Extracting Emails and Phone Numbers from Web Pages - Data Parsing

Extract email addresses and phone numbers from scraped web pages using regex patterns, BeautifulSoup, and validation techniques.

Extracting contact information from web pages is one of the most common scraping tasks. Emails and phone numbers appear in various formats and need robust patterns to catch them all.

Extracting Email Addresses

import re
import requests
from bs4 import BeautifulSoup

def extract_emails(text):
    """Extract email addresses from text."""
    pattern = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'
    emails = re.findall(pattern, text)
    # Deduplicate while preserving order
    seen = set()
    unique = []
    for email in emails:
        lower = email.lower()
        if lower not in seen:
            seen.add(lower)
            unique.append(email)
    return unique

# From a web page
response = requests.get("https://quotes.toscrape.com/", timeout=15)
soup = BeautifulSoup(response.text, "lxml")
page_text = soup.get_text()
emails = extract_emails(page_text)
print(f"Found emails: {emails}")

Handling Obfuscated Emails

Sites often disguise emails to prevent scraping:

import re
from bs4 import BeautifulSoup

html = """
<p>Contact: support [at] example [dot] com</p>
<p>Email: info(at)example(dot)org</p>
<a href="mailto:sales@example.com">Email us</a>
<span class="email" data-user="admin" data-domain="example.com">protected</span>
"""

soup = BeautifulSoup(html, "lxml")

def extract_all_emails(soup):
    emails = set()

    # 1. Standard mailto links
    for a in soup.select("a[href^='mailto:']"):
        email = a["href"].replace("mailto:", "").split("?")[0]
        emails.add(email)

    # 2. Data-attribute based obfuscation
    for el in soup.select("[data-user][data-domain]"):
        emails.add(f"{el['data-user']}@{el['data-domain']}")

    # 3. Text-based obfuscation
    text = soup.get_text()
    deobfuscated = text.replace(" [at] ", "@").replace(" [dot] ", ".")
    deobfuscated = deobfuscated.replace("(at)", "@").replace("(dot)", ".")
    pattern = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'
    emails.update(re.findall(pattern, deobfuscated))

    return list(emails)

found = extract_all_emails(soup)
for email in found:
    print(email)

Extracting Phone Numbers

import re

def extract_phones(text):
    """Extract phone numbers in various formats."""
    patterns = [
        r'\+?1?[-.\s]?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}',  # US/CA
        r'\+\d{1,3}[-.\s]?\d{1,4}[-.\s]?\d{1,4}[-.\s]?\d{1,9}',  # International
        r'\d{5}\s?\d{5}',  # Indian mobile (10 digits)
    ]

    phones = []
    for pattern in patterns:
        matches = re.findall(pattern, text)
        phones.extend(matches)

    # Clean up results
    cleaned = []
    for phone in phones:
        digits = re.sub(r'\D', '', phone)
        if 10 <= len(digits) <= 15:  # Valid phone length
            cleaned.append(phone.strip())

    return list(set(cleaned))

text = """
Call us: (415) 555-0123 or +1-800-555-0199
UK office: +44 20 7946 0958
India: 98765 43210
Fax: 415.555.0100
"""

phones = extract_phones(text)
for phone in phones:
    print(phone)

Complete Contact Extractor

import re
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

def scrape_contacts(url):
    """Extract all contact information from a page."""
    response = requests.get(url, timeout=15)
    soup = BeautifulSoup(response.text, "lxml")
    text = soup.get_text(separator=" ")

    contacts = {
        "url": url,
        "emails": extract_emails(text),
        "phones": extract_phones(text),
    }

    # Check mailto links
    for a in soup.select("a[href^='mailto:']"):
        email = a["href"].replace("mailto:", "").split("?")[0]
        if email not in contacts["emails"]:
            contacts["emails"].append(email)

    # Check tel links
    for a in soup.select("a[href^='tel:']"):
        phone = a["href"].replace("tel:", "")
        if phone not in contacts["phones"]:
            contacts["phones"].append(phone)

    return contacts

# Also check common contact pages
def find_contact_pages(base_url, soup):
    """Find links to contact/about pages."""
    contact_urls = []
    for a in soup.select("a[href]"):
        href = a.get("href", "").lower()
        text = a.get_text(strip=True).lower()
        if any(kw in href or kw in text for kw in ["contact", "about", "support"]):
            contact_urls.append(urljoin(base_url, a["href"]))
    return list(set(contact_urls))

Validation

import re

def validate_email(email):
    """Basic email validation."""
    pattern = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$'
    return bool(re.match(pattern, email))

def validate_phone(phone):
    """Check if phone has enough digits."""
    digits = re.sub(r'\D', '', phone)
    return 10 <= len(digits) <= 15

# Filter results
valid_emails = [e for e in raw_emails if validate_email(e)]
valid_phones = [p for p in raw_phones if validate_phone(p)]

For scraping contact pages behind bot protection, route your requests through ScraperAPI with proxy rotation enabled.

Next Steps

Parse dates and prices from scraped text
Deduplicate extracted contact data
Clean and normalize contact information