Extracting Emails and Phone Numbers from Web Pages
Extract email addresses and phone numbers from scraped web pages using regex patterns, BeautifulSoup, and validation techniques.
Data Parsing · #10beginner3 min read
Extracting contact information from web pages is one of the most common scraping tasks. Emails and phone numbers appear in various formats and need robust patterns to catch them all.
Extracting Email Addresses
import re
import requests
from bs4 import BeautifulSoup
def extract_emails(text):
"""Extract email addresses from text."""
pattern = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'
emails = re.findall(pattern, text)
# Deduplicate while preserving order
seen = set()
unique = []
for email in emails:
lower = email.lower()
if lower not in seen:
seen.add(lower)
unique.append(email)
return unique
# From a web page
response = requests.get("https://quotes.toscrape.com/", timeout=15)
soup = BeautifulSoup(response.text, "lxml")
page_text = soup.get_text()
emails = extract_emails(page_text)
print(f"Found emails: {emails}")
Handling Obfuscated Emails
Sites often disguise emails to prevent scraping:
import re
from bs4 import BeautifulSoup
html = """
<p>Contact: support [at] example [dot] com</p>
<p>Email: info(at)example(dot)org</p>
<a href="mailto:sales@example.com">Email us</a>
<span class="email" data-user="admin" data-domain="example.com">protected</span>
"""
soup = BeautifulSoup(html, "lxml")
def extract_all_emails(soup):
emails = set()
# 1. Standard mailto links
for a in soup.select("a[href^='mailto:']"):
email = a["href"].replace("mailto:", "").split("?")[0]
emails.add(email)
# 2. Data-attribute based obfuscation
for el in soup.select("[data-user][data-domain]"):
emails.add(f"{el['data-user']}@{el['data-domain']}")
# 3. Text-based obfuscation
text = soup.get_text()
deobfuscated = text.replace(" [at] ", "@").replace(" [dot] ", ".")
deobfuscated = deobfuscated.replace("(at)", "@").replace("(dot)", ".")
pattern = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'
emails.update(re.findall(pattern, deobfuscated))
return list(emails)
found = extract_all_emails(soup)
for email in found:
print(email)
Extracting Phone Numbers
import re
def extract_phones(text):
"""Extract phone numbers in various formats."""
patterns = [
r'\+?1?[-.\s]?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}', # US/CA
r'\+\d{1,3}[-.\s]?\d{1,4}[-.\s]?\d{1,4}[-.\s]?\d{1,9}', # International
r'\d{5}\s?\d{5}', # Indian mobile (10 digits)
]
phones = []
for pattern in patterns:
matches = re.findall(pattern, text)
phones.extend(matches)
# Clean up results
cleaned = []
for phone in phones:
digits = re.sub(r'\D', '', phone)
if 10 <= len(digits) <= 15: # Valid phone length
cleaned.append(phone.strip())
return list(set(cleaned))
text = """
Call us: (415) 555-0123 or +1-800-555-0199
UK office: +44 20 7946 0958
India: 98765 43210
Fax: 415.555.0100
"""
phones = extract_phones(text)
for phone in phones:
print(phone)
Complete Contact Extractor
import re
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
def scrape_contacts(url):
"""Extract all contact information from a page."""
response = requests.get(url, timeout=15)
soup = BeautifulSoup(response.text, "lxml")
text = soup.get_text(separator=" ")
contacts = {
"url": url,
"emails": extract_emails(text),
"phones": extract_phones(text),
}
# Check mailto links
for a in soup.select("a[href^='mailto:']"):
email = a["href"].replace("mailto:", "").split("?")[0]
if email not in contacts["emails"]:
contacts["emails"].append(email)
# Check tel links
for a in soup.select("a[href^='tel:']"):
phone = a["href"].replace("tel:", "")
if phone not in contacts["phones"]:
contacts["phones"].append(phone)
return contacts
# Also check common contact pages
def find_contact_pages(base_url, soup):
"""Find links to contact/about pages."""
contact_urls = []
for a in soup.select("a[href]"):
href = a.get("href", "").lower()
text = a.get_text(strip=True).lower()
if any(kw in href or kw in text for kw in ["contact", "about", "support"]):
contact_urls.append(urljoin(base_url, a["href"]))
return list(set(contact_urls))
Validation
import re
def validate_email(email):
"""Basic email validation."""
pattern = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$'
return bool(re.match(pattern, email))
def validate_phone(phone):
"""Check if phone has enough digits."""
digits = re.sub(r'\D', '', phone)
return 10 <= len(digits) <= 15
# Filter results
valid_emails = [e for e in raw_emails if validate_email(e)]
valid_phones = [p for p in raw_phones if validate_phone(p)]
For scraping contact pages behind bot protection, route your requests through ScraperAPI with proxy rotation enabled.
Next Steps
- Parse dates and prices from scraped text
- Deduplicate extracted contact data
- Clean and normalize contact information