Deduplication of Scraped Data
Remove duplicate records from scraped datasets using exact matching, fuzzy matching, and content hashing techniques in Python.
Data Parsing · #13intermediate3 min read
Scraping often produces duplicates, from overlapping pagination, retries, or scraping the same site over time. Deduplication is critical for data quality.
Exact Deduplication with pandas
import pandas as pd
# Simulated scraped data with duplicates
data = [
{"title": "ScraperAPI Review", "url": "https://example.com/scraperapi", "price": 49.99},
{"title": "ScrapingAnt Guide", "url": "https://example.com/scrapingant", "price": 29.00},
{"title": "ScraperAPI Review", "url": "https://example.com/scraperapi", "price": 49.99}, # exact dup
{"title": "scraperapi review", "url": "https://example.com/scraperapi", "price": 49.99}, # case variant
{"title": "Bright Data Review", "url": "https://example.com/brightdata", "price": 99.00},
]
df = pd.DataFrame(data)
print(f"Before: {len(df)} rows")
# Exact duplicates
df_exact = df.drop_duplicates()
print(f"After exact dedup: {len(df_exact)} rows")
# Case-insensitive dedup on specific column
df["title_lower"] = df["title"].str.lower().str.strip()
df_ci = df.drop_duplicates(subset=["title_lower"]).drop(columns=["title_lower"])
print(f"After case-insensitive dedup: {len(df_ci)} rows")
URL-Based Deduplication
When scraping, URLs are often the best unique identifier:
import pandas as pd
from urllib.parse import urlparse, parse_qs, urlencode
def normalize_url(url):
"""Normalize URL for deduplication."""
parsed = urlparse(url.lower().rstrip("/"))
# Sort query parameters for consistent comparison
params = parse_qs(parsed.query)
sorted_query = urlencode(sorted(params.items()), doseq=True)
return f"{parsed.scheme}://{parsed.netloc}{parsed.path}?{sorted_query}".rstrip("?")
# URLs that look different but point to the same page
urls = [
"https://example.com/product?id=1&ref=home",
"https://example.com/product?ref=home&id=1", # Same, different param order
"https://Example.com/product?id=1&ref=home", # Same, different case
"https://example.com/product?id=2&ref=home", # Different product
]
normalized = [normalize_url(u) for u in urls]
unique_urls = list(set(normalized))
print(f"Unique URLs: {len(unique_urls)} from {len(urls)}")
Fuzzy Deduplication
For near-duplicates with minor text differences, use fuzzy matching:
pip install thefuzz python-Levenshtein
from thefuzz import fuzz
import pandas as pd
data = [
{"title": "Best Web Scraping Tools 2025", "source": "blog_a"},
{"title": "Best Web Scraping Tools for 2025", "source": "blog_b"}, # near-dup
{"title": "Top 10 Proxy Services Compared", "source": "blog_a"},
{"title": "Top Ten Proxy Services Compared", "source": "blog_c"}, # near-dup
{"title": "Python Requests Library Tutorial", "source": "blog_b"},
]
df = pd.DataFrame(data)
def fuzzy_dedup(df, column, threshold=85):
"""Remove near-duplicate rows based on fuzzy string matching."""
to_drop = set()
titles = df[column].tolist()
for i in range(len(titles)):
if i in to_drop:
continue
for j in range(i + 1, len(titles)):
if j in to_drop:
continue
similarity = fuzz.ratio(titles[i].lower(), titles[j].lower())
if similarity >= threshold:
to_drop.add(j)
print(f" Duplicate: '{titles[j]}' matches '{titles[i]}' ({similarity}%)")
return df.drop(index=list(to_drop)).reset_index(drop=True)
df_clean = fuzzy_dedup(df, "title", threshold=80)
print(f"\nBefore: {len(df)} rows -> After: {len(df_clean)} rows")
Content Hash Deduplication
For large-scale deduplication, hash the content instead of comparing strings:
import hashlib
import pandas as pd
def content_hash(text):
"""Create a hash of normalized text content."""
normalized = " ".join(text.lower().split()) # Normalize whitespace
return hashlib.md5(normalized.encode()).hexdigest()
data = [
{"url": "/page1", "content": "Web scraping with Python is powerful."},
{"url": "/page2", "content": "Web scraping with Python is powerful."}, # Same after normalization
{"url": "/page3", "content": "Learn to scrape websites with Python."},
]
df = pd.DataFrame(data)
df["hash"] = df["content"].apply(content_hash)
df_deduped = df.drop_duplicates(subset=["hash"]).drop(columns=["hash"])
print(f"Unique content: {len(df_deduped)} from {len(df)}")
Incremental Deduplication (Across Scraping Runs)
import json
import hashlib
from pathlib import Path
class DeduplicationTracker:
def __init__(self, seen_file="seen_hashes.json"):
self.seen_file = Path(seen_file)
self.seen = self._load()
def _load(self):
if self.seen_file.exists():
return set(json.loads(self.seen_file.read_text()))
return set()
def save(self):
self.seen_file.write_text(json.dumps(list(self.seen)))
def is_new(self, record):
"""Check if a record is new (not seen before)."""
key = hashlib.md5(json.dumps(record, sort_keys=True).encode()).hexdigest()
if key in self.seen:
return False
self.seen.add(key)
return True
# Usage across scraping runs
tracker = DeduplicationTracker()
new_items = [item for item in scraped_items if tracker.is_new(item)]
tracker.save()
print(f"New items: {len(new_items)} out of {len(scraped_items)}")
Next Steps
- Normalize and validate deduplicated data
- Convert clean data to different output formats
- Build a complete data processing pipeline