Normalizing and Validating Scraped Data
Ensure scraped data quality through normalization and validation using Pydantic models, custom validators, and pandas techniques.
Data Parsing · #14intermediate4 min read
Scraped data can contain unexpected values, wrong types, and inconsistent formats. Validation catches problems early; normalization makes data consistent and comparable.
Validation with Pydantic
Pydantic models define the expected shape and types for your data. Invalid records are rejected with clear error messages:
pip install pydantic
from pydantic import BaseModel, field_validator, HttpUrl
from typing import Optional
from datetime import date
class Product(BaseModel):
name: str
price: float
currency: str = "USD"
url: HttpUrl
rating: Optional[float] = None
scraped_date: date
@field_validator("name")
@classmethod
def name_not_empty(cls, v):
if not v.strip():
raise ValueError("Name cannot be empty")
return v.strip()
@field_validator("price")
@classmethod
def price_positive(cls, v):
if v <= 0:
raise ValueError("Price must be positive")
return round(v, 2)
@field_validator("rating")
@classmethod
def rating_range(cls, v):
if v is not None and not (0 <= v <= 5):
raise ValueError("Rating must be between 0 and 5")
return v
# Validate scraped records
raw_products = [
{"name": "ScraperAPI", "price": 49.99, "url": "https://scraperapi.com", "rating": 4.5, "scraped_date": "2025-03-15"},
{"name": "", "price": 29.00, "url": "https://scrapingant.com", "scraped_date": "2025-03-15"}, # bad: empty name
{"name": "Bright Data", "price": -10, "url": "https://brightdata.com", "scraped_date": "2025-03-15"}, # bad: negative price
{"name": "Oxylabs", "price": 99.00, "url": "https://oxylabs.io", "rating": 6.0, "scraped_date": "2025-03-15"}, # bad: rating > 5
]
valid = []
errors = []
for raw in raw_products:
try:
product = Product(**raw)
valid.append(product.model_dump())
except Exception as e:
errors.append({"data": raw, "error": str(e)})
print(f"Valid: {len(valid)}, Errors: {len(errors)}")
for err in errors:
print(f" Rejected: {err['data'].get('name', 'N/A')} - {err['error'][:80]}")
Text Normalization
import re
import unicodedata
def normalize_text(text):
"""Standardize text for consistent storage and comparison."""
if not text:
return ""
# Unicode normalization (NFKD decomposes special chars)
text = unicodedata.normalize("NFKC", text)
# Replace HTML entities
text = text.replace("&", "&").replace("<", "<").replace(">", ">")
# Collapse whitespace
text = re.sub(r'\s+', ' ', text).strip()
# Remove zero-width characters
text = re.sub(r'[\u200b\u200c\u200d\ufeff]', '', text)
return text
def normalize_price(price_str):
"""Convert various price formats to float."""
if not price_str:
return None
cleaned = re.sub(r'[^\d.,]', '', str(price_str))
# Handle European format (1.234,56 -> 1234.56)
if ',' in cleaned and '.' in cleaned:
if cleaned.rindex(',') > cleaned.rindex('.'):
cleaned = cleaned.replace('.', '').replace(',', '.')
else:
cleaned = cleaned.replace(',', '')
elif ',' in cleaned:
# Could be decimal comma (29,99) or thousands (1,000)
parts = cleaned.split(',')
if len(parts[-1]) == 2:
cleaned = cleaned.replace(',', '.')
else:
cleaned = cleaned.replace(',', '')
try:
return round(float(cleaned), 2)
except ValueError:
return None
# Test
print(normalize_price("$1,299.99")) # 1299.99
print(normalize_price("EUR 29,99")) # 29.99
print(normalize_price("1.234,56")) # 1234.56
Schema Validation with pandas
import pandas as pd
import numpy as np
def validate_dataframe(df, schema):
"""Validate a DataFrame against a schema definition."""
issues = []
for col, rules in schema.items():
if col not in df.columns:
issues.append(f"Missing column: {col}")
continue
# Check for nulls
if not rules.get("nullable", True):
null_count = df[col].isna().sum()
if null_count > 0:
issues.append(f"{col}: {null_count} null values")
# Check data type
expected_type = rules.get("type")
if expected_type == "numeric":
non_numeric = pd.to_numeric(df[col], errors="coerce").isna().sum()
if non_numeric > df[col].isna().sum():
issues.append(f"{col}: {non_numeric} non-numeric values")
# Check value range
if "min" in rules:
below = (pd.to_numeric(df[col], errors="coerce") < rules["min"]).sum()
if below > 0:
issues.append(f"{col}: {below} values below {rules['min']}")
if "max" in rules:
above = (pd.to_numeric(df[col], errors="coerce") > rules["max"]).sum()
if above > 0:
issues.append(f"{col}: {above} values above {rules['max']}")
return issues
# Define schema
schema = {
"name": {"nullable": False},
"price": {"type": "numeric", "nullable": False, "min": 0},
"rating": {"type": "numeric", "min": 0, "max": 5},
}
df = pd.DataFrame([
{"name": "ScraperAPI", "price": 49.99, "rating": 4.5},
{"name": None, "price": 29.00, "rating": 3.8},
{"name": "Test", "price": -5, "rating": 6.0},
])
issues = validate_dataframe(df, schema)
for issue in issues:
print(f" Issue: {issue}")
Normalization Pipeline
def normalize_record(raw):
"""Full normalization pipeline for a scraped record."""
return {
"name": normalize_text(raw.get("name", "")),
"price": normalize_price(raw.get("price")),
"url": raw.get("url", "").strip().rstrip("/").lower(),
"description": normalize_text(raw.get("description", ""))[:500],
"rating": min(max(float(raw.get("rating", 0) or 0), 0), 5),
}
raw_records = [
{"name": " ScraperAPI ", "price": "$49.99", "url": "https://ScraperAPI.com/", "description": "Best\u00a0proxy\u200bservice", "rating": "4.5"},
{"name": "ScrapingAnt", "price": "EUR 29,00", "url": "https://scrapingant.com", "description": "", "rating": None},
]
normalized = [normalize_record(r) for r in raw_records]
for r in normalized:
print(r)
Next Steps
- Convert validated data to CSV, JSON, Excel, and SQL
- Build a complete data processing pipeline
- Automate data quality checks in production