Normalizing and Validating Scraped Data - Data Parsing

Ensure scraped data quality through normalization and validation using Pydantic models, custom validators, and pandas techniques.

Scraped data can contain unexpected values, wrong types, and inconsistent formats. Validation catches problems early; normalization makes data consistent and comparable.

Validation with Pydantic

Pydantic models define the expected shape and types for your data. Invalid records are rejected with clear error messages:

pip install pydantic

from pydantic import BaseModel, field_validator, HttpUrl
from typing import Optional
from datetime import date

class Product(BaseModel):
    name: str
    price: float
    currency: str = "USD"
    url: HttpUrl
    rating: Optional[float] = None
    scraped_date: date

    @field_validator("name")
    @classmethod
    def name_not_empty(cls, v):
        if not v.strip():
            raise ValueError("Name cannot be empty")
        return v.strip()

    @field_validator("price")
    @classmethod
    def price_positive(cls, v):
        if v <= 0:
            raise ValueError("Price must be positive")
        return round(v, 2)

    @field_validator("rating")
    @classmethod
    def rating_range(cls, v):
        if v is not None and not (0 <= v <= 5):
            raise ValueError("Rating must be between 0 and 5")
        return v

# Validate scraped records
raw_products = [
    {"name": "ScraperAPI", "price": 49.99, "url": "https://scraperapi.com", "rating": 4.5, "scraped_date": "2025-03-15"},
    {"name": "", "price": 29.00, "url": "https://scrapingant.com", "scraped_date": "2025-03-15"},  # bad: empty name
    {"name": "Bright Data", "price": -10, "url": "https://brightdata.com", "scraped_date": "2025-03-15"},  # bad: negative price
    {"name": "Oxylabs", "price": 99.00, "url": "https://oxylabs.io", "rating": 6.0, "scraped_date": "2025-03-15"},  # bad: rating > 5
]

valid = []
errors = []
for raw in raw_products:
    try:
        product = Product(**raw)
        valid.append(product.model_dump())
    except Exception as e:
        errors.append({"data": raw, "error": str(e)})

print(f"Valid: {len(valid)}, Errors: {len(errors)}")
for err in errors:
    print(f"  Rejected: {err['data'].get('name', 'N/A')} - {err['error'][:80]}")

Text Normalization

import re
import unicodedata

def normalize_text(text):
    """Standardize text for consistent storage and comparison."""
    if not text:
        return ""

    # Unicode normalization (NFKD decomposes special chars)
    text = unicodedata.normalize("NFKC", text)

    # Replace HTML entities
    text = text.replace("&amp;", "&").replace("&lt;", "<").replace("&gt;", ">")

    # Collapse whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    # Remove zero-width characters
    text = re.sub(r'[\u200b\u200c\u200d\ufeff]', '', text)

    return text

def normalize_price(price_str):
    """Convert various price formats to float."""
    if not price_str:
        return None
    cleaned = re.sub(r'[^\d.,]', '', str(price_str))
    # Handle European format (1.234,56 -> 1234.56)
    if ',' in cleaned and '.' in cleaned:
        if cleaned.rindex(',') > cleaned.rindex('.'):
            cleaned = cleaned.replace('.', '').replace(',', '.')
        else:
            cleaned = cleaned.replace(',', '')
    elif ',' in cleaned:
        # Could be decimal comma (29,99) or thousands (1,000)
        parts = cleaned.split(',')
        if len(parts[-1]) == 2:
            cleaned = cleaned.replace(',', '.')
        else:
            cleaned = cleaned.replace(',', '')
    try:
        return round(float(cleaned), 2)
    except ValueError:
        return None

# Test
print(normalize_price("$1,299.99"))   # 1299.99
print(normalize_price("EUR 29,99"))    # 29.99
print(normalize_price("1.234,56"))     # 1234.56

Schema Validation with pandas

import pandas as pd
import numpy as np

def validate_dataframe(df, schema):
    """Validate a DataFrame against a schema definition."""
    issues = []

    for col, rules in schema.items():
        if col not in df.columns:
            issues.append(f"Missing column: {col}")
            continue

        # Check for nulls
        if not rules.get("nullable", True):
            null_count = df[col].isna().sum()
            if null_count > 0:
                issues.append(f"{col}: {null_count} null values")

        # Check data type
        expected_type = rules.get("type")
        if expected_type == "numeric":
            non_numeric = pd.to_numeric(df[col], errors="coerce").isna().sum()
            if non_numeric > df[col].isna().sum():
                issues.append(f"{col}: {non_numeric} non-numeric values")

        # Check value range
        if "min" in rules:
            below = (pd.to_numeric(df[col], errors="coerce") < rules["min"]).sum()
            if below > 0:
                issues.append(f"{col}: {below} values below {rules['min']}")

        if "max" in rules:
            above = (pd.to_numeric(df[col], errors="coerce") > rules["max"]).sum()
            if above > 0:
                issues.append(f"{col}: {above} values above {rules['max']}")

    return issues

# Define schema
schema = {
    "name": {"nullable": False},
    "price": {"type": "numeric", "nullable": False, "min": 0},
    "rating": {"type": "numeric", "min": 0, "max": 5},
}

df = pd.DataFrame([
    {"name": "ScraperAPI", "price": 49.99, "rating": 4.5},
    {"name": None, "price": 29.00, "rating": 3.8},
    {"name": "Test", "price": -5, "rating": 6.0},
])

issues = validate_dataframe(df, schema)
for issue in issues:
    print(f"  Issue: {issue}")

Normalization Pipeline

def normalize_record(raw):
    """Full normalization pipeline for a scraped record."""
    return {
        "name": normalize_text(raw.get("name", "")),
        "price": normalize_price(raw.get("price")),
        "url": raw.get("url", "").strip().rstrip("/").lower(),
        "description": normalize_text(raw.get("description", ""))[:500],
        "rating": min(max(float(raw.get("rating", 0) or 0), 0), 5),
    }

raw_records = [
    {"name": "  ScraperAPI  ", "price": "$49.99", "url": "https://ScraperAPI.com/", "description": "Best\u00a0proxy\u200bservice", "rating": "4.5"},
    {"name": "ScrapingAnt", "price": "EUR 29,00", "url": "https://scrapingant.com", "description": "", "rating": None},
]

normalized = [normalize_record(r) for r in raw_records]
for r in normalized:
    print(r)

Next Steps

Convert validated data to CSV, JSON, Excel, and SQL
Build a complete data processing pipeline
Automate data quality checks in production