Parsing Dates and Prices from Scraped Text - Data Parsing

Extract and normalize dates, prices, and currencies from messy scraped text using Python's dateutil, regex, and locale-aware parsing.

Dates and prices appear in wildly different formats across websites. Normalizing them into consistent, machine-readable values is essential for analysis.

Parsing Dates with dateutil

The dateutil library handles almost any date format automatically:

pip install python-dateutil

from dateutil import parser as date_parser

# dateutil handles all these formats
date_strings = [
    "March 15, 2025",
    "15/03/2025",
    "2025-03-15",
    "Mar 15th, 2025",
    "15 Mar 2025 14:30:00",
    "3/15/25",
    "Yesterday",  # Relative dates won't work
]

for ds in date_strings:
    try:
        dt = date_parser.parse(ds)
        print(f"{ds:30s} -> {dt.strftime('%Y-%m-%d')}")
    except Exception as e:
        print(f"{ds:30s} -> Error: {e}")

March 15, 2025                 -> 2025-03-15
15/03/2025                     -> 2025-03-15
2025-03-15                     -> 2025-03-15
Mar 15th, 2025                 -> 2025-03-15
15 Mar 2025 14:30:00           -> 2025-03-15
3/15/25                        -> 2025-03-15

Extracting Dates from Mixed Text

import re
from dateutil import parser as date_parser

def extract_dates(text):
    """Find and parse dates embedded in text."""
    # Common date patterns
    patterns = [
        r'\d{4}-\d{2}-\d{2}',                          # 2025-03-15
        r'\d{1,2}/\d{1,2}/\d{2,4}',                    # 3/15/2025
        r'(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\s+\d{1,2},?\s+\d{4}',  # March 15, 2025
        r'\d{1,2}\s+(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\s+\d{4}',    # 15 March 2025
    ]

    dates = []
    for pattern in patterns:
        for match in re.finditer(pattern, text, re.IGNORECASE):
            try:
                dt = date_parser.parse(match.group())
                dates.append({"raw": match.group(), "parsed": dt})
            except ValueError:
                continue
    return dates

text = """
Product launched on March 15, 2025. Updated 2025-04-01.
Sale ends 12/31/2025. Originally listed 1 Jan 2024.
"""

for d in extract_dates(text):
    print(f"{d['raw']:25s} -> {d['parsed'].strftime('%Y-%m-%d')}")

Parsing Prices and Currencies

import re
from decimal import Decimal

def parse_price(text):
    """Extract price and currency from text."""
    # Currency symbols and codes
    currency_map = {
        "$": "USD", "\u20ac": "EUR", "\u00a3": "GBP",
        "\u00a5": "JPY", "\u20b9": "INR",
        "USD": "USD", "EUR": "EUR", "GBP": "GBP",
    }

    # Match currency symbol/code followed by number
    pattern = r'([$\u20ac\u00a3\u00a5\u20b9]|USD|EUR|GBP|INR)\s?([\d,]+\.?\d*)'
    match = re.search(pattern, text)

    if match:
        currency_str = match.group(1)
        amount_str = match.group(2).replace(",", "")
        return {
            "currency": currency_map.get(currency_str, currency_str),
            "amount": float(amount_str),
            "raw": match.group(0),
        }

    # Try number followed by currency code
    pattern2 = r'([\d,]+\.?\d*)\s?(USD|EUR|GBP|INR)'
    match2 = re.search(pattern2, text)
    if match2:
        return {
            "currency": match2.group(2),
            "amount": float(match2.group(1).replace(",", "")),
            "raw": match2.group(0),
        }

    return None

# Test with various formats
prices = [
    "Price: $49.99/month",
    "Only \u20ac29.00!",
    "Cost: \u00a31,299.99",
    "Starting at 199.50 USD",
    "\u20b94,999 per year",
    "Free trial, then $9.99",
]

for text in prices:
    result = parse_price(text)
    if result:
        print(f"{text:30s} -> {result['currency']} {result['amount']:.2f}")

Price: $49.99/month            -> USD 49.99
Only EUR 29.00!                -> EUR 29.00
Cost: GBP 1,299.99            -> GBP 1299.99
Starting at 199.50 USD        -> USD 199.50
INR 4,999 per year            -> INR 4999.00
Free trial, then $9.99        -> USD 9.99

Batch Processing Scraped Data

import pandas as pd
from dateutil import parser as date_parser

# Simulated scraped product data
raw_data = [
    {"name": "ScraperAPI", "price_text": "$49.99/mo", "date_text": "Updated Mar 2025"},
    {"name": "ScrapingAnt", "price_text": "\u20ac29/month", "date_text": "Since January 15, 2024"},
    {"name": "Bright Data", "price_text": "From $199.00", "date_text": "Last reviewed 2025-02-28"},
]

df = pd.DataFrame(raw_data)

# Parse prices
df["price"] = df["price_text"].apply(lambda x: parse_price(x)["amount"] if parse_price(x) else None)
df["currency"] = df["price_text"].apply(lambda x: parse_price(x)["currency"] if parse_price(x) else None)

# Parse dates
def safe_parse_date(text):
    dates = extract_dates(text)
    return dates[0]["parsed"] if dates else None

df["date"] = df["date_text"].apply(safe_parse_date)

print(df[["name", "price", "currency", "date"]])

Next Steps

Use jq and jsonpath for JSON date/price fields
Deduplicate scraped records
Normalize and validate all extracted data