Scraping Central is reader-supported. When you buy through links on our site, we may earn an affiliate commission.

Parsing Dates and Prices from Scraped Text

Extract and normalize dates, prices, and currencies from messy scraped text using Python's dateutil, regex, and locale-aware parsing.

Data Parsing · #11intermediate3 min read
Share:WhatsAppLinkedIn

Dates and prices appear in wildly different formats across websites. Normalizing them into consistent, machine-readable values is essential for analysis.

Parsing Dates with dateutil

The dateutil library handles almost any date format automatically:

pip install python-dateutil
from dateutil import parser as date_parser

# dateutil handles all these formats
date_strings = [
    "March 15, 2025",
    "15/03/2025",
    "2025-03-15",
    "Mar 15th, 2025",
    "15 Mar 2025 14:30:00",
    "3/15/25",
    "Yesterday",  # Relative dates won't work
]

for ds in date_strings:
    try:
        dt = date_parser.parse(ds)
        print(f"{ds:30s} -> {dt.strftime('%Y-%m-%d')}")
    except Exception as e:
        print(f"{ds:30s} -> Error: {e}")
March 15, 2025                 -> 2025-03-15
15/03/2025                     -> 2025-03-15
2025-03-15                     -> 2025-03-15
Mar 15th, 2025                 -> 2025-03-15
15 Mar 2025 14:30:00           -> 2025-03-15
3/15/25                        -> 2025-03-15

Extracting Dates from Mixed Text

import re
from dateutil import parser as date_parser

def extract_dates(text):
    """Find and parse dates embedded in text."""
    # Common date patterns
    patterns = [
        r'\d{4}-\d{2}-\d{2}',                          # 2025-03-15
        r'\d{1,2}/\d{1,2}/\d{2,4}',                    # 3/15/2025
        r'(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\s+\d{1,2},?\s+\d{4}',  # March 15, 2025
        r'\d{1,2}\s+(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\s+\d{4}',    # 15 March 2025
    ]

    dates = []
    for pattern in patterns:
        for match in re.finditer(pattern, text, re.IGNORECASE):
            try:
                dt = date_parser.parse(match.group())
                dates.append({"raw": match.group(), "parsed": dt})
            except ValueError:
                continue
    return dates

text = """
Product launched on March 15, 2025. Updated 2025-04-01.
Sale ends 12/31/2025. Originally listed 1 Jan 2024.
"""

for d in extract_dates(text):
    print(f"{d['raw']:25s} -> {d['parsed'].strftime('%Y-%m-%d')}")

Parsing Prices and Currencies

import re
from decimal import Decimal

def parse_price(text):
    """Extract price and currency from text."""
    # Currency symbols and codes
    currency_map = {
        "$": "USD", "\u20ac": "EUR", "\u00a3": "GBP",
        "\u00a5": "JPY", "\u20b9": "INR",
        "USD": "USD", "EUR": "EUR", "GBP": "GBP",
    }

    # Match currency symbol/code followed by number
    pattern = r'([$\u20ac\u00a3\u00a5\u20b9]|USD|EUR|GBP|INR)\s?([\d,]+\.?\d*)'
    match = re.search(pattern, text)

    if match:
        currency_str = match.group(1)
        amount_str = match.group(2).replace(",", "")
        return {
            "currency": currency_map.get(currency_str, currency_str),
            "amount": float(amount_str),
            "raw": match.group(0),
        }

    # Try number followed by currency code
    pattern2 = r'([\d,]+\.?\d*)\s?(USD|EUR|GBP|INR)'
    match2 = re.search(pattern2, text)
    if match2:
        return {
            "currency": match2.group(2),
            "amount": float(match2.group(1).replace(",", "")),
            "raw": match2.group(0),
        }

    return None

# Test with various formats
prices = [
    "Price: $49.99/month",
    "Only \u20ac29.00!",
    "Cost: \u00a31,299.99",
    "Starting at 199.50 USD",
    "\u20b94,999 per year",
    "Free trial, then $9.99",
]

for text in prices:
    result = parse_price(text)
    if result:
        print(f"{text:30s} -> {result['currency']} {result['amount']:.2f}")
Price: $49.99/month            -> USD 49.99
Only EUR 29.00!                -> EUR 29.00
Cost: GBP 1,299.99            -> GBP 1299.99
Starting at 199.50 USD        -> USD 199.50
INR 4,999 per year            -> INR 4999.00
Free trial, then $9.99        -> USD 9.99

Batch Processing Scraped Data

import pandas as pd
from dateutil import parser as date_parser

# Simulated scraped product data
raw_data = [
    {"name": "ScraperAPI", "price_text": "$49.99/mo", "date_text": "Updated Mar 2025"},
    {"name": "ScrapingAnt", "price_text": "\u20ac29/month", "date_text": "Since January 15, 2024"},
    {"name": "Bright Data", "price_text": "From $199.00", "date_text": "Last reviewed 2025-02-28"},
]

df = pd.DataFrame(raw_data)

# Parse prices
df["price"] = df["price_text"].apply(lambda x: parse_price(x)["amount"] if parse_price(x) else None)
df["currency"] = df["price_text"].apply(lambda x: parse_price(x)["currency"] if parse_price(x) else None)

# Parse dates
def safe_parse_date(text):
    dates = extract_dates(text)
    return dates[0]["parsed"] if dates else None

df["date"] = df["date_text"].apply(safe_parse_date)

print(df[["name", "price", "currency", "date"]])

Next Steps

  • Use jq and jsonpath for JSON date/price fields
  • Deduplicate scraped records
  • Normalize and validate all extracted data