Parsing Dates and Prices from Scraped Text
Extract and normalize dates, prices, and currencies from messy scraped text using Python's dateutil, regex, and locale-aware parsing.
Data Parsing · #11intermediate3 min read
Dates and prices appear in wildly different formats across websites. Normalizing them into consistent, machine-readable values is essential for analysis.
Parsing Dates with dateutil
The dateutil library handles almost any date format automatically:
pip install python-dateutil
from dateutil import parser as date_parser
# dateutil handles all these formats
date_strings = [
"March 15, 2025",
"15/03/2025",
"2025-03-15",
"Mar 15th, 2025",
"15 Mar 2025 14:30:00",
"3/15/25",
"Yesterday", # Relative dates won't work
]
for ds in date_strings:
try:
dt = date_parser.parse(ds)
print(f"{ds:30s} -> {dt.strftime('%Y-%m-%d')}")
except Exception as e:
print(f"{ds:30s} -> Error: {e}")
March 15, 2025 -> 2025-03-15
15/03/2025 -> 2025-03-15
2025-03-15 -> 2025-03-15
Mar 15th, 2025 -> 2025-03-15
15 Mar 2025 14:30:00 -> 2025-03-15
3/15/25 -> 2025-03-15
Extracting Dates from Mixed Text
import re
from dateutil import parser as date_parser
def extract_dates(text):
"""Find and parse dates embedded in text."""
# Common date patterns
patterns = [
r'\d{4}-\d{2}-\d{2}', # 2025-03-15
r'\d{1,2}/\d{1,2}/\d{2,4}', # 3/15/2025
r'(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\s+\d{1,2},?\s+\d{4}', # March 15, 2025
r'\d{1,2}\s+(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\s+\d{4}', # 15 March 2025
]
dates = []
for pattern in patterns:
for match in re.finditer(pattern, text, re.IGNORECASE):
try:
dt = date_parser.parse(match.group())
dates.append({"raw": match.group(), "parsed": dt})
except ValueError:
continue
return dates
text = """
Product launched on March 15, 2025. Updated 2025-04-01.
Sale ends 12/31/2025. Originally listed 1 Jan 2024.
"""
for d in extract_dates(text):
print(f"{d['raw']:25s} -> {d['parsed'].strftime('%Y-%m-%d')}")
Parsing Prices and Currencies
import re
from decimal import Decimal
def parse_price(text):
"""Extract price and currency from text."""
# Currency symbols and codes
currency_map = {
"$": "USD", "\u20ac": "EUR", "\u00a3": "GBP",
"\u00a5": "JPY", "\u20b9": "INR",
"USD": "USD", "EUR": "EUR", "GBP": "GBP",
}
# Match currency symbol/code followed by number
pattern = r'([$\u20ac\u00a3\u00a5\u20b9]|USD|EUR|GBP|INR)\s?([\d,]+\.?\d*)'
match = re.search(pattern, text)
if match:
currency_str = match.group(1)
amount_str = match.group(2).replace(",", "")
return {
"currency": currency_map.get(currency_str, currency_str),
"amount": float(amount_str),
"raw": match.group(0),
}
# Try number followed by currency code
pattern2 = r'([\d,]+\.?\d*)\s?(USD|EUR|GBP|INR)'
match2 = re.search(pattern2, text)
if match2:
return {
"currency": match2.group(2),
"amount": float(match2.group(1).replace(",", "")),
"raw": match2.group(0),
}
return None
# Test with various formats
prices = [
"Price: $49.99/month",
"Only \u20ac29.00!",
"Cost: \u00a31,299.99",
"Starting at 199.50 USD",
"\u20b94,999 per year",
"Free trial, then $9.99",
]
for text in prices:
result = parse_price(text)
if result:
print(f"{text:30s} -> {result['currency']} {result['amount']:.2f}")
Price: $49.99/month -> USD 49.99
Only EUR 29.00! -> EUR 29.00
Cost: GBP 1,299.99 -> GBP 1299.99
Starting at 199.50 USD -> USD 199.50
INR 4,999 per year -> INR 4999.00
Free trial, then $9.99 -> USD 9.99
Batch Processing Scraped Data
import pandas as pd
from dateutil import parser as date_parser
# Simulated scraped product data
raw_data = [
{"name": "ScraperAPI", "price_text": "$49.99/mo", "date_text": "Updated Mar 2025"},
{"name": "ScrapingAnt", "price_text": "\u20ac29/month", "date_text": "Since January 15, 2024"},
{"name": "Bright Data", "price_text": "From $199.00", "date_text": "Last reviewed 2025-02-28"},
]
df = pd.DataFrame(raw_data)
# Parse prices
df["price"] = df["price_text"].apply(lambda x: parse_price(x)["amount"] if parse_price(x) else None)
df["currency"] = df["price_text"].apply(lambda x: parse_price(x)["currency"] if parse_price(x) else None)
# Parse dates
def safe_parse_date(text):
dates = extract_dates(text)
return dates[0]["parsed"] if dates else None
df["date"] = df["date_text"].apply(safe_parse_date)
print(df[["name", "price", "currency", "date"]])
Next Steps
- Use jq and jsonpath for JSON date/price fields
- Deduplicate scraped records
- Normalize and validate all extracted data