Tutorial
Building a Web Scraping Monitoring Dashboard
Learn how to build a monitoring dashboard for your web scrapers. Track success rates, response times, data quality, and get alerts on failures.
Production scrapers need monitoring. Without it, you discover data is stale or missing only when a customer complains. Here is how to build a monitoring dashboard for your scraping operations.
Key Metrics to Track
- Success rate, Percentage of requests returning valid data
- Response time, How long each scrape takes
- Data freshness, Time since last successful scrape per source
- Queue depth, Number of pending URLs
- Data quality, Percentage of records passing validation
- Proxy health, Success rate per proxy or API provider
Metrics Collection
import time
import json
from datetime import datetime
from dataclasses import dataclass, asdict
from typing import Optional
@dataclass
class ScrapeMetric:
url: str
domain: str
status_code: int
success: bool
response_time_ms: float
records_extracted: int
timestamp: str
error: Optional[str] = None
class MetricsCollector:
def __init__(self, metrics_file="metrics.jsonl"):
self.metrics_file = metrics_file
def record(self, metric: ScrapeMetric):
with open(self.metrics_file, "a") as f:
f.write(json.dumps(asdict(metric)) + "\n")
def scrape_with_metrics(self, url, scrape_func):
"""Wrapper that records metrics for any scrape function."""
from urllib.parse import urlparse
domain = urlparse(url).netloc
start = time.time()
try:
result = scrape_func(url)
elapsed = (time.time() - start) * 1000
self.record(ScrapeMetric(
url=url,
domain=domain,
status_code=200,
success=True,
response_time_ms=elapsed,
records_extracted=len(result) if isinstance(result, list) else 1,
timestamp=datetime.utcnow().isoformat()
))
return result
except Exception as e:
elapsed = (time.time() - start) * 1000
self.record(ScrapeMetric(
url=url,
domain=domain,
status_code=0,
success=False,
response_time_ms=elapsed,
records_extracted=0,
timestamp=datetime.utcnow().isoformat(),
error=str(e)
))
raise
Dashboard with Flask
from flask import Flask, render_template_string, jsonify
import json
from collections import defaultdict
from datetime import datetime, timedelta
app = Flask(__name__)
def load_metrics(hours=24):
metrics = []
cutoff = datetime.utcnow() - timedelta(hours=hours)
with open("metrics.jsonl") as f:
for line in f:
m = json.loads(line)
if datetime.fromisoformat(m["timestamp"]) > cutoff:
metrics.append(m)
return metrics
@app.route("/api/stats")
def stats():
metrics = load_metrics()
by_domain = defaultdict(lambda: {"total": 0, "success": 0, "total_time": 0})
for m in metrics:
d = by_domain[m["domain"]]
d["total"] += 1
d["success"] += 1 if m["success"] else 0
d["total_time"] += m["response_time_ms"]
domain_stats = {}
for domain, d in by_domain.items():
domain_stats[domain] = {
"success_rate": round(d["success"] / d["total"] * 100, 1),
"avg_response_ms": round(d["total_time"] / d["total"]),
"total_requests": d["total"]
}
return jsonify({
"total_scrapes": len(metrics),
"overall_success_rate": round(
sum(1 for m in metrics if m["success"]) / len(metrics) * 100, 1
) if metrics else 0,
"domains": domain_stats
})
Alerting on Failures
import smtplib
from email.mime.text import MIMEText
def check_alerts(metrics):
"""Check for conditions that should trigger alerts."""
from collections import defaultdict
domain_failures = defaultdict(int)
for m in metrics:
if not m["success"]:
domain_failures[m["domain"]] += 1
alerts = []
for domain, failures in domain_failures.items():
total = sum(1 for m in metrics if m["domain"] == domain)
failure_rate = failures / total * 100
if failure_rate > 20:
alerts.append(f"HIGH FAILURE RATE: {domain} at {failure_rate:.0f}%")
if alerts:
send_alert_email("\n".join(alerts))
Integration with ScraperAPI
When using ScraperAPI, you can track API-specific metrics:
import requests
def scrape_with_tracking(url, api_key):
start = time.time()
response = requests.get(
"http://api.scraperapi.com",
params={"api_key": api_key, "url": url}
)
elapsed = time.time() - start
return {
"status": response.status_code,
"response_time": elapsed,
"credits_used": response.headers.get("sa-credits-used", 0),
"content_length": len(response.text)
}
Monitoring is what separates hobby scrapers from production systems. Start with basic metrics and alerting, then expand to dashboards as your scraping operation grows.