Scraping Central is reader-supported. When you buy through links on our site, we may earn an affiliate commission.

Tutorial

Building a Web Scraping Monitoring Dashboard

Learn how to build a monitoring dashboard for your web scrapers. Track success rates, response times, data quality, and get alerts on failures.

Production scrapers need monitoring. Without it, you discover data is stale or missing only when a customer complains. Here is how to build a monitoring dashboard for your scraping operations.

Key Metrics to Track

  1. Success rate, Percentage of requests returning valid data
  2. Response time, How long each scrape takes
  3. Data freshness, Time since last successful scrape per source
  4. Queue depth, Number of pending URLs
  5. Data quality, Percentage of records passing validation
  6. Proxy health, Success rate per proxy or API provider

Metrics Collection

import time
import json
from datetime import datetime
from dataclasses import dataclass, asdict
from typing import Optional

@dataclass
class ScrapeMetric:
    url: str
    domain: str
    status_code: int
    success: bool
    response_time_ms: float
    records_extracted: int
    timestamp: str
    error: Optional[str] = None

class MetricsCollector:
    def __init__(self, metrics_file="metrics.jsonl"):
        self.metrics_file = metrics_file

    def record(self, metric: ScrapeMetric):
        with open(self.metrics_file, "a") as f:
            f.write(json.dumps(asdict(metric)) + "\n")

    def scrape_with_metrics(self, url, scrape_func):
        """Wrapper that records metrics for any scrape function."""
        from urllib.parse import urlparse
        domain = urlparse(url).netloc

        start = time.time()
        try:
            result = scrape_func(url)
            elapsed = (time.time() - start) * 1000

            self.record(ScrapeMetric(
                url=url,
                domain=domain,
                status_code=200,
                success=True,
                response_time_ms=elapsed,
                records_extracted=len(result) if isinstance(result, list) else 1,
                timestamp=datetime.utcnow().isoformat()
            ))
            return result

        except Exception as e:
            elapsed = (time.time() - start) * 1000
            self.record(ScrapeMetric(
                url=url,
                domain=domain,
                status_code=0,
                success=False,
                response_time_ms=elapsed,
                records_extracted=0,
                timestamp=datetime.utcnow().isoformat(),
                error=str(e)
            ))
            raise

Dashboard with Flask

from flask import Flask, render_template_string, jsonify
import json
from collections import defaultdict
from datetime import datetime, timedelta

app = Flask(__name__)

def load_metrics(hours=24):
    metrics = []
    cutoff = datetime.utcnow() - timedelta(hours=hours)

    with open("metrics.jsonl") as f:
        for line in f:
            m = json.loads(line)
            if datetime.fromisoformat(m["timestamp"]) > cutoff:
                metrics.append(m)
    return metrics

@app.route("/api/stats")
def stats():
    metrics = load_metrics()

    by_domain = defaultdict(lambda: {"total": 0, "success": 0, "total_time": 0})
    for m in metrics:
        d = by_domain[m["domain"]]
        d["total"] += 1
        d["success"] += 1 if m["success"] else 0
        d["total_time"] += m["response_time_ms"]

    domain_stats = {}
    for domain, d in by_domain.items():
        domain_stats[domain] = {
            "success_rate": round(d["success"] / d["total"] * 100, 1),
            "avg_response_ms": round(d["total_time"] / d["total"]),
            "total_requests": d["total"]
        }

    return jsonify({
        "total_scrapes": len(metrics),
        "overall_success_rate": round(
            sum(1 for m in metrics if m["success"]) / len(metrics) * 100, 1
        ) if metrics else 0,
        "domains": domain_stats
    })

Alerting on Failures

import smtplib
from email.mime.text import MIMEText

def check_alerts(metrics):
    """Check for conditions that should trigger alerts."""
    from collections import defaultdict

    domain_failures = defaultdict(int)
    for m in metrics:
        if not m["success"]:
            domain_failures[m["domain"]] += 1

    alerts = []
    for domain, failures in domain_failures.items():
        total = sum(1 for m in metrics if m["domain"] == domain)
        failure_rate = failures / total * 100

        if failure_rate > 20:
            alerts.append(f"HIGH FAILURE RATE: {domain} at {failure_rate:.0f}%")

    if alerts:
        send_alert_email("\n".join(alerts))

Integration with ScraperAPI

When using ScraperAPI, you can track API-specific metrics:

import requests

def scrape_with_tracking(url, api_key):
    start = time.time()
    response = requests.get(
        "http://api.scraperapi.com",
        params={"api_key": api_key, "url": url}
    )
    elapsed = time.time() - start

    return {
        "status": response.status_code,
        "response_time": elapsed,
        "credits_used": response.headers.get("sa-credits-used", 0),
        "content_length": len(response.text)
    }

Monitoring is what separates hobby scrapers from production systems. Start with basic metrics and alerting, then expand to dashboards as your scraping operation grows.