Scraping Central is reader-supported. When you buy through links on our site, we may earn an affiliate commission.

Scaling Scrapers Horizontally

Learn how to scale your web scraping operation horizontally with multiple workers, task queues, and distributed architecture.

Deployment · #8advanced3 min read
Share:WhatsAppLinkedIn

When a single scraper cannot keep up with your URL list, you need to scale horizontally, running multiple scraper workers that divide the work.

Scaling Architecture

                    ┌──────────────┐
                    │   URL Queue   │
                    │  (Redis/SQS)  │
                    └──────┬───────┘
                           │
              ┌────────────┼────────────┐
              │            │            │
        ┌─────▼────┐ ┌────▼─────┐ ┌────▼─────┐
        │ Worker 1 │ │ Worker 2 │ │ Worker 3 │
        └─────┬────┘ └────┬─────┘ └────┬─────┘
              │            │            │
              └────────────┼────────────┘
                           │
                    ┌──────▼───────┐
                    │   Results    │
                    │  (DB / S3)   │
                    └──────────────┘

Step 1: Build a URL Queue with Redis

# queue_manager.py
import redis
import json
from typing import Optional

class URLQueue:
    def __init__(self, redis_url: str = "redis://localhost:6379"):
        self.redis = redis.from_url(redis_url)
        self.queue_key = "scraper:urls"
        self.seen_key = "scraper:seen"
        self.results_key = "scraper:results"

    def add_urls(self, urls: list[str]):
        """Add URLs to the queue, skipping duplicates."""
        pipe = self.redis.pipeline()
        added = 0
        for url in urls:
            if not self.redis.sismember(self.seen_key, url):
                pipe.rpush(self.queue_key, url)
                pipe.sadd(self.seen_key, url)
                added += 1
        pipe.execute()
        return added

    def get_url(self) -> Optional[str]:
        """Get the next URL to scrape."""
        result = self.redis.lpop(self.queue_key)
        return result.decode() if result else None

    def save_result(self, url: str, data: dict):
        """Store scrape result."""
        self.redis.hset(self.results_key, url, json.dumps(data))

    @property
    def pending(self) -> int:
        return self.redis.llen(self.queue_key)

Step 2: Build a Worker

# worker.py
import requests
from bs4 import BeautifulSoup
import time
import logging
from queue_manager import URLQueue

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class ScraperWorker:
    def __init__(self, worker_id: int, queue: URLQueue):
        self.worker_id = worker_id
        self.queue = queue
        self.session = requests.Session()
        self.session.headers.update({
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/124.0.0.0"
        })

    def scrape_url(self, url: str) -> dict:
        response = self.session.get(url, timeout=30)
        soup = BeautifulSoup(response.text, "html.parser")
        title = soup.find("title")
        return {
            "url": url,
            "status": response.status_code,
            "title": title.text if title else "",
        }

    def run(self):
        logger.info(f"Worker {self.worker_id} started")
        while True:
            url = self.queue.get_url()
            if not url:
                logger.info(f"Worker {self.worker_id}: queue empty, waiting...")
                time.sleep(5)
                continue

            try:
                data = self.scrape_url(url)
                self.queue.save_result(url, data)
                logger.info(f"Worker {self.worker_id}: scraped {url}")
            except Exception as e:
                logger.error(f"Worker {self.worker_id}: failed {url} - {e}")

            time.sleep(1)  # Polite delay

if __name__ == "__main__":
    import sys
    worker_id = int(sys.argv[1]) if len(sys.argv) > 1 else 1
    queue = URLQueue()
    worker = ScraperWorker(worker_id, queue)
    worker.run()

Step 3: Run Multiple Workers

With Docker Compose:

# docker-compose.yml
version: "3.8"

services:
  redis:
    image: redis:7-alpine
    ports: ["6379:6379"]

  worker:
    build: .
    command: python worker.py
    deploy:
      replicas: 5
    depends_on: [redis]
    environment:
      - REDIS_URL=redis://redis:6379
# Start 5 workers
docker compose up -d --scale worker=5

# Add URLs to the queue
python -c "
from queue_manager import URLQueue
q = URLQueue()
urls = [f'https://example.com/page/{i}' for i in range(1000)]
print(f'Added {q.add_urls(urls)} URLs')
"

Scaling Guidelines

URLs to Scrape Workers Proxy Recommendation
< 1,000 1-2 Single proxy fine
1,000 - 10,000 3-5 Rotating proxy pool
10,000 - 100,000 5-20 ScraperAPI or large pool
100,000+ 20-50+ Managed service essential

Tips

  • Start with fewer workers and scale up based on the target site's tolerance
  • Use a deduplication layer (the seen_key in the example) to avoid scraping the same URL twice
  • Implement a dead letter queue for URLs that fail repeatedly
  • Monitor worker health and restart failed workers automatically