Scaling Scrapers Horizontally - Deployment

Learn how to scale your web scraping operation horizontally with multiple workers, task queues, and distributed architecture.

When a single scraper cannot keep up with your URL list, you need to scale horizontally, running multiple scraper workers that divide the work.

Scaling Architecture

                    ┌──────────────┐
                    │   URL Queue   │
                    │  (Redis/SQS)  │
                    └──────┬───────┘
                           │
              ┌────────────┼────────────┐
              │            │            │
        ┌─────▼────┐ ┌────▼─────┐ ┌────▼─────┐
        │ Worker 1 │ │ Worker 2 │ │ Worker 3 │
        └─────┬────┘ └────┬─────┘ └────┬─────┘
              │            │            │
              └────────────┼────────────┘
                           │
                    ┌──────▼───────┐
                    │   Results    │
                    │  (DB / S3)   │
                    └──────────────┘

Step 1: Build a URL Queue with Redis

# queue_manager.py
import redis
import json
from typing import Optional

class URLQueue:
    def __init__(self, redis_url: str = "redis://localhost:6379"):
        self.redis = redis.from_url(redis_url)
        self.queue_key = "scraper:urls"
        self.seen_key = "scraper:seen"
        self.results_key = "scraper:results"

    def add_urls(self, urls: list[str]):
        """Add URLs to the queue, skipping duplicates."""
        pipe = self.redis.pipeline()
        added = 0
        for url in urls:
            if not self.redis.sismember(self.seen_key, url):
                pipe.rpush(self.queue_key, url)
                pipe.sadd(self.seen_key, url)
                added += 1
        pipe.execute()
        return added

    def get_url(self) -> Optional[str]:
        """Get the next URL to scrape."""
        result = self.redis.lpop(self.queue_key)
        return result.decode() if result else None

    def save_result(self, url: str, data: dict):
        """Store scrape result."""
        self.redis.hset(self.results_key, url, json.dumps(data))

    @property
    def pending(self) -> int:
        return self.redis.llen(self.queue_key)

Step 2: Build a Worker

# worker.py
import requests
from bs4 import BeautifulSoup
import time
import logging
from queue_manager import URLQueue

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class ScraperWorker:
    def __init__(self, worker_id: int, queue: URLQueue):
        self.worker_id = worker_id
        self.queue = queue
        self.session = requests.Session()
        self.session.headers.update({
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/124.0.0.0"
        })

    def scrape_url(self, url: str) -> dict:
        response = self.session.get(url, timeout=30)
        soup = BeautifulSoup(response.text, "html.parser")
        title = soup.find("title")
        return {
            "url": url,
            "status": response.status_code,
            "title": title.text if title else "",
        }

    def run(self):
        logger.info(f"Worker {self.worker_id} started")
        while True:
            url = self.queue.get_url()
            if not url:
                logger.info(f"Worker {self.worker_id}: queue empty, waiting...")
                time.sleep(5)
                continue

            try:
                data = self.scrape_url(url)
                self.queue.save_result(url, data)
                logger.info(f"Worker {self.worker_id}: scraped {url}")
            except Exception as e:
                logger.error(f"Worker {self.worker_id}: failed {url} - {e}")

            time.sleep(1)  # Polite delay

if __name__ == "__main__":
    import sys
    worker_id = int(sys.argv[1]) if len(sys.argv) > 1 else 1
    queue = URLQueue()
    worker = ScraperWorker(worker_id, queue)
    worker.run()

Step 3: Run Multiple Workers

With Docker Compose:

# docker-compose.yml
version: "3.8"

services:
  redis:
    image: redis:7-alpine
    ports: ["6379:6379"]

  worker:
    build: .
    command: python worker.py
    deploy:
      replicas: 5
    depends_on: [redis]
    environment:
      - REDIS_URL=redis://redis:6379

# Start 5 workers
docker compose up -d --scale worker=5

# Add URLs to the queue
python -c "
from queue_manager import URLQueue
q = URLQueue()
urls = [f'https://example.com/page/{i}' for i in range(1000)]
print(f'Added {q.add_urls(urls)} URLs')
"

Scaling Guidelines

URLs to Scrape	Workers	Proxy Recommendation
< 1,000	1-2	Single proxy fine
1,000 - 10,000	3-5	Rotating proxy pool
10,000 - 100,000	5-20	ScraperAPI or large pool
100,000+	20-50+	Managed service essential

Tips

Start with fewer workers and scale up based on the target site's tolerance
Use a deduplication layer (the seen_key in the example) to avoid scraping the same URL twice
Implement a dead letter queue for URLs that fail repeatedly
Monitor worker health and restart failed workers automatically