Scaling Scrapers Horizontally
Learn how to scale your web scraping operation horizontally with multiple workers, task queues, and distributed architecture.
Deployment · #8advanced3 min read
When a single scraper cannot keep up with your URL list, you need to scale horizontally, running multiple scraper workers that divide the work.
Scaling Architecture
┌──────────────┐
│ URL Queue │
│ (Redis/SQS) │
└──────┬───────┘
│
┌────────────┼────────────┐
│ │ │
┌─────▼────┐ ┌────▼─────┐ ┌────▼─────┐
│ Worker 1 │ │ Worker 2 │ │ Worker 3 │
└─────┬────┘ └────┬─────┘ └────┬─────┘
│ │ │
└────────────┼────────────┘
│
┌──────▼───────┐
│ Results │
│ (DB / S3) │
└──────────────┘
Step 1: Build a URL Queue with Redis
# queue_manager.py
import redis
import json
from typing import Optional
class URLQueue:
def __init__(self, redis_url: str = "redis://localhost:6379"):
self.redis = redis.from_url(redis_url)
self.queue_key = "scraper:urls"
self.seen_key = "scraper:seen"
self.results_key = "scraper:results"
def add_urls(self, urls: list[str]):
"""Add URLs to the queue, skipping duplicates."""
pipe = self.redis.pipeline()
added = 0
for url in urls:
if not self.redis.sismember(self.seen_key, url):
pipe.rpush(self.queue_key, url)
pipe.sadd(self.seen_key, url)
added += 1
pipe.execute()
return added
def get_url(self) -> Optional[str]:
"""Get the next URL to scrape."""
result = self.redis.lpop(self.queue_key)
return result.decode() if result else None
def save_result(self, url: str, data: dict):
"""Store scrape result."""
self.redis.hset(self.results_key, url, json.dumps(data))
@property
def pending(self) -> int:
return self.redis.llen(self.queue_key)
Step 2: Build a Worker
# worker.py
import requests
from bs4 import BeautifulSoup
import time
import logging
from queue_manager import URLQueue
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class ScraperWorker:
def __init__(self, worker_id: int, queue: URLQueue):
self.worker_id = worker_id
self.queue = queue
self.session = requests.Session()
self.session.headers.update({
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/124.0.0.0"
})
def scrape_url(self, url: str) -> dict:
response = self.session.get(url, timeout=30)
soup = BeautifulSoup(response.text, "html.parser")
title = soup.find("title")
return {
"url": url,
"status": response.status_code,
"title": title.text if title else "",
}
def run(self):
logger.info(f"Worker {self.worker_id} started")
while True:
url = self.queue.get_url()
if not url:
logger.info(f"Worker {self.worker_id}: queue empty, waiting...")
time.sleep(5)
continue
try:
data = self.scrape_url(url)
self.queue.save_result(url, data)
logger.info(f"Worker {self.worker_id}: scraped {url}")
except Exception as e:
logger.error(f"Worker {self.worker_id}: failed {url} - {e}")
time.sleep(1) # Polite delay
if __name__ == "__main__":
import sys
worker_id = int(sys.argv[1]) if len(sys.argv) > 1 else 1
queue = URLQueue()
worker = ScraperWorker(worker_id, queue)
worker.run()
Step 3: Run Multiple Workers
With Docker Compose:
# docker-compose.yml
version: "3.8"
services:
redis:
image: redis:7-alpine
ports: ["6379:6379"]
worker:
build: .
command: python worker.py
deploy:
replicas: 5
depends_on: [redis]
environment:
- REDIS_URL=redis://redis:6379
# Start 5 workers
docker compose up -d --scale worker=5
# Add URLs to the queue
python -c "
from queue_manager import URLQueue
q = URLQueue()
urls = [f'https://example.com/page/{i}' for i in range(1000)]
print(f'Added {q.add_urls(urls)} URLs')
"
Scaling Guidelines
| URLs to Scrape | Workers | Proxy Recommendation |
|---|---|---|
| < 1,000 | 1-2 | Single proxy fine |
| 1,000 - 10,000 | 3-5 | Rotating proxy pool |
| 10,000 - 100,000 | 5-20 | ScraperAPI or large pool |
| 100,000+ | 20-50+ | Managed service essential |
Tips
- Start with fewer workers and scale up based on the target site's tolerance
- Use a deduplication layer (the
seen_keyin the example) to avoid scraping the same URL twice - Implement a dead letter queue for URLs that fail repeatedly
- Monitor worker health and restart failed workers automatically