Scraping Multiple Pages Concurrently
Speed up scraping with concurrent requests using threading, multiprocessing, and asyncio. Learn to balance speed with politeness.
Python Scraping · #21intermediate3 min read
Sequential scraping is simple but slow. When you have hundreds or thousands of URLs to process, concurrent execution can reduce your total scrape time from hours to minutes.
Method 1: ThreadPoolExecutor
The simplest way to add concurrency to an existing requests-based scraper.
import requests
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor, as_completed
import time
def scrape_page(url):
"""Scrape a single page and return extracted data."""
response = requests.get(url, timeout=15)
soup = BeautifulSoup(response.text, "html.parser")
quotes = []
for quote in soup.select("div.quote"):
quotes.append({
"text": quote.select_one("span.text").get_text(),
"author": quote.select_one("small.author").get_text(),
})
return {"url": url, "quotes": quotes}
urls = [f"https://quotes.toscrape.com/page/{i}/" for i in range(1, 11)]
start = time.time()
results = []
with ThreadPoolExecutor(max_workers=5) as executor:
futures = {executor.submit(scrape_page, url): url for url in urls}
for future in as_completed(futures):
url = futures[future]
try:
result = future.result()
results.append(result)
print(f"Done: {url} ({len(result['quotes'])} quotes)")
except Exception as e:
print(f"Failed: {url} - {e}")
elapsed = time.time() - start
total_quotes = sum(len(r["quotes"]) for r in results)
print(f"\nScraped {total_quotes} quotes from {len(results)} pages in {elapsed:.1f}s")
Method 2: asyncio + HTTPX
For higher concurrency, use async I/O.
import asyncio
import httpx
from bs4 import BeautifulSoup
import time
MAX_CONCURRENT = 5
async def scrape_page(client, url, semaphore):
async with semaphore:
response = await client.get(url)
soup = BeautifulSoup(response.text, "html.parser")
return {
"url": url,
"quotes": [
{
"text": q.select_one("span.text").get_text(),
"author": q.select_one("small.author").get_text(),
}
for q in soup.select("div.quote")
],
}
async def main():
urls = [f"https://quotes.toscrape.com/page/{i}/" for i in range(1, 11)]
semaphore = asyncio.Semaphore(MAX_CONCURRENT)
async with httpx.AsyncClient(timeout=30.0) as client:
tasks = [scrape_page(client, url, semaphore) for url in urls]
results = await asyncio.gather(*tasks, return_exceptions=True)
for result in results:
if isinstance(result, dict):
print(f"{result['url']}: {len(result['quotes'])} quotes")
else:
print(f"Error: {result}")
start = time.time()
asyncio.run(main())
print(f"Completed in {time.time() - start:.1f}s")
Method 3: multiprocessing for CPU-Bound Parsing
When parsing is the bottleneck (large HTML documents), use multiple processes.
import requests
from bs4 import BeautifulSoup
from multiprocessing import Pool
import time
def scrape_and_parse(url):
"""Runs in a separate process."""
response = requests.get(url, timeout=15)
soup = BeautifulSoup(response.text, "html.parser")
quotes = []
for q in soup.select("div.quote"):
quotes.append(q.select_one("span.text").get_text())
return {"url": url, "count": len(quotes)}
if __name__ == "__main__":
urls = [f"https://quotes.toscrape.com/page/{i}/" for i in range(1, 11)]
start = time.time()
with Pool(processes=4) as pool:
results = pool.map(scrape_and_parse, urls)
for r in results:
print(f"{r['url']}: {r['count']} quotes")
print(f"Completed in {time.time() - start:.1f}s")
Comparison
| Method | Best For | Concurrency Model |
|---|---|---|
| ThreadPoolExecutor | Simple I/O-bound tasks | Threads |
| asyncio + HTTPX | High-volume I/O | Event loop |
| multiprocessing | CPU-heavy parsing | Processes |
| Scrapy | Full scraping projects | Twisted event loop |
Rate Limiting and Politeness
Always limit your concurrent requests to avoid overwhelming servers.
import time
from concurrent.futures import ThreadPoolExecutor
def rate_limited_scrape(url):
time.sleep(0.5) # Delay per request
response = requests.get(url, timeout=15)
return response.status_code
with ThreadPoolExecutor(max_workers=3) as executor:
results = list(executor.map(rate_limited_scrape, urls))
Tips
- Start with 3-5 concurrent workers and increase gradually.
- Monitor your error rate, if it spikes, reduce concurrency.
- Use ScraperAPI when scraping concurrently at scale. It handles rate limits and IP rotation across all your concurrent requests.
- ThreadPoolExecutor is usually sufficient, reach for asyncio only when you need 50+ concurrent connections.
- Always add error handling to prevent one failed page from crashing the entire batch.
Next Steps
- Learn regex patterns for extracting data from unstructured text
- Combine concurrent scraping with database storage for production pipelines