Web Scraping Best Practices and Patterns
Master web scraping best practices: respectful scraping, anti-detection, data quality, error recovery, project architecture, and legal considerations.
After 26 tutorials, you have the tools and techniques to scrape almost anything. This final guide covers the best practices and patterns that separate fragile scripts from production-grade scrapers.
1. Be a Respectful Scraper
Your scraper is a guest on someone else's server. Treat it that way.
import requests
import time
from urllib.robotparser import RobotFileParser
def check_robots_txt(base_url, path="/"):
"""Check if scraping is allowed by robots.txt."""
rp = RobotFileParser()
rp.set_url(f"{base_url}/robots.txt")
rp.read()
return rp.can_fetch("*", f"{base_url}{path}")
# Always check before scraping
if check_robots_txt("https://quotes.toscrape.com"):
print("Scraping is allowed")
else:
print("Scraping is disallowed by robots.txt")
Politeness Rules
| Rule | Implementation |
|---|---|
| Respect robots.txt | Check before scraping |
| Add delays | time.sleep(1) between requests |
| Identify yourself | Set a descriptive User-Agent |
| Limit concurrency | Max 3-5 concurrent requests |
| Scrape off-peak | Run during low-traffic hours |
| Cache responses | Do not fetch the same page twice |
2. Build Resilient Scrapers
import requests
from bs4 import BeautifulSoup
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("scraper")
class ResilientScraper:
def __init__(self, base_delay=1.0, max_retries=3):
self.session = requests.Session()
self.base_delay = base_delay
# Automatic retries with backoff
retry = Retry(
total=max_retries,
backoff_factor=0.5,
status_forcelist=[429, 500, 502, 503, 504],
)
self.session.mount("http://", HTTPAdapter(max_retries=retry))
self.session.mount("https://", HTTPAdapter(max_retries=retry))
self.session.headers.update({
"User-Agent": "ScrapingCentral Bot/1.0 (+https://scrapingcentral.com)",
"Accept": "text/html,application/xhtml+xml",
"Accept-Language": "en-US,en;q=0.9",
})
def get(self, url, **kwargs):
"""Fetch a URL with automatic retries and delay."""
import time
time.sleep(self.base_delay)
try:
response = self.session.get(url, timeout=15, **kwargs)
response.raise_for_status()
logger.info(f"OK {response.status_code}: {url}")
return response
except requests.exceptions.RequestException as e:
logger.error(f"FAILED: {url} - {e}")
return None
def parse(self, url):
"""Fetch and parse a URL into BeautifulSoup."""
response = self.get(url)
if response:
return BeautifulSoup(response.content, "html.parser")
return None
3. Structure Your Scraping Projects
my_scraper/
config.py # URLs, settings, credentials
scraper.py # Core scraping logic
parsers.py # HTML parsing functions
storage.py # Database/file storage
models.py # Data classes
utils.py # Helpers (retry, logging)
run.py # Entry point
tests/
test_parsers.py
Separate Fetching from Parsing
# parsers.py, Pure functions that take HTML, return data
from bs4 import BeautifulSoup
def parse_quote(html):
"""Parse a quote page and return structured data."""
soup = BeautifulSoup(html, "html.parser")
quotes = []
for el in soup.select("div.quote"):
text_el = el.select_one("span.text")
author_el = el.select_one("small.author")
if text_el and author_el:
quotes.append({
"text": text_el.get_text(),
"author": author_el.get_text(),
"tags": [t.get_text() for t in el.select("a.tag")],
})
return quotes
def parse_next_page(html):
"""Extract the next page URL, or None."""
soup = BeautifulSoup(html, "html.parser")
next_link = soup.select_one("li.next > a")
return next_link["href"] if next_link else None
This pattern makes your parsers easy to test with saved HTML files.
4. Cache Responses
Never fetch the same page twice. Cache raw HTML so you can re-parse without re-fetching.
import hashlib
import os
import requests
class CachedFetcher:
def __init__(self, cache_dir="cache"):
self.cache_dir = cache_dir
os.makedirs(cache_dir, exist_ok=True)
self.session = requests.Session()
def _cache_path(self, url):
url_hash = hashlib.md5(url.encode()).hexdigest()
return os.path.join(self.cache_dir, f"{url_hash}.html")
def get(self, url, force_refresh=False):
cache_path = self._cache_path(url)
if not force_refresh and os.path.exists(cache_path):
with open(cache_path, "r", encoding="utf-8") as f:
return f.read()
response = self.session.get(url, timeout=15)
response.raise_for_status()
with open(cache_path, "w", encoding="utf-8") as f:
f.write(response.text)
return response.text
fetcher = CachedFetcher()
html = fetcher.get("https://quotes.toscrape.com/") # Fetches from network
html = fetcher.get("https://quotes.toscrape.com/") # Returns from cache
5. Monitor and Alert
import logging
from datetime import datetime
class ScrapingMonitor:
def __init__(self):
self.stats = {
"started_at": datetime.now().isoformat(),
"pages_fetched": 0,
"pages_failed": 0,
"items_extracted": 0,
"duplicates_skipped": 0,
}
def record_fetch(self, success=True):
if success:
self.stats["pages_fetched"] += 1
else:
self.stats["pages_failed"] += 1
def record_items(self, count):
self.stats["items_extracted"] += count
def report(self):
total = self.stats["pages_fetched"] + self.stats["pages_failed"]
success_rate = (
self.stats["pages_fetched"] / total * 100 if total else 0
)
print(f"\n--- Scraping Report ---")
print(f"Pages fetched: {self.stats['pages_fetched']}")
print(f"Pages failed: {self.stats['pages_failed']}")
print(f"Success rate: {success_rate:.1f}%")
print(f"Items extracted: {self.stats['items_extracted']}")
6. Legal and Ethical Considerations
- Check Terms of Service, some sites explicitly prohibit scraping.
- Respect robots.txt, it is not legally binding everywhere but signals the site owner's intent.
- Do not scrape personal data without a lawful basis (GDPR, CCPA).
- Do not overload servers, treat rate limits as instructions, not suggestions.
- Attribute data sources when publishing scraped data.
7. Use Proxy Services for Scale
When you outgrow simple scraping, proxy services handle the infrastructure for you.
- ScraperAPI, proxy rotation, CAPTCHA solving, JS rendering via a simple API.
- ScrapingAnt, headless browser API with anti-detection built in.
Both services let you focus on data extraction while they handle the reliability and anti-bot challenges.
Summary Checklist
- Check robots.txt before scraping
- Set a descriptive User-Agent
- Add delays between requests
- Handle errors with retries and backoff
- Cache responses to avoid redundant fetches
- Separate fetching, parsing, and storage
- Log everything for debugging
- Test parsers with saved HTML fixtures
- Store data incrementally to survive crashes
- Monitor success rates and alert on failures
You now have a complete toolkit for web scraping with Python. Build responsibly, scrape politely, and always focus on extracting the data you actually need.