Building a Stealth Scraping Setup
Combine every anti-detection technique into a single robust stealth scraping setup with Python and Playwright.
Anti-Detection · #16advanced3 min read
This guide combines proxy rotation, fingerprint spoofing, human behavior simulation, and smart error handling into a single production-ready stealth scraper.
The Architecture
A stealth setup needs four layers:
- Network layer, rotating residential proxies
- TLS layer, browser-authentic SSL fingerprints
- Browser layer, stealth-patched headless browser
- Behavior layer, human-like interaction patterns
Complete Stealth Scraper
import random
import time
from playwright.sync_api import sync_playwright
from playwright_stealth import stealth_sync
USER_AGENTS = [
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 14_4_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:125.0) Gecko/20100101 Firefox/125.0",
]
VIEWPORTS = [
{"width": 1920, "height": 1080},
{"width": 1366, "height": 768},
{"width": 1536, "height": 864},
{"width": 1440, "height": 900},
]
class StealthScraper:
def __init__(self, proxy: str = None):
self.proxy = proxy
self.playwright = None
self.browser = None
def __enter__(self):
self.playwright = sync_playwright().start()
launch_options = {"headless": True}
if self.proxy:
launch_options["proxy"] = {"server": self.proxy}
self.browser = self.playwright.chromium.launch(**launch_options)
return self
def __exit__(self, *args):
self.browser.close()
self.playwright.stop()
def new_context(self):
viewport = random.choice(VIEWPORTS)
context = self.browser.new_context(
viewport=viewport,
screen=viewport,
user_agent=random.choice(USER_AGENTS),
locale="en-US",
timezone_id="America/New_York",
color_scheme="light",
java_script_enabled=True,
)
return context
def scrape(self, url: str) -> str:
context = self.new_context()
page = context.new_page()
stealth_sync(page)
# Human-like navigation
page.goto(url, wait_until="domcontentloaded")
time.sleep(random.uniform(1.0, 3.0))
# Random mouse movement
page.mouse.move(
random.randint(100, 800),
random.randint(100, 400),
)
time.sleep(random.uniform(0.3, 1.0))
# Random scroll
for _ in range(random.randint(1, 3)):
page.mouse.wheel(0, random.randint(100, 400))
time.sleep(random.uniform(0.5, 1.5))
# Wait for content
page.wait_for_load_state("networkidle")
content = page.content()
context.close()
return content
# Usage
proxy = "http://user:pass@residential-proxy.example.com:8080"
with StealthScraper(proxy=proxy) as scraper:
html = scraper.scrape("https://target-site.com/products")
print(html[:500])
HTTP-Level Stealth with curl_cffi
For pages that do not require JavaScript, use curl_cffi for fast stealth requests:
from curl_cffi import requests
import random
import time
class StealthHTTPClient:
BROWSERS = ["chrome124", "chrome123", "safari17_4_1", "edge101"]
def __init__(self, proxy: str = None):
self.session = requests.Session(
impersonate=random.choice(self.BROWSERS),
proxy=proxy,
)
def get(self, url: str) -> requests.Response:
time.sleep(random.uniform(1.0, 3.0))
response = self.session.get(url, timeout=30)
return response
client = StealthHTTPClient(proxy="http://user:pass@proxy.example.com:8080")
resp = client.get("https://target-site.com/api/data")
print(resp.json())
When to Build Your Own vs Use a Service
| Scenario | Recommendation |
|---|---|
| Scraping a few protected sites | Build your own |
| Scraping 10+ different sites | Use ScraperAPI |
| Need 99%+ success rate | Use a managed service |
| Budget is tight | Build your own with free proxies |
| Time is tight | Use ScrapingAnt |
Installation
pip install playwright playwright-stealth curl-cffi fake-useragent
playwright install chromium
Building your own stealth setup gives you full control, but maintaining it against evolving anti-bot systems is an ongoing effort. Evaluate whether the time investment makes sense compared to using a managed API.