Intercepting Network Requests with Playwright
Learn to intercept, modify, and block network requests in Playwright for faster scraping and direct API data extraction.
One of the most powerful scraping techniques is intercepting network requests. Instead of parsing HTML, you can capture the underlying API calls that a website makes to load its data. This often gives you structured JSON directly, bypassing the need to scrape the DOM entirely.
Monitoring Network Requests
Listen to all network requests and responses:
from playwright.sync_api import sync_playwright
with sync_playwright() as p:
browser = p.chromium.launch(headless=True)
page = browser.new_page()
# Log all responses
def on_response(response):
if "api" in response.url and response.status == 200:
print(f"API call: {response.url}")
try:
data = response.json()
print(f" Data keys: {list(data.keys())[:5]}")
except:
pass
page.on("response", on_response)
page.goto("https://quotes.toscrape.com/js/")
page.wait_for_load_state("networkidle")
browser.close()
Capturing API Responses
Wait for a specific API call and capture its response:
with sync_playwright() as p:
browser = p.chromium.launch(headless=True)
page = browser.new_page()
# Wait for a specific API response
with page.expect_response(
lambda resp: "/api/products" in resp.url and resp.status == 200
) as response_info:
page.goto("https://example.com/shop")
response = response_info.value
products = response.json()
print(f"Captured {len(products)} products from API")
browser.close()
Blocking Unnecessary Resources
Speed up scraping by blocking images, fonts, stylesheets, and tracking scripts:
with sync_playwright() as p:
browser = p.chromium.launch(headless=True)
page = browser.new_page()
# Block images, fonts, and stylesheets
def block_resources(route):
if route.request.resource_type in ["image", "font", "stylesheet"]:
route.abort()
else:
route.continue_()
page.route("**/*", block_resources)
page.goto("https://quotes.toscrape.com")
print(page.title()) # Page loads faster without images/CSS
browser.close()
Blocking Specific Domains
Block analytics and tracking to speed up page loads:
blocked_domains = [
"google-analytics.com",
"doubleclick.net",
"facebook.net",
"hotjar.com",
]
def block_tracking(route):
if any(domain in route.request.url for domain in blocked_domains):
route.abort()
else:
route.continue_()
page.route("**/*", block_tracking)
Modifying Requests
Intercept and modify requests before they are sent:
def modify_headers(route):
headers = route.request.headers
headers["accept-language"] = "en-US,en;q=0.9"
headers["referer"] = "https://www.google.com/"
route.continue_(headers=headers)
page.route("**/*", modify_headers)
Mocking API Responses
Useful for testing or bypassing rate limits during development:
def mock_api(route):
route.fulfill(
status=200,
content_type="application/json",
body='[{"name": "Test Product", "price": 9.99}]'
)
page.route("**/api/products*", mock_api)
Practical Example: Extract Data from Hidden API
import json
with sync_playwright() as p:
browser = p.chromium.launch(headless=True)
page = browser.new_page()
captured_data = []
def capture_api(response):
if "/api/" in response.url and response.ok:
try:
captured_data.append({
"url": response.url,
"data": response.json()
})
except:
pass
page.on("response", capture_api)
page.goto("https://example.com")
page.wait_for_load_state("networkidle")
# Click through pages to trigger more API calls
for i in range(5):
try:
page.click("button.load-more", timeout=3000)
page.wait_for_load_state("networkidle")
except:
break
print(f"Captured {len(captured_data)} API responses")
browser.close()
Next Steps
- Scrape single-page applications (React, Vue, Angular)
- Manage browser contexts and sessions
- Learn parallel browser scraping