Handling Malformed HTML
Learn techniques for parsing broken, incomplete, and malformed HTML that you commonly encounter when web scraping.
Data Parsing · #9intermediate3 min read
The web is full of broken HTML, unclosed tags, mismatched nesting, invalid attributes, and encoding errors. Your scraper needs to handle all of it gracefully.
Parser Comparison
Different parsers handle broken HTML differently:
from bs4 import BeautifulSoup
broken_html = """
<div class="product">
<h2>ScraperAPI
<p>Best proxy service</p>
<span class=price>$49.99
</div>
<div class="product"
<h2>ScrapingAnt</h2>
<p>Headless browser API</p>
"""
# html.parser - lenient, built-in
soup1 = BeautifulSoup(broken_html, "html.parser")
# lxml - fast, fixes structure aggressively
soup2 = BeautifulSoup(broken_html, "lxml")
# html5lib - most lenient, matches browser behavior
soup3 = BeautifulSoup(broken_html, "html5lib")
for name, soup in [("html.parser", soup1), ("lxml", soup2), ("html5lib", soup3)]:
products = soup.find_all("div", class_="product")
print(f"{name}: found {len(products)} products")
pip install lxml html5lib
| Parser | Speed | Leniency | Dependencies |
|---|---|---|---|
html.parser |
Medium | Medium | Built-in |
lxml |
Fast | Good | C library |
html5lib |
Slow | Best | Pure Python |
Fixing Common HTML Problems
Unclosed Tags
from bs4 import BeautifulSoup
html = "<p>First paragraph<p>Second paragraph<p>Third"
# lxml auto-closes tags
soup = BeautifulSoup(html, "lxml")
paragraphs = soup.find_all("p")
for p in paragraphs:
print(f"Content: '{p.get_text(strip=True)}'")
Missing Quotes on Attributes
from bs4 import BeautifulSoup
html = '<div class=product data-id=123><span class=name>Test</span></div>'
# Parsers handle unquoted attributes fine
soup = BeautifulSoup(html, "lxml")
product = soup.find("div", class_="product")
print(f"ID: {product['data-id']}") # 123
Encoding Issues
import requests
from bs4 import BeautifulSoup
response = requests.get("https://example.com", timeout=15)
# Let BeautifulSoup detect encoding
soup = BeautifulSoup(response.content, "lxml", from_encoding=response.encoding)
# Or force UTF-8 for stubborn pages
soup = BeautifulSoup(
response.content.decode("utf-8", errors="replace"),
"lxml"
)
Defensive Parsing Strategy
from bs4 import BeautifulSoup
import logging
logger = logging.getLogger(__name__)
def safe_parse(html_content, selectors):
"""Parse HTML with fallback strategies."""
# Try parsers in order of speed
for parser in ["lxml", "html.parser", "html5lib"]:
try:
soup = BeautifulSoup(html_content, parser)
# Verify we can find expected elements
test_result = soup.select_one(selectors["test"])
if test_result:
return soup
logger.warning(f"{parser}: test selector found nothing")
except Exception as e:
logger.warning(f"{parser} failed: {e}")
raise ValueError("All parsers failed to extract expected elements")
# Usage
soup = safe_parse(html_content, {"test": "div.product"})
Cleaning HTML Before Parsing
import re
from bs4 import BeautifulSoup
def clean_html(raw_html):
"""Pre-process HTML to fix common issues."""
# Remove null bytes
cleaned = raw_html.replace("\x00", "")
# Remove JavaScript that can confuse parsers
cleaned = re.sub(r'<script[^>]*>.*?</script>', '', cleaned, flags=re.DOTALL)
cleaned = re.sub(r'<style[^>]*>.*?</style>', '', cleaned, flags=re.DOTALL)
# Fix common encoding artifacts
replacements = {
"\u00a0": " ", # Non-breaking space
"\u200b": "", # Zero-width space
"\ufffd": "", # Replacement character
}
for old, new in replacements.items():
cleaned = cleaned.replace(old, new)
return cleaned
raw = requests.get("https://example.com", timeout=15).text
soup = BeautifulSoup(clean_html(raw), "lxml")
When All Else Fails
If HTML is too broken for any parser, try extracting data with regex as a last resort, or use ScrapingAnt to get a properly rendered version of the page from a headless browser.
Next Steps
- Extract emails and phone numbers from parsed pages
- Parse dates and prices from scraped text
- Validate and normalize extracted data