Handling Malformed HTML - Data Parsing

Learn techniques for parsing broken, incomplete, and malformed HTML that you commonly encounter when web scraping.

The web is full of broken HTML, unclosed tags, mismatched nesting, invalid attributes, and encoding errors. Your scraper needs to handle all of it gracefully.

Parser Comparison

Different parsers handle broken HTML differently:

from bs4 import BeautifulSoup

broken_html = """
<div class="product">
  <h2>ScraperAPI
  <p>Best proxy service</p>
  <span class=price>$49.99
</div>
<div class="product"
  <h2>ScrapingAnt</h2>
  <p>Headless browser API</p>
"""

# html.parser - lenient, built-in
soup1 = BeautifulSoup(broken_html, "html.parser")

# lxml - fast, fixes structure aggressively
soup2 = BeautifulSoup(broken_html, "lxml")

# html5lib - most lenient, matches browser behavior
soup3 = BeautifulSoup(broken_html, "html5lib")

for name, soup in [("html.parser", soup1), ("lxml", soup2), ("html5lib", soup3)]:
    products = soup.find_all("div", class_="product")
    print(f"{name}: found {len(products)} products")

pip install lxml html5lib

Parser	Speed	Leniency	Dependencies
`html.parser`	Medium	Medium	Built-in
`lxml`	Fast	Good	C library
`html5lib`	Slow	Best	Pure Python

Fixing Common HTML Problems

Unclosed Tags

from bs4 import BeautifulSoup

html = "<p>First paragraph<p>Second paragraph<p>Third"

# lxml auto-closes tags
soup = BeautifulSoup(html, "lxml")
paragraphs = soup.find_all("p")
for p in paragraphs:
    print(f"Content: '{p.get_text(strip=True)}'")

Missing Quotes on Attributes

from bs4 import BeautifulSoup

html = '<div class=product data-id=123><span class=name>Test</span></div>'

# Parsers handle unquoted attributes fine
soup = BeautifulSoup(html, "lxml")
product = soup.find("div", class_="product")
print(f"ID: {product['data-id']}")  # 123

Encoding Issues

import requests
from bs4 import BeautifulSoup

response = requests.get("https://example.com", timeout=15)

# Let BeautifulSoup detect encoding
soup = BeautifulSoup(response.content, "lxml", from_encoding=response.encoding)

# Or force UTF-8 for stubborn pages
soup = BeautifulSoup(
    response.content.decode("utf-8", errors="replace"),
    "lxml"
)

Defensive Parsing Strategy

from bs4 import BeautifulSoup
import logging

logger = logging.getLogger(__name__)

def safe_parse(html_content, selectors):
    """Parse HTML with fallback strategies."""
    # Try parsers in order of speed
    for parser in ["lxml", "html.parser", "html5lib"]:
        try:
            soup = BeautifulSoup(html_content, parser)
            # Verify we can find expected elements
            test_result = soup.select_one(selectors["test"])
            if test_result:
                return soup
            logger.warning(f"{parser}: test selector found nothing")
        except Exception as e:
            logger.warning(f"{parser} failed: {e}")

    raise ValueError("All parsers failed to extract expected elements")

# Usage
soup = safe_parse(html_content, {"test": "div.product"})

Cleaning HTML Before Parsing

import re
from bs4 import BeautifulSoup

def clean_html(raw_html):
    """Pre-process HTML to fix common issues."""
    # Remove null bytes
    cleaned = raw_html.replace("\x00", "")

    # Remove JavaScript that can confuse parsers
    cleaned = re.sub(r'<script[^>]*>.*?</script>', '', cleaned, flags=re.DOTALL)
    cleaned = re.sub(r'<style[^>]*>.*?</style>', '', cleaned, flags=re.DOTALL)

    # Fix common encoding artifacts
    replacements = {
        "\u00a0": " ",      # Non-breaking space
        "\u200b": "",       # Zero-width space
        "\ufffd": "",       # Replacement character
    }
    for old, new in replacements.items():
        cleaned = cleaned.replace(old, new)

    return cleaned

raw = requests.get("https://example.com", timeout=15).text
soup = BeautifulSoup(clean_html(raw), "lxml")

When All Else Fails

If HTML is too broken for any parser, try extracting data with regex as a last resort, or use ScrapingAnt to get a properly rendered version of the page from a headless browser.

Next Steps

Extract emails and phone numbers from parsed pages
Parse dates and prices from scraped text
Validate and normalize extracted data