Scraping Images and Files - Python Scraping

Download images, PDFs, and other files while web scraping. Learn URL resolution, streaming downloads, and file organization best practices.

Scraping is not limited to text data. You can download images, PDFs, documents, and other files from websites. The key is extracting the file URLs, resolving relative paths, and downloading efficiently.

Downloading a Single Image

import requests
from pathlib import Path

url = "https://books.toscrape.com/media/cache/2c/da/2cdad67c44b002e7c0cc7c2e94813c12.jpg"
response = requests.get(url)

Path("downloads").mkdir(exist_ok=True)

filename = url.split("/")[-1]
filepath = Path("downloads") / filename

with open(filepath, "wb") as f:
    f.write(response.content)

print(f"Downloaded: {filepath} ({len(response.content)} bytes)")

Scraping All Images from a Page

import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from pathlib import Path

base_url = "https://books.toscrape.com/"
response = requests.get(base_url)
soup = BeautifulSoup(response.text, "html.parser")

Path("downloads/images").mkdir(parents=True, exist_ok=True)

images = soup.select("img")
print(f"Found {len(images)} images")

for img in images:
    src = img.get("src")
    if not src:
        continue

    # Resolve relative URLs to absolute
    full_url = urljoin(base_url, src)
    filename = full_url.split("/")[-1].split("?")[0]

    img_response = requests.get(full_url)
    if img_response.status_code == 200:
        filepath = Path("downloads/images") / filename
        with open(filepath, "wb") as f:
            f.write(img_response.content)
        print(f"Saved: {filename}")

Streaming Large File Downloads

For large files, use streaming to avoid loading the entire file into memory.

import requests
from pathlib import Path


def download_file(url, dest_folder="downloads"):
    Path(dest_folder).mkdir(exist_ok=True)
    filename = url.split("/")[-1].split("?")[0]
    filepath = Path(dest_folder) / filename

    with requests.get(url, stream=True) as response:
        response.raise_for_status()
        total_size = int(response.headers.get("content-length", 0))

        downloaded = 0
        with open(filepath, "wb") as f:
            for chunk in response.iter_content(chunk_size=8192):
                f.write(chunk)
                downloaded += len(chunk)

        print(f"Downloaded: {filename} ({downloaded:,} bytes)")
    return filepath


# Download a PDF
download_file("https://example.com/report.pdf")

Bulk Image Scraper with Filtering

import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from pathlib import Path
import time

ALLOWED_EXTENSIONS = {".jpg", ".jpeg", ".png", ".gif", ".webp"}


def scrape_images(url, output_dir="scraped_images", min_size=5000):
    """Scrape images from a URL, filtering by extension and size."""
    Path(output_dir).mkdir(exist_ok=True)
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")

    downloaded = 0
    for img in soup.select("img[src]"):
        src = urljoin(url, img["src"])

        # Filter by extension
        ext = Path(src.split("?")[0]).suffix.lower()
        if ext not in ALLOWED_EXTENSIONS:
            continue

        img_resp = requests.get(src)
        if img_resp.status_code != 200:
            continue

        # Filter by file size (skip tiny icons)
        if len(img_resp.content) < min_size:
            continue

        filename = f"img_{downloaded:04d}{ext}"
        with open(Path(output_dir) / filename, "wb") as f:
            f.write(img_resp.content)
        downloaded += 1
        time.sleep(0.5)

    print(f"Downloaded {downloaded} images to {output_dir}/")
    return downloaded


scrape_images("https://books.toscrape.com/")

Extracting File Links (PDFs, CSVs, etc.)

import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

response = requests.get("https://example.com/documents")
soup = BeautifulSoup(response.text, "html.parser")

# Find all PDF links
pdf_links = []
for a in soup.select('a[href$=".pdf"]'):
    full_url = urljoin(response.url, a["href"])
    pdf_links.append({
        "text": a.get_text(strip=True),
        "url": full_url,
    })

for pdf in pdf_links:
    print(f"{pdf['text']}: {pdf['url']}")

Tips

Always use urljoin() to resolve relative URLs, never concatenate strings manually.
Set a minimum file size filter to skip tiny icons and tracking pixels.
Use streaming downloads (stream=True) for files larger than a few MB.
Add delays between downloads to avoid overwhelming the server.
When downloading from sites with anti-bot protection, route requests through ScraperAPI or ScrapingAnt to avoid blocks.

Next Steps

Build a complete price monitoring scraper that tracks product images
Learn to scrape multiple pages concurrently for faster downloads