Scraping Images and Files
Download images, PDFs, and other files while web scraping. Learn URL resolution, streaming downloads, and file organization best practices.
Python Scraping · #19intermediate3 min read
Scraping is not limited to text data. You can download images, PDFs, documents, and other files from websites. The key is extracting the file URLs, resolving relative paths, and downloading efficiently.
Downloading a Single Image
import requests
from pathlib import Path
url = "https://books.toscrape.com/media/cache/2c/da/2cdad67c44b002e7c0cc7c2e94813c12.jpg"
response = requests.get(url)
Path("downloads").mkdir(exist_ok=True)
filename = url.split("/")[-1]
filepath = Path("downloads") / filename
with open(filepath, "wb") as f:
f.write(response.content)
print(f"Downloaded: {filepath} ({len(response.content)} bytes)")
Scraping All Images from a Page
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from pathlib import Path
base_url = "https://books.toscrape.com/"
response = requests.get(base_url)
soup = BeautifulSoup(response.text, "html.parser")
Path("downloads/images").mkdir(parents=True, exist_ok=True)
images = soup.select("img")
print(f"Found {len(images)} images")
for img in images:
src = img.get("src")
if not src:
continue
# Resolve relative URLs to absolute
full_url = urljoin(base_url, src)
filename = full_url.split("/")[-1].split("?")[0]
img_response = requests.get(full_url)
if img_response.status_code == 200:
filepath = Path("downloads/images") / filename
with open(filepath, "wb") as f:
f.write(img_response.content)
print(f"Saved: {filename}")
Streaming Large File Downloads
For large files, use streaming to avoid loading the entire file into memory.
import requests
from pathlib import Path
def download_file(url, dest_folder="downloads"):
Path(dest_folder).mkdir(exist_ok=True)
filename = url.split("/")[-1].split("?")[0]
filepath = Path(dest_folder) / filename
with requests.get(url, stream=True) as response:
response.raise_for_status()
total_size = int(response.headers.get("content-length", 0))
downloaded = 0
with open(filepath, "wb") as f:
for chunk in response.iter_content(chunk_size=8192):
f.write(chunk)
downloaded += len(chunk)
print(f"Downloaded: {filename} ({downloaded:,} bytes)")
return filepath
# Download a PDF
download_file("https://example.com/report.pdf")
Bulk Image Scraper with Filtering
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from pathlib import Path
import time
ALLOWED_EXTENSIONS = {".jpg", ".jpeg", ".png", ".gif", ".webp"}
def scrape_images(url, output_dir="scraped_images", min_size=5000):
"""Scrape images from a URL, filtering by extension and size."""
Path(output_dir).mkdir(exist_ok=True)
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")
downloaded = 0
for img in soup.select("img[src]"):
src = urljoin(url, img["src"])
# Filter by extension
ext = Path(src.split("?")[0]).suffix.lower()
if ext not in ALLOWED_EXTENSIONS:
continue
img_resp = requests.get(src)
if img_resp.status_code != 200:
continue
# Filter by file size (skip tiny icons)
if len(img_resp.content) < min_size:
continue
filename = f"img_{downloaded:04d}{ext}"
with open(Path(output_dir) / filename, "wb") as f:
f.write(img_resp.content)
downloaded += 1
time.sleep(0.5)
print(f"Downloaded {downloaded} images to {output_dir}/")
return downloaded
scrape_images("https://books.toscrape.com/")
Extracting File Links (PDFs, CSVs, etc.)
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
response = requests.get("https://example.com/documents")
soup = BeautifulSoup(response.text, "html.parser")
# Find all PDF links
pdf_links = []
for a in soup.select('a[href$=".pdf"]'):
full_url = urljoin(response.url, a["href"])
pdf_links.append({
"text": a.get_text(strip=True),
"url": full_url,
})
for pdf in pdf_links:
print(f"{pdf['text']}: {pdf['url']}")
Tips
- Always use
urljoin()to resolve relative URLs, never concatenate strings manually. - Set a minimum file size filter to skip tiny icons and tracking pixels.
- Use streaming downloads (
stream=True) for files larger than a few MB. - Add delays between downloads to avoid overwhelming the server.
- When downloading from sites with anti-bot protection, route requests through ScraperAPI or ScrapingAnt to avoid blocks.
Next Steps
- Build a complete price monitoring scraper that tracks product images
- Learn to scrape multiple pages concurrently for faster downloads