Scraping Central is reader-supported. When you buy through links on our site, we may earn an affiliate commission.

Storing Scraper Output in Cloud Storage (S3, GCS)

Learn how to store your web scraper output in AWS S3 and Google Cloud Storage for reliable, scalable data storage.

Deployment · #11beginner3 min read
Share:WhatsAppLinkedIn

Local files are fine for small projects, but cloud storage gives you durability, scalability, and easy access from any service. AWS S3 and Google Cloud Storage are the most popular options.

AWS S3

Setup

pip install boto3
aws configure  # Set your AWS credentials

Saving Scraped Data to S3

import boto3
import json
from datetime import datetime

class S3Storage:
    def __init__(self, bucket_name: str):
        self.s3 = boto3.client("s3")
        self.bucket = bucket_name

    def save_json(self, data: dict | list, prefix: str = "scrapes") -> str:
        """Save JSON data to S3 with a timestamped key."""
        timestamp = datetime.utcnow().strftime("%Y/%m/%d/%H%M%S")
        key = f"{prefix}/{timestamp}.json"

        self.s3.put_object(
            Bucket=self.bucket,
            Key=key,
            Body=json.dumps(data, indent=2),
            ContentType="application/json",
        )
        return f"s3://{self.bucket}/{key}"

    def save_html(self, html: str, url: str) -> str:
        """Save raw HTML for archival."""
        safe_name = url.replace("https://", "").replace("/", "_")[:100]
        timestamp = datetime.utcnow().strftime("%Y%m%d_%H%M%S")
        key = f"html/{safe_name}_{timestamp}.html"

        self.s3.put_object(
            Bucket=self.bucket,
            Key=key,
            Body=html.encode("utf-8"),
            ContentType="text/html",
        )
        return key

    def list_recent(self, prefix: str = "scrapes", limit: int = 10) -> list[str]:
        """List recent files in a prefix."""
        response = self.s3.list_objects_v2(
            Bucket=self.bucket,
            Prefix=prefix,
            MaxKeys=limit,
        )
        return [obj["Key"] for obj in response.get("Contents", [])]

# Usage
storage = S3Storage("my-scraper-data")

scraped_data = [
    {"title": "Product A", "price": 29.99},
    {"title": "Product B", "price": 49.99},
]

path = storage.save_json(scraped_data, prefix="products")
print(f"Saved to {path}")

Google Cloud Storage

Setup

pip install google-cloud-storage
gcloud auth application-default login

Saving Data to GCS

from google.cloud import storage
import json
from datetime import datetime

class GCSStorage:
    def __init__(self, bucket_name: str):
        self.client = storage.Client()
        self.bucket = self.client.bucket(bucket_name)

    def save_json(self, data: dict | list, prefix: str = "scrapes") -> str:
        """Save JSON data to GCS."""
        timestamp = datetime.utcnow().strftime("%Y/%m/%d/%H%M%S")
        blob_name = f"{prefix}/{timestamp}.json"

        blob = self.bucket.blob(blob_name)
        blob.upload_from_string(
            json.dumps(data, indent=2),
            content_type="application/json",
        )
        return f"gs://{self.bucket.name}/{blob_name}"

    def load_json(self, blob_name: str) -> dict:
        """Load JSON data from GCS."""
        blob = self.bucket.blob(blob_name)
        content = blob.download_as_text()
        return json.loads(content)

# Usage
gcs = GCSStorage("my-scraper-data")
path = gcs.save_json({"products": scraped_data})
print(f"Saved to {path}")

Organizing Your Data

Use a consistent directory structure:

scrapes/
├── products/
│   ├── 2025/
│   │   ├── 01/
│   │   │   ├── 15/
│   │   │   │   ├── 080000.json
│   │   │   │   └── 200000.json
│   │   │   └── 16/
│   │   │       └── 080000.json
├── html/
│   └── raw page backups
└── errors/
    └── failed scrape logs

This date-partitioned structure makes it easy to query specific time ranges and set up lifecycle policies.

Lifecycle Policies

Automatically manage old data to control costs:

# AWS S3: Move data to cheaper storage after 30 days, delete after 1 year
import boto3

s3 = boto3.client("s3")
s3.put_bucket_lifecycle_configuration(
    Bucket="my-scraper-data",
    LifecycleConfiguration={
        "Rules": [{
            "ID": "archive-old-scrapes",
            "Status": "Enabled",
            "Prefix": "scrapes/",
            "Transitions": [
                {"Days": 30, "StorageClass": "STANDARD_IA"},
                {"Days": 90, "StorageClass": "GLACIER"},
            ],
            "Expiration": {"Days": 365},
        }]
    },
)

Cost Comparison

Storage Price per GB/month Best For
S3 Standard $0.023 Frequent access
S3 Infrequent $0.0125 Monthly access
S3 Glacier $0.004 Archival
GCS Standard $0.020 Frequent access
GCS Nearline $0.010 Monthly access

For most scrapers producing a few GB per month, cloud storage costs under $1/month.