Storing Scraper Output in Cloud Storage (S3, GCS)
Learn how to store your web scraper output in AWS S3 and Google Cloud Storage for reliable, scalable data storage.
Deployment · #11beginner3 min read
Local files are fine for small projects, but cloud storage gives you durability, scalability, and easy access from any service. AWS S3 and Google Cloud Storage are the most popular options.
AWS S3
Setup
pip install boto3
aws configure # Set your AWS credentials
Saving Scraped Data to S3
import boto3
import json
from datetime import datetime
class S3Storage:
def __init__(self, bucket_name: str):
self.s3 = boto3.client("s3")
self.bucket = bucket_name
def save_json(self, data: dict | list, prefix: str = "scrapes") -> str:
"""Save JSON data to S3 with a timestamped key."""
timestamp = datetime.utcnow().strftime("%Y/%m/%d/%H%M%S")
key = f"{prefix}/{timestamp}.json"
self.s3.put_object(
Bucket=self.bucket,
Key=key,
Body=json.dumps(data, indent=2),
ContentType="application/json",
)
return f"s3://{self.bucket}/{key}"
def save_html(self, html: str, url: str) -> str:
"""Save raw HTML for archival."""
safe_name = url.replace("https://", "").replace("/", "_")[:100]
timestamp = datetime.utcnow().strftime("%Y%m%d_%H%M%S")
key = f"html/{safe_name}_{timestamp}.html"
self.s3.put_object(
Bucket=self.bucket,
Key=key,
Body=html.encode("utf-8"),
ContentType="text/html",
)
return key
def list_recent(self, prefix: str = "scrapes", limit: int = 10) -> list[str]:
"""List recent files in a prefix."""
response = self.s3.list_objects_v2(
Bucket=self.bucket,
Prefix=prefix,
MaxKeys=limit,
)
return [obj["Key"] for obj in response.get("Contents", [])]
# Usage
storage = S3Storage("my-scraper-data")
scraped_data = [
{"title": "Product A", "price": 29.99},
{"title": "Product B", "price": 49.99},
]
path = storage.save_json(scraped_data, prefix="products")
print(f"Saved to {path}")
Google Cloud Storage
Setup
pip install google-cloud-storage
gcloud auth application-default login
Saving Data to GCS
from google.cloud import storage
import json
from datetime import datetime
class GCSStorage:
def __init__(self, bucket_name: str):
self.client = storage.Client()
self.bucket = self.client.bucket(bucket_name)
def save_json(self, data: dict | list, prefix: str = "scrapes") -> str:
"""Save JSON data to GCS."""
timestamp = datetime.utcnow().strftime("%Y/%m/%d/%H%M%S")
blob_name = f"{prefix}/{timestamp}.json"
blob = self.bucket.blob(blob_name)
blob.upload_from_string(
json.dumps(data, indent=2),
content_type="application/json",
)
return f"gs://{self.bucket.name}/{blob_name}"
def load_json(self, blob_name: str) -> dict:
"""Load JSON data from GCS."""
blob = self.bucket.blob(blob_name)
content = blob.download_as_text()
return json.loads(content)
# Usage
gcs = GCSStorage("my-scraper-data")
path = gcs.save_json({"products": scraped_data})
print(f"Saved to {path}")
Organizing Your Data
Use a consistent directory structure:
scrapes/
├── products/
│ ├── 2025/
│ │ ├── 01/
│ │ │ ├── 15/
│ │ │ │ ├── 080000.json
│ │ │ │ └── 200000.json
│ │ │ └── 16/
│ │ │ └── 080000.json
├── html/
│ └── raw page backups
└── errors/
└── failed scrape logs
This date-partitioned structure makes it easy to query specific time ranges and set up lifecycle policies.
Lifecycle Policies
Automatically manage old data to control costs:
# AWS S3: Move data to cheaper storage after 30 days, delete after 1 year
import boto3
s3 = boto3.client("s3")
s3.put_bucket_lifecycle_configuration(
Bucket="my-scraper-data",
LifecycleConfiguration={
"Rules": [{
"ID": "archive-old-scrapes",
"Status": "Enabled",
"Prefix": "scrapes/",
"Transitions": [
{"Days": 30, "StorageClass": "STANDARD_IA"},
{"Days": 90, "StorageClass": "GLACIER"},
],
"Expiration": {"Days": 365},
}]
},
)
Cost Comparison
| Storage | Price per GB/month | Best For |
|---|---|---|
| S3 Standard | $0.023 | Frequent access |
| S3 Infrequent | $0.0125 | Monthly access |
| S3 Glacier | $0.004 | Archival |
| GCS Standard | $0.020 | Frequent access |
| GCS Nearline | $0.010 | Monthly access |
For most scrapers producing a few GB per month, cloud storage costs under $1/month.