Converting Scraped Data to Different Formats (CSV, JSON, Excel, SQL)
Export scraped data to CSV, JSON, Excel, SQLite, and other formats using Python. Learn best practices for each format and when to use them.
After scraping and cleaning data, you need to store it in the right format. Each format has trade-offs in readability, size, and downstream compatibility.
Format Comparison
| Format | Best For | Human Readable | Supports Nesting | File Size |
|---|---|---|---|---|
| CSV | Spreadsheets, simple data | Yes | No | Small |
| JSON | APIs, nested data | Yes | Yes | Medium |
| Excel | Business stakeholders | Yes (with formatting) | Limited | Medium |
| SQLite | Querying, large datasets | No | Via relations | Compact |
| Parquet | Data science, big data | No | Yes | Smallest |
Export to CSV
import csv
data = [
{"name": "ScraperAPI", "price": 49.99, "rating": 4.5, "url": "https://scraperapi.com"},
{"name": "ScrapingAnt", "price": 29.00, "rating": 4.2, "url": "https://scrapingant.com"},
{"name": "Bright Data", "price": 99.00, "rating": 4.7, "url": "https://brightdata.com"},
]
# With the csv module
with open("products.csv", "w", newline="", encoding="utf-8") as f:
writer = csv.DictWriter(f, fieldnames=data[0].keys())
writer.writeheader()
writer.writerows(data)
# With pandas (simpler for DataFrames)
import pandas as pd
df = pd.DataFrame(data)
df.to_csv("products_pandas.csv", index=False, encoding="utf-8")
Export to JSON
import json
# Pretty-printed JSON (human-readable)
with open("products.json", "w", encoding="utf-8") as f:
json.dump(data, f, indent=2, ensure_ascii=False)
# JSON Lines format (one JSON object per line, better for streaming)
with open("products.jsonl", "w", encoding="utf-8") as f:
for item in data:
f.write(json.dumps(item, ensure_ascii=False) + "\n")
# With pandas
df.to_json("products_pd.json", orient="records", indent=2, force_ascii=False)
Export to Excel
pip install openpyxl
import pandas as pd
df = pd.DataFrame(data)
# Basic Excel export
df.to_excel("products.xlsx", index=False, sheet_name="Products")
# Multiple sheets in one file
with pd.ExcelWriter("scraping_report.xlsx", engine="openpyxl") as writer:
df.to_excel(writer, sheet_name="Products", index=False)
# Summary sheet
summary = pd.DataFrame([{
"total_products": len(df),
"avg_price": df["price"].mean(),
"avg_rating": df["rating"].mean(),
}])
summary.to_excel(writer, sheet_name="Summary", index=False)
print("Excel report saved with 2 sheets")
Export to SQLite
import sqlite3
import pandas as pd
df = pd.DataFrame(data)
# pandas makes this trivial
conn = sqlite3.connect("scraping.db")
df.to_sql("products", conn, if_exists="replace", index=False)
# Verify the data
result = pd.read_sql("SELECT * FROM products WHERE price < 50", conn)
print(result)
conn.close()
For more control over the schema:
import sqlite3
conn = sqlite3.connect("scraping.db")
cursor = conn.cursor()
cursor.execute("""
CREATE TABLE IF NOT EXISTS products (
id INTEGER PRIMARY KEY AUTOINCREMENT,
name TEXT NOT NULL,
price REAL,
rating REAL,
url TEXT UNIQUE,
scraped_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
)
""")
for item in data:
cursor.execute(
"INSERT OR REPLACE INTO products (name, price, rating, url) VALUES (?, ?, ?, ?)",
(item["name"], item["price"], item["rating"], item["url"]),
)
conn.commit()
# Query it
cursor.execute("SELECT name, price FROM products ORDER BY price ASC")
for row in cursor.fetchall():
print(f"{row[0]}: ${row[1]}")
conn.close()
Export to Parquet (for Data Science)
pip install pyarrow
import pandas as pd
df = pd.DataFrame(data)
df.to_parquet("products.parquet", index=False)
# Read it back
df_loaded = pd.read_parquet("products.parquet")
print(f"Loaded {len(df_loaded)} rows from Parquet")
Universal Export Function
import pandas as pd
import json
from pathlib import Path
def export_data(data, filename, fmt="csv"):
"""Export scraped data to any supported format."""
df = pd.DataFrame(data)
path = Path(filename)
exporters = {
"csv": lambda: df.to_csv(path.with_suffix(".csv"), index=False),
"json": lambda: path.with_suffix(".json").write_text(
json.dumps(data, indent=2, ensure_ascii=False)
),
"jsonl": lambda: path.with_suffix(".jsonl").write_text(
"\n".join(json.dumps(r, ensure_ascii=False) for r in data)
),
"excel": lambda: df.to_excel(path.with_suffix(".xlsx"), index=False),
"parquet": lambda: df.to_parquet(path.with_suffix(".parquet"), index=False),
"sql": lambda: _export_sql(df, path.stem),
}
if fmt not in exporters:
raise ValueError(f"Unsupported format: {fmt}. Use: {list(exporters.keys())}")
exporters[fmt]()
print(f"Exported {len(data)} records as {fmt}")
def _export_sql(df, table_name):
import sqlite3
conn = sqlite3.connect(f"{table_name}.db")
df.to_sql(table_name, conn, if_exists="replace", index=False)
conn.close()
# Usage
export_data(data, "products", "csv")
export_data(data, "products", "json")
export_data(data, "products", "excel")
export_data(data, "products", "sql")
Choosing the Right Format
| Scenario | Best Format |
|---|---|
| Share with non-technical users | Excel |
| Feed into another application | JSON or CSV |
| Store incrementally (append) | JSON Lines or SQLite |
| Large-scale data analysis | Parquet |
| Import into a database | CSV or direct SQL insert |
| Version control friendly | JSON Lines (one record per line) |
Whether you are scraping with ScraperAPI or ScrapingAnt, these export methods work the same, get your data clean, then save it in whatever format your downstream workflow needs.
Summary
Start with CSV for simplicity, use JSON when you have nested data, Excel for business reports, SQLite when you need queries, and Parquet for large analytical datasets. The universal export function above covers all common cases.