Converting Scraped Data to Different Formats (CSV, JSON, Excel, SQL) - Data Parsing

Export scraped data to CSV, JSON, Excel, SQLite, and other formats using Python. Learn best practices for each format and when to use them.

After scraping and cleaning data, you need to store it in the right format. Each format has trade-offs in readability, size, and downstream compatibility.

Format Comparison

Format	Best For	Human Readable	Supports Nesting	File Size
CSV	Spreadsheets, simple data	Yes	No	Small
JSON	APIs, nested data	Yes	Yes	Medium
Excel	Business stakeholders	Yes (with formatting)	Limited	Medium
SQLite	Querying, large datasets	No	Via relations	Compact
Parquet	Data science, big data	No	Yes	Smallest

Export to CSV

import csv

data = [
    {"name": "ScraperAPI", "price": 49.99, "rating": 4.5, "url": "https://scraperapi.com"},
    {"name": "ScrapingAnt", "price": 29.00, "rating": 4.2, "url": "https://scrapingant.com"},
    {"name": "Bright Data", "price": 99.00, "rating": 4.7, "url": "https://brightdata.com"},
]

# With the csv module
with open("products.csv", "w", newline="", encoding="utf-8") as f:
    writer = csv.DictWriter(f, fieldnames=data[0].keys())
    writer.writeheader()
    writer.writerows(data)

# With pandas (simpler for DataFrames)
import pandas as pd
df = pd.DataFrame(data)
df.to_csv("products_pandas.csv", index=False, encoding="utf-8")

Export to JSON

import json

# Pretty-printed JSON (human-readable)
with open("products.json", "w", encoding="utf-8") as f:
    json.dump(data, f, indent=2, ensure_ascii=False)

# JSON Lines format (one JSON object per line, better for streaming)
with open("products.jsonl", "w", encoding="utf-8") as f:
    for item in data:
        f.write(json.dumps(item, ensure_ascii=False) + "\n")

# With pandas
df.to_json("products_pd.json", orient="records", indent=2, force_ascii=False)

Export to Excel

pip install openpyxl

import pandas as pd

df = pd.DataFrame(data)

# Basic Excel export
df.to_excel("products.xlsx", index=False, sheet_name="Products")

# Multiple sheets in one file
with pd.ExcelWriter("scraping_report.xlsx", engine="openpyxl") as writer:
    df.to_excel(writer, sheet_name="Products", index=False)

    # Summary sheet
    summary = pd.DataFrame([{
        "total_products": len(df),
        "avg_price": df["price"].mean(),
        "avg_rating": df["rating"].mean(),
    }])
    summary.to_excel(writer, sheet_name="Summary", index=False)

print("Excel report saved with 2 sheets")

Export to SQLite

import sqlite3
import pandas as pd

df = pd.DataFrame(data)

# pandas makes this trivial
conn = sqlite3.connect("scraping.db")
df.to_sql("products", conn, if_exists="replace", index=False)

# Verify the data
result = pd.read_sql("SELECT * FROM products WHERE price < 50", conn)
print(result)
conn.close()

For more control over the schema:

import sqlite3

conn = sqlite3.connect("scraping.db")
cursor = conn.cursor()

cursor.execute("""
    CREATE TABLE IF NOT EXISTS products (
        id INTEGER PRIMARY KEY AUTOINCREMENT,
        name TEXT NOT NULL,
        price REAL,
        rating REAL,
        url TEXT UNIQUE,
        scraped_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
    )
""")

for item in data:
    cursor.execute(
        "INSERT OR REPLACE INTO products (name, price, rating, url) VALUES (?, ?, ?, ?)",
        (item["name"], item["price"], item["rating"], item["url"]),
    )

conn.commit()

# Query it
cursor.execute("SELECT name, price FROM products ORDER BY price ASC")
for row in cursor.fetchall():
    print(f"{row[0]}: ${row[1]}")

conn.close()

Export to Parquet (for Data Science)

pip install pyarrow

import pandas as pd

df = pd.DataFrame(data)
df.to_parquet("products.parquet", index=False)

# Read it back
df_loaded = pd.read_parquet("products.parquet")
print(f"Loaded {len(df_loaded)} rows from Parquet")

Universal Export Function

import pandas as pd
import json
from pathlib import Path

def export_data(data, filename, fmt="csv"):
    """Export scraped data to any supported format."""
    df = pd.DataFrame(data)
    path = Path(filename)

    exporters = {
        "csv": lambda: df.to_csv(path.with_suffix(".csv"), index=False),
        "json": lambda: path.with_suffix(".json").write_text(
            json.dumps(data, indent=2, ensure_ascii=False)
        ),
        "jsonl": lambda: path.with_suffix(".jsonl").write_text(
            "\n".join(json.dumps(r, ensure_ascii=False) for r in data)
        ),
        "excel": lambda: df.to_excel(path.with_suffix(".xlsx"), index=False),
        "parquet": lambda: df.to_parquet(path.with_suffix(".parquet"), index=False),
        "sql": lambda: _export_sql(df, path.stem),
    }

    if fmt not in exporters:
        raise ValueError(f"Unsupported format: {fmt}. Use: {list(exporters.keys())}")

    exporters[fmt]()
    print(f"Exported {len(data)} records as {fmt}")

def _export_sql(df, table_name):
    import sqlite3
    conn = sqlite3.connect(f"{table_name}.db")
    df.to_sql(table_name, conn, if_exists="replace", index=False)
    conn.close()

# Usage
export_data(data, "products", "csv")
export_data(data, "products", "json")
export_data(data, "products", "excel")
export_data(data, "products", "sql")

Choosing the Right Format

Scenario	Best Format
Share with non-technical users	Excel
Feed into another application	JSON or CSV
Store incrementally (append)	JSON Lines or SQLite
Large-scale data analysis	Parquet
Import into a database	CSV or direct SQL insert
Version control friendly	JSON Lines (one record per line)

Whether you are scraping with ScraperAPI or ScrapingAnt, these export methods work the same, get your data clean, then save it in whatever format your downstream workflow needs.

Summary

Start with CSV for simplicity, use JSON when you have nested data, Excel for business reports, SQLite when you need queries, and Parquet for large analytical datasets. The universal export function above covers all common cases.