Building a RAG Knowledge Base with Scraped Web Data

Learn how to build a Retrieval-Augmented Generation (RAG) system using scraped web data. Covers crawling, embedding, vector storage, and querying.

Retrieval-Augmented Generation (RAG) lets you build AI chatbots that answer questions using your own data. Web scraping is the most common way to populate these knowledge bases. Here is how to build one end to end.

Architecture Overview

Web Scraping → Text Extraction → Chunking → Embedding → Vector DB → Query + LLM

Step 1: Scrape the Content

import requests
from bs4 import BeautifulSoup

API_KEY = "YOUR_SCRAPERAPI_KEY"

def scrape_page(url):
    response = requests.get(
        "http://api.scraperapi.com",
        params={"api_key": API_KEY, "url": url}
    )
    soup = BeautifulSoup(response.text, "html.parser")

    # Remove noise
    for tag in soup(["script", "style", "nav", "footer", "header"]):
        tag.decompose()

    return {
        "url": url,
        "title": soup.title.string if soup.title else "",
        "text": soup.get_text(separator="\n", strip=True)
    }

urls = [
    "https://docs.example.com/getting-started",
    "https://docs.example.com/api-reference",
    "https://docs.example.com/tutorials"
]

documents = [scrape_page(url) for url in urls]

Step 2: Chunk the Text

def chunk_text(text, chunk_size=500, overlap=50):
    """Split text into overlapping chunks."""
    words = text.split()
    chunks = []

    for i in range(0, len(words), chunk_size - overlap):
        chunk = " ".join(words[i:i + chunk_size])
        if chunk:
            chunks.append(chunk)

    return chunks

all_chunks = []
for doc in documents:
    chunks = chunk_text(doc["text"])
    for chunk in chunks:
        all_chunks.append({
            "text": chunk,
            "source": doc["url"],
            "title": doc["title"]
        })

print(f"Created {len(all_chunks)} chunks from {len(documents)} documents")

Step 3: Generate Embeddings and Store

import openai
import chromadb

client = openai.OpenAI()
chroma_client = chromadb.PersistentClient(path="./chroma_db")
collection = chroma_client.create_collection("knowledge_base")

# Batch embed and store
batch_size = 100
for i in range(0, len(all_chunks), batch_size):
    batch = all_chunks[i:i + batch_size]

    embeddings_response = client.embeddings.create(
        model="text-embedding-3-small",
        input=[c["text"] for c in batch]
    )

    collection.add(
        ids=[f"chunk_{i+j}" for j in range(len(batch))],
        embeddings=[e.embedding for e in embeddings_response.data],
        documents=[c["text"] for c in batch],
        metadatas=[{"source": c["source"], "title": c["title"]} for c in batch]
    )

print(f"Stored {len(all_chunks)} chunks in vector database")

Step 4: Query the Knowledge Base

def ask(question):
    # Embed the question
    q_embedding = client.embeddings.create(
        model="text-embedding-3-small",
        input=question
    ).data[0].embedding

    # Find relevant chunks
    results = collection.query(
        query_embeddings=[q_embedding],
        n_results=5
    )

    context = "\n\n".join(results["documents"][0])

    # Generate answer with LLM
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": "Answer based on the provided context. Cite sources."},
            {"role": "user", "content": f"Context:\n{context}\n\nQuestion: {question}"}
        ]
    )

    return response.choices[0].message.content

answer = ask("How do I authenticate with the API?")
print(answer)

Tips for Better RAG with Scraped Data

Clean HTML thoroughly before chunking to avoid noise in embeddings
Include metadata (URL, title, date) for source citation
Use overlapping chunks so context is not lost at boundaries
Re-scrape periodically to keep the knowledge base current
Use ScraperAPI for protected sources that require proxy rotation or JavaScript rendering