Scraping Central is reader-supported. When you buy through links on our site, we may earn an affiliate commission.

Tutorial

How to Scrape GitHub Repository and Profile Data

Learn how to extract GitHub repository data, user profiles, stars, and commit history using the GitHub API and Python web scraping.

GitHub is a goldmine for developer tooling research, open source trend analysis, and technology landscape mapping. The GitHub API makes structured data extraction straightforward.

Method 1: GitHub REST API (Recommended)

The GitHub API is generous with rate limits (5,000 requests/hour with authentication) and returns clean JSON.

import requests

GITHUB_TOKEN = "ghp_your_personal_access_token"
headers = {"Authorization": f"token {GITHUB_TOKEN}"}

def get_repo_info(owner, repo):
    response = requests.get(
        f"https://api.github.com/repos/{owner}/{repo}",
        headers=headers
    )
    data = response.json()
    return {
        "name": data["full_name"],
        "stars": data["stargazers_count"],
        "forks": data["forks_count"],
        "language": data["language"],
        "description": data["description"],
        "created": data["created_at"],
        "updated": data["pushed_at"]
    }

repo = get_repo_info("scrapy", "scrapy")
print(f"{repo['name']}: {repo['stars']} stars, {repo['forks']} forks")

Searching Repositories

def search_repos(query, sort="stars", limit=30):
    repos = []
    page = 1

    while len(repos) < limit:
        response = requests.get(
            "https://api.github.com/search/repositories",
            headers=headers,
            params={
                "q": query,
                "sort": sort,
                "order": "desc",
                "per_page": min(100, limit - len(repos)),
                "page": page
            }
        )
        data = response.json()
        items = data.get("items", [])
        if not items:
            break

        for item in items:
            repos.append({
                "name": item["full_name"],
                "stars": item["stargazers_count"],
                "language": item["language"],
                "description": item["description"]
            })
        page += 1

    return repos

# Find top web scraping libraries
scrapers = search_repos("web scraping", limit=20)
for r in scrapers:
    print(f"{r['name']} ({r['language']}): {r['stars']} stars")

Scraping User Profiles

def get_user_repos(username):
    repos = []
    page = 1

    while True:
        response = requests.get(
            f"https://api.github.com/users/{username}/repos",
            headers=headers,
            params={"per_page": 100, "page": page, "sort": "stars"}
        )
        data = response.json()
        if not data:
            break

        repos.extend(data)
        page += 1

    return [{
        "name": r["name"],
        "stars": r["stargazers_count"],
        "language": r["language"]
    } for r in repos]

user_repos = get_user_repos("sindresorhus")
total_stars = sum(r["stars"] for r in user_repos)
print(f"Total repos: {len(user_repos)}, Total stars: {total_stars}")

Extracting Commit History

def get_commits(owner, repo, since="2026-01-01"):
    commits = []
    page = 1

    while True:
        response = requests.get(
            f"https://api.github.com/repos/{owner}/{repo}/commits",
            headers=headers,
            params={"since": since, "per_page": 100, "page": page}
        )
        data = response.json()
        if not data:
            break

        for commit in data:
            commits.append({
                "sha": commit["sha"][:7],
                "message": commit["commit"]["message"].split("\n")[0],
                "author": commit["commit"]["author"]["name"],
                "date": commit["commit"]["author"]["date"]
            })
        page += 1

    return commits

Rate Limit Tips

  • Authenticated requests get 5,000/hour vs 60/hour unauthenticated
  • Use conditional requests with If-None-Match headers to avoid counting cached responses
  • For bulk data, use GitHub's GraphQL API to fetch multiple resources in one request
  • GitHub's Archive Program provides bulk dataset downloads for research

The GitHub API is one of the best-documented and most generous APIs available. Always prefer the API over HTML scraping.