Tutorial
How to Scrape GitHub Repository and Profile Data
Learn how to extract GitHub repository data, user profiles, stars, and commit history using the GitHub API and Python web scraping.
GitHub is a goldmine for developer tooling research, open source trend analysis, and technology landscape mapping. The GitHub API makes structured data extraction straightforward.
Method 1: GitHub REST API (Recommended)
The GitHub API is generous with rate limits (5,000 requests/hour with authentication) and returns clean JSON.
import requests
GITHUB_TOKEN = "ghp_your_personal_access_token"
headers = {"Authorization": f"token {GITHUB_TOKEN}"}
def get_repo_info(owner, repo):
response = requests.get(
f"https://api.github.com/repos/{owner}/{repo}",
headers=headers
)
data = response.json()
return {
"name": data["full_name"],
"stars": data["stargazers_count"],
"forks": data["forks_count"],
"language": data["language"],
"description": data["description"],
"created": data["created_at"],
"updated": data["pushed_at"]
}
repo = get_repo_info("scrapy", "scrapy")
print(f"{repo['name']}: {repo['stars']} stars, {repo['forks']} forks")
Searching Repositories
def search_repos(query, sort="stars", limit=30):
repos = []
page = 1
while len(repos) < limit:
response = requests.get(
"https://api.github.com/search/repositories",
headers=headers,
params={
"q": query,
"sort": sort,
"order": "desc",
"per_page": min(100, limit - len(repos)),
"page": page
}
)
data = response.json()
items = data.get("items", [])
if not items:
break
for item in items:
repos.append({
"name": item["full_name"],
"stars": item["stargazers_count"],
"language": item["language"],
"description": item["description"]
})
page += 1
return repos
# Find top web scraping libraries
scrapers = search_repos("web scraping", limit=20)
for r in scrapers:
print(f"{r['name']} ({r['language']}): {r['stars']} stars")
Scraping User Profiles
def get_user_repos(username):
repos = []
page = 1
while True:
response = requests.get(
f"https://api.github.com/users/{username}/repos",
headers=headers,
params={"per_page": 100, "page": page, "sort": "stars"}
)
data = response.json()
if not data:
break
repos.extend(data)
page += 1
return [{
"name": r["name"],
"stars": r["stargazers_count"],
"language": r["language"]
} for r in repos]
user_repos = get_user_repos("sindresorhus")
total_stars = sum(r["stars"] for r in user_repos)
print(f"Total repos: {len(user_repos)}, Total stars: {total_stars}")
Extracting Commit History
def get_commits(owner, repo, since="2026-01-01"):
commits = []
page = 1
while True:
response = requests.get(
f"https://api.github.com/repos/{owner}/{repo}/commits",
headers=headers,
params={"since": since, "per_page": 100, "page": page}
)
data = response.json()
if not data:
break
for commit in data:
commits.append({
"sha": commit["sha"][:7],
"message": commit["commit"]["message"].split("\n")[0],
"author": commit["commit"]["author"]["name"],
"date": commit["commit"]["author"]["date"]
})
page += 1
return commits
Rate Limit Tips
- Authenticated requests get 5,000/hour vs 60/hour unauthenticated
- Use conditional requests with
If-None-Matchheaders to avoid counting cached responses - For bulk data, use GitHub's GraphQL API to fetch multiple resources in one request
- GitHub's Archive Program provides bulk dataset downloads for research
The GitHub API is one of the best-documented and most generous APIs available. Always prefer the API over HTML scraping.