GitHub Harvester Skill

Extract and ingest content from GitHub repositories into RAG.

Overview

GitHub repositories contain valuable documentation, code examples, and discussions. This skill covers:

README and documentation extraction
Code example mining
Issue and discussion harvesting
Wiki content extraction
Release notes and changelogs

Prerequisites

# GitHub CLI (recommended)
brew install gh  # macOS
# or: https://cli.github.com/

# Python libraries
pip install PyGithub httpx

Authentication

# Authenticate with GitHub CLI
gh auth login

# Or set token for API access
export GITHUB_TOKEN="ghp_..."

Extraction Methods

Method 1: GitHub CLI (Recommended)

Best for quick extraction and authenticated access.

#!/bin/bash
# Extract repo content using gh CLI

REPO="$1"  # owner/repo format

# Clone with depth 1 for content only
gh repo clone "$REPO" -- --depth 1

# Get repo info
gh repo view "$REPO" --json name,description,readme

# Get issues
gh issue list --repo "$REPO" --limit 100 --json title,body,comments

# Get discussions (if enabled)
gh api "repos/$REPO/discussions" --paginate

# Get releases
gh release list --repo "$REPO" --limit 20

Method 2: PyGithub API

Better for programmatic access and complex queries.

#!/usr/bin/env python3
"""GitHub content extraction using PyGithub."""

from github import Github
from typing import Dict, List, Optional
import base64
import os

class GitHubExtractor:
    """Extract content from GitHub repositories."""

    def __init__(self, token: str = None):
        self.token = token or os.getenv("GITHUB_TOKEN")
        self.github = Github(self.token) if self.token else Github()

    def get_repo(self, repo_name: str):
        """Get repository object."""
        return self.github.get_repo(repo_name)

    def get_readme(self, repo_name: str) -> Dict:
        """Extract README content."""
        repo = self.get_repo(repo_name)

        try:
            readme = repo.get_readme()
            content = base64.b64decode(readme.content).decode('utf-8')

            return {
                "content": content,
                "path": readme.path,
                "size": readme.size,
                "url": readme.html_url
            }
        except Exception as e:
            return {"error": str(e)}

    def get_docs(self, repo_name: str) -> List[Dict]:
        """Extract documentation files."""
        repo = self.get_repo(repo_name)
        docs = []

        # Common doc locations
        doc_paths = ['docs', 'doc', 'documentation', '.github']

        for path in doc_paths:
            try:
                contents = repo.get_contents(path)
                docs.extend(self._extract_dir(repo, contents))
            except Exception:
                continue

        # Also get root markdown files
        try:
            root_contents = repo.get_contents("")
            for item in root_contents:
                if item.type == "file" and item.name.endswith('.md'):
                    content = base64.b64decode(item.content).decode('utf-8')
                    docs.append({
                        "path": item.path,
                        "content": content,
                        "url": item.html_url
                    })
        except Exception:
            pass

        return docs

    def _extract_dir(self, repo, contents) -> List[Dict]:
        """Recursively extract directory contents."""
        docs = []

        if not isinstance(contents, list):
            contents = [contents]

        for item in contents:
            if item.type == "dir":
                sub_contents = repo.get_contents(item.path)
                docs.extend(self._extract_dir(repo, sub_contents))
            elif item.type == "file":
                if item.name.endswith(('.md', '.rst', '.txt')):
                    try:
                        content = base64.b64decode(item.content).decode('utf-8')
                        docs.append({
                            "path": item.path,
                            "content": content,
                            "url": item.html_url
                        })
                    except Exception:
                        pass

        return docs

    def get_code_examples(
        self,
        repo_name: str,
        patterns: List[str] = None
    ) -> List[Dict]:
        """Extract code examples from repository."""
        repo = self.get_repo(repo_name)
        examples = []

        if patterns is None:
            patterns = ['examples', 'samples', 'demo', 'tutorials']

        for pattern in patterns:
            try:
                contents = repo.get_contents(pattern)
                examples.extend(self._extract_code(repo, contents))
            except Exception:
                continue

        return examples

    def _extract_code(self, repo, contents) -> List[Dict]:
        """Extract code files."""
        code = []
        code_extensions = ['.py', '.js', '.ts', '.go', '.rs', '.java', '.rb']

        if not isinstance(contents, list):
            contents = [contents]

        for item in contents:
            if item.type == "dir":
                sub = repo.get_contents(item.path)
                code.extend(self._extract_code(repo, sub))
            elif item.type == "file":
                if any(item.name.endswith(ext) for ext in code_extensions):
                    try:
                        content = base64.b64decode(item.content).decode('utf-8')
                        code.append({
                            "path": item.path,
                            "content": content,
                            "language": self._detect_language(item.name),
                            "url": item.html_url
                        })
                    except Exception:
                        pass

        return code

    def _detect_language(self, filename: str) -> str:
        """Detect programming language from filename."""
        ext_map = {
            '.py': 'python',
            '.js': 'javascript',
            '.ts': 'typescript',
            '.go': 'go',
            '.rs': 'rust',
            '.java': 'java',
            '.rb': 'ruby',
            '.sh': 'bash',
        }
        for ext, lang in ext_map.items():
            if filename.endswith(ext):
                return lang
        return 'unknown'

    def get_issues(
        self,
        repo_name: str,
        state: str = "all",
        limit: int = 100
    ) -> List[Dict]:
        """Extract issues with comments."""
        repo = self.get_repo(repo_name)
        issues = []

        for issue in repo.get_issues(state=state)[:limit]:
            issue_data = {
                "number": issue.number,
                "title": issue.title,
                "body": issue.body or "",
                "state": issue.state,
                "labels": [l.name for l in issue.labels],
                "created_at": issue.created_at.isoformat(),
                "url": issue.html_url,
                "comments": []
            }

            # Get comments
            for comment in issue.get_comments():
                issue_data["comments"].append({
                    "body": comment.body,
                    "author": comment.user.login,
                    "created_at": comment.created_at.isoformat()
                })

            issues.append(issue_data)

        return issues

    def get_discussions(self, repo_name: str, limit: int = 50) -> List[Dict]:
        """Extract discussions using GraphQL API."""
        # Note: Requires GraphQL query, simplified version here
        query = """
        query($owner: String!, $name: String!, $first: Int!) {
          repository(owner: $owner, name: $name) {
            discussions(first: $first) {
              nodes {
                title
                body
                url
                category { name }
                comments(first: 10) {
                  nodes { body }
                }
              }
            }
          }
        }
        """

        owner, name = repo_name.split('/')

        # Would need to execute GraphQL query
        # Simplified: return empty for now
        return []

    def get_releases(self, repo_name: str, limit: int = 20) -> List[Dict]:
        """Extract release information."""
        repo = self.get_repo(repo_name)
        releases = []

        for release in repo.get_releases()[:limit]:
            releases.append({
                "tag": release.tag_name,
                "name": release.title,
                "body": release.body or "",
                "published_at": release.published_at.isoformat() if release.published_at else None,
                "url": release.html_url,
                "prerelease": release.prerelease
            })

        return releases

    def get_repo_metadata(self, repo_name: str) -> Dict:
        """Get repository metadata."""
        repo = self.get_repo(repo_name)

        return {
            "name": repo.name,
            "full_name": repo.full_name,
            "description": repo.description,
            "topics": repo.get_topics(),
            "language": repo.language,
            "stars": repo.stargazers_count,
            "forks": repo.forks_count,
            "created_at": repo.created_at.isoformat(),
            "updated_at": repo.updated_at.isoformat(),
            "url": repo.html_url,
            "homepage": repo.homepage
        }

Chunking Strategies

README Chunking

def chunk_readme(content: str) -> List[Dict]:
    """Chunk README by sections."""
    import re

    sections = []
    current_section = {"heading": "Overview", "content": "", "level": 1}

    for line in content.split('
'):
        heading_match = re.match(r'^(#{1,3})\s+(.+)$', line)

        if heading_match:
            # Save current section
            if current_section["content"].strip():
                sections.append(current_section)

            level = len(heading_match.group(1))
            heading = heading_match.group(2)
            current_section = {"heading": heading, "content": "", "level": level}
        else:
            current_section["content"] += line + "
"

    # Don't forget last section
    if current_section["content"].strip():
        sections.append(current_section)

    return sections

Code Example Chunking

def chunk_code_file(content: str, language: str) -> List[Dict]:
    """Chunk code file by functions/classes."""
    import ast

    if language != 'python':
        # For non-Python, chunk by size
        return [{"content": content, "type": "file"}]

    try:
        tree = ast.parse(content)
    except SyntaxError:
        return [{"content": content, "type": "file"}]

    chunks = []

    for node in ast.iter_child_nodes(tree):
        if isinstance(node, ast.FunctionDef):
            source = ast.get_source_segment(content, node)
            if source:
                chunks.append({
                    "content": source,
                    "type": "function",
                    "name": node.name,
                    "docstring": ast.get_docstring(node)
                })

        elif isinstance(node, ast.ClassDef):
            source = ast.get_source_segment(content, node)
            if source:
                chunks.append({
                    "content": source,
                    "type": "class",
                    "name": node.name,
                    "docstring": ast.get_docstring(node)
                })

    return chunks if chunks else [{"content": content, "type": "file"}]

Issue/Discussion Chunking

def chunk_issue(issue: Dict) -> List[Dict]:
    """Chunk issue with comments."""
    chunks = []

    # Issue body as main chunk
    chunks.append({
        "content": f"# {issue['title']}

{issue['body']}",
        "type": "issue",
        "issue_number": issue["number"]
    })

    # Significant comments as separate chunks
    for i, comment in enumerate(issue.get("comments", [])):
        if len(comment["body"]) > 200:  # Only substantial comments
            chunks.append({
                "content": comment["body"],
                "type": "comment",
                "issue_number": issue["number"],
                "comment_index": i
            })

    return chunks

Full Harvesting Pipeline

#!/usr/bin/env python3
"""Complete GitHub harvesting pipeline."""

from datetime import datetime
from typing import Dict, List
import hashlib

async def harvest_github_repo(
    repo_name: str,
    collection: str,
    include_readme: bool = True,
    include_docs: bool = True,
    include_examples: bool = True,
    include_issues: bool = False,
    include_releases: bool = True,
    max_issues: int = 50
) -> Dict:
    """
    Harvest a GitHub repository into RAG.

    Args:
        repo_name: Repository in owner/repo format
        collection: Target RAG collection
        include_*: What content to harvest
        max_issues: Maximum issues to harvest
    """
    extractor = GitHubExtractor()

    # Get repo metadata
    repo_meta = extractor.get_repo_metadata(repo_name)

    base_metadata = {
        "source_type": "github",
        "repo": repo_name,
        "repo_description": repo_meta.get("description"),
        "repo_language": repo_meta.get("language"),
        "repo_topics": repo_meta.get("topics", []),
        "harvested_at": datetime.now().isoformat()
    }

    stats = {
        "readme": 0,
        "docs": 0,
        "examples": 0,
        "issues": 0,
        "releases": 0
    }

    # Harvest README
    if include_readme:
        readme = extractor.get_readme(repo_name)
        if "content" in readme:
            sections = chunk_readme(readme["content"])

            for i, section in enumerate(sections):
                metadata = {
                    **base_metadata,
                    "content_type": "readme",
                    "section": section["heading"],
                    "section_level": section["level"],
                    "chunk_index": i,
                    "source_url": readme["url"]
                }

                await ingest(
                    content=section["content"],
                    collection=collection,
                    metadata=metadata,
                    doc_id=f"gh_{repo_name.replace('/', '_')}_readme_{i}"
                )
                stats["readme"] += 1

    # Harvest docs
    if include_docs:
        docs = extractor.get_docs(repo_name)

        for doc in docs:
            metadata = {
                **base_metadata,
                "content_type": "documentation",
                "file_path": doc["path"],
                "source_url": doc["url"]
            }

            await ingest(
                content=doc["content"],
                collection=collection,
                metadata=metadata,
                doc_id=f"gh_{repo_name.replace('/', '_')}_doc_{hashlib.md5(doc['path'].encode()).hexdigest()[:8]}"
            )
            stats["docs"] += 1

    # Harvest code examples
    if include_examples:
        examples = extractor.get_code_examples(repo_name)

        for example in examples:
            chunks = chunk_code_file(example["content"], example["language"])

            for i, chunk in enumerate(chunks):
                metadata = {
                    **base_metadata,
                    "content_type": "code_example",
                    "file_path": example["path"],
                    "language": example["language"],
                    "code_type": chunk.get("type", "file"),
                    "code_name": chunk.get("name", ""),
                    "source_url": example["url"]
                }

                await ingest(
                    content=chunk["content"],
                    collection=collection,
                    metadata=metadata,
                    doc_id=f"gh_{repo_name.replace('/', '_')}_code_{hashlib.md5(example['path'].encode()).hexdigest()[:8]}_{i}"
                )
                stats["examples"] += 1

    # Harvest issues
    if include_issues:
        issues = extractor.get_issues(repo_name, limit=max_issues)

        for issue in issues:
            chunks = chunk_issue(issue)

            for chunk in chunks:
                metadata = {
                    **base_metadata,
                    "content_type": chunk["type"],
                    "issue_number": issue["number"],
                    "issue_title": issue["title"],
                    "issue_state": issue["state"],
                    "issue_labels": issue["labels"],
                    "source_url": issue["url"]
                }

                await ingest(
                    content=chunk["content"],
                    collection=collection,
                    metadata=metadata,
                    doc_id=f"gh_{repo_name.replace('/', '_')}_issue_{issue['number']}_{chunk.get('comment_index', 0)}"
                )
                stats["issues"] += 1

    # Harvest releases
    if include_releases:
        releases = extractor.get_releases(repo_name)

        for release in releases:
            if release["body"]:  # Only if has release notes
                metadata = {
                    **base_metadata,
                    "content_type": "release",
                    "release_tag": release["tag"],
                    "release_name": release["name"],
                    "published_at": release["published_at"],
                    "source_url": release["url"]
                }

                await ingest(
                    content=f"# {release['name']}

{release['body']}",
                    collection=collection,
                    metadata=metadata,
                    doc_id=f"gh_{repo_name.replace('/', '_')}_release_{release['tag']}"
                )
                stats["releases"] += 1

    return {
        "status": "success",
        "repo": repo_name,
        "collection": collection,
        "harvested": stats,
        "total": sum(stats.values())
    }

Metadata Schema

# GitHub content metadata
source_type: github
repo: owner/repo
repo_description: "Repository description"
repo_language: Python
repo_topics: [topic1, topic2]
content_type: readme|documentation|code_example|issue|release
file_path: docs/guide.md (for docs/code)
language: python (for code)
code_type: function|class|file
code_name: function_name
issue_number: 123
issue_title: "Issue title"
issue_state: open|closed
issue_labels: [bug, help wanted]
release_tag: v1.0.0
source_url: https://github.com/...
harvested_at: "2024-01-01T12:00:00Z"

Usage Examples

# Full repository harvest
result = await harvest_github_repo(
    repo_name="anthropics/anthropic-sdk-python",
    collection="anthropic_sdk",
    include_readme=True,
    include_docs=True,
    include_examples=True,
    include_issues=False,
    include_releases=True
)

# Issues focus
result = await harvest_github_repo(
    repo_name="langchain-ai/langchain",
    collection="langchain_issues",
    include_readme=False,
    include_docs=False,
    include_issues=True,
    max_issues=200
)

# Code examples only
result = await harvest_github_repo(
    repo_name="fastapi/fastapi",
    collection="fastapi_examples",
    include_readme=True,
    include_docs=False,
    include_examples=True
)

CLI Usage

# Using gh CLI for quick extraction
gh repo clone owner/repo -- --depth 1
gh repo view owner/repo --json readme -q .readme.content

# Get issues as JSON
gh issue list --repo owner/repo --json title,body,comments --limit 50

Refinement Notes

Track improvements as you use this skill.

github-harvester