Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: ingest PR diffs through .diff URLs #166

Draft
wants to merge 2 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
100 changes: 100 additions & 0 deletions src/gitingest/pull_request_ingest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
import asyncio

from gitingest.query_parser import ParsedQuery


async def ingest_pull_request(
query: ParsedQuery,
) -> tuple[str, str, str]:
"""
Ingest a pull request and return its summary, directory structure, and file contents.
"""
summary = f"Pull request {query.pull_or_issue_number} from {query.url}"
diff_url = await _pull_request_url_to_diff_url(query)
files_changed, diff_content = await _ingest_diff_url(diff_url)
return summary, files_changed, diff_content


async def _ingest_diff_url(diff_url: str) -> tuple[str, str]:
"""
Ingest a diff URL and return its summary, files changed, and diff content.
"""
proc = await asyncio.create_subprocess_exec(
"curl",
diff_url,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE,
)
stdout, _ = await proc.communicate()

if proc.returncode != 0:
raise RuntimeError(f"Failed to fetch diff content from {diff_url}")

diff_content = stdout.decode()

files_changed = _summarize_files_changed_from_diff_content(diff_content)
return files_changed, diff_content


def _summarize_files_changed_from_diff_content(diff_content: str) -> str:
"""
Summarize the files changed from a diff content.

Given a diff content, return a summary of the files changed in the following format:
```
path/to/file1.py (modified, +X/-Y)
path/to/file2.py (added)
path/to/file3.py (deleted)
path/to/file4.py (renamed from path/to/file5.py)
```
"""
raise NotImplementedError


async def _pull_request_url_to_diff_url(query: ParsedQuery) -> str:
"""Convert a pull request URL to a diff URL.

The following providers are supported:

- GitHub:
<- https://github.com/cyclotruc/gitingest/pull/153
-> https://github.com/cyclotruc/gitingest/pull/153.diff
- Bitbucket:
<- https://bitbucket.org//cyclotruc/gitingest/pullrequests/153
-> https://api.bitbucket.org/2.0/repositories/cyclotruc/gitingest/pullrequests/153/diff
- GitLab:
<- https://gitlab.com/ruancomelli/testing-project/-/merge_requests/1
-> https://gitlab.com/ruancomelli/testing-project/-/merge_requests/1.diff
- Codeberg:
<- https://codeberg.org/mergiraf/mergiraf/pulls/184
-> https://codeberg.org/mergiraf/mergiraf/pulls/184.diff
- Gitea:
<- https://gitea.com/gitea/git/pulls/152
-> https://gitea.com/gitea/git/pulls/152.diff
- ... and any other provider in which diff URLs are of the form `<pr_url> + ".diff"`

Note that Bitbucket URLs cannot be converted to diff URLs by simply appending `.diff` to the URL.
Instead, the Bitbucket API must be used.

Args:
url: The URL of the pull request.

Returns:
The URL of the diff.
"""
if query.host == "bitbucket.org":
return _bitbucket_pr_url_to_diff_url(query)
else:
return f"{query.url}.diff"


def _bitbucket_pr_url_to_diff_url(query: ParsedQuery) -> str:
"""Convert a Bitbucket pull request URL to a diff URL.

Args:
url: The URL of the pull request.

Returns:
The URL of the diff.
"""
return f"https://api.bitbucket.org/2.0/repositories/{query.user_name}/{query.repo_name}/pullrequests/{query.pull_or_issue_number}/diff"
6 changes: 5 additions & 1 deletion src/gitingest/query_ingestion.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
MaxFilesReachedError,
)
from gitingest.notebook_utils import process_notebook
from gitingest.pull_request_ingest import ingest_pull_request as _ingest_pull_request
from gitingest.query_parser import ParsedQuery


Expand Down Expand Up @@ -792,7 +793,7 @@ def _ingest_directory(path: Path, query: ParsedQuery) -> tuple[str, str, str]:
return summary, tree, files_content


def run_ingest_query(query: ParsedQuery) -> tuple[str, str, str]:
async def run_ingest_query(query: ParsedQuery) -> tuple[str, str, str]:
"""
Run the ingestion process for a parsed query.

Expand All @@ -815,6 +816,9 @@ def run_ingest_query(query: ParsedQuery) -> tuple[str, str, str]:
ValueError
If the specified path cannot be found or if the file is not a text file.
"""
if query.type == "pull":
return await _ingest_pull_request(query)

path = query.local_path / query.subpath.lstrip("/")
if not path.exists():
raise ValueError(f"{query.slug} cannot be found")
Expand Down
58 changes: 47 additions & 11 deletions src/gitingest/query_parser.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
""" This module contains functions to parse and validate input sources and patterns. """
"""This module contains functions to parse and validate input sources and patterns."""

import os
import re
Expand Down Expand Up @@ -37,6 +37,7 @@ class ParsedQuery: # pylint: disable=too-many-instance-attributes
subpath: str
local_path: Path
url: str | None
host: str | None
slug: str
id: str
type: str | None = None
Expand All @@ -46,6 +47,7 @@ class ParsedQuery: # pylint: disable=too-many-instance-attributes
ignore_patterns: set[str] | None = None
include_patterns: set[str] | None = None
pattern_type: str | None = None
pull_or_issue_number: int | None = None


async def parse_query(
Expand Down Expand Up @@ -82,7 +84,11 @@ async def parse_query(
"""

# Determine the parsing method based on the source type
if from_web or urlparse(source).scheme in ("https", "http") or any(h in source for h in KNOWN_GIT_HOSTS):
if (
from_web
or urlparse(source).scheme in ("https", "http")
or any(h in source for h in KNOWN_GIT_HOSTS)
):
# We either have a full URL or a domain-less slug
parsed_query = await _parse_repo_source(source)
else:
Expand All @@ -97,14 +103,17 @@ async def parse_query(
# Process include patterns and override ignore patterns accordingly
if include_patterns:
parsed_include = _parse_patterns(include_patterns)
ignore_patterns_set = _override_ignore_patterns(ignore_patterns_set, include_patterns=parsed_include)
ignore_patterns_set = _override_ignore_patterns(
ignore_patterns_set, include_patterns=parsed_include
)
else:
parsed_include = None

return ParsedQuery(
user_name=parsed_query.user_name,
repo_name=parsed_query.repo_name,
url=parsed_query.url,
host=parsed_query.host,
subpath=parsed_query.subpath,
local_path=parsed_query.local_path,
slug=parsed_query.slug,
Expand Down Expand Up @@ -152,7 +161,9 @@ async def _parse_repo_source(source: str) -> ParsedQuery:
_validate_host(tmp_host)
else:
# No scheme, no domain => user typed "user/repo", so we'll guess the domain.
host = await try_domains_for_user_and_repo(*_get_user_and_repo_from_path(source))
host = await try_domains_for_user_and_repo(
*_get_user_and_repo_from_path(source)
)
source = f"{host}/{source}"

source = "https://" + source
Expand All @@ -170,6 +181,7 @@ async def _parse_repo_source(source: str) -> ParsedQuery:
user_name=user_name,
repo_name=repo_name,
url=url,
host=host,
subpath="/",
local_path=local_path,
slug=slug,
Expand All @@ -181,18 +193,23 @@ async def _parse_repo_source(source: str) -> ParsedQuery:
if not remaining_parts:
return parsed

possible_type = remaining_parts.pop(0) # e.g. 'issues', 'pull', 'tree', 'blob'
possible_type = remaining_parts.pop(0) # e.g. 'issues', 'pull', 'tree', 'blob' or '-' (on GitLab)

if possible_type == "-" and remaining_parts:
possible_type = remaining_parts.pop(0)

parsed.type = _possible_type_to_parsed_query_type(possible_type)

# If no extra path parts, just return
if not remaining_parts:
return parsed

# If this is an issues page or pull requests, return early without processing subpath
if remaining_parts and possible_type in ("issues", "pull"):
if parsed.type in ("issue", "pull"):
parsed.pull_or_issue_number = int(remaining_parts.pop(0))
parsed.url = f"{url}/{parsed.type}/{parsed.pull_or_issue_number}"
return parsed

parsed.type = possible_type

# Commit or branch
commit_or_branch = remaining_parts[0]
if _is_valid_git_commit_hash(commit_or_branch):
Expand All @@ -208,7 +225,21 @@ async def _parse_repo_source(source: str) -> ParsedQuery:
return parsed


async def _configure_branch_and_subpath(remaining_parts: list[str], url: str) -> str | None:
def _possible_type_to_parsed_query_type(possible_type: str) -> str:
"""
Convert a possible type to a parsed query type.
"""
if possible_type in ("issues", "issue"):
return "issue"
elif possible_type in ("pulls", "pull", "merge_requests", "pullrequest"):
return "pull"
else:
return possible_type


async def _configure_branch_and_subpath(
remaining_parts: list[str], url: str
) -> str | None:
"""
Configure the branch and subpath based on the remaining parts of the URL.
Parameters
Expand Down Expand Up @@ -324,7 +355,9 @@ def _parse_patterns(pattern: set[str] | str) -> set[str]:
return {_normalize_pattern(p) for p in parsed_patterns}


def _override_ignore_patterns(ignore_patterns: set[str], include_patterns: set[str]) -> set[str]:
def _override_ignore_patterns(
ignore_patterns: set[str], include_patterns: set[str]
) -> set[str]:
"""
Remove patterns from ignore_patterns that are present in include_patterns using set difference.

Expand Down Expand Up @@ -361,6 +394,7 @@ def _parse_path(path_str: str) -> ParsedQuery:
return ParsedQuery(
user_name=None,
repo_name=None,
host=None,
url=None,
subpath="/",
local_path=path_obj,
Expand Down Expand Up @@ -415,7 +449,9 @@ async def try_domains_for_user_and_repo(user_name: str, repo_name: str) -> str:
candidate = f"https://{domain}/{user_name}/{repo_name}"
if await _check_repo_exists(candidate):
return domain
raise ValueError(f"Could not find a valid repository host for '{user_name}/{repo_name}'.")
raise ValueError(
f"Could not find a valid repository host for '{user_name}/{repo_name}'."
)


def _get_user_and_repo_from_path(path: str) -> tuple[str, str]:
Expand Down
10 changes: 4 additions & 6 deletions src/gitingest/repository_ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ async def ingest(
ignore_patterns=exclude_patterns,
)

if parsed_query.url:
if parsed_query.url and parsed_query.type != "pull":
selected_branch = branch if branch else parsed_query.branch # prioritize branch argument
parsed_query.branch = selected_branch

Expand All @@ -84,15 +84,13 @@ async def ingest(
else:
raise TypeError("clone_repo did not return a coroutine as expected.")

summary, tree, content = run_ingest_query(parsed_query)
summary, tree, content = await run_ingest_query(parsed_query)

if output is not None:
with open(output, "w", encoding="utf-8") as f:
f.write(tree + "\n" + content)

return summary, tree, content
finally:
# Clean up the temporary directory if it was created
if parsed_query.url:
# Clean up the temporary directory
shutil.rmtree(TMP_BASE_PATH, ignore_errors=True)
# Clean up the temporary directory
shutil.rmtree(TMP_BASE_PATH, ignore_errors=True)
5 changes: 3 additions & 2 deletions src/server/query_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,8 +90,9 @@ async def process_query(
commit=parsed_query.commit,
branch=parsed_query.branch,
)
await clone_repo(clone_config)
summary, tree, content = run_ingest_query(parsed_query)
if parsed_query.type != "pull":
await clone_repo(clone_config)
summary, tree, content = await run_ingest_query(parsed_query)
with open(f"{clone_config.local_path}.txt", "w", encoding="utf-8") as f:
f.write(tree + "\n" + content)
except Exception as e:
Expand Down
1 change: 1 addition & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ def sample_query() -> ParsedQuery:
user_name="test_user",
repo_name="test_repo",
url=None,
host="github.com",
subpath="/",
local_path=Path("/tmp/test_repo").resolve(),
slug="test_user/test_repo",
Expand Down
4 changes: 2 additions & 2 deletions tests/test_query_ingestion.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,7 +176,7 @@ def test_include_src_patterns(temp_directory: Path, sample_query: ParsedQuery, i
assert file_paths == expected_paths, "Missing or unexpected files in result"


def test_run_ingest_query(temp_directory: Path, sample_query: ParsedQuery) -> None:
async def test_run_ingest_query(temp_directory: Path, sample_query: ParsedQuery) -> None:
"""
Test `run_ingest_query` to ensure it processes the directory and returns expected results.

Expand All @@ -188,7 +188,7 @@ def test_run_ingest_query(temp_directory: Path, sample_query: ParsedQuery) -> No
sample_query.subpath = "/"
sample_query.type = None

summary, _, content = run_ingest_query(sample_query)
summary, _, content = await run_ingest_query(sample_query)

assert "Repository: test_user/test_repo" in summary
assert "Files analyzed: 8" in summary
Expand Down