Skip to content

Rework check_repo_exists to use git ls-remote #508

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 34 additions & 30 deletions src/gitingest/utils/git_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,7 @@
from typing import TYPE_CHECKING, Final, Iterable
from urllib.parse import urlparse

import httpx
from starlette.status import HTTP_200_OK, HTTP_401_UNAUTHORIZED, HTTP_403_FORBIDDEN, HTTP_404_NOT_FOUND


from gitingest.utils.compat_func import removesuffix
from gitingest.utils.exceptions import InvalidGitHubTokenError
Expand Down Expand Up @@ -112,7 +111,7 @@ async def ensure_git_installed() -> None:


async def check_repo_exists(url: str, token: str | None = None) -> bool:
"""Check whether a remote Git repository is reachable.
"""Check whether a remote Git repository is reachable using git ls-remote.

Parameters
----------
Expand All @@ -126,35 +125,40 @@ async def check_repo_exists(url: str, token: str | None = None) -> bool:
bool
``True`` if the repository exists, ``False`` otherwise.

Raises
------
RuntimeError
If the host returns an unrecognised status code.

"""
headers = {}

if token and is_github_host(url):
host, owner, repo = _parse_github_url(url)
# Public GitHub vs. GitHub Enterprise
base_api = "https://api.github.com" if host == "github.com" else f"https://{host}/api/v3"
url = f"{base_api}/repos/{owner}/{repo}"
headers["Authorization"] = f"Bearer {token}"

async with httpx.AsyncClient(follow_redirects=True) as client:
try:
response = await client.head(url, headers=headers)
except httpx.RequestError:
return False

status_code = response.status_code

if status_code == HTTP_200_OK:
return True
if status_code in {HTTP_401_UNAUTHORIZED, HTTP_403_FORBIDDEN, HTTP_404_NOT_FOUND}:
cmd = ["git", "ls-remote"]

# Add authentication header if token is provided
if token:
if is_github_host(url):
# Use GitHub-specific authentication
cmd.extend(["-c", create_git_auth_header(token, url=url)])
else:
# For non-GitHub repositories, use generic HTTP basic auth
# This works for GitLab, Bitbucket, and other Git hosting services
parsed_url = urlparse(url)
if parsed_url.hostname:
basic_auth = base64.b64encode(f"oauth2:{token}".encode()).decode()
auth_header = f"http.https://{parsed_url.hostname}/.extraheader=Authorization: Basic {basic_auth}"
cmd.extend(["-c", auth_header])

cmd.extend(["--exit-code", url, "HEAD"])

try:
proc = await asyncio.create_subprocess_exec(
*cmd,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE,
)
stdout, stderr = await proc.communicate()

# git ls-remote returns 0 if repository exists and is accessible
# returns non-zero if repository doesn't exist or is not accessible
return proc.returncode == 0

except Exception:
# If any exception occurs (e.g., git not available), assume repo doesn't exist
return False
msg = f"Unexpected HTTP status {status_code} for {url}"
raise RuntimeError(msg)


def _parse_github_url(url: str) -> tuple[str, str, str]:
Expand Down
105 changes: 85 additions & 20 deletions tests/test_clone.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,7 @@
from typing import TYPE_CHECKING
from unittest.mock import AsyncMock

import httpx
import pytest
from starlette.status import HTTP_200_OK, HTTP_401_UNAUTHORIZED, HTTP_403_FORBIDDEN, HTTP_404_NOT_FOUND

from gitingest.clone import clone_repo
from gitingest.schemas import CloneConfig
Expand Down Expand Up @@ -101,24 +99,30 @@ async def test_clone_nonexistent_repository(repo_exists_true: AsyncMock) -> None

@pytest.mark.asyncio
@pytest.mark.parametrize(
("status_code", "expected"),
("returncode", "expected"),
[
(HTTP_200_OK, True),
(HTTP_401_UNAUTHORIZED, False),
(HTTP_403_FORBIDDEN, False),
(HTTP_404_NOT_FOUND, False),
(0, True), # Repository exists and is accessible
(2, False), # Repository doesn't exist or is not accessible
(128, False), # Git error (e.g., authentication failure)
],
)
async def test_check_repo_exists(status_code: int, *, expected: bool, mocker: MockerFixture) -> None:
"""Verify that ``check_repo_exists`` interprets httpx results correctly."""
mock_client = AsyncMock()
mock_client.__aenter__.return_value = mock_client # context-manager protocol
mock_client.head.return_value = httpx.Response(status_code=status_code)
mocker.patch("httpx.AsyncClient", return_value=mock_client)
async def test_check_repo_exists(returncode: int, *, expected: bool, mocker: MockerFixture) -> None:
"""Verify that ``check_repo_exists`` interprets git ls-remote results correctly."""
mock_exec = mocker.patch("asyncio.create_subprocess_exec", new_callable=AsyncMock)
mock_process = AsyncMock()
mock_process.communicate.return_value = (b"", b"")
mock_process.returncode = returncode
mock_exec.return_value = mock_process

result = await check_repo_exists(DEMO_URL)

assert result is expected
# Verify that git ls-remote was called with the correct arguments
mock_exec.assert_called_once_with(
"git", "ls-remote", "--exit-code", DEMO_URL, "HEAD",
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE,
)


@pytest.mark.asyncio
Expand Down Expand Up @@ -190,24 +194,85 @@ async def test_clone_commit(run_command_mock: AsyncMock) -> None:


@pytest.mark.asyncio
async def test_check_repo_exists_with_redirect(mocker: MockerFixture) -> None:
"""Test ``check_repo_exists`` when a redirect (302) is returned.
async def test_check_repo_exists_with_exception(mocker: MockerFixture) -> None:
"""Test ``check_repo_exists`` when an exception occurs during git ls-remote.

Given a URL that responds with "302 Found":
Given a git ls-remote command that raises an exception:
When ``check_repo_exists`` is called,
Then it should return ``False``, indicating the repo is inaccessible.
"""
mock_exec = mocker.patch("asyncio.create_subprocess_exec", new_callable=AsyncMock)
mock_process = AsyncMock()
mock_process.communicate.return_value = (b"302\n", b"")
mock_process.returncode = 0 # Simulate successful request
mock_exec.return_value = mock_process
mock_exec.side_effect = Exception("Git command failed")

repo_exists = await check_repo_exists(DEMO_URL)

assert repo_exists is False


@pytest.mark.asyncio
async def test_check_repo_exists_with_github_token(mocker: MockerFixture) -> None:
"""Test ``check_repo_exists`` with GitHub token authentication.

Given a GitHub URL and a token:
When ``check_repo_exists`` is called,
Then it should include the GitHub-specific authentication header in the git ls-remote command.
"""
mock_exec = mocker.patch("asyncio.create_subprocess_exec", new_callable=AsyncMock)
mock_process = AsyncMock()
mock_process.communicate.return_value = (b"", b"")
mock_process.returncode = 0
mock_exec.return_value = mock_process

mock_auth_header = mocker.patch("gitingest.utils.git_utils.create_git_auth_header")
mock_auth_header.return_value = "http.extraheader=Authorization: Bearer test_token"

github_url = "https://github.com/owner/repo"
result = await check_repo_exists(github_url, token="test_token")

assert result is True
# Verify that authentication header was created
mock_auth_header.assert_called_once_with("test_token", url=github_url)
# Verify that git ls-remote was called with the authentication config
mock_exec.assert_called_once_with(
"git", "ls-remote", "-c", "http.extraheader=Authorization: Bearer test_token",
"--exit-code", github_url, "HEAD",
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE,
)


@pytest.mark.asyncio
async def test_check_repo_exists_with_non_github_token(mocker: MockerFixture) -> None:
"""Test ``check_repo_exists`` with non-GitHub token authentication.

Given a non-GitHub URL and a token:
When ``check_repo_exists`` is called,
Then it should include the generic HTTP basic auth header in the git ls-remote command.
"""
mock_exec = mocker.patch("asyncio.create_subprocess_exec", new_callable=AsyncMock)
mock_process = AsyncMock()
mock_process.communicate.return_value = (b"", b"")
mock_process.returncode = 0
mock_exec.return_value = mock_process

mock_base64 = mocker.patch("base64.b64encode")
mock_base64.return_value.decode.return_value = "encoded_token"

gitlab_url = "https://gitlab.com/owner/repo"
result = await check_repo_exists(gitlab_url, token="test_token")

assert result is True
# Verify that base64 encoding was called for the token
mock_base64.assert_called_once_with(b"oauth2:test_token")
# Verify that git ls-remote was called with the authentication config
mock_exec.assert_called_once_with(
"git", "ls-remote", "-c", "http.https://gitlab.com/.extraheader=Authorization: Basic encoded_token",
"--exit-code", gitlab_url, "HEAD",
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE,
)


@pytest.mark.asyncio
async def test_clone_with_timeout(run_command_mock: AsyncMock) -> None:
"""Test cloning a repository when a timeout occurs.
Expand Down
Loading