Skip to content

feat: include_submodules option #313

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Jul 3, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,9 @@ gitingest https://github.com/username/private-repo --token github_pat_...
# Or set it as an environment variable
export GITHUB_TOKEN=github_pat_...
gitingest https://github.com/username/private-repo

# Include repository submodules
gitingest https://github.com/username/repo-with-submodules --include-submodules
```

By default, files listed in `.gitignore` are skipped. Use `--include-gitignored` if you
Expand Down Expand Up @@ -163,6 +166,9 @@ summary, tree, content = ingest("https://github.com/username/private-repo", toke
import os
os.environ["GITHUB_TOKEN"] = "github_pat_..."
summary, tree, content = ingest("https://github.com/username/private-repo")

# Include repository submodules
summary, tree, content = ingest("https://github.com/username/repo-with-submodules", include_submodules=True)
```

By default, this won't write a file but can be enabled with the `output` argument.
Expand Down
18 changes: 16 additions & 2 deletions src/gitingest/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ class _CLIArgs(TypedDict):
include_pattern: tuple[str, ...]
branch: str | None
include_gitignored: bool
include_submodules: bool
token: str | None
output: str | None

Expand Down Expand Up @@ -47,6 +48,12 @@ class _CLIArgs(TypedDict):
default=False,
help="Include files matched by .gitignore and .gitingestignore",
)
@click.option(
"--include-submodules",
is_flag=True,
help="Include repository's submodules in the analysis",
default=False,
)
@click.option(
"--token",
"-t",
Expand Down Expand Up @@ -94,6 +101,9 @@ def main(**cli_kwargs: Unpack[_CLIArgs]) -> None:
$ gitingest https://github.com/user/private-repo -t ghp_token
$ GITHUB_TOKEN=ghp_token gitingest https://github.com/user/private-repo

Include submodules:
$ gitingest https://github.com/user/repo --include-submodules

"""
asyncio.run(_async_main(**cli_kwargs))

Expand All @@ -106,6 +116,7 @@ async def _async_main(
include_pattern: tuple[str, ...] | None = None,
branch: str | None = None,
include_gitignored: bool = False,
include_submodules: bool = False,
token: str | None = None,
output: str | None = None,
) -> None:
Expand All @@ -129,6 +140,8 @@ async def _async_main(
Git branch to ingest. If ``None``, the repository's default branch is used.
include_gitignored : bool
If ``True``, also ingest files matched by ``.gitignore`` or ``.gitingestignore`` (default: ``False``).
include_submodules : bool
If ``True``, recursively include all Git submodules within the repository (default: ``False``).
token : str | None
GitHub personal access token (PAT) for accessing private repositories.
Can also be set via the ``GITHUB_TOKEN`` environment variable.
Expand All @@ -155,14 +168,15 @@ async def _async_main(
click.echo(f"Analyzing source, output will be written to '{output_target}'...", err=True)

summary, _, _ = await ingest_async(
source=source,
source,
max_file_size=max_size,
include_patterns=include_patterns,
exclude_patterns=exclude_patterns,
branch=branch,
output=output_target,
include_gitignored=include_gitignored,
include_submodules=include_submodules,
token=token,
output=output_target,
)
except Exception as exc:
# Convert any exception into Click.Abort so that exit status is non-zero
Expand Down
31 changes: 23 additions & 8 deletions src/gitingest/clone.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,9 @@ async def clone_repo(config: CloneConfig, *, token: str | None = None) -> None:
clone_cmd += ["-c", create_git_auth_header(token, url=url)]

clone_cmd += ["clone", "--single-branch"]
# TODO: Re-enable --recurse-submodules when submodule support is needed

if config.include_submodules:
clone_cmd += ["--recurse-submodules"]

if partial_clone:
clone_cmd += ["--filter=blob:none", "--sparse"]
Expand All @@ -86,15 +88,28 @@ async def clone_repo(config: CloneConfig, *, token: str | None = None) -> None:

# Checkout the subpath if it is a partial clone
if partial_clone:
subpath = config.subpath.lstrip("/")
if config.blob:
# When ingesting from a file url (blob/branch/path/file.txt), we need to remove the file name.
subpath = str(Path(subpath).parent.as_posix())

checkout_cmd = create_git_command(["git"], local_path, url, token)
await run_command(*checkout_cmd, "sparse-checkout", "set", subpath)
await _checkout_partial_clone(config, token)

# Checkout the commit if it is provided
if commit:
checkout_cmd = create_git_command(["git"], local_path, url, token)
await run_command(*checkout_cmd, "checkout", commit)


async def _checkout_partial_clone(config: CloneConfig, token: str | None) -> None:
"""Configure sparse-checkout for a partially cloned repository.

Parameters
----------
config : CloneConfig
The configuration for cloning the repository, including subpath and blob flag.
token : str | None
GitHub personal access token (PAT) for accessing private repositories.

"""
subpath = config.subpath.lstrip("/")
if config.blob:
# Remove the file name from the subpath when ingesting from a file url (e.g. blob/branch/path/file.txt)
subpath = str(Path(subpath).parent.as_posix())
checkout_cmd = create_git_command(["git"], config.local_path, config.url, token)
await run_command(*checkout_cmd, "sparse-checkout", "set", subpath)
9 changes: 9 additions & 0 deletions src/gitingest/entrypoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ async def ingest_async(
branch: str | None = None,
tag: str | None = None,
include_gitignored: bool = False,
include_submodules: bool = False,
token: str | None = None,
output: str | None = None,
) -> tuple[str, str, str]:
Expand All @@ -52,6 +53,8 @@ async def ingest_async(
The tag to clone and ingest. If ``None``, no tag is used.
include_gitignored : bool
If ``True``, include files ignored by ``.gitignore`` and ``.gitingestignore`` (default: ``False``).
include_submodules : bool
If ``True``, recursively include all Git submodules within the repository (default: ``False``).
token : str | None
GitHub personal access token (PAT) for accessing private repositories.
Can also be set via the ``GITHUB_TOKEN`` environment variable.
Expand Down Expand Up @@ -86,6 +89,8 @@ async def ingest_async(
if query.url:
_override_branch_and_tag(query, branch=branch, tag=tag)

query.include_submodules = include_submodules

async with _clone_repo_if_remote(query, token=token):
summary, tree, content = ingest_query(query)
await _write_output(tree, content=content, target=output)
Expand All @@ -101,6 +106,7 @@ def ingest(
branch: str | None = None,
tag: str | None = None,
include_gitignored: bool = False,
include_submodules: bool = False,
token: str | None = None,
output: str | None = None,
) -> tuple[str, str, str]:
Expand All @@ -126,6 +132,8 @@ def ingest(
The tag to clone and ingest. If ``None``, no tag is used.
include_gitignored : bool
If ``True``, include files ignored by ``.gitignore`` and ``.gitingestignore`` (default: ``False``).
include_submodules : bool
If ``True``, recursively include all Git submodules within the repository (default: ``False``).
token : str | None
GitHub personal access token (PAT) for accessing private repositories.
Can also be set via the ``GITHUB_TOKEN`` environment variable.
Expand Down Expand Up @@ -156,6 +164,7 @@ def ingest(
branch=branch,
tag=tag,
include_gitignored=include_gitignored,
include_submodules=include_submodules,
token=token,
output=output,
),
Expand Down
9 changes: 8 additions & 1 deletion src/gitingest/schemas/ingestion.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@


@dataclass
class CloneConfig:
class CloneConfig: # pylint: disable=too-many-instance-attributes
"""Configuration for cloning a Git repository.

This class holds the necessary parameters for cloning a repository to a local path, including
Expand All @@ -33,6 +33,8 @@ class CloneConfig:
The subpath to clone from the repository (default: ``"/"``).
blob: bool
Whether the repository is a blob (default: ``False``).
include_submodules: bool
Whether to clone submodules (default: ``False``).

"""

Expand All @@ -43,6 +45,7 @@ class CloneConfig:
tag: str | None = None
subpath: str = "/"
blob: bool = False
include_submodules: bool = False


class IngestionQuery(BaseModel): # pylint: disable=too-many-instance-attributes
Expand Down Expand Up @@ -78,6 +81,8 @@ class IngestionQuery(BaseModel): # pylint: disable=too-many-instance-attributes
The patterns to ignore (default: ``set()``).
include_patterns : set[str] | None
The patterns to include.
include_submodules : bool
Whether to include all Git submodules within the repository. (default: ``False``)

"""

Expand All @@ -95,6 +100,7 @@ class IngestionQuery(BaseModel): # pylint: disable=too-many-instance-attributes
max_file_size: int = Field(default=MAX_FILE_SIZE)
ignore_patterns: set[str] = set() # TODO: ignore_patterns and include_patterns have the same type
include_patterns: set[str] | None = None
include_submodules: bool = False

def extract_clone_config(self) -> CloneConfig:
"""Extract the relevant fields for the CloneConfig object.
Expand Down Expand Up @@ -122,6 +128,7 @@ def extract_clone_config(self) -> CloneConfig:
tag=self.tag,
subpath=self.subpath,
blob=self.type == "blob",
include_submodules=self.include_submodules,
)

def ensure_url(self) -> None:
Expand Down
1 change: 1 addition & 0 deletions tests/query_parser/test_git_host_agnostic.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ async def test_parse_query_without_host(
"commit": None,
"max_file_size": 50,
"include_patterns": None,
"include_submodules": False,
}

assert actual == expected
1 change: 1 addition & 0 deletions tests/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
"tests/",
"--include-pattern",
"src/",
"--include-submodules",
],
True,
id="custom-options",
Expand Down
31 changes: 28 additions & 3 deletions tests/test_clone.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,10 +181,10 @@ async def test_clone_default_shallow_clone(run_command_mock: AsyncMock) -> None:


@pytest.mark.asyncio
async def test_clone_commit_without_branch(run_command_mock: AsyncMock) -> None:
"""Test cloning when a commit hash is provided but no branch is specified.
async def test_clone_commit(run_command_mock: AsyncMock) -> None:
"""Test cloning when a commit hash is provided.

Given a valid URL and a commit hash (but no branch):
Given a valid URL and a commit hash:
When ``clone_repo`` is called,
Then the repository should be cloned and checked out at that commit.
"""
Expand Down Expand Up @@ -414,3 +414,28 @@ async def test_clone_with_commit_and_subpath(run_command_mock: AsyncMock) -> Non
)

assert run_command_mock.call_count == expected_call_count


@pytest.mark.asyncio
async def test_clone_with_include_submodules(run_command_mock: AsyncMock) -> None:
"""Test cloning a repository with submodules included.

Given a valid URL and ``include_submodules=True``:
When ``clone_repo`` is called,
Then the repository should be cloned with ``--recurse-submodules`` in the git command.
"""
expected_call_count = 1 # No commit and no partial clone
clone_config = CloneConfig(url=DEMO_URL, local_path=LOCAL_REPO_PATH, branch="main", include_submodules=True)

await clone_repo(clone_config)

assert run_command_mock.call_count == expected_call_count
run_command_mock.assert_called_once_with(
"git",
"clone",
"--single-branch",
"--recurse-submodules",
"--depth=1",
clone_config.url,
clone_config.local_path,
)
Loading