inventory¶

Retrieve and parse a GitHub user's repositories/files.

Inventory ¶

Inventory(
    username: str,
    lazy: bool = False,
    token: str | None = None,
    use_cache: bool = True,
    force_refresh: bool = False,
    filter_exprs: tuple[str, ...] | tuple[Expr, ...] = None,
    select_exprs: tuple[str, ...] | tuple[Expr, ...] = None,
    addcols_exprs: tuple[str, ...] | tuple[Expr, ...] = None,
    show_tbl_cols: int | None = None,
    show_tbl_rows: int | None = None,
)

Retrieve and parse a GitHub user's public repositories into a Polars DataFrame.

Results are cached locally to avoid repeated API calls.

username: The GitHub username to fetch repositories for.
lazy: Whether to allow lazy Polars operations (not all transformations may be supported).
token: An optional GitHub personal access token for higher rate limits.
use_cache: Whether to use cached results if available.
force_refresh: If True, always refetch from GitHub and overwrite the cache.
filter_exprs: One or more Polars schema (column) names to filter (where True),
              or an Expr to filter the repository listing or file walk tree.
select_exprs: One or more Polars schema (column) names to select, or an
              Expr to evaluate for the repository listing or file walk tree.
addcols_exprs: One or more Polars schema (column) names to with_columns, or an
               Expr to evaluate for the repository listing or file walk tree.
show_tbl_cols: Configure Polars to print N columns if `int` (default: None).
show_tbl_rows: Configure Polars to print N rows if `int` (default: None).

Source code in src/octopols/inventory.py

def __init__(
    self,
    username: str,
    lazy: bool = False,
    token: str | None = None,
    use_cache: bool = True,
    force_refresh: bool = False,
    filter_exprs: tuple[str, ...] | tuple[pl.Expr, ...] = None,
    select_exprs: tuple[str, ...] | tuple[pl.Expr, ...] = None,
    addcols_exprs: tuple[str, ...] | tuple[pl.Expr, ...] = None,
    show_tbl_cols: int | None = None,
    show_tbl_rows: int | None = None,
) -> None:
    """Initialise the Inventory object.

    Args:
    ----
        username: The GitHub username to fetch repositories for.
        lazy: Whether to allow lazy Polars operations (not all transformations may be supported).
        token: An optional GitHub personal access token for higher rate limits.
        use_cache: Whether to use cached results if available.
        force_refresh: If True, always refetch from GitHub and overwrite the cache.
        filter_exprs: One or more Polars schema (column) names to filter (where True),
                      or an Expr to filter the repository listing or file walk tree.
        select_exprs: One or more Polars schema (column) names to select, or an
                      Expr to evaluate for the repository listing or file walk tree.
        addcols_exprs: One or more Polars schema (column) names to with_columns, or an
                       Expr to evaluate for the repository listing or file walk tree.
        show_tbl_cols: Configure Polars to print N columns if `int` (default: None).
        show_tbl_rows: Configure Polars to print N rows if `int` (default: None).

    """
    self.username = username
    self.lazy = lazy
    self.token = token if token is not None else ENV_GH_TOKEN
    self.use_cache = use_cache
    self.force_refresh = force_refresh
    self.filter_exprs = tuple(map(prepare_expr, filter_exprs or []))
    self.select_exprs = tuple(map(prepare_expr, select_exprs or []))
    self.addcols_exprs = tuple(map(prepare_expr, addcols_exprs or []))
    self._inventory_df: pl.DataFrame | None = None

    # Initialize the cache location
    self._cache_dir = Path(user_cache_dir(appname="octopols"))
    self._cache_dir.mkdir(parents=True, exist_ok=True)
    self._cache_file = self._cache_dir / f"{username}_repos.json"
    self._cfg = pl.Config()
    if show_tbl_cols is not None:
        self._cfg.set_tbl_cols(show_tbl_cols)
    if show_tbl_rows is not None:
        self._cfg.set_tbl_rows(show_tbl_rows)

list_repos ¶

list_repos() -> pl.DataFrame

Fetch and parse the public repositories for the specified GitHub user.

Checks the local cache first (unless force_refresh=True). Returns a Polars DataFrame with the columns 'name', 'html_url', and 'description'.

Source code in src/octopols/inventory.py

def list_repos(self) -> pl.DataFrame:
    """Fetch and parse the public repositories for the specified GitHub user.

    Checks the local cache first (unless force_refresh=True). Returns a Polars DataFrame
    with the columns 'name', 'html_url', and 'description'.
    """
    self._inventory_df = self._retrieve_repos()
    return self._inventory_df

review_version_changes ¶

review_version_changes(from_v: str = 'first', to_v: str = 'latest') -> pl.DataFrame

Compare repository metadata across two versions (placeholder).

Currently returns a trivial DataFrame.

Source code in src/octopols/inventory.py

def review_version_changes(
    self,
    from_v: str = "first",
    to_v: str = "latest",
) -> pl.DataFrame:
    """Compare repository metadata across two versions (placeholder).

    Currently returns a trivial DataFrame.
    """
    return pl.DataFrame({"from_v": [from_v], "to_v": [to_v]})

walk_file_trees ¶

walk_file_trees(
    pattern: str = "**",
    no_recurse: bool = False,
    skip_larger_than_mb: int | None = None,
) -> pl.DataFrame

Walk (recursively enumerate) files in each repository via UPath.

Discovers (but does not read) file paths that match a given glob pattern.

pattern: Glob pattern for file listing. By default "**" (recursive).
no_recurse: If True, uses "*" (non-recursive) instead of the default "**".
skip_larger_than_mb: If set, skip listing files larger than this many MB.
                     By default, None (don't skip based on size).

A Polars DataFrame with columns:
    - "repository_name": str
    - "file_path": str
    - "is_directory": bool
    - "file_size_bytes": int

Source code in src/octopols/inventory.py

def walk_file_trees(
    self,
    pattern: str = "**",
    no_recurse: bool = False,
    skip_larger_than_mb: int | None = None,
) -> pl.DataFrame:
    """Walk (recursively enumerate) files in each repository via UPath.

    Discovers (but does not read) file paths that match a given glob pattern.

    Args:
    ----
        pattern: Glob pattern for file listing. By default "**" (recursive).
        no_recurse: If True, uses "*" (non-recursive) instead of the default "**".
        skip_larger_than_mb: If set, skip listing files larger than this many MB.
                             By default, None (don't skip based on size).

    Returns:
    -------
        A Polars DataFrame with columns:
            - "repository_name": str
            - "file_path": str
            - "is_directory": bool
            - "file_size_bytes": int

    """
    if self._inventory_df is None:
        self.list_repos()
    if no_recurse:
        pattern = "*"
    records = []
    for row in self._inventory_df.to_dicts():
        repo_name = row["name"]
        default_branch = row["default_branch"]
        ghpath = UPath(
            "/",
            protocol="github",
            org=self.username,
            repo=repo_name,
            sha=default_branch,
            username=self.username,
            token=self.token,
        )
        for p in ghpath.glob(pattern):
            if is_dir := p.is_dir():
                file_size_bytes = 0
            else:
                file_size_bytes = p.stat().st_size
                if skip_larger_than_mb is not None:
                    threshold_bytes = skip_larger_than_mb * 1_048_576
                    if file_size_bytes > threshold_bytes:
                        continue
            records.append(
                {
                    "repository_name": repo_name,
                    "file_path": os.path.join(*p.parts),
                    "is_directory": is_dir,
                    "file_size_bytes": file_size_bytes,
                },
            )
    files = pl.DataFrame(
        records,
        schema={
            "repository_name": pl.String,
            "file_path": pl.String,
            "is_directory": pl.Boolean,
            "file_size_bytes": pl.Int64,
        },
    )
    files.hopper.add_filters(*self.filter_exprs)
    files.hopper.add_selects(*self.select_exprs)
    files.hopper.add_addcols(*self.addcols_exprs)
    files = files.hopper.apply_ready_exprs()
    self.filter_exprs = tuple(files.hopper.list_filters())
    self.select_exprs = tuple(files.hopper.list_selects())
    self.addcols_exprs = tuple(files.hopper.list_addcols())
    return files

read_files ¶

read_files(
    pattern: str = "**",
    no_recurse: bool = False,
    skip_larger_than_mb: int | None = None,
) -> pl.DataFrame

Read all file contents in each matched repository path.

This enumerates all files (via walk_file_trees) that match pattern, then reads their text content if they are not directories.

pattern: Glob pattern for file listing. Default "**" means recursive.
no_recurse: If True, uses "*" instead of "**".
skip_larger_than_mb: Optional size limit in MB. If set, skip any file above it.

A Polars DataFrame with columns:
    - "repository_name": str
    - "file_path": str
    - "file_size_bytes": int
    - "content": str (file content, or empty if directory/failed)

Source code in src/octopols/inventory.py

def read_files(
    self,
    pattern: str = "**",
    no_recurse: bool = False,
    skip_larger_than_mb: int | None = None,
) -> pl.DataFrame:
    """Read *all* file contents in each matched repository path.

    This enumerates all files (via walk_file_trees) that match `pattern`,
    then reads their text content if they are not directories.

    Args:
    ----
        pattern: Glob pattern for file listing. Default "**" means recursive.
        no_recurse: If True, uses "*" instead of "**".
        skip_larger_than_mb: Optional size limit in MB. If set, skip any file above it.

    Returns:
    -------
        A Polars DataFrame with columns:
            - "repository_name": str
            - "file_path": str
            - "file_size_bytes": int
            - "content": str (file content, or empty if directory/failed)

    """
    # First, get a listing
    file_tree = self.walk_file_trees(
        pattern=pattern,
        no_recurse=no_recurse,
        skip_larger_than_mb=skip_larger_than_mb,
    )
    if file_tree.is_empty():
        return file_tree

    # We'll accumulate a new DataFrame with the file content
    rows = []
    for row in file_tree.to_dicts():
        if row["is_directory"]:
            # Skip directories; no content to read
            rows.append(
                {
                    "repository_name": row["repository_name"],
                    "file_path": row["file_path"],
                    "file_size_bytes": row["file_size_bytes"],
                    "content": "",
                },
            )
            continue

        repo_name = row["repository_name"]
        file_path = row["file_path"]
        # We'll look up the default branch from the known repos
        default_branch = (
            self._inventory_df.filter(pl.col("name") == repo_name)
            .select("default_branch")
            .item()
        )

        ghpath = UPath(
            "/",
            protocol="github",
            org=self.username,
            repo=repo_name,
            sha=default_branch,
            username=self.username,
            token=self.token,
        )
        p = ghpath / file_path
        try:
            content_str = p.read_text()
        except Exception:
            content_str = ""

        rows.append(
            {
                "repository_name": repo_name,
                "file_path": file_path,
                "file_size_bytes": row["file_size_bytes"],
                "content": content_str,
            },
        )
    return pl.DataFrame(rows)