scripts/codebase_genome_status.py

#!/usr/bin/env python3
"""Status/reporting helper for the codebase genome pipeline.

This lands a parent-epic slice for timmy-home #665 by making the current genome
coverage across repos inspectable: which repos have artifacts, which have tests,
what duplicates exist, and which repo is still uncovered next.
"""

from __future__ import annotations

import argparse
import json
from datetime import datetime, timezone
from pathlib import Path
from typing import Iterable
import urllib.request


def artifact_repo_name(path: Path, host_repo_name: str = 'timmy-home') -> str | None:
    normalized = path.as_posix()
    name = path.name
    if normalized == 'GENOME.md':
        return host_repo_name
    if path.parts[:1] == ('genomes',) and name == 'GENOME.md' and len(path.parts) == 3:
        return path.parts[1]
    if path.parts[:1] == ('genomes',) and name.endswith('-GENOME.md'):
        return name[:-len('-GENOME.md')]
    if path.parent == Path('.') and name.startswith('GENOME-') and name.endswith('.md'):
        return name[len('GENOME-'):-len('.md')]
    if path.parent == Path('.') and name.endswith('-GENOME.md'):
        return name[:-len('-GENOME.md')]
    return None


def test_repo_name(path: Path, host_repo_name: str = 'timmy-home') -> str | None:
    if path.name == 'test_codebase_genome_pipeline.py':
        return host_repo_name
    stem = path.stem
    if not stem.startswith('test_') or not stem.endswith('_genome'):
        return None
    middle = stem[len('test_'):-len('_genome')]
    return middle.replace('_', '-') if middle else None


def scan_artifacts(repo_root: Path, host_repo_name: str = 'timmy-home') -> dict[str, list[str]]:
    artifacts: dict[str, list[str]] = {}
    for path in sorted(repo_root.rglob('*.md')):
        rel = path.relative_to(repo_root)
        repo_name = artifact_repo_name(rel, host_repo_name=host_repo_name)
        if repo_name is None:
            continue
        artifacts.setdefault(repo_name, []).append(rel.as_posix())
    return artifacts


def scan_tests(repo_root: Path, host_repo_name: str = 'timmy-home') -> set[str]:
    tests = set()
    tests_root = repo_root / 'tests'
    if not tests_root.exists():
        return tests
    for path in sorted(tests_root.rglob('test_*.py')):
        repo_name = test_repo_name(path.relative_to(repo_root), host_repo_name=host_repo_name)
        if repo_name:
            tests.add(repo_name)
    return tests


def build_status_summary(
    *,
    repo_root: str | Path,
    expected_repos: Iterable[str],
    state: dict | None = None,
    host_repo_name: str = 'timmy-home',
) -> dict:
    root = Path(repo_root)
    expected = list(expected_repos)
    artifacts = scan_artifacts(root, host_repo_name=host_repo_name)
    tested_repos = scan_tests(root, host_repo_name=host_repo_name)

    coverage = {}
    duplicates = {}
    for repo in sorted(artifacts):
        paths = artifacts[repo]
        coverage[repo] = {
            'artifact_paths': paths,
            'has_test': repo in tested_repos,
        }
        if len(paths) > 1:
            duplicates[repo] = paths

    missing_repos = [repo for repo in expected if repo not in artifacts]
    next_uncovered_repo = missing_repos[0] if missing_repos else None

    return {
        'generated_at': datetime.now(timezone.utc).isoformat().replace('+00:00', 'Z'),
        'total_expected_repos': len(expected),
        'artifact_count': len(artifacts),
        'tested_artifact_count': sum(1 for repo in artifacts if repo in tested_repos),
        'last_repo': (state or {}).get('last_repo'),
        'next_uncovered_repo': next_uncovered_repo,
        'missing_repos': missing_repos,
        'duplicates': duplicates,
        'artifacts': coverage,
    }


def render_markdown(summary: dict) -> str:
    lines = [
        '# Codebase Genome Status',
        '',
        f"Generated: {summary['generated_at']}",
        '',
        '## Summary',
        '',
        f"- expected repos: {summary['total_expected_repos']}",
        f"- repos with genome artifacts: {summary['artifact_count']}",
        f"- repos with genome tests: {summary['tested_artifact_count']}",
    ]
    if summary.get('last_repo'):
        lines.append(f"- last repo processed by nightly rotation: {summary['last_repo']}")
    if summary.get('next_uncovered_repo'):
        lines.append(f"- next uncovered repo: {summary['next_uncovered_repo']}")

    lines += [
        '',
        '## Coverage Matrix',
        '',
        '| Repo | Artifact Paths | Test? |',
        '|------|----------------|-------|',
    ]
    for repo, data in summary['artifacts'].items():
        artifact_paths = '<br>'.join(data['artifact_paths'])
        has_test = 'yes' if data['has_test'] else 'no'
        lines.append(f'| `{repo}` | `{artifact_paths}` | {has_test} |')

    lines += ['', '## Missing Repo Artifacts', '']
    if summary['missing_repos']:
        for repo in summary['missing_repos']:
            lines.append(f'- `{repo}`')
    else:
        lines.append('- none')

    lines += ['', '## Duplicate Artifact Paths', '']
    if summary['duplicates']:
        for repo, paths in summary['duplicates'].items():
            lines.append(f'- `{repo}`')
            for path in paths:
                lines.append(f'  - `{path}`')
    else:
        lines.append('- none')

    return '\n'.join(lines) + '\n'


def load_state(path: str | Path | None) -> dict:
    if not path:
        return {}
    state_path = Path(path).expanduser()
    if not state_path.exists():
        return {}
    return json.loads(state_path.read_text(encoding='utf-8'))


def fetch_org_repo_names(org: str, host: str, token_file: str | Path, *, include_archived: bool = False) -> list[str]:
    token = Path(token_file).expanduser().read_text(encoding='utf-8').strip()
    headers = {'Authorization': f'token {token}', 'Accept': 'application/json'}
    repos = []
    page = 1
    while True:
        req = urllib.request.Request(
            f"{host.rstrip('/')}/api/v1/orgs/{org}/repos?limit=100&page={page}",
            headers=headers,
        )
        with urllib.request.urlopen(req, timeout=30) as resp:
            batch = json.loads(resp.read().decode('utf-8'))
        if not batch:
            break
        for repo in batch:
            if repo.get('archived') and not include_archived:
                continue
            name = repo['name']
            if name.startswith('.'):
                continue
            repos.append(name)
        if len(batch) < 100:
            break
        page += 1
    return sorted(set(repos))


def main(argv: list[str] | None = None) -> int:
    parser = argparse.ArgumentParser(description='Summarize codebase genome coverage across repos')
    parser.add_argument('--repo-root', default='.', help='timmy-home repo root')
    parser.add_argument('--expected-repos-json', help='JSON array of expected repo names')
    parser.add_argument('--org', default='Timmy_Foundation', help='Gitea org to inspect when expected repos are not provided')
    parser.add_argument('--host', default='https://forge.alexanderwhitestone.com', help='Gitea host')
    parser.add_argument('--token-file', default='~/.config/gitea/token', help='Gitea token file for org scan fallback')
    parser.add_argument('--include-archived', action='store_true', help='Include archived repos in org scan fallback')
    parser.add_argument('--state-path', default='~/.timmy/codebase_genome_state.json')
    parser.add_argument('--output', help='Optional markdown output path')
    args = parser.parse_args(argv)

    if args.expected_repos_json:
        expected = json.loads(args.expected_repos_json)
    else:
        expected = fetch_org_repo_names(args.org, args.host, args.token_file, include_archived=args.include_archived)
    summary = build_status_summary(repo_root=args.repo_root, expected_repos=expected, state=load_state(args.state_path))
    rendered = render_markdown(summary)
    print(rendered)
    if args.output:
        out = Path(args.output)
        out.parent.mkdir(parents=True, exist_ok=True)
        out.write_text(rendered, encoding='utf-8')
    return 0


if __name__ == '__main__':
    raise SystemExit(main())
feat: add codebase genome status rollup (#665) 2026-04-18 15:15:16 -04:00			`#!/usr/bin/env python3`
			`"""Status/reporting helper for the codebase genome pipeline.`

			`This lands a parent-epic slice for timmy-home #665 by making the current genome`
			`coverage across repos inspectable: which repos have artifacts, which have tests,`
			`what duplicates exist, and which repo is still uncovered next.`
			`"""`

			`from __future__ import annotations`

			`import argparse`
			`import json`
			`from datetime import datetime, timezone`
			`from pathlib import Path`
			`from typing import Iterable`
			`import urllib.request`


			`def artifact_repo_name(path: Path, host_repo_name: str = 'timmy-home') -> str \| None:`
			`normalized = path.as_posix()`
			`name = path.name`
			`if normalized == 'GENOME.md':`
			`return host_repo_name`
			`if path.parts[:1] == ('genomes',) and name == 'GENOME.md' and len(path.parts) == 3:`
			`return path.parts[1]`
			`if path.parts[:1] == ('genomes',) and name.endswith('-GENOME.md'):`
			`return name[:-len('-GENOME.md')]`
			`if path.parent == Path('.') and name.startswith('GENOME-') and name.endswith('.md'):`
			`return name[len('GENOME-'):-len('.md')]`
			`if path.parent == Path('.') and name.endswith('-GENOME.md'):`
			`return name[:-len('-GENOME.md')]`
			`return None`


			`def test_repo_name(path: Path, host_repo_name: str = 'timmy-home') -> str \| None:`
			`if path.name == 'test_codebase_genome_pipeline.py':`
			`return host_repo_name`
			`stem = path.stem`
			`if not stem.startswith('test_') or not stem.endswith('_genome'):`
			`return None`
			`middle = stem[len('test_'):-len('_genome')]`
			`return middle.replace('_', '-') if middle else None`


			`def scan_artifacts(repo_root: Path, host_repo_name: str = 'timmy-home') -> dict[str, list[str]]:`
			`artifacts: dict[str, list[str]] = {}`
			`for path in sorted(repo_root.rglob('*.md')):`
			`rel = path.relative_to(repo_root)`
			`repo_name = artifact_repo_name(rel, host_repo_name=host_repo_name)`
			`if repo_name is None:`
			`continue`
			`artifacts.setdefault(repo_name, []).append(rel.as_posix())`
			`return artifacts`


			`def scan_tests(repo_root: Path, host_repo_name: str = 'timmy-home') -> set[str]:`
			`tests = set()`
			`tests_root = repo_root / 'tests'`
			`if not tests_root.exists():`
			`return tests`
			`for path in sorted(tests_root.rglob('test_*.py')):`
			`repo_name = test_repo_name(path.relative_to(repo_root), host_repo_name=host_repo_name)`
			`if repo_name:`
			`tests.add(repo_name)`
			`return tests`


			`def build_status_summary(`
			`*,`
			`repo_root: str \| Path,`
			`expected_repos: Iterable[str],`
			`state: dict \| None = None,`
			`host_repo_name: str = 'timmy-home',`
			`) -> dict:`
			`root = Path(repo_root)`
			`expected = list(expected_repos)`
			`artifacts = scan_artifacts(root, host_repo_name=host_repo_name)`
			`tested_repos = scan_tests(root, host_repo_name=host_repo_name)`

			`coverage = {}`
			`duplicates = {}`
			`for repo in sorted(artifacts):`
			`paths = artifacts[repo]`
			`coverage[repo] = {`
			`'artifact_paths': paths,`
			`'has_test': repo in tested_repos,`
			`}`
			`if len(paths) > 1:`
			`duplicates[repo] = paths`

			`missing_repos = [repo for repo in expected if repo not in artifacts]`
			`next_uncovered_repo = missing_repos[0] if missing_repos else None`

			`return {`
			`'generated_at': datetime.now(timezone.utc).isoformat().replace('+00:00', 'Z'),`
			`'total_expected_repos': len(expected),`
			`'artifact_count': len(artifacts),`
			`'tested_artifact_count': sum(1 for repo in artifacts if repo in tested_repos),`
			`'last_repo': (state or {}).get('last_repo'),`
			`'next_uncovered_repo': next_uncovered_repo,`
			`'missing_repos': missing_repos,`
			`'duplicates': duplicates,`
			`'artifacts': coverage,`
			`}`


			`def render_markdown(summary: dict) -> str:`
			`lines = [`
			`'# Codebase Genome Status',`
			`'',`
			`f"Generated: {summary['generated_at']}",`
			`'',`
			`'## Summary',`
			`'',`
			`f"- expected repos: {summary['total_expected_repos']}",`
			`f"- repos with genome artifacts: {summary['artifact_count']}",`
			`f"- repos with genome tests: {summary['tested_artifact_count']}",`
			`]`
			`if summary.get('last_repo'):`
			`lines.append(f"- last repo processed by nightly rotation: {summary['last_repo']}")`
			`if summary.get('next_uncovered_repo'):`
			`lines.append(f"- next uncovered repo: {summary['next_uncovered_repo']}")`

			`lines += [`
			`'',`
			`'## Coverage Matrix',`
			`'',`
			`'\| Repo \| Artifact Paths \| Test? \|',`
			`'\|------\|----------------\|-------\|',`
			`]`
			`for repo, data in summary['artifacts'].items():`
			`artifact_paths = '<br>'.join(data['artifact_paths'])`
			`has_test = 'yes' if data['has_test'] else 'no'`
			lines.append(f'\| `{repo}` \| `{artifact_paths}` \| {has_test} \|')

			`lines += ['', '## Missing Repo Artifacts', '']`
			`if summary['missing_repos']:`
			`for repo in summary['missing_repos']:`
			lines.append(f'- `{repo}`')
			`else:`
			`lines.append('- none')`

			`lines += ['', '## Duplicate Artifact Paths', '']`
			`if summary['duplicates']:`
			`for repo, paths in summary['duplicates'].items():`
			lines.append(f'- `{repo}`')
			`for path in paths:`
			lines.append(f' - `{path}`')
			`else:`
			`lines.append('- none')`

			`return '\n'.join(lines) + '\n'`


			`def load_state(path: str \| Path \| None) -> dict:`
			`if not path:`
			`return {}`
			`state_path = Path(path).expanduser()`
			`if not state_path.exists():`
			`return {}`
			`return json.loads(state_path.read_text(encoding='utf-8'))`


			`def fetch_org_repo_names(org: str, host: str, token_file: str \| Path, *, include_archived: bool = False) -> list[str]:`
			`token = Path(token_file).expanduser().read_text(encoding='utf-8').strip()`
			`headers = {'Authorization': f'token {token}', 'Accept': 'application/json'}`
			`repos = []`
			`page = 1`
			`while True:`
			`req = urllib.request.Request(`
			`f"{host.rstrip('/')}/api/v1/orgs/{org}/repos?limit=100&page={page}",`
			`headers=headers,`
			`)`
			`with urllib.request.urlopen(req, timeout=30) as resp:`
			`batch = json.loads(resp.read().decode('utf-8'))`
			`if not batch:`
			`break`
			`for repo in batch:`
			`if repo.get('archived') and not include_archived:`
			`continue`
			`name = repo['name']`
			`if name.startswith('.'):`
			`continue`
			`repos.append(name)`
			`if len(batch) < 100:`
			`break`
			`page += 1`
			`return sorted(set(repos))`


			`def main(argv: list[str] \| None = None) -> int:`
			`parser = argparse.ArgumentParser(description='Summarize codebase genome coverage across repos')`
			`parser.add_argument('--repo-root', default='.', help='timmy-home repo root')`
			`parser.add_argument('--expected-repos-json', help='JSON array of expected repo names')`
			`parser.add_argument('--org', default='Timmy_Foundation', help='Gitea org to inspect when expected repos are not provided')`
			`parser.add_argument('--host', default='https://forge.alexanderwhitestone.com', help='Gitea host')`
			`parser.add_argument('--token-file', default='~/.config/gitea/token', help='Gitea token file for org scan fallback')`
			`parser.add_argument('--include-archived', action='store_true', help='Include archived repos in org scan fallback')`
			`parser.add_argument('--state-path', default='~/.timmy/codebase_genome_state.json')`
			`parser.add_argument('--output', help='Optional markdown output path')`
			`args = parser.parse_args(argv)`

			`if args.expected_repos_json:`
			`expected = json.loads(args.expected_repos_json)`
			`else:`
			`expected = fetch_org_repo_names(args.org, args.host, args.token_file, include_archived=args.include_archived)`
			`summary = build_status_summary(repo_root=args.repo_root, expected_repos=expected, state=load_state(args.state_path))`
			`rendered = render_markdown(summary)`
			`print(rendered)`
			`if args.output:`
			`out = Path(args.output)`
			`out.parent.mkdir(parents=True, exist_ok=True)`
			`out.write_text(rendered, encoding='utf-8')`
			`return 0`


			`if __name__ == '__main__':`
			`raise SystemExit(main())`