#!/usr/bin/env python3
"""Status/reporting helper for the codebase genome pipeline.

This lands a parent-epic slice for timmy-home #665 by making the current genome
coverage across repos inspectable: which repos have artifacts, which have tests,
what duplicates exist, and which repo is still uncovered next.
"""

from __future__ import annotations

import argparse
import json
from datetime import datetime, timezone
from pathlib import Path
from typing import Iterable
import urllib.request


def artifact_repo_name(path: Path, host_repo_name: str = 'timmy-home') -> str | None:
    normalized = path.as_posix()
    name = path.name
    if normalized == 'GENOME.md':
        return host_repo_name
    if path.parts[:1] == ('genomes',) and name == 'GENOME.md' and len(path.parts) == 3:
        return path.parts[1]
    if path.parts[:1] == ('genomes',) and name.endswith('-GENOME.md'):
        return name[:-len('-GENOME.md')]
    if path.parent == Path('.') and name.startswith('GENOME-') and name.endswith('.md'):
        return name[len('GENOME-'):-len('.md')]
    if path.parent == Path('.') and name.endswith('-GENOME.md'):
        return name[:-len('-GENOME.md')]
    return None


def test_repo_name(path: Path, host_repo_name: str = 'timmy-home') -> str | None:
    if path.name == 'test_codebase_genome_pipeline.py':
        return host_repo_name
    stem = path.stem
    if not stem.startswith('test_') or not stem.endswith('_genome'):
        return None
    middle = stem[len('test_'):-len('_genome')]
    return middle.replace('_', '-') if middle else None


def scan_artifacts(repo_root: Path, host_repo_name: str = 'timmy-home') -> dict[str, list[str]]:
    artifacts: dict[str, list[str]] = {}
    for path in sorted(repo_root.rglob('*.md')):
        rel = path.relative_to(repo_root)
        repo_name = artifact_repo_name(rel, host_repo_name=host_repo_name)
        if repo_name is None:
            continue
        artifacts.setdefault(repo_name, []).append(rel.as_posix())
    return artifacts


def scan_tests(repo_root: Path, host_repo_name: str = 'timmy-home') -> set[str]:
    tests = set()
    tests_root = repo_root / 'tests'
    if not tests_root.exists():
        return tests
    for path in sorted(tests_root.rglob('test_*.py')):
        repo_name = test_repo_name(path.relative_to(repo_root), host_repo_name=host_repo_name)
        if repo_name:
            tests.add(repo_name)
    return tests


def build_status_summary(
    *,
    repo_root: str | Path,
    expected_repos: Iterable[str],
    state: dict | None = None,
    host_repo_name: str = 'timmy-home',
) -> dict:
    root = Path(repo_root)
    expected = list(expected_repos)
    artifacts = scan_artifacts(root, host_repo_name=host_repo_name)
    tested_repos = scan_tests(root, host_repo_name=host_repo_name)

    coverage = {}
    duplicates = {}
    for repo in sorted(artifacts):
        paths = artifacts[repo]
        coverage[repo] = {
            'artifact_paths': paths,
            'has_test': repo in tested_repos,
        }
        if len(paths) > 1:
            duplicates[repo] = paths

    missing_repos = [repo for repo in expected if repo not in artifacts]
    next_uncovered_repo = missing_repos[0] if missing_repos else None

    return {
        'generated_at': datetime.now(timezone.utc).isoformat().replace('+00:00', 'Z'),
        'total_expected_repos': len(expected),
        'artifact_count': len(artifacts),
        'tested_artifact_count': sum(1 for repo in artifacts if repo in tested_repos),
        'last_repo': (state or {}).get('last_repo'),
        'next_uncovered_repo': next_uncovered_repo,
        'missing_repos': missing_repos,
        'duplicates': duplicates,
        'artifacts': coverage,
    }


def render_markdown(summary: dict) -> str:
    lines = [
        '# Codebase Genome Status',
        '',
        f"Generated: {summary['generated_at']}",
        '',
        '## Summary',
        '',
        f"- expected repos: {summary['total_expected_repos']}",
        f"- repos with genome artifacts: {summary['artifact_count']}",
        f"- repos with genome tests: {summary['tested_artifact_count']}",
    ]
    if summary.get('last_repo'):
        lines.append(f"- last repo processed by nightly rotation: {summary['last_repo']}")
    if summary.get('next_uncovered_repo'):
        lines.append(f"- next uncovered repo: {summary['next_uncovered_repo']}")

    lines += [
        '',
        '## Coverage Matrix',
        '',
        '| Repo | Artifact Paths | Test? |',
        '|------|----------------|-------|',
    ]
    for repo, data in summary['artifacts'].items():
        artifact_paths = '<br>'.join(data['artifact_paths'])
        has_test = 'yes' if data['has_test'] else 'no'
        lines.append(f'| `{repo}` | `{artifact_paths}` | {has_test} |')

    lines += ['', '## Missing Repo Artifacts', '']
    if summary['missing_repos']:
        for repo in summary['missing_repos']:
            lines.append(f'- `{repo}`')
    else:
        lines.append('- none')

    lines += ['', '## Duplicate Artifact Paths', '']
    if summary['duplicates']:
        for repo, paths in summary['duplicates'].items():
            lines.append(f'- `{repo}`')
            for path in paths:
                lines.append(f'  - `{path}`')
    else:
        lines.append('- none')

    return '\n'.join(lines) + '\n'


def load_state(path: str | Path | None) -> dict:
    if not path:
        return {}
    state_path = Path(path).expanduser()
    if not state_path.exists():
        return {}
    return json.loads(state_path.read_text(encoding='utf-8'))


def fetch_org_repo_names(org: str, host: str, token_file: str | Path, *, include_archived: bool = False) -> list[str]:
    token = Path(token_file).expanduser().read_text(encoding='utf-8').strip()
    headers = {'Authorization': f'token {token}', 'Accept': 'application/json'}
    repos = []
    page = 1
    while True:
        req = urllib.request.Request(
            f"{host.rstrip('/')}/api/v1/orgs/{org}/repos?limit=100&page={page}",
            headers=headers,
        )
        with urllib.request.urlopen(req, timeout=30) as resp:
            batch = json.loads(resp.read().decode('utf-8'))
        if not batch:
            break
        for repo in batch:
            if repo.get('archived') and not include_archived:
                continue
            name = repo['name']
            if name.startswith('.'):
                continue
            repos.append(name)
        if len(batch) < 100:
            break
        page += 1
    return sorted(set(repos))


def main(argv: list[str] | None = None) -> int:
    parser = argparse.ArgumentParser(description='Summarize codebase genome coverage across repos')
    parser.add_argument('--repo-root', default='.', help='timmy-home repo root')
    parser.add_argument('--expected-repos-json', help='JSON array of expected repo names')
    parser.add_argument('--org', default='Timmy_Foundation', help='Gitea org to inspect when expected repos are not provided')
    parser.add_argument('--host', default='https://forge.alexanderwhitestone.com', help='Gitea host')
    parser.add_argument('--token-file', default='~/.config/gitea/token', help='Gitea token file for org scan fallback')
    parser.add_argument('--include-archived', action='store_true', help='Include archived repos in org scan fallback')
    parser.add_argument('--state-path', default='~/.timmy/codebase_genome_state.json')
    parser.add_argument('--output', help='Optional markdown output path')
    args = parser.parse_args(argv)

    if args.expected_repos_json:
        expected = json.loads(args.expected_repos_json)
    else:
        expected = fetch_org_repo_names(args.org, args.host, args.token_file, include_archived=args.include_archived)
    summary = build_status_summary(repo_root=args.repo_root, expected_repos=expected, state=load_state(args.state_path))
    rendered = render_markdown(summary)
    print(rendered)
    if args.output:
        out = Path(args.output)
        out.parent.mkdir(parents=True, exist_ok=True)
        out.write_text(rendered, encoding='utf-8')
    return 0


if __name__ == '__main__':
    raise SystemExit(main())