feat: add sovereign hashtag metrics for archive media (#48)

2026-03-28 20:16:31 +00:00
parent 25c1b32bfb
commit 4ca5e8e100
3 changed files with 209 additions and 0 deletions
--- a/scripts/twitter_archive/compute_hashtag_metrics.py
+++ b/scripts/twitter_archive/compute_hashtag_metrics.py
@@ -0,0 +1,157 @@
+#!/usr/bin/env python3
+"""Compute sovereign hashtag metrics for the Twitter archive media lane."""
+
+from __future__ import annotations
+
+import json
+from collections import Counter, defaultdict
+from datetime import datetime
+from pathlib import Path
+from typing import Any
+
+from .common import ARCHIVE_DIR, EXTRACTED_DIR, load_jsonl, write_json
+
+MEDIA_DIR = ARCHIVE_DIR / "media"
+MEDIA_MANIFEST_JSONL = MEDIA_DIR / "manifest.jsonl"
+HASHTAG_METRICS_JSON = MEDIA_DIR / "hashtag_metrics.json"
+HASHTAG_METRICS_MD = MEDIA_DIR / "hashtag_metrics.md"
+TARGET_TAGS = ["timmyTime", "TimmyChain"]
+DATE_FMT = "%a %b %d %H:%M:%S %z %Y"
+
+
+def normalize_tag(tag: str) -> str:
+    return tag.strip().lower()
+
+
+def month_bucket(created_at: str | None) -> str | None:
+    if not created_at:
+        return None
+    return datetime.strptime(created_at, DATE_FMT).strftime("%Y-%m")
+
+
+def _top_cooccurring(tweets: list[dict[str, Any]], normalized_target: str, excluded: set[str]) -> list[dict[str, Any]]:
+    counts = Counter()
+    for tweet in tweets:
+        tags = [normalize_tag(tag) for tag in tweet.get("hashtags", [])]
+        if normalized_target not in tags:
+            continue
+        for tag in tags:
+            if tag != normalized_target and tag not in excluded:
+                counts[tag] += 1
+    return [{"tag": tag, "count": count} for tag, count in counts.most_common(10)]
+
+
+def _summarize_target(tweets: list[dict[str, Any]], media_records: list[dict[str, Any]], target: str, all_targets: set[str]) -> dict[str, Any]:
+    target_norm = normalize_tag(target)
+    tweet_ids = set()
+    media_tweet_ids = set()
+    monthly_post_counts = Counter()
+    monthly_media_counts = Counter()
+
+    for tweet in tweets:
+        tags = [normalize_tag(tag) for tag in tweet.get("hashtags", [])]
+        if target_norm not in tags:
+            continue
+        tweet_ids.add(str(tweet["tweet_id"]))
+        bucket = month_bucket(tweet.get("created_at"))
+        if bucket:
+            monthly_post_counts[bucket] += 1
+
+    for record in media_records:
+        tags = [normalize_tag(tag) for tag in record.get("hashtags", [])]
+        if target_norm not in tags:
+            continue
+        media_tweet_ids.add(str(record["tweet_id"]))
+        bucket = month_bucket(record.get("created_at"))
+        if bucket:
+            monthly_media_counts[bucket] += 1
+
+    return {
+        "tag": target_norm,
+        "total_posts": len(tweet_ids),
+        "media_posts": len(media_tweet_ids),
+        "monthly_post_counts": dict(sorted(monthly_post_counts.items())),
+        "monthly_media_counts": dict(sorted(monthly_media_counts.items())),
+        "top_cooccurring_hashtags": _top_cooccurring(tweets, target_norm, all_targets),
+    }
+
+
+def build_hashtag_metrics(
+    tweets: list[dict[str, Any]],
+    media_records: list[dict[str, Any]],
+    target_tags: list[str],
+) -> dict[str, Any]:
+    normalized_targets = {normalize_tag(tag) for tag in target_tags}
+    targets = {normalize_tag(tag): _summarize_target(tweets, media_records, tag, normalized_targets) for tag in target_tags}
+    tweet_overlap_ids = set.intersection(*(set(str(t["tweet_id"]) for t in tweets if normalize_tag(tag) in [normalize_tag(x) for x in t.get("hashtags", [])]) for tag in target_tags)) if target_tags else set()
+    media_overlap_ids = set.intersection(*(set(str(r["tweet_id"]) for r in media_records if normalize_tag(tag) in [normalize_tag(x) for x in r.get("hashtags", [])]) for tag in target_tags)) if target_tags else set()
+    return {
+        "schema_version": 1,
+        "target_tags": [normalize_tag(tag) for tag in target_tags],
+        "targets": targets,
+        "overlap": {
+            "tweet_posts": len(tweet_overlap_ids),
+            "media_posts": len(media_overlap_ids),
+        },
+    }
+
+
+def build_markdown_report(metrics: dict[str, Any]) -> str:
+    a, b = metrics["target_tags"]
+    ta = metrics["targets"][a]
+    tb = metrics["targets"][b]
+    return f"""# Sovereign Hashtag Metrics
+
+Targets:
+- #{a}
+- #{b}
+
+## Totals
+
+| Tag | Total posts | Media posts |
+|---|---:|---:|
+| #{a} | {ta['total_posts']} | {ta['media_posts']} |
+| #{b} | {tb['total_posts']} | {tb['media_posts']} |
+
+## Overlap
+
+- Tweet posts containing both: {metrics['overlap']['tweet_posts']}
+- Media posts containing both: {metrics['overlap']['media_posts']}
+
+## Co-occurring hashtags
+
+### #{a}
+{chr(10).join(f"- #{item['tag']} ({item['count']})" for item in ta['top_cooccurring_hashtags'][:10]) or '- none'}
+
+### #{b}
+{chr(10).join(f"- #{item['tag']} ({item['count']})" for item in tb['top_cooccurring_hashtags'][:10]) or '- none'}
+
+## Notes
+
+These metrics are computed locally from downloaded archive data.
+No archive media was uploaded to cloud services for analysis.
+
+Artifacts:
+- `twitter-archive/media/hashtag_metrics.json`
+- `twitter-archive/media/hashtag_metrics.md`
+"""
+
+
+def write_text(path: Path, payload: str) -> None:
+    path.parent.mkdir(parents=True, exist_ok=True)
+    text = payload.rstrip()
+    path.write_text((text + "\n") if text else "")
+
+
+def main() -> None:
+    MEDIA_DIR.mkdir(parents=True, exist_ok=True)
+    tweets = load_jsonl(EXTRACTED_DIR / "tweets.jsonl")
+    media_records = load_jsonl(MEDIA_MANIFEST_JSONL)
+    metrics = build_hashtag_metrics(tweets, media_records, TARGET_TAGS)
+    write_json(HASHTAG_METRICS_JSON, metrics)
+    write_text(HASHTAG_METRICS_MD, build_markdown_report(metrics))
+    print(json.dumps(metrics, sort_keys=True))
+
+
+if __name__ == "__main__":
+    main()
--- a/tests/test_twitter_hashtag_metrics.py
+++ b/tests/test_twitter_hashtag_metrics.py
@@ -0,0 +1,50 @@
+from __future__ import annotations
+
+from pathlib import Path
+
+from scripts.twitter_archive.compute_hashtag_metrics import build_hashtag_metrics
+
+
+def test_build_hashtag_metrics_counts_posts_media_overlap_and_cooccurrence() -> None:
+    tweets = [
+        {
+            "tweet_id": "1",
+            "created_at": "Sat Mar 01 12:00:00 +0000 2025",
+            "hashtags": ["timmyTime"],
+        },
+        {
+            "tweet_id": "2",
+            "created_at": "Sat Mar 15 12:00:00 +0000 2025",
+            "hashtags": ["TimmyChain", "timmyTime", "Bitcoin"],
+        },
+        {
+            "tweet_id": "3",
+            "created_at": "Sat Apr 05 12:00:00 +0000 2025",
+            "hashtags": ["TimmyChain"],
+        },
+    ]
+    media_records = [
+        {"tweet_id": "1", "hashtags": ["timmyTime"], "media_type": "video", "created_at": "Sat Mar 01 12:00:00 +0000 2025"},
+        {"tweet_id": "2", "hashtags": ["TimmyChain", "timmyTime", "Bitcoin"], "media_type": "animated_gif", "created_at": "Sat Mar 15 12:00:00 +0000 2025"},
+    ]
+
+    metrics = build_hashtag_metrics(tweets, media_records, ["timmyTime", "TimmyChain"])
+
+    assert metrics["targets"]["timmytime"]["total_posts"] == 2
+    assert metrics["targets"]["timmychain"]["total_posts"] == 2
+    assert metrics["targets"]["timmytime"]["media_posts"] == 2
+    assert metrics["targets"]["timmychain"]["media_posts"] == 1
+    assert metrics["overlap"]["tweet_posts"] == 1
+    assert metrics["overlap"]["media_posts"] == 1
+    assert metrics["targets"]["timmytime"]["monthly_post_counts"]["2025-03"] == 2
+    assert metrics["targets"]["timmychain"]["monthly_post_counts"]["2025-04"] == 1
+    assert metrics["targets"]["timmychain"]["top_cooccurring_hashtags"][0]["tag"] == "bitcoin"
+
+
+def test_project_doc_mentions_hashtag_metrics_outputs() -> None:
+    project = Path("twitter-archive/PROJECT.md").read_text()
+
+    assert "hashtag_metrics.json" in project
+    assert "hashtag_metrics.md" in project
+    assert "#timmyTime" in project
+    assert "#TimmyChain" in project
--- a/twitter-archive/PROJECT.md
+++ b/twitter-archive/PROJECT.md
@@ -121,6 +121,8 @@ Late notes are terse deltas. THAT'S THE PROOF OF GROWTH.
  media/                  # Local-first media understanding
    manifest.jsonl        # one row per video/gif with tweet text + hashtags preserved
    manifest_summary.json # rollup counts and hashtag families
+    hashtag_metrics.json  # machine-readable metrics for #timmyTime / #TimmyChain
+    hashtag_metrics.md    # human-readable local report
    keyframes/            # future extracted frames
    audio/                # future demuxed audio
    style_cards/          # future per-video aesthetic summaries