From 4ca5e8e1003907bab282d582c5c2b51f095a7ff8 Mon Sep 17 00:00:00 2001 From: Timmy Time Date: Sat, 28 Mar 2026 20:16:31 +0000 Subject: [PATCH] feat: add sovereign hashtag metrics for archive media (#48) --- .../compute_hashtag_metrics.py | 157 ++++++++++++++++++ tests/test_twitter_hashtag_metrics.py | 50 ++++++ twitter-archive/PROJECT.md | 2 + 3 files changed, 209 insertions(+) create mode 100644 scripts/twitter_archive/compute_hashtag_metrics.py create mode 100644 tests/test_twitter_hashtag_metrics.py diff --git a/scripts/twitter_archive/compute_hashtag_metrics.py b/scripts/twitter_archive/compute_hashtag_metrics.py new file mode 100644 index 0000000..2dcedee --- /dev/null +++ b/scripts/twitter_archive/compute_hashtag_metrics.py @@ -0,0 +1,157 @@ +#!/usr/bin/env python3 +"""Compute sovereign hashtag metrics for the Twitter archive media lane.""" + +from __future__ import annotations + +import json +from collections import Counter, defaultdict +from datetime import datetime +from pathlib import Path +from typing import Any + +from .common import ARCHIVE_DIR, EXTRACTED_DIR, load_jsonl, write_json + +MEDIA_DIR = ARCHIVE_DIR / "media" +MEDIA_MANIFEST_JSONL = MEDIA_DIR / "manifest.jsonl" +HASHTAG_METRICS_JSON = MEDIA_DIR / "hashtag_metrics.json" +HASHTAG_METRICS_MD = MEDIA_DIR / "hashtag_metrics.md" +TARGET_TAGS = ["timmyTime", "TimmyChain"] +DATE_FMT = "%a %b %d %H:%M:%S %z %Y" + + +def normalize_tag(tag: str) -> str: + return tag.strip().lower() + + +def month_bucket(created_at: str | None) -> str | None: + if not created_at: + return None + return datetime.strptime(created_at, DATE_FMT).strftime("%Y-%m") + + +def _top_cooccurring(tweets: list[dict[str, Any]], normalized_target: str, excluded: set[str]) -> list[dict[str, Any]]: + counts = Counter() + for tweet in tweets: + tags = [normalize_tag(tag) for tag in tweet.get("hashtags", [])] + if normalized_target not in tags: + continue + for tag in tags: + if tag != normalized_target and tag not in excluded: + counts[tag] += 1 + return [{"tag": tag, "count": count} for tag, count in counts.most_common(10)] + + +def _summarize_target(tweets: list[dict[str, Any]], media_records: list[dict[str, Any]], target: str, all_targets: set[str]) -> dict[str, Any]: + target_norm = normalize_tag(target) + tweet_ids = set() + media_tweet_ids = set() + monthly_post_counts = Counter() + monthly_media_counts = Counter() + + for tweet in tweets: + tags = [normalize_tag(tag) for tag in tweet.get("hashtags", [])] + if target_norm not in tags: + continue + tweet_ids.add(str(tweet["tweet_id"])) + bucket = month_bucket(tweet.get("created_at")) + if bucket: + monthly_post_counts[bucket] += 1 + + for record in media_records: + tags = [normalize_tag(tag) for tag in record.get("hashtags", [])] + if target_norm not in tags: + continue + media_tweet_ids.add(str(record["tweet_id"])) + bucket = month_bucket(record.get("created_at")) + if bucket: + monthly_media_counts[bucket] += 1 + + return { + "tag": target_norm, + "total_posts": len(tweet_ids), + "media_posts": len(media_tweet_ids), + "monthly_post_counts": dict(sorted(monthly_post_counts.items())), + "monthly_media_counts": dict(sorted(monthly_media_counts.items())), + "top_cooccurring_hashtags": _top_cooccurring(tweets, target_norm, all_targets), + } + + +def build_hashtag_metrics( + tweets: list[dict[str, Any]], + media_records: list[dict[str, Any]], + target_tags: list[str], +) -> dict[str, Any]: + normalized_targets = {normalize_tag(tag) for tag in target_tags} + targets = {normalize_tag(tag): _summarize_target(tweets, media_records, tag, normalized_targets) for tag in target_tags} + tweet_overlap_ids = set.intersection(*(set(str(t["tweet_id"]) for t in tweets if normalize_tag(tag) in [normalize_tag(x) for x in t.get("hashtags", [])]) for tag in target_tags)) if target_tags else set() + media_overlap_ids = set.intersection(*(set(str(r["tweet_id"]) for r in media_records if normalize_tag(tag) in [normalize_tag(x) for x in r.get("hashtags", [])]) for tag in target_tags)) if target_tags else set() + return { + "schema_version": 1, + "target_tags": [normalize_tag(tag) for tag in target_tags], + "targets": targets, + "overlap": { + "tweet_posts": len(tweet_overlap_ids), + "media_posts": len(media_overlap_ids), + }, + } + + +def build_markdown_report(metrics: dict[str, Any]) -> str: + a, b = metrics["target_tags"] + ta = metrics["targets"][a] + tb = metrics["targets"][b] + return f"""# Sovereign Hashtag Metrics + +Targets: +- #{a} +- #{b} + +## Totals + +| Tag | Total posts | Media posts | +|---|---:|---:| +| #{a} | {ta['total_posts']} | {ta['media_posts']} | +| #{b} | {tb['total_posts']} | {tb['media_posts']} | + +## Overlap + +- Tweet posts containing both: {metrics['overlap']['tweet_posts']} +- Media posts containing both: {metrics['overlap']['media_posts']} + +## Co-occurring hashtags + +### #{a} +{chr(10).join(f"- #{item['tag']} ({item['count']})" for item in ta['top_cooccurring_hashtags'][:10]) or '- none'} + +### #{b} +{chr(10).join(f"- #{item['tag']} ({item['count']})" for item in tb['top_cooccurring_hashtags'][:10]) or '- none'} + +## Notes + +These metrics are computed locally from downloaded archive data. +No archive media was uploaded to cloud services for analysis. + +Artifacts: +- `twitter-archive/media/hashtag_metrics.json` +- `twitter-archive/media/hashtag_metrics.md` +""" + + +def write_text(path: Path, payload: str) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + text = payload.rstrip() + path.write_text((text + "\n") if text else "") + + +def main() -> None: + MEDIA_DIR.mkdir(parents=True, exist_ok=True) + tweets = load_jsonl(EXTRACTED_DIR / "tweets.jsonl") + media_records = load_jsonl(MEDIA_MANIFEST_JSONL) + metrics = build_hashtag_metrics(tweets, media_records, TARGET_TAGS) + write_json(HASHTAG_METRICS_JSON, metrics) + write_text(HASHTAG_METRICS_MD, build_markdown_report(metrics)) + print(json.dumps(metrics, sort_keys=True)) + + +if __name__ == "__main__": + main() diff --git a/tests/test_twitter_hashtag_metrics.py b/tests/test_twitter_hashtag_metrics.py new file mode 100644 index 0000000..f570794 --- /dev/null +++ b/tests/test_twitter_hashtag_metrics.py @@ -0,0 +1,50 @@ +from __future__ import annotations + +from pathlib import Path + +from scripts.twitter_archive.compute_hashtag_metrics import build_hashtag_metrics + + +def test_build_hashtag_metrics_counts_posts_media_overlap_and_cooccurrence() -> None: + tweets = [ + { + "tweet_id": "1", + "created_at": "Sat Mar 01 12:00:00 +0000 2025", + "hashtags": ["timmyTime"], + }, + { + "tweet_id": "2", + "created_at": "Sat Mar 15 12:00:00 +0000 2025", + "hashtags": ["TimmyChain", "timmyTime", "Bitcoin"], + }, + { + "tweet_id": "3", + "created_at": "Sat Apr 05 12:00:00 +0000 2025", + "hashtags": ["TimmyChain"], + }, + ] + media_records = [ + {"tweet_id": "1", "hashtags": ["timmyTime"], "media_type": "video", "created_at": "Sat Mar 01 12:00:00 +0000 2025"}, + {"tweet_id": "2", "hashtags": ["TimmyChain", "timmyTime", "Bitcoin"], "media_type": "animated_gif", "created_at": "Sat Mar 15 12:00:00 +0000 2025"}, + ] + + metrics = build_hashtag_metrics(tweets, media_records, ["timmyTime", "TimmyChain"]) + + assert metrics["targets"]["timmytime"]["total_posts"] == 2 + assert metrics["targets"]["timmychain"]["total_posts"] == 2 + assert metrics["targets"]["timmytime"]["media_posts"] == 2 + assert metrics["targets"]["timmychain"]["media_posts"] == 1 + assert metrics["overlap"]["tweet_posts"] == 1 + assert metrics["overlap"]["media_posts"] == 1 + assert metrics["targets"]["timmytime"]["monthly_post_counts"]["2025-03"] == 2 + assert metrics["targets"]["timmychain"]["monthly_post_counts"]["2025-04"] == 1 + assert metrics["targets"]["timmychain"]["top_cooccurring_hashtags"][0]["tag"] == "bitcoin" + + +def test_project_doc_mentions_hashtag_metrics_outputs() -> None: + project = Path("twitter-archive/PROJECT.md").read_text() + + assert "hashtag_metrics.json" in project + assert "hashtag_metrics.md" in project + assert "#timmyTime" in project + assert "#TimmyChain" in project diff --git a/twitter-archive/PROJECT.md b/twitter-archive/PROJECT.md index 6b74b33..84c025c 100644 --- a/twitter-archive/PROJECT.md +++ b/twitter-archive/PROJECT.md @@ -121,6 +121,8 @@ Late notes are terse deltas. THAT'S THE PROOF OF GROWTH. media/ # Local-first media understanding manifest.jsonl # one row per video/gif with tweet text + hashtags preserved manifest_summary.json # rollup counts and hashtag families + hashtag_metrics.json # machine-readable metrics for #timmyTime / #TimmyChain + hashtag_metrics.md # human-readable local report keyframes/ # future extracted frames audio/ # future demuxed audio style_cards/ # future per-video aesthetic summaries