feat: add sovereign hashtag metrics for archive media (#48)

This commit was merged in pull request #48.
This commit is contained in:
2026-03-28 20:16:31 +00:00
parent 25c1b32bfb
commit 4ca5e8e100
3 changed files with 209 additions and 0 deletions

View File

@@ -0,0 +1,157 @@
#!/usr/bin/env python3
"""Compute sovereign hashtag metrics for the Twitter archive media lane."""
from __future__ import annotations
import json
from collections import Counter, defaultdict
from datetime import datetime
from pathlib import Path
from typing import Any
from .common import ARCHIVE_DIR, EXTRACTED_DIR, load_jsonl, write_json
MEDIA_DIR = ARCHIVE_DIR / "media"
MEDIA_MANIFEST_JSONL = MEDIA_DIR / "manifest.jsonl"
HASHTAG_METRICS_JSON = MEDIA_DIR / "hashtag_metrics.json"
HASHTAG_METRICS_MD = MEDIA_DIR / "hashtag_metrics.md"
TARGET_TAGS = ["timmyTime", "TimmyChain"]
DATE_FMT = "%a %b %d %H:%M:%S %z %Y"
def normalize_tag(tag: str) -> str:
return tag.strip().lower()
def month_bucket(created_at: str | None) -> str | None:
if not created_at:
return None
return datetime.strptime(created_at, DATE_FMT).strftime("%Y-%m")
def _top_cooccurring(tweets: list[dict[str, Any]], normalized_target: str, excluded: set[str]) -> list[dict[str, Any]]:
counts = Counter()
for tweet in tweets:
tags = [normalize_tag(tag) for tag in tweet.get("hashtags", [])]
if normalized_target not in tags:
continue
for tag in tags:
if tag != normalized_target and tag not in excluded:
counts[tag] += 1
return [{"tag": tag, "count": count} for tag, count in counts.most_common(10)]
def _summarize_target(tweets: list[dict[str, Any]], media_records: list[dict[str, Any]], target: str, all_targets: set[str]) -> dict[str, Any]:
target_norm = normalize_tag(target)
tweet_ids = set()
media_tweet_ids = set()
monthly_post_counts = Counter()
monthly_media_counts = Counter()
for tweet in tweets:
tags = [normalize_tag(tag) for tag in tweet.get("hashtags", [])]
if target_norm not in tags:
continue
tweet_ids.add(str(tweet["tweet_id"]))
bucket = month_bucket(tweet.get("created_at"))
if bucket:
monthly_post_counts[bucket] += 1
for record in media_records:
tags = [normalize_tag(tag) for tag in record.get("hashtags", [])]
if target_norm not in tags:
continue
media_tweet_ids.add(str(record["tweet_id"]))
bucket = month_bucket(record.get("created_at"))
if bucket:
monthly_media_counts[bucket] += 1
return {
"tag": target_norm,
"total_posts": len(tweet_ids),
"media_posts": len(media_tweet_ids),
"monthly_post_counts": dict(sorted(monthly_post_counts.items())),
"monthly_media_counts": dict(sorted(monthly_media_counts.items())),
"top_cooccurring_hashtags": _top_cooccurring(tweets, target_norm, all_targets),
}
def build_hashtag_metrics(
tweets: list[dict[str, Any]],
media_records: list[dict[str, Any]],
target_tags: list[str],
) -> dict[str, Any]:
normalized_targets = {normalize_tag(tag) for tag in target_tags}
targets = {normalize_tag(tag): _summarize_target(tweets, media_records, tag, normalized_targets) for tag in target_tags}
tweet_overlap_ids = set.intersection(*(set(str(t["tweet_id"]) for t in tweets if normalize_tag(tag) in [normalize_tag(x) for x in t.get("hashtags", [])]) for tag in target_tags)) if target_tags else set()
media_overlap_ids = set.intersection(*(set(str(r["tweet_id"]) for r in media_records if normalize_tag(tag) in [normalize_tag(x) for x in r.get("hashtags", [])]) for tag in target_tags)) if target_tags else set()
return {
"schema_version": 1,
"target_tags": [normalize_tag(tag) for tag in target_tags],
"targets": targets,
"overlap": {
"tweet_posts": len(tweet_overlap_ids),
"media_posts": len(media_overlap_ids),
},
}
def build_markdown_report(metrics: dict[str, Any]) -> str:
a, b = metrics["target_tags"]
ta = metrics["targets"][a]
tb = metrics["targets"][b]
return f"""# Sovereign Hashtag Metrics
Targets:
- #{a}
- #{b}
## Totals
| Tag | Total posts | Media posts |
|---|---:|---:|
| #{a} | {ta['total_posts']} | {ta['media_posts']} |
| #{b} | {tb['total_posts']} | {tb['media_posts']} |
## Overlap
- Tweet posts containing both: {metrics['overlap']['tweet_posts']}
- Media posts containing both: {metrics['overlap']['media_posts']}
## Co-occurring hashtags
### #{a}
{chr(10).join(f"- #{item['tag']} ({item['count']})" for item in ta['top_cooccurring_hashtags'][:10]) or '- none'}
### #{b}
{chr(10).join(f"- #{item['tag']} ({item['count']})" for item in tb['top_cooccurring_hashtags'][:10]) or '- none'}
## Notes
These metrics are computed locally from downloaded archive data.
No archive media was uploaded to cloud services for analysis.
Artifacts:
- `twitter-archive/media/hashtag_metrics.json`
- `twitter-archive/media/hashtag_metrics.md`
"""
def write_text(path: Path, payload: str) -> None:
path.parent.mkdir(parents=True, exist_ok=True)
text = payload.rstrip()
path.write_text((text + "\n") if text else "")
def main() -> None:
MEDIA_DIR.mkdir(parents=True, exist_ok=True)
tweets = load_jsonl(EXTRACTED_DIR / "tweets.jsonl")
media_records = load_jsonl(MEDIA_MANIFEST_JSONL)
metrics = build_hashtag_metrics(tweets, media_records, TARGET_TAGS)
write_json(HASHTAG_METRICS_JSON, metrics)
write_text(HASHTAG_METRICS_MD, build_markdown_report(metrics))
print(json.dumps(metrics, sort_keys=True))
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,50 @@
from __future__ import annotations
from pathlib import Path
from scripts.twitter_archive.compute_hashtag_metrics import build_hashtag_metrics
def test_build_hashtag_metrics_counts_posts_media_overlap_and_cooccurrence() -> None:
tweets = [
{
"tweet_id": "1",
"created_at": "Sat Mar 01 12:00:00 +0000 2025",
"hashtags": ["timmyTime"],
},
{
"tweet_id": "2",
"created_at": "Sat Mar 15 12:00:00 +0000 2025",
"hashtags": ["TimmyChain", "timmyTime", "Bitcoin"],
},
{
"tweet_id": "3",
"created_at": "Sat Apr 05 12:00:00 +0000 2025",
"hashtags": ["TimmyChain"],
},
]
media_records = [
{"tweet_id": "1", "hashtags": ["timmyTime"], "media_type": "video", "created_at": "Sat Mar 01 12:00:00 +0000 2025"},
{"tweet_id": "2", "hashtags": ["TimmyChain", "timmyTime", "Bitcoin"], "media_type": "animated_gif", "created_at": "Sat Mar 15 12:00:00 +0000 2025"},
]
metrics = build_hashtag_metrics(tweets, media_records, ["timmyTime", "TimmyChain"])
assert metrics["targets"]["timmytime"]["total_posts"] == 2
assert metrics["targets"]["timmychain"]["total_posts"] == 2
assert metrics["targets"]["timmytime"]["media_posts"] == 2
assert metrics["targets"]["timmychain"]["media_posts"] == 1
assert metrics["overlap"]["tweet_posts"] == 1
assert metrics["overlap"]["media_posts"] == 1
assert metrics["targets"]["timmytime"]["monthly_post_counts"]["2025-03"] == 2
assert metrics["targets"]["timmychain"]["monthly_post_counts"]["2025-04"] == 1
assert metrics["targets"]["timmychain"]["top_cooccurring_hashtags"][0]["tag"] == "bitcoin"
def test_project_doc_mentions_hashtag_metrics_outputs() -> None:
project = Path("twitter-archive/PROJECT.md").read_text()
assert "hashtag_metrics.json" in project
assert "hashtag_metrics.md" in project
assert "#timmyTime" in project
assert "#TimmyChain" in project

View File

@@ -121,6 +121,8 @@ Late notes are terse deltas. THAT'S THE PROOF OF GROWTH.
media/ # Local-first media understanding
manifest.jsonl # one row per video/gif with tweet text + hashtags preserved
manifest_summary.json # rollup counts and hashtag families
hashtag_metrics.json # machine-readable metrics for #timmyTime / #TimmyChain
hashtag_metrics.md # human-readable local report
keyframes/ # future extracted frames
audio/ # future demuxed audio
style_cards/ # future per-video aesthetic summaries