feat: add sovereign hashtag metrics for archive media (#48)
This commit was merged in pull request #48.
This commit is contained in:
157
scripts/twitter_archive/compute_hashtag_metrics.py
Normal file
157
scripts/twitter_archive/compute_hashtag_metrics.py
Normal file
@@ -0,0 +1,157 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Compute sovereign hashtag metrics for the Twitter archive media lane."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from collections import Counter, defaultdict
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from .common import ARCHIVE_DIR, EXTRACTED_DIR, load_jsonl, write_json
|
||||
|
||||
MEDIA_DIR = ARCHIVE_DIR / "media"
|
||||
MEDIA_MANIFEST_JSONL = MEDIA_DIR / "manifest.jsonl"
|
||||
HASHTAG_METRICS_JSON = MEDIA_DIR / "hashtag_metrics.json"
|
||||
HASHTAG_METRICS_MD = MEDIA_DIR / "hashtag_metrics.md"
|
||||
TARGET_TAGS = ["timmyTime", "TimmyChain"]
|
||||
DATE_FMT = "%a %b %d %H:%M:%S %z %Y"
|
||||
|
||||
|
||||
def normalize_tag(tag: str) -> str:
|
||||
return tag.strip().lower()
|
||||
|
||||
|
||||
def month_bucket(created_at: str | None) -> str | None:
|
||||
if not created_at:
|
||||
return None
|
||||
return datetime.strptime(created_at, DATE_FMT).strftime("%Y-%m")
|
||||
|
||||
|
||||
def _top_cooccurring(tweets: list[dict[str, Any]], normalized_target: str, excluded: set[str]) -> list[dict[str, Any]]:
|
||||
counts = Counter()
|
||||
for tweet in tweets:
|
||||
tags = [normalize_tag(tag) for tag in tweet.get("hashtags", [])]
|
||||
if normalized_target not in tags:
|
||||
continue
|
||||
for tag in tags:
|
||||
if tag != normalized_target and tag not in excluded:
|
||||
counts[tag] += 1
|
||||
return [{"tag": tag, "count": count} for tag, count in counts.most_common(10)]
|
||||
|
||||
|
||||
def _summarize_target(tweets: list[dict[str, Any]], media_records: list[dict[str, Any]], target: str, all_targets: set[str]) -> dict[str, Any]:
|
||||
target_norm = normalize_tag(target)
|
||||
tweet_ids = set()
|
||||
media_tweet_ids = set()
|
||||
monthly_post_counts = Counter()
|
||||
monthly_media_counts = Counter()
|
||||
|
||||
for tweet in tweets:
|
||||
tags = [normalize_tag(tag) for tag in tweet.get("hashtags", [])]
|
||||
if target_norm not in tags:
|
||||
continue
|
||||
tweet_ids.add(str(tweet["tweet_id"]))
|
||||
bucket = month_bucket(tweet.get("created_at"))
|
||||
if bucket:
|
||||
monthly_post_counts[bucket] += 1
|
||||
|
||||
for record in media_records:
|
||||
tags = [normalize_tag(tag) for tag in record.get("hashtags", [])]
|
||||
if target_norm not in tags:
|
||||
continue
|
||||
media_tweet_ids.add(str(record["tweet_id"]))
|
||||
bucket = month_bucket(record.get("created_at"))
|
||||
if bucket:
|
||||
monthly_media_counts[bucket] += 1
|
||||
|
||||
return {
|
||||
"tag": target_norm,
|
||||
"total_posts": len(tweet_ids),
|
||||
"media_posts": len(media_tweet_ids),
|
||||
"monthly_post_counts": dict(sorted(monthly_post_counts.items())),
|
||||
"monthly_media_counts": dict(sorted(monthly_media_counts.items())),
|
||||
"top_cooccurring_hashtags": _top_cooccurring(tweets, target_norm, all_targets),
|
||||
}
|
||||
|
||||
|
||||
def build_hashtag_metrics(
|
||||
tweets: list[dict[str, Any]],
|
||||
media_records: list[dict[str, Any]],
|
||||
target_tags: list[str],
|
||||
) -> dict[str, Any]:
|
||||
normalized_targets = {normalize_tag(tag) for tag in target_tags}
|
||||
targets = {normalize_tag(tag): _summarize_target(tweets, media_records, tag, normalized_targets) for tag in target_tags}
|
||||
tweet_overlap_ids = set.intersection(*(set(str(t["tweet_id"]) for t in tweets if normalize_tag(tag) in [normalize_tag(x) for x in t.get("hashtags", [])]) for tag in target_tags)) if target_tags else set()
|
||||
media_overlap_ids = set.intersection(*(set(str(r["tweet_id"]) for r in media_records if normalize_tag(tag) in [normalize_tag(x) for x in r.get("hashtags", [])]) for tag in target_tags)) if target_tags else set()
|
||||
return {
|
||||
"schema_version": 1,
|
||||
"target_tags": [normalize_tag(tag) for tag in target_tags],
|
||||
"targets": targets,
|
||||
"overlap": {
|
||||
"tweet_posts": len(tweet_overlap_ids),
|
||||
"media_posts": len(media_overlap_ids),
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def build_markdown_report(metrics: dict[str, Any]) -> str:
|
||||
a, b = metrics["target_tags"]
|
||||
ta = metrics["targets"][a]
|
||||
tb = metrics["targets"][b]
|
||||
return f"""# Sovereign Hashtag Metrics
|
||||
|
||||
Targets:
|
||||
- #{a}
|
||||
- #{b}
|
||||
|
||||
## Totals
|
||||
|
||||
| Tag | Total posts | Media posts |
|
||||
|---|---:|---:|
|
||||
| #{a} | {ta['total_posts']} | {ta['media_posts']} |
|
||||
| #{b} | {tb['total_posts']} | {tb['media_posts']} |
|
||||
|
||||
## Overlap
|
||||
|
||||
- Tweet posts containing both: {metrics['overlap']['tweet_posts']}
|
||||
- Media posts containing both: {metrics['overlap']['media_posts']}
|
||||
|
||||
## Co-occurring hashtags
|
||||
|
||||
### #{a}
|
||||
{chr(10).join(f"- #{item['tag']} ({item['count']})" for item in ta['top_cooccurring_hashtags'][:10]) or '- none'}
|
||||
|
||||
### #{b}
|
||||
{chr(10).join(f"- #{item['tag']} ({item['count']})" for item in tb['top_cooccurring_hashtags'][:10]) or '- none'}
|
||||
|
||||
## Notes
|
||||
|
||||
These metrics are computed locally from downloaded archive data.
|
||||
No archive media was uploaded to cloud services for analysis.
|
||||
|
||||
Artifacts:
|
||||
- `twitter-archive/media/hashtag_metrics.json`
|
||||
- `twitter-archive/media/hashtag_metrics.md`
|
||||
"""
|
||||
|
||||
|
||||
def write_text(path: Path, payload: str) -> None:
|
||||
path.parent.mkdir(parents=True, exist_ok=True)
|
||||
text = payload.rstrip()
|
||||
path.write_text((text + "\n") if text else "")
|
||||
|
||||
|
||||
def main() -> None:
|
||||
MEDIA_DIR.mkdir(parents=True, exist_ok=True)
|
||||
tweets = load_jsonl(EXTRACTED_DIR / "tweets.jsonl")
|
||||
media_records = load_jsonl(MEDIA_MANIFEST_JSONL)
|
||||
metrics = build_hashtag_metrics(tweets, media_records, TARGET_TAGS)
|
||||
write_json(HASHTAG_METRICS_JSON, metrics)
|
||||
write_text(HASHTAG_METRICS_MD, build_markdown_report(metrics))
|
||||
print(json.dumps(metrics, sort_keys=True))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
50
tests/test_twitter_hashtag_metrics.py
Normal file
50
tests/test_twitter_hashtag_metrics.py
Normal file
@@ -0,0 +1,50 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
from scripts.twitter_archive.compute_hashtag_metrics import build_hashtag_metrics
|
||||
|
||||
|
||||
def test_build_hashtag_metrics_counts_posts_media_overlap_and_cooccurrence() -> None:
|
||||
tweets = [
|
||||
{
|
||||
"tweet_id": "1",
|
||||
"created_at": "Sat Mar 01 12:00:00 +0000 2025",
|
||||
"hashtags": ["timmyTime"],
|
||||
},
|
||||
{
|
||||
"tweet_id": "2",
|
||||
"created_at": "Sat Mar 15 12:00:00 +0000 2025",
|
||||
"hashtags": ["TimmyChain", "timmyTime", "Bitcoin"],
|
||||
},
|
||||
{
|
||||
"tweet_id": "3",
|
||||
"created_at": "Sat Apr 05 12:00:00 +0000 2025",
|
||||
"hashtags": ["TimmyChain"],
|
||||
},
|
||||
]
|
||||
media_records = [
|
||||
{"tweet_id": "1", "hashtags": ["timmyTime"], "media_type": "video", "created_at": "Sat Mar 01 12:00:00 +0000 2025"},
|
||||
{"tweet_id": "2", "hashtags": ["TimmyChain", "timmyTime", "Bitcoin"], "media_type": "animated_gif", "created_at": "Sat Mar 15 12:00:00 +0000 2025"},
|
||||
]
|
||||
|
||||
metrics = build_hashtag_metrics(tweets, media_records, ["timmyTime", "TimmyChain"])
|
||||
|
||||
assert metrics["targets"]["timmytime"]["total_posts"] == 2
|
||||
assert metrics["targets"]["timmychain"]["total_posts"] == 2
|
||||
assert metrics["targets"]["timmytime"]["media_posts"] == 2
|
||||
assert metrics["targets"]["timmychain"]["media_posts"] == 1
|
||||
assert metrics["overlap"]["tweet_posts"] == 1
|
||||
assert metrics["overlap"]["media_posts"] == 1
|
||||
assert metrics["targets"]["timmytime"]["monthly_post_counts"]["2025-03"] == 2
|
||||
assert metrics["targets"]["timmychain"]["monthly_post_counts"]["2025-04"] == 1
|
||||
assert metrics["targets"]["timmychain"]["top_cooccurring_hashtags"][0]["tag"] == "bitcoin"
|
||||
|
||||
|
||||
def test_project_doc_mentions_hashtag_metrics_outputs() -> None:
|
||||
project = Path("twitter-archive/PROJECT.md").read_text()
|
||||
|
||||
assert "hashtag_metrics.json" in project
|
||||
assert "hashtag_metrics.md" in project
|
||||
assert "#timmyTime" in project
|
||||
assert "#TimmyChain" in project
|
||||
@@ -121,6 +121,8 @@ Late notes are terse deltas. THAT'S THE PROOF OF GROWTH.
|
||||
media/ # Local-first media understanding
|
||||
manifest.jsonl # one row per video/gif with tweet text + hashtags preserved
|
||||
manifest_summary.json # rollup counts and hashtag families
|
||||
hashtag_metrics.json # machine-readable metrics for #timmyTime / #TimmyChain
|
||||
hashtag_metrics.md # human-readable local report
|
||||
keyframes/ # future extracted frames
|
||||
audio/ # future demuxed audio
|
||||
style_cards/ # future per-video aesthetic summaries
|
||||
|
||||
Reference in New Issue
Block a user