diff --git a/skills/research/DESCRIPTION.md b/skills/research/DESCRIPTION.md new file mode 100644 index 000000000..8bcf33023 --- /dev/null +++ b/skills/research/DESCRIPTION.md @@ -0,0 +1,3 @@ +--- +description: Skills for academic research, paper discovery, literature review, and scientific knowledge retrieval. +--- diff --git a/skills/research/arxiv/SKILL.md b/skills/research/arxiv/SKILL.md new file mode 100644 index 000000000..f6b90d2d5 --- /dev/null +++ b/skills/research/arxiv/SKILL.md @@ -0,0 +1,235 @@ +--- +name: arxiv +description: Search and retrieve academic papers from arXiv using their free REST API. No API key needed. Search by keyword, author, category, or ID. Combine with web_extract or the ocr-and-documents skill to read full paper content. +version: 1.0.0 +author: Hermes Agent +license: MIT +metadata: + hermes: + tags: [Research, Arxiv, Papers, Academic, Science, API] + related_skills: [ocr-and-documents] +--- + +# arXiv Research + +Search and retrieve academic papers from arXiv via their free REST API. No API key, no dependencies — just curl. + +## Quick Reference + +| Action | Command | +|--------|---------| +| Search papers | `curl "https://export.arxiv.org/api/query?search_query=all:QUERY&max_results=5"` | +| Get specific paper | `curl "https://export.arxiv.org/api/query?id_list=2402.03300"` | +| Read abstract (web) | `web_extract(urls=["https://arxiv.org/abs/2402.03300"])` | +| Read full paper (PDF) | `web_extract(urls=["https://arxiv.org/pdf/2402.03300"])` | + +## Searching Papers + +The API returns Atom XML. Parse with `grep`/`sed` or pipe through `python3` for clean output. + +### Basic search + +```bash +curl -s "https://export.arxiv.org/api/query?search_query=all:GRPO+reinforcement+learning&max_results=5" +``` + +### Clean output (parse XML to readable format) + +```bash +curl -s "https://export.arxiv.org/api/query?search_query=all:GRPO+reinforcement+learning&max_results=5&sortBy=submittedDate&sortOrder=descending" | python3 -c " +import sys, xml.etree.ElementTree as ET +ns = {'a': 'http://www.w3.org/2005/Atom'} +root = ET.parse(sys.stdin).getroot() +for i, entry in enumerate(root.findall('a:entry', ns)): + title = entry.find('a:title', ns).text.strip().replace('\n', ' ') + arxiv_id = entry.find('a:id', ns).text.strip().split('/abs/')[-1] + published = entry.find('a:published', ns).text[:10] + authors = ', '.join(a.find('a:name', ns).text for a in entry.findall('a:author', ns)) + summary = entry.find('a:summary', ns).text.strip()[:200] + cats = ', '.join(c.get('term') for c in entry.findall('a:category', ns)) + print(f'{i+1}. [{arxiv_id}] {title}') + print(f' Authors: {authors}') + print(f' Published: {published} | Categories: {cats}') + print(f' Abstract: {summary}...') + print(f' PDF: https://arxiv.org/pdf/{arxiv_id}') + print() +" +``` + +## Search Query Syntax + +| Prefix | Searches | Example | +|--------|----------|---------| +| `all:` | All fields | `all:transformer+attention` | +| `ti:` | Title | `ti:large+language+models` | +| `au:` | Author | `au:vaswani` | +| `abs:` | Abstract | `abs:reinforcement+learning` | +| `cat:` | Category | `cat:cs.AI` | +| `co:` | Comment | `co:accepted+NeurIPS` | + +### Boolean operators + +``` +# AND (default when using +) +search_query=all:transformer+attention + +# OR +search_query=all:GPT+OR+all:BERT + +# AND NOT +search_query=all:language+model+ANDNOT+all:vision + +# Exact phrase +search_query=ti:"chain+of+thought" + +# Combined +search_query=au:hinton+AND+cat:cs.LG +``` + +## Sort and Pagination + +| Parameter | Options | +|-----------|---------| +| `sortBy` | `relevance`, `lastUpdatedDate`, `submittedDate` | +| `sortOrder` | `ascending`, `descending` | +| `start` | Result offset (0-based) | +| `max_results` | Number of results (default 10, max 30000) | + +```bash +# Latest 10 papers in cs.AI +curl -s "https://export.arxiv.org/api/query?search_query=cat:cs.AI&sortBy=submittedDate&sortOrder=descending&max_results=10" +``` + +## Fetching Specific Papers + +```bash +# By arXiv ID +curl -s "https://export.arxiv.org/api/query?id_list=2402.03300" + +# Multiple papers +curl -s "https://export.arxiv.org/api/query?id_list=2402.03300,2401.12345,2403.00001" +``` + +## Reading Paper Content + +After finding a paper, read it: + +``` +# Abstract page (fast, metadata + abstract) +web_extract(urls=["https://arxiv.org/abs/2402.03300"]) + +# Full paper (PDF → markdown via Firecrawl) +web_extract(urls=["https://arxiv.org/pdf/2402.03300"]) +``` + +For local PDF processing, see the `ocr-and-documents` skill. + +## Common Categories + +| Category | Field | +|----------|-------| +| `cs.AI` | Artificial Intelligence | +| `cs.CL` | Computation and Language (NLP) | +| `cs.CV` | Computer Vision | +| `cs.LG` | Machine Learning | +| `cs.CR` | Cryptography and Security | +| `stat.ML` | Machine Learning (Statistics) | +| `math.OC` | Optimization and Control | +| `physics.comp-ph` | Computational Physics | + +Full list: https://arxiv.org/category_taxonomy + +## Helper Script + +The `scripts/search_arxiv.py` script handles XML parsing and provides clean output: + +```bash +python scripts/search_arxiv.py "GRPO reinforcement learning" +python scripts/search_arxiv.py "transformer attention" --max 10 --sort date +python scripts/search_arxiv.py --author "Yann LeCun" --max 5 +python scripts/search_arxiv.py --category cs.AI --sort date +python scripts/search_arxiv.py --id 2402.03300 +python scripts/search_arxiv.py --id 2402.03300,2401.12345 +``` + +No dependencies — uses only Python stdlib. + +--- + +## Semantic Scholar (Citations, Related Papers, Author Profiles) + +arXiv doesn't provide citation data or recommendations. Use the **Semantic Scholar API** for that — free, no key needed for basic use (1 req/sec), returns JSON. + +### Get paper details + citations + +```bash +# By arXiv ID +curl -s "https://api.semanticscholar.org/graph/v1/paper/arXiv:2402.03300?fields=title,authors,citationCount,referenceCount,influentialCitationCount,year,abstract" | python3 -m json.tool + +# By Semantic Scholar paper ID or DOI +curl -s "https://api.semanticscholar.org/graph/v1/paper/DOI:10.1234/example?fields=title,citationCount" +``` + +### Get citations OF a paper (who cited it) + +```bash +curl -s "https://api.semanticscholar.org/graph/v1/paper/arXiv:2402.03300/citations?fields=title,authors,year,citationCount&limit=10" | python3 -m json.tool +``` + +### Get references FROM a paper (what it cites) + +```bash +curl -s "https://api.semanticscholar.org/graph/v1/paper/arXiv:2402.03300/references?fields=title,authors,year,citationCount&limit=10" | python3 -m json.tool +``` + +### Search papers (alternative to arXiv search, returns JSON) + +```bash +curl -s "https://api.semanticscholar.org/graph/v1/paper/search?query=GRPO+reinforcement+learning&limit=5&fields=title,authors,year,citationCount,externalIds" | python3 -m json.tool +``` + +### Get paper recommendations + +```bash +curl -s -X POST "https://api.semanticscholar.org/recommendations/v1/papers/" \ + -H "Content-Type: application/json" \ + -d '{"positivePaperIds": ["arXiv:2402.03300"], "negativePaperIds": []}' | python3 -m json.tool +``` + +### Author profile + +```bash +curl -s "https://api.semanticscholar.org/graph/v1/author/search?query=Yann+LeCun&fields=name,hIndex,citationCount,paperCount" | python3 -m json.tool +``` + +### Useful Semantic Scholar fields + +`title`, `authors`, `year`, `abstract`, `citationCount`, `referenceCount`, `influentialCitationCount`, `isOpenAccess`, `openAccessPdf`, `fieldsOfStudy`, `publicationVenue`, `externalIds` (contains arXiv ID, DOI, etc.) + +--- + +## Complete Research Workflow + +1. **Discover**: `python scripts/search_arxiv.py "your topic" --sort date --max 10` +2. **Assess impact**: `curl -s "https://api.semanticscholar.org/graph/v1/paper/arXiv:ID?fields=citationCount,influentialCitationCount"` +3. **Read abstract**: `web_extract(urls=["https://arxiv.org/abs/ID"])` +4. **Read full paper**: `web_extract(urls=["https://arxiv.org/pdf/ID"])` +5. **Find related work**: `curl -s "https://api.semanticscholar.org/graph/v1/paper/arXiv:ID/references?fields=title,citationCount&limit=20"` +6. **Get recommendations**: POST to Semantic Scholar recommendations endpoint +7. **Track authors**: `curl -s "https://api.semanticscholar.org/graph/v1/author/search?query=NAME"` + +## Rate Limits + +| API | Rate | Auth | +|-----|------|------| +| arXiv | ~1 req / 3 seconds | None needed | +| Semantic Scholar | 1 req / second | None (100/sec with API key) | + +## Notes + +- arXiv returns Atom XML — use the helper script or parsing snippet for clean output +- Semantic Scholar returns JSON — pipe through `python3 -m json.tool` for readability +- arXiv IDs: old format (`hep-th/0601001`) vs new (`2402.03300`) +- PDF: `https://arxiv.org/pdf/{id}` — Abstract: `https://arxiv.org/abs/{id}` +- HTML (when available): `https://arxiv.org/html/{id}` +- For local PDF processing, see the `ocr-and-documents` skill diff --git a/skills/research/arxiv/scripts/search_arxiv.py b/skills/research/arxiv/scripts/search_arxiv.py new file mode 100644 index 000000000..dede870f5 --- /dev/null +++ b/skills/research/arxiv/scripts/search_arxiv.py @@ -0,0 +1,112 @@ +#!/usr/bin/env python3 +"""Search arXiv and display results in a clean format. + +Usage: + python search_arxiv.py "GRPO reinforcement learning" + python search_arxiv.py "GRPO reinforcement learning" --max 10 + python search_arxiv.py "GRPO reinforcement learning" --sort date + python search_arxiv.py --author "Yann LeCun" --max 5 + python search_arxiv.py --category cs.AI --sort date --max 10 + python search_arxiv.py --id 2402.03300 + python search_arxiv.py --id 2402.03300,2401.12345 +""" +import sys +import urllib.request +import urllib.parse +import xml.etree.ElementTree as ET + +NS = {'a': 'http://www.w3.org/2005/Atom'} + +def search(query=None, author=None, category=None, ids=None, max_results=5, sort="relevance"): + params = {} + + if ids: + params['id_list'] = ids + else: + parts = [] + if query: + parts.append(f'all:{urllib.parse.quote(query)}') + if author: + parts.append(f'au:{urllib.parse.quote(author)}') + if category: + parts.append(f'cat:{category}') + if not parts: + print("Error: provide a query, --author, --category, or --id") + sys.exit(1) + params['search_query'] = '+AND+'.join(parts) + + params['max_results'] = str(max_results) + + sort_map = {"relevance": "relevance", "date": "submittedDate", "updated": "lastUpdatedDate"} + params['sortBy'] = sort_map.get(sort, sort) + params['sortOrder'] = 'descending' + + url = "https://export.arxiv.org/api/query?" + "&".join(f"{k}={v}" for k, v in params.items()) + + req = urllib.request.Request(url, headers={'User-Agent': 'HermesAgent/1.0'}) + with urllib.request.urlopen(req, timeout=15) as resp: + data = resp.read() + + root = ET.fromstring(data) + entries = root.findall('a:entry', NS) + + if not entries: + print("No results found.") + return + + total = root.find('{http://a9.com/-/spec/opensearch/1.1/}totalResults') + if total is not None: + print(f"Found {total.text} results (showing {len(entries)})\n") + + for i, entry in enumerate(entries): + title = entry.find('a:title', NS).text.strip().replace('\n', ' ') + raw_id = entry.find('a:id', NS).text.strip() + arxiv_id = raw_id.split('/abs/')[-1].split('v')[0] if '/abs/' in raw_id else raw_id + published = entry.find('a:published', NS).text[:10] + updated = entry.find('a:updated', NS).text[:10] + authors = ', '.join(a.find('a:name', NS).text for a in entry.findall('a:author', NS)) + summary = entry.find('a:summary', NS).text.strip().replace('\n', ' ') + cats = ', '.join(c.get('term') for c in entry.findall('a:category', NS)) + + print(f"{i+1}. {title}") + print(f" ID: {arxiv_id} | Published: {published} | Updated: {updated}") + print(f" Authors: {authors}") + print(f" Categories: {cats}") + print(f" Abstract: {summary[:300]}{'...' if len(summary) > 300 else ''}") + print(f" Links: https://arxiv.org/abs/{arxiv_id} | https://arxiv.org/pdf/{arxiv_id}") + print() + + +if __name__ == "__main__": + args = sys.argv[1:] + if not args or args[0] in ("-h", "--help"): + print(__doc__) + sys.exit(0) + + query = None + author = None + category = None + ids = None + max_results = 5 + sort = "relevance" + + i = 0 + positional = [] + while i < len(args): + if args[i] == "--max" and i + 1 < len(args): + max_results = int(args[i + 1]); i += 2 + elif args[i] == "--sort" and i + 1 < len(args): + sort = args[i + 1]; i += 2 + elif args[i] == "--author" and i + 1 < len(args): + author = args[i + 1]; i += 2 + elif args[i] == "--category" and i + 1 < len(args): + category = args[i + 1]; i += 2 + elif args[i] == "--id" and i + 1 < len(args): + ids = args[i + 1]; i += 2 + else: + positional.append(args[i]); i += 1 + + if positional: + query = " ".join(positional) + + search(query=query, author=author, category=category, ids=ids, max_results=max_results, sort=sort)