diff --git a/skills/research/arxiv/SKILL.md b/skills/research/arxiv/SKILL.md index f6b90d2d5..248f91dc5 100644 --- a/skills/research/arxiv/SKILL.md +++ b/skills/research/arxiv/SKILL.md @@ -110,6 +110,36 @@ curl -s "https://export.arxiv.org/api/query?id_list=2402.03300" curl -s "https://export.arxiv.org/api/query?id_list=2402.03300,2401.12345,2403.00001" ``` +## BibTeX Generation + +After fetching metadata for a paper, generate a BibTeX entry: + +```bash +curl -s "https://export.arxiv.org/api/query?id_list=1706.03762" | python3 -c " +import sys, xml.etree.ElementTree as ET +ns = {'a': 'http://www.w3.org/2005/Atom', 'arxiv': 'http://arxiv.org/schemas/atom'} +root = ET.parse(sys.stdin).getroot() +entry = root.find('a:entry', ns) +if entry is None: sys.exit('Paper not found') +title = entry.find('a:title', ns).text.strip().replace('\n', ' ') +authors = ' and '.join(a.find('a:name', ns).text for a in entry.findall('a:author', ns)) +year = entry.find('a:published', ns).text[:4] +raw_id = entry.find('a:id', ns).text.strip().split('/abs/')[-1] +cat = entry.find('arxiv:primary_category', ns) +primary = cat.get('term') if cat is not None else 'cs.LG' +last_name = entry.find('a:author', ns).find('a:name', ns).text.split()[-1] +print(f'@article{{{last_name}{year}_{raw_id.replace(\".\", \"\")},') +print(f' title = {{{title}}},') +print(f' author = {{{authors}}},') +print(f' year = {{{year}}},') +print(f' eprint = {{{raw_id}}},') +print(f' archivePrefix = {{arXiv}},') +print(f' primaryClass = {{{primary}}},') +print(f' url = {{https://arxiv.org/abs/{raw_id}}}') +print('}') +" +``` + ## Reading Paper Content After finding a paper, read it: @@ -233,3 +263,17 @@ curl -s "https://api.semanticscholar.org/graph/v1/author/search?query=Yann+LeCun - PDF: `https://arxiv.org/pdf/{id}` — Abstract: `https://arxiv.org/abs/{id}` - HTML (when available): `https://arxiv.org/html/{id}` - For local PDF processing, see the `ocr-and-documents` skill + +## ID Versioning + +- `arxiv.org/abs/1706.03762` always resolves to the **latest** version +- `arxiv.org/abs/1706.03762v1` points to a **specific** immutable version +- When generating citations, preserve the version suffix you actually read to prevent citation drift (a later version may substantially change content) +- The API `` field returns the versioned URL (e.g., `http://arxiv.org/abs/1706.03762v7`) + +## Withdrawn Papers + +Papers can be withdrawn after submission. When this happens: +- The `` field contains a withdrawal notice (look for "withdrawn" or "retracted") +- Metadata fields may be incomplete +- Always check the summary before treating a result as a valid paper diff --git a/skills/research/arxiv/scripts/search_arxiv.py b/skills/research/arxiv/scripts/search_arxiv.py index dede870f5..9acd8b97e 100644 --- a/skills/research/arxiv/scripts/search_arxiv.py +++ b/skills/research/arxiv/scripts/search_arxiv.py @@ -61,15 +61,17 @@ def search(query=None, author=None, category=None, ids=None, max_results=5, sort for i, entry in enumerate(entries): title = entry.find('a:title', NS).text.strip().replace('\n', ' ') raw_id = entry.find('a:id', NS).text.strip() - arxiv_id = raw_id.split('/abs/')[-1].split('v')[0] if '/abs/' in raw_id else raw_id + full_id = raw_id.split('/abs/')[-1] if '/abs/' in raw_id else raw_id + arxiv_id = full_id.split('v')[0] # base ID for links published = entry.find('a:published', NS).text[:10] updated = entry.find('a:updated', NS).text[:10] authors = ', '.join(a.find('a:name', NS).text for a in entry.findall('a:author', NS)) summary = entry.find('a:summary', NS).text.strip().replace('\n', ' ') cats = ', '.join(c.get('term') for c in entry.findall('a:category', NS)) + version = full_id[len(arxiv_id):] if full_id != arxiv_id else "" print(f"{i+1}. {title}") - print(f" ID: {arxiv_id} | Published: {published} | Updated: {updated}") + print(f" ID: {arxiv_id}{version} | Published: {published} | Updated: {updated}") print(f" Authors: {authors}") print(f" Categories: {cats}") print(f" Abstract: {summary[:300]}{'...' if len(summary) > 300 else ''}")