Sync all skills and memories 2026-04-14 07:27
This commit is contained in:
114
skills/research/arxiv/scripts/search_arxiv.py
Normal file
114
skills/research/arxiv/scripts/search_arxiv.py
Normal file
@@ -0,0 +1,114 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Search arXiv and display results in a clean format.
|
||||
|
||||
Usage:
|
||||
python search_arxiv.py "GRPO reinforcement learning"
|
||||
python search_arxiv.py "GRPO reinforcement learning" --max 10
|
||||
python search_arxiv.py "GRPO reinforcement learning" --sort date
|
||||
python search_arxiv.py --author "Yann LeCun" --max 5
|
||||
python search_arxiv.py --category cs.AI --sort date --max 10
|
||||
python search_arxiv.py --id 2402.03300
|
||||
python search_arxiv.py --id 2402.03300,2401.12345
|
||||
"""
|
||||
import sys
|
||||
import urllib.request
|
||||
import urllib.parse
|
||||
import xml.etree.ElementTree as ET
|
||||
|
||||
NS = {'a': 'http://www.w3.org/2005/Atom'}
|
||||
|
||||
def search(query=None, author=None, category=None, ids=None, max_results=5, sort="relevance"):
|
||||
params = {}
|
||||
|
||||
if ids:
|
||||
params['id_list'] = ids
|
||||
else:
|
||||
parts = []
|
||||
if query:
|
||||
parts.append(f'all:{urllib.parse.quote(query)}')
|
||||
if author:
|
||||
parts.append(f'au:{urllib.parse.quote(author)}')
|
||||
if category:
|
||||
parts.append(f'cat:{category}')
|
||||
if not parts:
|
||||
print("Error: provide a query, --author, --category, or --id")
|
||||
sys.exit(1)
|
||||
params['search_query'] = '+AND+'.join(parts)
|
||||
|
||||
params['max_results'] = str(max_results)
|
||||
|
||||
sort_map = {"relevance": "relevance", "date": "submittedDate", "updated": "lastUpdatedDate"}
|
||||
params['sortBy'] = sort_map.get(sort, sort)
|
||||
params['sortOrder'] = 'descending'
|
||||
|
||||
url = "https://export.arxiv.org/api/query?" + "&".join(f"{k}={v}" for k, v in params.items())
|
||||
|
||||
req = urllib.request.Request(url, headers={'User-Agent': 'HermesAgent/1.0'})
|
||||
with urllib.request.urlopen(req, timeout=15) as resp:
|
||||
data = resp.read()
|
||||
|
||||
root = ET.fromstring(data)
|
||||
entries = root.findall('a:entry', NS)
|
||||
|
||||
if not entries:
|
||||
print("No results found.")
|
||||
return
|
||||
|
||||
total = root.find('{http://a9.com/-/spec/opensearch/1.1/}totalResults')
|
||||
if total is not None:
|
||||
print(f"Found {total.text} results (showing {len(entries)})\n")
|
||||
|
||||
for i, entry in enumerate(entries):
|
||||
title = entry.find('a:title', NS).text.strip().replace('\n', ' ')
|
||||
raw_id = entry.find('a:id', NS).text.strip()
|
||||
full_id = raw_id.split('/abs/')[-1] if '/abs/' in raw_id else raw_id
|
||||
arxiv_id = full_id.split('v')[0] # base ID for links
|
||||
published = entry.find('a:published', NS).text[:10]
|
||||
updated = entry.find('a:updated', NS).text[:10]
|
||||
authors = ', '.join(a.find('a:name', NS).text for a in entry.findall('a:author', NS))
|
||||
summary = entry.find('a:summary', NS).text.strip().replace('\n', ' ')
|
||||
cats = ', '.join(c.get('term') for c in entry.findall('a:category', NS))
|
||||
|
||||
version = full_id[len(arxiv_id):] if full_id != arxiv_id else ""
|
||||
print(f"{i+1}. {title}")
|
||||
print(f" ID: {arxiv_id}{version} | Published: {published} | Updated: {updated}")
|
||||
print(f" Authors: {authors}")
|
||||
print(f" Categories: {cats}")
|
||||
print(f" Abstract: {summary[:300]}{'...' if len(summary) > 300 else ''}")
|
||||
print(f" Links: https://arxiv.org/abs/{arxiv_id} | https://arxiv.org/pdf/{arxiv_id}")
|
||||
print()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = sys.argv[1:]
|
||||
if not args or args[0] in ("-h", "--help"):
|
||||
print(__doc__)
|
||||
sys.exit(0)
|
||||
|
||||
query = None
|
||||
author = None
|
||||
category = None
|
||||
ids = None
|
||||
max_results = 5
|
||||
sort = "relevance"
|
||||
|
||||
i = 0
|
||||
positional = []
|
||||
while i < len(args):
|
||||
if args[i] == "--max" and i + 1 < len(args):
|
||||
max_results = int(args[i + 1]); i += 2
|
||||
elif args[i] == "--sort" and i + 1 < len(args):
|
||||
sort = args[i + 1]; i += 2
|
||||
elif args[i] == "--author" and i + 1 < len(args):
|
||||
author = args[i + 1]; i += 2
|
||||
elif args[i] == "--category" and i + 1 < len(args):
|
||||
category = args[i + 1]; i += 2
|
||||
elif args[i] == "--id" and i + 1 < len(args):
|
||||
ids = args[i + 1]; i += 2
|
||||
else:
|
||||
positional.append(args[i]); i += 1
|
||||
|
||||
if positional:
|
||||
query = " ".join(positional)
|
||||
|
||||
search(query=query, author=author, category=category, ids=ids, max_results=max_results, sort=sort)
|
||||
Reference in New Issue
Block a user