#!/usr/bin/env python3 """ papers.py — Standalone academic paper toolkit (Skill-mode port of papers-mcp). Original MCP project: https://github.com/xwmxcz/papers-mcp Usage: python papers.py search [--limit 10] python papers.py detail python papers.py citations [--limit 10] python papers.py arxiv [--max-results 5] python papers.py download [--save-dir .] python papers.py read [--max-pages 10] Dependencies: httpx, arxiv, PyMuPDF """ from __future__ import annotations import argparse import sys import time from pathlib import Path # Force UTF-8 stdout on Windows so Chinese strings render correctly when # called via Bash / cmd / cron (Python 3.7+). if hasattr(sys.stdout, "reconfigure"): sys.stdout.reconfigure(encoding="utf-8") sys.stderr.reconfigure(encoding="utf-8") import httpx S2_BASE = "https://api.semanticscholar.org/graph/v1" S2_FIELDS = "paperId,title,abstract,year,citationCount,authors,externalIds,url" S2_RETRIES = 3 S2_WAIT = 2 # seconds, exponential backoff base # ---------- HTTP helpers ---------- def _s2_get(url: str, params: dict) -> dict: """GET with rate-limit retry. Returns parsed JSON or {'error': ...}.""" for attempt in range(S2_RETRIES): try: r = httpx.get( url, params=params, timeout=30.0, headers={"User-Agent": "papers-skill/1.0"}, ) if r.status_code == 429: time.sleep(S2_WAIT * (attempt + 1)) continue r.raise_for_status() return r.json() except httpx.HTTPError as e: if attempt == S2_RETRIES - 1: return {"error": f"HTTP error: {e}"} time.sleep(S2_WAIT * (attempt + 1)) return {"error": "rate limit, retries exhausted"} def _fmt_authors(authors: list, n: int = 3) -> str: if not authors: return "(unknown)" names = [a.get("name", "?") for a in authors[:n]] suffix = " et al." if len(authors) > n else "" return ", ".join(names) + suffix # ---------- Commands ---------- def cmd_search(args) -> str: data = _s2_get( f"{S2_BASE}/paper/search", {"query": args.query, "limit": min(args.limit, 20), "fields": S2_FIELDS}, ) if "error" in data: return f"搜索失败: {data['error']}" papers = data.get("data", []) if not papers: return f"没有找到与 '{args.query}' 相关的论文" out = [f"# 搜索结果 ({len(papers)} 篇)\n"] for i, p in enumerate(papers, 1): title = p.get("title", "无标题") year = p.get("year", "?") citations = p.get("citationCount", 0) authors = _fmt_authors(p.get("authors", [])) abstract = (p.get("abstract") or "").strip()[:200] ext = p.get("externalIds") or {} arxiv_id = ext.get("ArXiv", "") out.append( f"## {i}. {title}\n" f"**Authors:** {authors} \n" f"**Year:** {year} | **Citations:** {citations} \n" f"**S2 ID:** `{p.get('paperId')}`" + (f" | **arXiv:** `{arxiv_id}`" if arxiv_id else "") + " \n" f"**Abstract:** {abstract}{'...' if abstract else '(无摘要)'}\n" ) return "\n".join(out) def cmd_detail(args) -> str: pid = args.paper_id # Auto-detect ID type if pid.startswith(("10.", "ARXIV:", "DOI:", "MAG:", "PMID:", "PMCID:")): lookup = pid elif pid.isdigit() and len(pid) >= 10: lookup = f"ARXIV:{pid}" else: lookup = pid # assume raw S2 paperId fields = S2_FIELDS + ",references.title,references.year,tldr" data = _s2_get(f"{S2_BASE}/paper/{lookup}", {"fields": fields}) if "error" in data: return f"查询失败: {data['error']}" title = data.get("title", "无标题") authors = _fmt_authors(data.get("authors", []), n=5) year = data.get("year", "?") citations = data.get("citationCount", 0) abstract = data.get("abstract") or "(无摘要)" tldr = (data.get("tldr") or {}).get("text") or "(无 TL;DR)" refs = (data.get("references") or [])[:10] out = [ f"# {title}", f"**Authors:** {authors} ", f"**Year:** {year} | **Citations:** {citations} ", f"**ID:** `{data.get('paperId')}` ", f"**URL:** {data.get('url', '')}", "", "## TL;DR", tldr, "", "## Abstract", abstract, "", f"## Top {len(refs)} References", ] for i, r in enumerate(refs, 1): out.append(f"{i}. {r.get('title', '?')} ({r.get('year', '?')})") return "\n".join(out) def cmd_citations(args) -> str: data = _s2_get( f"{S2_BASE}/paper/{args.paper_id}/citations", { "limit": min(args.limit, 20), "fields": "title,year,authors", }, ) if "error" in data: return f"查询失败: {data['error']}" cites = data.get("data", []) if not cites: return "没有找到引用此论文的记录" out = [f"# 引用此论文的论文 ({len(cites)} 篇)\n"] for i, item in enumerate(cites, 1): p = item.get("citingPaper", {}) title = p.get("title", "?") year = p.get("year", "?") authors = _fmt_authors(p.get("authors", []), n=2) out.append(f"{i}. **{title}** ({year}) — {authors}") return "\n".join(out) def cmd_arxiv(args) -> str: try: import arxiv except ImportError: return "需要安装 arxiv: pip install arxiv" search = arxiv.Search( query=args.query, max_results=min(args.max_results, 10), sort_by=arxiv.SortCriterion.Relevance, ) results = list(arxiv.Client().results(search)) if not results: return f"没有找到与 '{args.query}' 相关的 arXiv 论文" out = [f"# arXiv 搜索结果 ({len(results)} 篇)\n"] for i, p in enumerate(results, 1): arxiv_id = p.entry_id.rsplit("/", 1)[-1] out.append( f"## {i}. {p.title}\n" f"**Authors:** {', '.join(a.name for a in p.authors[:3])} \n" f"**arXiv ID:** `{arxiv_id}` \n" f"**Published:** {p.published.strftime('%Y-%m-%d')} \n" f"**Summary:** {p.summary[:200].strip()}...\n" ) return "\n".join(out) def cmd_download(args) -> str: try: import arxiv except ImportError: return "需要安装 arxiv: pip install arxiv" save_dir = Path(args.save_dir).resolve() save_dir.mkdir(parents=True, exist_ok=True) search = arxiv.Search(id_list=[args.arxiv_id]) paper = next(arxiv.Client().results(search), None) if paper is None: return f"找不到 arXiv ID: {args.arxiv_id}" path = paper.download_pdf(dirpath=str(save_dir)) return f"已下载: {path}" def cmd_read(args) -> str: try: import fitz # PyMuPDF except ImportError: return "需要安装 PyMuPDF: pip install PyMuPDF" pdf = Path(args.pdf_path) if not pdf.exists(): return f"PDF 不存在: {pdf}" doc = fitz.open(str(pdf)) pages = min(args.max_pages, doc.page_count) chunks = [] for i in range(pages): text = doc.load_page(i).get_text().strip() if text: chunks.append(f"--- Page {i + 1} ---\n{text}") doc.close() if not chunks: return "PDF无法提取文本(可能是扫描件)" return "\n\n".join(chunks) # ---------- CLI ---------- def main(): parser = argparse.ArgumentParser(prog="papers", description=__doc__) sub = parser.add_subparsers(dest="cmd", required=True) p = sub.add_parser("search", help="Semantic Scholar 搜索") p.add_argument("query") p.add_argument("--limit", type=int, default=10) p.set_defaults(fn=cmd_search) p = sub.add_parser("detail", help="论文详情 (支持 DOI / ARXIV:id / S2 paperId)") p.add_argument("paper_id") p.set_defaults(fn=cmd_detail) p = sub.add_parser("citations", help="该论文的引用列表") p.add_argument("paper_id") p.add_argument("--limit", type=int, default=10) p.set_defaults(fn=cmd_citations) p = sub.add_parser("arxiv", help="arXiv 搜索") p.add_argument("query") p.add_argument("--max-results", type=int, default=5) p.set_defaults(fn=cmd_arxiv) p = sub.add_parser("download", help="下载 arXiv PDF") p.add_argument("arxiv_id") p.add_argument("--save-dir", default=".") p.set_defaults(fn=cmd_download) p = sub.add_parser("read", help="提取 PDF 文本 (PyMuPDF)") p.add_argument("pdf_path") p.add_argument("--max-pages", type=int, default=10) p.set_defaults(fn=cmd_read) args = parser.parse_args() try: print(args.fn(args)) except Exception as e: print(f"错误: {type(e).__name__}: {e}", file=sys.stderr) sys.exit(1) if __name__ == "__main__": main()