#!/usr/bin/env -S uv run # /// script # requires-python = ">=3.10" # dependencies = [ # "huggingface_hub", # "pyyaml", # "requests", # "python-dotenv", # ] # /// """ Paper Manager for Hugging Face Hub Manages paper indexing, linking, authorship, and article creation. """ import argparse import os import sys import re import json from pathlib import Path from typing import Optional, List, Dict, Any from datetime import datetime try: from huggingface_hub import HfApi, hf_hub_download, get_token import yaml import requests from dotenv import load_dotenv except ImportError as e: print(f"Error: Missing required dependency: {e}") print("Tip: run this script with `uv run scripts/paper_manager.py ...`.") sys.exit(1) # Load environment variables load_dotenv() class PaperManager: """Manages paper publishing operations on Hugging Face Hub.""" def __init__(self, hf_token: Optional[str] = None): """Initialize Paper Manager with HF token.""" self.token = hf_token or os.getenv("HF_TOKEN") or get_token() if not self.token: print("Warning: No HF_TOKEN found. Some operations will fail.") self.api = HfApi(token=self.token) def index_paper(self, arxiv_id: str) -> Dict[str, Any]: """ Index a paper on Hugging Face from arXiv. Args: arxiv_id: arXiv identifier (e.g., "2301.12345") Returns: dict: Status information """ # Clean and validate arXiv ID try: arxiv_id = self._clean_arxiv_id(arxiv_id) except ValueError as e: print(f"Error: {e}") return {"status": "error", "message": str(e)} print(f"Indexing paper {arxiv_id} on Hugging Face...") # Check if paper exists paper_url = f"https://huggingface.co/papers/{arxiv_id}" try: response = requests.get(paper_url, timeout=10) if response.status_code == 200: print(f"✓ Paper already indexed at {paper_url}") return {"status": "exists", "url": paper_url} else: print(f"Paper not indexed. Visit {paper_url} to trigger indexing.") print("The paper will be automatically indexed when you first visit the URL.") return {"status": "not_indexed", "url": paper_url, "action": "visit_url"} except requests.RequestException as e: print(f"Error checking paper status: {e}") return {"status": "error", "message": str(e)} def check_paper(self, arxiv_id: str) -> Dict[str, Any]: """ Check if a paper exists on Hugging Face. Args: arxiv_id: arXiv identifier Returns: dict: Paper status and metadata """ try: arxiv_id = self._clean_arxiv_id(arxiv_id) except ValueError as e: return {"exists": False, "error": str(e)} paper_url = f"https://huggingface.co/papers/{arxiv_id}" try: response = requests.get(paper_url, timeout=10) if response.status_code == 200: return { "exists": True, "url": paper_url, "arxiv_id": arxiv_id, "arxiv_url": f"https://arxiv.org/abs/{arxiv_id}" } else: return { "exists": False, "arxiv_id": arxiv_id, "index_url": paper_url, "message": f"Visit {paper_url} to index this paper" } except requests.RequestException as e: return {"exists": False, "error": str(e)} def link_paper_to_repo( self, repo_id: str, arxiv_id: str, repo_type: str = "model", citation: Optional[str] = None, create_pr: bool = False ) -> Dict[str, Any]: """ Link a paper to a model/dataset/space repository. Args: repo_id: Repository identifier (e.g., "username/repo-name") arxiv_id: arXiv identifier repo_type: Type of repository ("model", "dataset", or "space") citation: Optional full citation text create_pr: Create a PR instead of direct commit Returns: dict: Operation status """ try: arxiv_id = self._clean_arxiv_id(arxiv_id) except ValueError as e: print(f"Error: {e}") return {"status": "error", "message": str(e)} print(f"Linking paper {arxiv_id} to {repo_type} {repo_id}...") try: # Download current README readme_path = hf_hub_download( repo_id=repo_id, filename="README.md", repo_type=repo_type, token=self.token ) with open(readme_path, 'r', encoding='utf-8') as f: content = f.read() # Parse or create YAML frontmatter updated_content = self._add_paper_to_readme(content, arxiv_id, citation) # Upload updated README commit_message = f"Add paper reference: arXiv:{arxiv_id}" if create_pr: # Create PR (not implemented in basic version) print("PR creation not yet implemented. Committing directly.") self.api.upload_file( path_or_fileobj=updated_content.encode('utf-8'), path_in_repo="README.md", repo_id=repo_id, repo_type=repo_type, commit_message=commit_message, token=self.token ) paper_url = f"https://huggingface.co/papers/{arxiv_id}" repo_url = f"https://huggingface.co/{repo_id}" print(f"✓ Successfully linked paper to repository") print(f" Paper: {paper_url}") print(f" Repo: {repo_url}") return { "status": "success", "paper_url": paper_url, "repo_url": repo_url, "arxiv_id": arxiv_id } except Exception as e: print(f"Error linking paper: {e}") return {"status": "error", "message": str(e)} def _add_paper_to_readme( self, content: str, arxiv_id: str, citation: Optional[str] = None ) -> str: """ Add paper reference to README content. Args: content: Current README content arxiv_id: arXiv identifier citation: Optional citation text Returns: str: Updated README content """ arxiv_url = f"https://arxiv.org/abs/{arxiv_id}" hf_paper_url = f"https://huggingface.co/papers/{arxiv_id}" # Check if YAML frontmatter exists yaml_pattern = r'^---\s*\n(.*?)\n---\s*\n' match = re.match(yaml_pattern, content, re.DOTALL) if match: # YAML exists, check if paper already referenced if arxiv_id in content: print(f"Paper {arxiv_id} already referenced in README") return content # Add to existing content (after YAML) yaml_end = match.end() before = content[:yaml_end] after = content[yaml_end:] else: # No YAML, add minimal frontmatter yaml_content = "---\n---\n\n" before = yaml_content after = content # Add paper reference section with boundary markers paper_section = "\n\n" paper_section += f"## Paper\n\n" paper_section += f"This {'model' if 'model' in content.lower() else 'work'} is based on research presented in:\n\n" paper_section += f"**[View on arXiv]({arxiv_url})** | " paper_section += f"**[View on Hugging Face]({hf_paper_url})**\n\n" if citation: safe_citation = self._sanitize_text(citation) paper_section += f"### Citation\n\n```bibtex\n{safe_citation}\n```\n\n" paper_section += "\n" # Insert after YAML, before main content updated_content = before + paper_section + after return updated_content def create_research_article( self, template: str, title: str, output: str, authors: Optional[str] = None, abstract: Optional[str] = None ) -> Dict[str, Any]: """ Create a research article from template. Args: template: Template name ("standard", "modern", "arxiv", "ml-report") title: Paper title output: Output filename authors: Comma-separated author names abstract: Abstract text Returns: dict: Creation status """ print(f"Creating research article with '{template}' template...") # Load template template_dir = Path(__file__).parent.parent / "templates" template_file = template_dir / f"{template}.md" if not template_file.exists(): return { "status": "error", "message": f"Template '{template}' not found at {template_file}" } with open(template_file, 'r', encoding='utf-8') as f: template_content = f.read() # Prepare safe values for different contexts date_str = datetime.now().strftime("%Y-%m-%d") safe_title_body = self._sanitize_text(title) authors_val = authors if authors else "Your Name" safe_authors_body = self._sanitize_text(authors_val) abstract_val = abstract if abstract else "Abstract to be written..." safe_abstract_body = self._sanitize_text(abstract_val) # Split frontmatter from body for context-aware escaping fm_pattern = r'^(---\s*\n)(.*?\n)(---\s*\n)' fm_match = re.match(fm_pattern, template_content, re.DOTALL) if fm_match: fm_open, fm_body, fm_close = fm_match.group(1), fm_match.group(2), fm_match.group(3) body = template_content[fm_match.end():] # YAML-escape values in frontmatter fm_body = fm_body.replace("{{TITLE}}", self._escape_yaml_value(title)) fm_body = fm_body.replace("{{AUTHORS}}", self._escape_yaml_value(authors_val)) fm_body = fm_body.replace("{{DATE}}", date_str) # Sanitize values in body body = body.replace("{{TITLE}}", safe_title_body) body = body.replace("{{AUTHORS}}", safe_authors_body) body = body.replace("{{ABSTRACT}}", safe_abstract_body) body = body.replace("{{DATE}}", date_str) content = fm_open + fm_body + fm_close + body else: # No frontmatter — sanitize everything content = template_content.replace("{{TITLE}}", safe_title_body) content = content.replace("{{DATE}}", date_str) content = content.replace("{{AUTHORS}}", safe_authors_body) content = content.replace("{{ABSTRACT}}", safe_abstract_body) # Write output with open(output, 'w', encoding='utf-8') as f: f.write(content) print(f"✓ Research article created at {output}") return { "status": "success", "output": output, "template": template } def get_arxiv_info(self, arxiv_id: str) -> Dict[str, Any]: """ Fetch paper information from arXiv API. Args: arxiv_id: arXiv identifier Returns: dict: Paper metadata """ try: arxiv_id = self._clean_arxiv_id(arxiv_id) except ValueError as e: return {"error": str(e)} api_url = f"https://export.arxiv.org/api/query?id_list={arxiv_id}" try: response = requests.get(api_url, timeout=10) response.raise_for_status() # Parse XML response (simplified) content = response.text # Extract basic info with regex (proper XML parsing would be better) title_match = re.search(r'(.*?)', content, re.DOTALL) authors_matches = re.findall(r'(.*?)', content) summary_match = re.search(r'(.*?)', content, re.DOTALL) # Sanitize all text extracted from the external API raw_title = title_match.group(1).strip() if title_match else None raw_authors = authors_matches[1:] if len(authors_matches) > 1 else [] raw_abstract = summary_match.group(1).strip() if summary_match else None return { "arxiv_id": arxiv_id, "title": self._sanitize_text(raw_title) if raw_title else None, "authors": [self._sanitize_text(a) for a in raw_authors], "abstract": self._sanitize_text(raw_abstract) if raw_abstract else None, "arxiv_url": f"https://arxiv.org/abs/{arxiv_id}", "pdf_url": f"https://arxiv.org/pdf/{arxiv_id}.pdf" } except Exception as e: return {"error": str(e)} def generate_citation( self, arxiv_id: str, format: str = "bibtex" ) -> str: """ Generate citation for a paper. Args: arxiv_id: arXiv identifier format: Citation format ("bibtex", "apa", "mla") Returns: str: Formatted citation """ try: arxiv_id = self._clean_arxiv_id(arxiv_id) except ValueError as e: return f"Error: {e}" info = self.get_arxiv_info(arxiv_id) if "error" in info: return f"Error fetching paper info: {info['error']}" if format == "bibtex": # Generate BibTeX citation key = f"arxiv{arxiv_id.replace('.', '_')}" raw_authors = " and ".join(info.get("authors", ["Unknown"])) raw_title = info.get("title", "Untitled") year = arxiv_id.split(".")[0][:2] # Extract year from ID (simplified) year = f"20{year}" if int(year) < 50 else f"19{year}" # Escape BibTeX structural characters in untrusted values safe_title = raw_title.replace('{', r'\{').replace('}', r'\}') safe_authors = raw_authors.replace('{', r'\{').replace('}', r'\}') citation = f"""@article{{{key}, title={{{safe_title}}}, author={{{safe_authors}}}, journal={{arXiv preprint arXiv:{arxiv_id}}}, year={{{year}}} }}""" return citation return f"Format '{format}' not yet implemented" # Patterns for valid arXiv IDs _ARXIV_ID_MODERN = re.compile(r'^\d{4}\.\d{4,5}(v\d+)?$') _ARXIV_ID_LEGACY = re.compile(r'^[a-zA-Z\-]+/\d{7}(v\d+)?$') @staticmethod def _clean_arxiv_id(arxiv_id: str) -> str: """Clean, normalize, and validate arXiv ID. Raises: ValueError: If the cleaned ID does not match a valid arXiv format. """ # Remove common prefixes and whitespace arxiv_id = arxiv_id.strip() arxiv_id = re.sub(r'^(arxiv:|arXiv:)', '', arxiv_id, flags=re.IGNORECASE) arxiv_id = re.sub(r'https?://arxiv\.org/(abs|pdf)/', '', arxiv_id) arxiv_id = arxiv_id.replace('.pdf', '') # Validate format if not (PaperManager._ARXIV_ID_MODERN.match(arxiv_id) or PaperManager._ARXIV_ID_LEGACY.match(arxiv_id)): raise ValueError( f"Invalid arXiv ID: {arxiv_id!r}. " "Expected format: YYMM.NNNNN[vN] or category/YYMMNNN[vN]" ) return arxiv_id @staticmethod def _escape_yaml_value(value: str) -> str: """Escape a string for safe use as a YAML scalar value. Wraps in double quotes and escapes internal quotes and backslashes to prevent YAML injection via crafted titles/authors. """ value = value.replace('\\', '\\\\').replace('"', '\\"') return f'"{value}"' @staticmethod def _sanitize_text(text: str) -> str: """Sanitize untrusted text for safe inclusion in Markdown/YAML output. Normalizes whitespace, strips control characters, and neutralizes markdown code-fence breakout and YAML document delimiters. """ # Remove control characters (keep newlines and tabs) text = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]', '', text) # Normalize whitespace runs (collapse multiple spaces/tabs, preserve single newlines) text = re.sub(r'[^\S\n]+', ' ', text) text = re.sub(r'\n{3,}', '\n\n', text) # Neutralize markdown code fence breakout text = text.replace('```', r'\`\`\`') # Neutralize YAML document delimiters at line start text = re.sub(r'^---', r'\\---', text, flags=re.MULTILINE) return text.strip() def main(): """Main CLI entry point.""" parser = argparse.ArgumentParser( description="Paper Manager for Hugging Face Hub", formatter_class=argparse.RawDescriptionHelpFormatter ) subparsers = parser.add_subparsers(dest="command", help="Command to execute") # Index command index_parser = subparsers.add_parser("index", help="Index a paper from arXiv") index_parser.add_argument("--arxiv-id", required=True, help="arXiv paper ID") # Check command check_parser = subparsers.add_parser("check", help="Check if paper exists") check_parser.add_argument("--arxiv-id", required=True, help="arXiv paper ID") # Link command link_parser = subparsers.add_parser("link", help="Link paper to repository") link_parser.add_argument("--repo-id", required=True, help="Repository ID") link_parser.add_argument("--repo-type", default="model", choices=["model", "dataset", "space"]) link_parser.add_argument("--arxiv-id", help="Single arXiv ID") link_parser.add_argument("--arxiv-ids", help="Comma-separated arXiv IDs") link_parser.add_argument("--citation", help="Full citation text") link_parser.add_argument("--create-pr", action="store_true", help="Create PR instead of direct commit") # Create command create_parser = subparsers.add_parser("create", help="Create research article") create_parser.add_argument("--template", required=True, help="Template name") create_parser.add_argument("--title", required=True, help="Paper title") create_parser.add_argument("--output", required=True, help="Output filename") create_parser.add_argument("--authors", help="Comma-separated authors") create_parser.add_argument("--abstract", help="Abstract text") # Info command info_parser = subparsers.add_parser("info", help="Get paper information") info_parser.add_argument("--arxiv-id", required=True, help="arXiv paper ID") info_parser.add_argument("--format", default="json", choices=["json", "text"]) # Citation command citation_parser = subparsers.add_parser("citation", help="Generate citation") citation_parser.add_argument("--arxiv-id", required=True, help="arXiv paper ID") citation_parser.add_argument("--format", default="bibtex", choices=["bibtex", "apa", "mla"]) # Search command search_parser = subparsers.add_parser("search", help="Search papers") search_parser.add_argument("--query", required=True, help="Search query") args = parser.parse_args() if not args.command: parser.print_help() sys.exit(1) # Initialize manager manager = PaperManager() # Execute command if args.command == "index": result = manager.index_paper(args.arxiv_id) print(json.dumps(result, indent=2)) elif args.command == "check": result = manager.check_paper(args.arxiv_id) print(json.dumps(result, indent=2)) elif args.command == "link": arxiv_ids = [] if args.arxiv_id: arxiv_ids.append(args.arxiv_id) if args.arxiv_ids: arxiv_ids.extend([id.strip() for id in args.arxiv_ids.split(",")]) if not arxiv_ids: print("Error: Must provide --arxiv-id or --arxiv-ids") sys.exit(1) for arxiv_id in arxiv_ids: result = manager.link_paper_to_repo( repo_id=args.repo_id, arxiv_id=arxiv_id, repo_type=args.repo_type, citation=args.citation, create_pr=args.create_pr ) print(json.dumps(result, indent=2)) elif args.command == "create": result = manager.create_research_article( template=args.template, title=args.title, output=args.output, authors=args.authors, abstract=args.abstract ) print(json.dumps(result, indent=2)) elif args.command == "info": result = manager.get_arxiv_info(args.arxiv_id) if args.format == "json": print(json.dumps(result, indent=2)) else: if "error" in result: print(f"Error: {result['error']}") else: print(f"Title: {result.get('title')}") print(f"Authors: {', '.join(result.get('authors', []))}") print(f"arXiv URL: {result.get('arxiv_url')}") print(f"\nAbstract:\n{result.get('abstract')}") elif args.command == "citation": citation = manager.generate_citation(args.arxiv_id, args.format) print(citation) elif args.command == "search": print(f"Searching for: {args.query}") print("Search functionality coming soon!") print(f"Visit: https://huggingface.co/papers?search={args.query}") if __name__ == "__main__": main()