#!/usr/bin/env -S uv run # /// script # requires-python = ">=3.10" # dependencies = [ # "huggingface_hub", # "pyyaml", # "requests", # "python-dotenv", # ] # /// """ Paper Manager for Hugging Face Hub Manages paper indexing, linking, authorship, and article creation. """ import argparse import os import sys import re import json from pathlib import Path from typing import Optional, List, Dict, Any from datetime import datetime try: from huggingface_hub import HfApi, hf_hub_download, get_token import yaml import requests from dotenv import load_dotenv except ImportError as e: print(f"Error: Missing required dependency: {e}") print("Tip: run this script with `uv run scripts/paper_manager.py ...`.") sys.exit(1) # Load environment variables load_dotenv() class PaperManager: """Manages paper publishing operations on Hugging Face Hub.""" def __init__(self, hf_token: Optional[str] = None): """Initialize Paper Manager with HF token.""" self.token = hf_token or os.getenv("HF_TOKEN") or get_token() if not self.token: print("Warning: No HF_TOKEN found. Some operations will fail.") self.api = HfApi(token=self.token) def index_paper(self, arxiv_id: str) -> Dict[str, Any]: """ Index a paper on Hugging Face from arXiv. Args: arxiv_id: arXiv identifier (e.g., "2301.12345") Returns: dict: Status information """ # Clean and validate arXiv ID try: arxiv_id = self._clean_arxiv_id(arxiv_id) except ValueError as e: print(f"Error: {e}") return {"status": "error", "message": str(e)} print(f"Indexing paper {arxiv_id} on Hugging Face...") # Check if paper exists paper_url = f"https://huggingface.co/papers/{arxiv_id}" try: response = requests.get(paper_url, timeout=10) if response.status_code == 200: print(f"✓ Paper already indexed at {paper_url}") return {"status": "exists", "url": paper_url} else: print(f"Paper not indexed. Visit {paper_url} to trigger indexing.") print("The paper will be automatically indexed when you first visit the URL.") return {"status": "not_indexed", "url": paper_url, "action": "visit_url"} except requests.RequestException as e: print(f"Error checking paper status: {e}") return {"status": "error", "message": str(e)} def check_paper(self, arxiv_id: str) -> Dict[str, Any]: """ Check if a paper exists on Hugging Face. Args: arxiv_id: arXiv identifier Returns: dict: Paper status and metadata """ try: arxiv_id = self._clean_arxiv_id(arxiv_id) except ValueError as e: return {"exists": False, "error": str(e)} paper_url = f"https://huggingface.co/papers/{arxiv_id}" try: response = requests.get(paper_url, timeout=10) if response.status_code == 200: return { "exists": True, "url": paper_url, "arxiv_id": arxiv_id, "arxiv_url": f"https://arxiv.org/abs/{arxiv_id}" } else: return { "exists": False, "arxiv_id": arxiv_id, "index_url": paper_url, "message": f"Visit {paper_url} to index this paper" } except requests.RequestException as e: return {"exists": False, "error": str(e)} def link_paper_to_repo( self, repo_id: str, arxiv_id: str, repo_type: str = "model", citation: Optional[str] = None, create_pr: bool = False ) -> Dict[str, Any]: """ Link a paper to a model/dataset/space repository. Args: repo_id: Repository identifier (e.g., "username/repo-name") arxiv_id: arXiv identifier repo_type: Type of repository ("model", "dataset", or "space") citation: Optional full citation text create_pr: Create a PR instead of direct commit Returns: dict: Operation status """ try: arxiv_id = self._clean_arxiv_id(arxiv_id) except ValueError as e: print(f"Error: {e}") return {"status": "error", "message": str(e)} print(f"Linking paper {arxiv_id} to {repo_type} {repo_id}...") try: # Download current README readme_path = hf_hub_download( repo_id=repo_id, filename="README.md", repo_type=repo_type, token=self.token ) with open(readme_path, 'r', encoding='utf-8') as f: content = f.read() # Parse or create YAML frontmatter updated_content = self._add_paper_to_readme(content, arxiv_id, citation) # Upload updated README commit_message = f"Add paper reference: arXiv:{arxiv_id}" if create_pr: # Create PR (not implemented in basic version) print("PR creation not yet implemented. Committing directly.") self.api.upload_file( path_or_fileobj=updated_content.encode('utf-8'), path_in_repo="README.md", repo_id=repo_id, repo_type=repo_type, commit_message=commit_message, token=self.token ) paper_url = f"https://huggingface.co/papers/{arxiv_id}" repo_url = f"https://huggingface.co/{repo_id}" print(f"✓ Successfully linked paper to repository") print(f" Paper: {paper_url}") print(f" Repo: {repo_url}") return { "status": "success", "paper_url": paper_url, "repo_url": repo_url, "arxiv_id": arxiv_id } except Exception as e: print(f"Error linking paper: {e}") return {"status": "error", "message": str(e)} def _add_paper_to_readme( self, content: str, arxiv_id: str, citation: Optional[str] = None ) -> str: """ Add paper reference to README content. Args: content: Current README content arxiv_id: arXiv identifier citation: Optional citation text Returns: str: Updated README content """ arxiv_url = f"https://arxiv.org/abs/{arxiv_id}" hf_paper_url = f"https://huggingface.co/papers/{arxiv_id}" # Check if YAML frontmatter exists yaml_pattern = r'^---\s*\n(.*?)\n---\s*\n' match = re.match(yaml_pattern, content, re.DOTALL) if match: # YAML exists, check if paper already referenced if arxiv_id in content: print(f"Paper {arxiv_id} already referenced in README") return content # Add to existing content (after YAML) yaml_end = match.end() before = content[:yaml_end] after = content[yaml_end:] else: # No YAML, add minimal frontmatter yaml_content = "---\n---\n\n" before = yaml_content after = content # Add paper reference section with boundary markers paper_section = "\n\n" paper_section += f"## Paper\n\n" paper_section += f"This {'model' if 'model' in content.lower() else 'work'} is based on research presented in:\n\n" paper_section += f"**[View on arXiv]({arxiv_url})** | " paper_section += f"**[View on Hugging Face]({hf_paper_url})**\n\n" if citation: safe_citation = self._sanitize_text(citation) paper_section += f"### Citation\n\n```bibtex\n{safe_citation}\n```\n\n" paper_section += "\n" # Insert after YAML, before main content updated_content = before + paper_section + after return updated_content def create_research_article( self, template: str, title: str, output: str, authors: Optional[str] = None, abstract: Optional[str] = None ) -> Dict[str, Any]: """ Create a research article from template. Args: template: Template name ("standard", "modern", "arxiv", "ml-report") title: Paper title output: Output filename authors: Comma-separated author names abstract: Abstract text Returns: dict: Creation status """ print(f"Creating research article with '{template}' template...") # Load template template_dir = Path(__file__).parent.parent / "templates" template_file = template_dir / f"{template}.md" if not template_file.exists(): return { "status": "error", "message": f"Template '{template}' not found at {template_file}" } with open(template_file, 'r', encoding='utf-8') as f: template_content = f.read() # Prepare safe values for different contexts date_str = datetime.now().strftime("%Y-%m-%d") safe_title_body = self._sanitize_text(title) authors_val = authors if authors else "Your Name" safe_authors_body = self._sanitize_text(authors_val) abstract_val = abstract if abstract else "Abstract to be written..." safe_abstract_body = self._sanitize_text(abstract_val) # Split frontmatter from body for context-aware escaping fm_pattern = r'^(---\s*\n)(.*?\n)(---\s*\n)' fm_match = re.match(fm_pattern, template_content, re.DOTALL) if fm_match: fm_open, fm_body, fm_close = fm_match.group(1), fm_match.group(2), fm_match.group(3) body = template_content[fm_match.end():] # YAML-escape values in frontmatter fm_body = fm_body.replace("{{TITLE}}", self._escape_yaml_value(title)) fm_body = fm_body.replace("{{AUTHORS}}", self._escape_yaml_value(authors_val)) fm_body = fm_body.replace("{{DATE}}", date_str) # Sanitize values in body body = body.replace("{{TITLE}}", safe_title_body) body = body.replace("{{AUTHORS}}", safe_authors_body) body = body.replace("{{ABSTRACT}}", safe_abstract_body) body = body.replace("{{DATE}}", date_str) content = fm_open + fm_body + fm_close + body else: # No frontmatter — sanitize everything content = template_content.replace("{{TITLE}}", safe_title_body) content = content.replace("{{DATE}}", date_str) content = content.replace("{{AUTHORS}}", safe_authors_body) content = content.replace("{{ABSTRACT}}", safe_abstract_body) # Write output with open(output, 'w', encoding='utf-8') as f: f.write(content) print(f"✓ Research article created at {output}") return { "status": "success", "output": output, "template": template } def get_arxiv_info(self, arxiv_id: str) -> Dict[str, Any]: """ Fetch paper information from arXiv API. Args: arxiv_id: arXiv identifier Returns: dict: Paper metadata """ try: arxiv_id = self._clean_arxiv_id(arxiv_id) except ValueError as e: return {"error": str(e)} api_url = f"https://export.arxiv.org/api/query?id_list={arxiv_id}" try: response = requests.get(api_url, timeout=10) response.raise_for_status() # Parse XML response (simplified) content = response.text # Extract basic info with regex (proper XML parsing would be better) title_match = re.search(r'