playbook/antigravity-awesome-skills/skills/hugging-face-paper-publisher/scripts/paper_manager.py

#!/usr/bin/env -S uv run
# /// script
# requires-python = ">=3.10"
# dependencies = [
#     "huggingface_hub",
#     "pyyaml",
#     "requests",
#     "python-dotenv",
# ]
# ///
"""
Paper Manager for Hugging Face Hub
Manages paper indexing, linking, authorship, and article creation.
"""

import argparse
import os
import sys
import re
import json
from pathlib import Path
from typing import Optional, List, Dict, Any
from datetime import datetime

try:
    from huggingface_hub import HfApi, hf_hub_download, get_token
    import yaml
    import requests
    from dotenv import load_dotenv
except ImportError as e:
    print(f"Error: Missing required dependency: {e}")
    print("Tip: run this script with `uv run scripts/paper_manager.py ...`.")
    sys.exit(1)

# Load environment variables
load_dotenv()


class PaperManager:
    """Manages paper publishing operations on Hugging Face Hub."""

    def __init__(self, hf_token: Optional[str] = None):
        """Initialize Paper Manager with HF token."""
        self.token = hf_token or os.getenv("HF_TOKEN") or get_token()
        if not self.token:
            print("Warning: No HF_TOKEN found. Some operations will fail.")
        self.api = HfApi(token=self.token)

    def index_paper(self, arxiv_id: str) -> Dict[str, Any]:
        """
        Index a paper on Hugging Face from arXiv.

        Args:
            arxiv_id: arXiv identifier (e.g., "2301.12345")

        Returns:
            dict: Status information
        """
        # Clean and validate arXiv ID
        try:
            arxiv_id = self._clean_arxiv_id(arxiv_id)
        except ValueError as e:
            print(f"Error: {e}")
            return {"status": "error", "message": str(e)}

        print(f"Indexing paper {arxiv_id} on Hugging Face...")

        # Check if paper exists
        paper_url = f"https://huggingface.co/papers/{arxiv_id}"

        try:
            response = requests.get(paper_url, timeout=10)
            if response.status_code == 200:
                print(f"✓ Paper already indexed at {paper_url}")
                return {"status": "exists", "url": paper_url}
            else:
                print(f"Paper not indexed. Visit {paper_url} to trigger indexing.")
                print("The paper will be automatically indexed when you first visit the URL.")
                return {"status": "not_indexed", "url": paper_url, "action": "visit_url"}
        except requests.RequestException as e:
            print(f"Error checking paper status: {e}")
            return {"status": "error", "message": str(e)}

    def check_paper(self, arxiv_id: str) -> Dict[str, Any]:
        """
        Check if a paper exists on Hugging Face.

        Args:
            arxiv_id: arXiv identifier

        Returns:
            dict: Paper status and metadata
        """
        try:
            arxiv_id = self._clean_arxiv_id(arxiv_id)
        except ValueError as e:
            return {"exists": False, "error": str(e)}
        paper_url = f"https://huggingface.co/papers/{arxiv_id}"

        try:
            response = requests.get(paper_url, timeout=10)
            if response.status_code == 200:
                return {
                    "exists": True,
                    "url": paper_url,
                    "arxiv_id": arxiv_id,
                    "arxiv_url": f"https://arxiv.org/abs/{arxiv_id}"
                }
            else:
                return {
                    "exists": False,
                    "arxiv_id": arxiv_id,
                    "index_url": paper_url,
                    "message": f"Visit {paper_url} to index this paper"
                }
        except requests.RequestException as e:
            return {"exists": False, "error": str(e)}

    def link_paper_to_repo(
        self,
        repo_id: str,
        arxiv_id: str,
        repo_type: str = "model",
        citation: Optional[str] = None,
        create_pr: bool = False
    ) -> Dict[str, Any]:
        """
        Link a paper to a model/dataset/space repository.

        Args:
            repo_id: Repository identifier (e.g., "username/repo-name")
            arxiv_id: arXiv identifier
            repo_type: Type of repository ("model", "dataset", or "space")
            citation: Optional full citation text
            create_pr: Create a PR instead of direct commit

        Returns:
            dict: Operation status
        """
        try:
            arxiv_id = self._clean_arxiv_id(arxiv_id)
        except ValueError as e:
            print(f"Error: {e}")
            return {"status": "error", "message": str(e)}

        print(f"Linking paper {arxiv_id} to {repo_type} {repo_id}...")

        try:
            # Download current README
            readme_path = hf_hub_download(
                repo_id=repo_id,
                filename="README.md",
                repo_type=repo_type,
                token=self.token
            )

            with open(readme_path, 'r', encoding='utf-8') as f:
                content = f.read()

            # Parse or create YAML frontmatter
            updated_content = self._add_paper_to_readme(content, arxiv_id, citation)

            # Upload updated README
            commit_message = f"Add paper reference: arXiv:{arxiv_id}"

            if create_pr:
                # Create PR (not implemented in basic version)
                print("PR creation not yet implemented. Committing directly.")

            self.api.upload_file(
                path_or_fileobj=updated_content.encode('utf-8'),
                path_in_repo="README.md",
                repo_id=repo_id,
                repo_type=repo_type,
                commit_message=commit_message,
                token=self.token
            )

            paper_url = f"https://huggingface.co/papers/{arxiv_id}"
            repo_url = f"https://huggingface.co/{repo_id}"

            print(f"✓ Successfully linked paper to repository")
            print(f"  Paper: {paper_url}")
            print(f"  Repo: {repo_url}")

            return {
                "status": "success",
                "paper_url": paper_url,
                "repo_url": repo_url,
                "arxiv_id": arxiv_id
            }

        except Exception as e:
            print(f"Error linking paper: {e}")
            return {"status": "error", "message": str(e)}

    def _add_paper_to_readme(
        self,
        content: str,
        arxiv_id: str,
        citation: Optional[str] = None
    ) -> str:
        """
        Add paper reference to README content.

        Args:
            content: Current README content
            arxiv_id: arXiv identifier
            citation: Optional citation text

        Returns:
            str: Updated README content
        """
        arxiv_url = f"https://arxiv.org/abs/{arxiv_id}"
        hf_paper_url = f"https://huggingface.co/papers/{arxiv_id}"

        # Check if YAML frontmatter exists
        yaml_pattern = r'^---\s*\n(.*?)\n---\s*\n'
        match = re.match(yaml_pattern, content, re.DOTALL)

        if match:
            # YAML exists, check if paper already referenced
            if arxiv_id in content:
                print(f"Paper {arxiv_id} already referenced in README")
                return content

            # Add to existing content (after YAML)
            yaml_end = match.end()
            before = content[:yaml_end]
            after = content[yaml_end:]
        else:
            # No YAML, add minimal frontmatter
            yaml_content = "---\n---\n\n"
            before = yaml_content
            after = content

        # Add paper reference section with boundary markers
        paper_section = "\n<!-- paper-manager:start -->\n"
        paper_section += f"## Paper\n\n"
        paper_section += f"This {'model' if 'model' in content.lower() else 'work'} is based on research presented in:\n\n"
        paper_section += f"**[View on arXiv]({arxiv_url})** | "
        paper_section += f"**[View on Hugging Face]({hf_paper_url})**\n\n"

        if citation:
            safe_citation = self._sanitize_text(citation)
            paper_section += f"### Citation\n\n```bibtex\n{safe_citation}\n```\n\n"

        paper_section += "<!-- paper-manager:end -->\n"

        # Insert after YAML, before main content
        updated_content = before + paper_section + after

        return updated_content

    def create_research_article(
        self,
        template: str,
        title: str,
        output: str,
        authors: Optional[str] = None,
        abstract: Optional[str] = None
    ) -> Dict[str, Any]:
        """
        Create a research article from template.

        Args:
            template: Template name ("standard", "modern", "arxiv", "ml-report")
            title: Paper title
            output: Output filename
            authors: Comma-separated author names
            abstract: Abstract text

        Returns:
            dict: Creation status
        """
        print(f"Creating research article with '{template}' template...")

        # Load template
        template_dir = Path(__file__).parent.parent / "templates"
        template_file = template_dir / f"{template}.md"

        if not template_file.exists():
            return {
                "status": "error",
                "message": f"Template '{template}' not found at {template_file}"
            }

        with open(template_file, 'r', encoding='utf-8') as f:
            template_content = f.read()

        # Prepare safe values for different contexts
        date_str = datetime.now().strftime("%Y-%m-%d")
        safe_title_body = self._sanitize_text(title)
        authors_val = authors if authors else "Your Name"
        safe_authors_body = self._sanitize_text(authors_val)
        abstract_val = abstract if abstract else "Abstract to be written..."
        safe_abstract_body = self._sanitize_text(abstract_val)

        # Split frontmatter from body for context-aware escaping
        fm_pattern = r'^(---\s*\n)(.*?\n)(---\s*\n)'
        fm_match = re.match(fm_pattern, template_content, re.DOTALL)

        if fm_match:
            fm_open, fm_body, fm_close = fm_match.group(1), fm_match.group(2), fm_match.group(3)
            body = template_content[fm_match.end():]

            # YAML-escape values in frontmatter
            fm_body = fm_body.replace("{{TITLE}}", self._escape_yaml_value(title))
            fm_body = fm_body.replace("{{AUTHORS}}", self._escape_yaml_value(authors_val))
            fm_body = fm_body.replace("{{DATE}}", date_str)

            # Sanitize values in body
            body = body.replace("{{TITLE}}", safe_title_body)
            body = body.replace("{{AUTHORS}}", safe_authors_body)
            body = body.replace("{{ABSTRACT}}", safe_abstract_body)
            body = body.replace("{{DATE}}", date_str)

            content = fm_open + fm_body + fm_close + body
        else:
            # No frontmatter — sanitize everything
            content = template_content.replace("{{TITLE}}", safe_title_body)
            content = content.replace("{{DATE}}", date_str)
            content = content.replace("{{AUTHORS}}", safe_authors_body)
            content = content.replace("{{ABSTRACT}}", safe_abstract_body)

        # Write output
        with open(output, 'w', encoding='utf-8') as f:
            f.write(content)

        print(f"✓ Research article created at {output}")

        return {
            "status": "success",
            "output": output,
            "template": template
        }

    def get_arxiv_info(self, arxiv_id: str) -> Dict[str, Any]:
        """
        Fetch paper information from arXiv API.

        Args:
            arxiv_id: arXiv identifier

        Returns:
            dict: Paper metadata
        """
        try:
            arxiv_id = self._clean_arxiv_id(arxiv_id)
        except ValueError as e:
            return {"error": str(e)}
        api_url = f"https://export.arxiv.org/api/query?id_list={arxiv_id}"

        try:
            response = requests.get(api_url, timeout=10)
            response.raise_for_status()

            # Parse XML response (simplified)
            content = response.text

            # Extract basic info with regex (proper XML parsing would be better)
            title_match = re.search(r'<title>(.*?)</title>', content, re.DOTALL)
            authors_matches = re.findall(r'<name>(.*?)</name>', content)
            summary_match = re.search(r'<summary>(.*?)</summary>', content, re.DOTALL)

            # Sanitize all text extracted from the external API
            raw_title = title_match.group(1).strip() if title_match else None
            raw_authors = authors_matches[1:] if len(authors_matches) > 1 else []
            raw_abstract = summary_match.group(1).strip() if summary_match else None

            return {
                "arxiv_id": arxiv_id,
                "title": self._sanitize_text(raw_title) if raw_title else None,
                "authors": [self._sanitize_text(a) for a in raw_authors],
                "abstract": self._sanitize_text(raw_abstract) if raw_abstract else None,
                "arxiv_url": f"https://arxiv.org/abs/{arxiv_id}",
                "pdf_url": f"https://arxiv.org/pdf/{arxiv_id}.pdf"
            }
        except Exception as e:
            return {"error": str(e)}

    def generate_citation(
        self,
        arxiv_id: str,
        format: str = "bibtex"
    ) -> str:
        """
        Generate citation for a paper.

        Args:
            arxiv_id: arXiv identifier
            format: Citation format ("bibtex", "apa", "mla")

        Returns:
            str: Formatted citation
        """
        try:
            arxiv_id = self._clean_arxiv_id(arxiv_id)
        except ValueError as e:
            return f"Error: {e}"

        info = self.get_arxiv_info(arxiv_id)

        if "error" in info:
            return f"Error fetching paper info: {info['error']}"

        if format == "bibtex":
            # Generate BibTeX citation
            key = f"arxiv{arxiv_id.replace('.', '_')}"
            raw_authors = " and ".join(info.get("authors", ["Unknown"]))
            raw_title = info.get("title", "Untitled")
            year = arxiv_id.split(".")[0][:2]  # Extract year from ID (simplified)
            year = f"20{year}" if int(year) < 50 else f"19{year}"

            # Escape BibTeX structural characters in untrusted values
            safe_title = raw_title.replace('{', r'\{').replace('}', r'\}')
            safe_authors = raw_authors.replace('{', r'\{').replace('}', r'\}')

            citation = f"""@article{{{key},
  title={{{safe_title}}},
  author={{{safe_authors}}},
  journal={{arXiv preprint arXiv:{arxiv_id}}},
  year={{{year}}}
}}"""
            return citation

        return f"Format '{format}' not yet implemented"

    # Patterns for valid arXiv IDs
    _ARXIV_ID_MODERN = re.compile(r'^\d{4}\.\d{4,5}(v\d+)?$')
    _ARXIV_ID_LEGACY = re.compile(r'^[a-zA-Z\-]+/\d{7}(v\d+)?$')

    @staticmethod
    def _clean_arxiv_id(arxiv_id: str) -> str:
        """Clean, normalize, and validate arXiv ID.

        Raises:
            ValueError: If the cleaned ID does not match a valid arXiv format.
        """
        # Remove common prefixes and whitespace
        arxiv_id = arxiv_id.strip()
        arxiv_id = re.sub(r'^(arxiv:|arXiv:)', '', arxiv_id, flags=re.IGNORECASE)
        arxiv_id = re.sub(r'https?://arxiv\.org/(abs|pdf)/', '', arxiv_id)
        arxiv_id = arxiv_id.replace('.pdf', '')

        # Validate format
        if not (PaperManager._ARXIV_ID_MODERN.match(arxiv_id)
                or PaperManager._ARXIV_ID_LEGACY.match(arxiv_id)):
            raise ValueError(
                f"Invalid arXiv ID: {arxiv_id!r}. "
                "Expected format: YYMM.NNNNN[vN] or category/YYMMNNN[vN]"
            )

        return arxiv_id

    @staticmethod
    def _escape_yaml_value(value: str) -> str:
        """Escape a string for safe use as a YAML scalar value.

        Wraps in double quotes and escapes internal quotes and backslashes
        to prevent YAML injection via crafted titles/authors.
        """
        value = value.replace('\\', '\\\\').replace('"', '\\"')
        return f'"{value}"'

    @staticmethod
    def _sanitize_text(text: str) -> str:
        """Sanitize untrusted text for safe inclusion in Markdown/YAML output.

        Normalizes whitespace, strips control characters, and neutralizes
        markdown code-fence breakout and YAML document delimiters.
        """
        # Remove control characters (keep newlines and tabs)
        text = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]', '', text)
        # Normalize whitespace runs (collapse multiple spaces/tabs, preserve single newlines)
        text = re.sub(r'[^\S\n]+', ' ', text)
        text = re.sub(r'\n{3,}', '\n\n', text)
        # Neutralize markdown code fence breakout
        text = text.replace('```', r'\`\`\`')
        # Neutralize YAML document delimiters at line start
        text = re.sub(r'^---', r'\\---', text, flags=re.MULTILINE)
        return text.strip()


def main():
    """Main CLI entry point."""
    parser = argparse.ArgumentParser(
        description="Paper Manager for Hugging Face Hub",
        formatter_class=argparse.RawDescriptionHelpFormatter
    )

    subparsers = parser.add_subparsers(dest="command", help="Command to execute")

    # Index command
    index_parser = subparsers.add_parser("index", help="Index a paper from arXiv")
    index_parser.add_argument("--arxiv-id", required=True, help="arXiv paper ID")

    # Check command
    check_parser = subparsers.add_parser("check", help="Check if paper exists")
    check_parser.add_argument("--arxiv-id", required=True, help="arXiv paper ID")

    # Link command
    link_parser = subparsers.add_parser("link", help="Link paper to repository")
    link_parser.add_argument("--repo-id", required=True, help="Repository ID")
    link_parser.add_argument("--repo-type", default="model", choices=["model", "dataset", "space"])
    link_parser.add_argument("--arxiv-id", help="Single arXiv ID")
    link_parser.add_argument("--arxiv-ids", help="Comma-separated arXiv IDs")
    link_parser.add_argument("--citation", help="Full citation text")
    link_parser.add_argument("--create-pr", action="store_true", help="Create PR instead of direct commit")

    # Create command
    create_parser = subparsers.add_parser("create", help="Create research article")
    create_parser.add_argument("--template", required=True, help="Template name")
    create_parser.add_argument("--title", required=True, help="Paper title")
    create_parser.add_argument("--output", required=True, help="Output filename")
    create_parser.add_argument("--authors", help="Comma-separated authors")
    create_parser.add_argument("--abstract", help="Abstract text")

    # Info command
    info_parser = subparsers.add_parser("info", help="Get paper information")
    info_parser.add_argument("--arxiv-id", required=True, help="arXiv paper ID")
    info_parser.add_argument("--format", default="json", choices=["json", "text"])

    # Citation command
    citation_parser = subparsers.add_parser("citation", help="Generate citation")
    citation_parser.add_argument("--arxiv-id", required=True, help="arXiv paper ID")
    citation_parser.add_argument("--format", default="bibtex", choices=["bibtex", "apa", "mla"])

    # Search command
    search_parser = subparsers.add_parser("search", help="Search papers")
    search_parser.add_argument("--query", required=True, help="Search query")

    args = parser.parse_args()

    if not args.command:
        parser.print_help()
        sys.exit(1)

    # Initialize manager
    manager = PaperManager()

    # Execute command
    if args.command == "index":
        result = manager.index_paper(args.arxiv_id)
        print(json.dumps(result, indent=2))

    elif args.command == "check":
        result = manager.check_paper(args.arxiv_id)
        print(json.dumps(result, indent=2))

    elif args.command == "link":
        arxiv_ids = []
        if args.arxiv_id:
            arxiv_ids.append(args.arxiv_id)
        if args.arxiv_ids:
            arxiv_ids.extend([id.strip() for id in args.arxiv_ids.split(",")])

        if not arxiv_ids:
            print("Error: Must provide --arxiv-id or --arxiv-ids")
            sys.exit(1)

        for arxiv_id in arxiv_ids:
            result = manager.link_paper_to_repo(
                repo_id=args.repo_id,
                arxiv_id=arxiv_id,
                repo_type=args.repo_type,
                citation=args.citation,
                create_pr=args.create_pr
            )
            print(json.dumps(result, indent=2))

    elif args.command == "create":
        result = manager.create_research_article(
            template=args.template,
            title=args.title,
            output=args.output,
            authors=args.authors,
            abstract=args.abstract
        )
        print(json.dumps(result, indent=2))

    elif args.command == "info":
        result = manager.get_arxiv_info(args.arxiv_id)
        if args.format == "json":
            print(json.dumps(result, indent=2))
        else:
            if "error" in result:
                print(f"Error: {result['error']}")
            else:
                print(f"Title: {result.get('title')}")
                print(f"Authors: {', '.join(result.get('authors', []))}")
                print(f"arXiv URL: {result.get('arxiv_url')}")
                print(f"\nAbstract:\n{result.get('abstract')}")

    elif args.command == "citation":
        citation = manager.generate_citation(args.arxiv_id, args.format)
        print(citation)

    elif args.command == "search":
        print(f"Searching for: {args.query}")
        print("Search functionality coming soon!")
        print(f"Visit: https://huggingface.co/papers?search={args.query}")


if __name__ == "__main__":
    main()