playbook/antigravity-awesome-skills/skills/hugging-face-jobs/scripts/cot-self-instruct.py

# /// script
# requires-python = ">=3.10"
# dependencies = [
#     "datasets",
#     "transformers",
#     "vllm>=0.6.5",
#     "huggingface-hub[hf_transfer]",
#     "torch",
#     "numpy",
#     "tqdm",
#     "scikit-learn",
# ]
# ///
"""
Generate high-quality synthetic data using Chain-of-Thought Self-Instruct methodology.

This script implements the CoT-Self-Instruct approach from the paper "CoT-Self-Instruct:
Building high-quality synthetic prompts for reasoning and non-reasoning tasks" (2025).

It supports two modes:
1. Reasoning tasks: Generates both questions and answers with Chain-of-Thought
2. Instruction tasks: Generates diverse prompts for general instruction following

Example usage:
    # Reasoning tasks with Answer-Consistency filtering
    uv run cot-self-instruct.py \\
        --seed-dataset davanstrien/s1k-reasoning \\
        --output-dataset username/synthetic-math \\
        --task-type reasoning \\
        --num-samples 5000 \\
        --filter-method answer-consistency

    # Instruction tasks with RIP filtering
    uv run cot-self-instruct.py \\
        --seed-dataset wildchat-filtered \\
        --output-dataset username/synthetic-prompts \\
        --task-type instruction \\
        --filter-method rip \\
        --reward-model Nexusflow/Athene-RM-8B

    # HF Jobs execution
    hf jobs uv run --flavor l4x4 \\
        --image vllm/vllm-openai \\
        -e HF_TOKEN=$(python3 -c "from huggingface_hub import get_token; print(get_token())") \\
        https://huggingface.co/datasets/uv-scripts/synthetic-data/raw/main/cot-self-instruct.py \\
        [args...]
"""

import argparse
import json
import logging
import os
import random
import re
import sys
from collections import Counter
from datetime import datetime
from typing import Dict, List, Optional, Tuple, Union

import numpy as np
import torch
from datasets import Dataset, load_dataset
from huggingface_hub import DatasetCard, login
from sklearn.cluster import KMeans
from tqdm.auto import tqdm
from transformers import AutoTokenizer
from vllm import LLM, SamplingParams

# Enable HF Transfer for faster downloads
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"

logging.basicConfig(
    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
)
logger = logging.getLogger(__name__)

# Prompt templates from the paper
REASONING_PROMPT_TEMPLATE = """You are a reasoning question generator assistant. Your goal is to create a novel, and challenging reasoning question. You are provided the following seed questions:
Seed Question 1: {seed1}
Seed Question 2: {seed2}
Your task is to:
1. Write a brand-new, self-contained reasoning question that meets the following requirements:
(a) The question draws inspiration from the seed question without copying it verbatim, remaining novel and of comparable difficulty.
(b) The question's final answer should be a single, unambiguous scalar value (e.g., an integer, reduced fraction, exact radical), or another answer type that can be verified in one step (e.g., 'yes/no,' a choice from A to D).
2. Then reason step by step, solve the new question and format your output as follows:
[New Question Begin]{{your_generated_question}}[New Question End]
[Final Answer to New Question Begin]\\boxed{{your_final_answer}}[Final Answer to New Question End]"""

INSTRUCTION_PROMPT_TEMPLATE = """You are a prompt generator assistant. Your goal is to create diverse and creative synthetic prompts.
Please follow the steps below to create synthetic prompts.
Step 1: Carefully read #Prompt 1# and #Prompt 2#. Identify and list all the common elements between these two prompts. If no common elements are found, list the main elements from each prompt.
Step 2: Develop a comprehensive plan based on the #Common Elements List# or #Main Elements List# from Step 1. This plan will guide the generation of new synthetic prompts that are similar to the original prompts.
Step 3: Execute the plan step by step and provide one #Synthetic Prompt#.
Please reply strictly in the following format:
- Step 1 #Common Elements List# or #Main Elements List#:
- Step 2 #Plan#:
- Step 3 #Synthetic Prompt#:
#Prompt 1#:
{prompt1}
#Prompt 2#:
{prompt2}"""


def check_gpu_availability() -> int:
    """Check if CUDA is available and return the number of GPUs."""
    if not torch.cuda.is_available():
        logger.error("CUDA is not available. This script requires a GPU.")
        logger.error(
            "Please run on a machine with NVIDIA GPU or use HF Jobs with GPU flavor."
        )
        sys.exit(1)

    num_gpus = torch.cuda.device_count()
    for i in range(num_gpus):
        gpu_name = torch.cuda.get_device_name(i)
        gpu_memory = torch.cuda.get_device_properties(i).total_memory / 1024**3
        logger.info(f"GPU {i}: {gpu_name} with {gpu_memory:.1f} GB memory")

    return num_gpus


def parse_thinking_output(text: str) -> str:
    """Remove thinking tokens from model output."""
    # Remove <think>...</think> blocks
    text = re.sub(r'<think>.*?</think>', '', text, flags=re.DOTALL)
    return text.strip()


def extract_reasoning_output(text: str) -> Tuple[Optional[str], Optional[str]]:
    """Extract question and answer from reasoning task output."""
    text = parse_thinking_output(text)

    # Extract question
    question_match = re.search(r'\[New Question Begin\](.*?)\[New Question End\]', text, re.DOTALL)
    if not question_match:
        return None, None
    question = question_match.group(1).strip()

    # Extract answer
    answer_match = re.search(r'\[Final Answer to New Question Begin\]\\?boxed\{(.*?)\}\[Final Answer to New Question End\]', text, re.DOTALL)
    if not answer_match:
        # Try without \boxed
        answer_match = re.search(r'\[Final Answer to New Question Begin\](.*?)\[Final Answer to New Question End\]', text, re.DOTALL)

    if not answer_match:
        return question, None

    answer = answer_match.group(1).strip()
    return question, answer


def extract_instruction_output(text: str) -> Optional[str]:
    """Extract synthetic prompt from instruction task output."""
    text = parse_thinking_output(text)

    # Look for the synthetic prompt after "Step 3 #Synthetic Prompt#:"
    match = re.search(r'Step 3 #Synthetic Prompt#:\s*(.+)', text, re.DOTALL)
    if match:
        return match.group(1).strip()
    return None


def categorize_prompts(prompts: List[str], num_categories: int = 8) -> Dict[int, List[int]]:
    """Categorize prompts using clustering for instruction tasks."""
    from transformers import AutoModel

    logger.info(f"Categorizing {len(prompts)} prompts into {num_categories} categories...")

    # Use a small model for embeddings
    tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
    model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")

    # Get embeddings
    embeddings = []
    for prompt in tqdm(prompts, desc="Computing embeddings"):
        inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
        with torch.no_grad():
            outputs = model(**inputs)
            embedding = outputs.last_hidden_state.mean(dim=1).numpy()
        embeddings.append(embedding[0])

    # Cluster
    kmeans = KMeans(n_clusters=num_categories, random_state=42)
    labels = kmeans.fit_predict(embeddings)

    # Group by category
    categories = {}
    for idx, label in enumerate(labels):
        if label not in categories:
            categories[label] = []
        categories[label].append(idx)

    return categories


def generate_synthetic_data(
    llm: LLM,
    seed_data: List[Dict],
    task_type: str,
    num_samples: int,
    categories: Optional[Dict[int, List[int]]] = None,
) -> List[Dict]:
    """Generate synthetic data using CoT-Self-Instruct."""
    synthetic_data = []

    # Set up progress bar
    pbar = tqdm(total=num_samples, desc="Generating synthetic data")

    while len(synthetic_data) < num_samples:
        # Sample seed data
        if task_type == "reasoning":
            # Random sampling for reasoning tasks
            seeds = random.sample(seed_data, min(2, len(seed_data)))
            prompt = REASONING_PROMPT_TEMPLATE.format(
                seed1=seeds[0].get("question", seeds[0].get("prompt", "")),
                seed2=seeds[1].get("question", seeds[1].get("prompt", "")) if len(seeds) > 1 else seeds[0].get("question", seeds[0].get("prompt", ""))
            )
        else:
            # Category-aware sampling for instruction tasks
            if categories:
                # Pick a random category
                category = random.choice(list(categories.keys()))
                category_indices = categories[category]
                indices = random.sample(category_indices, min(2, len(category_indices)))
                seeds = [seed_data[i] for i in indices]
            else:
                seeds = random.sample(seed_data, min(2, len(seed_data)))

            prompt = INSTRUCTION_PROMPT_TEMPLATE.format(
                prompt1=seeds[0].get("prompt", seeds[0].get("question", "")),
                prompt2=seeds[1].get("prompt", seeds[1].get("question", "")) if len(seeds) > 1 else seeds[0].get("prompt", seeds[0].get("question", ""))
            )

        # Generate
        sampling_params = SamplingParams(
            temperature=0.7 if task_type == "reasoning" else 0.8,
            top_p=0.95 if task_type == "reasoning" else 0.9,
            max_tokens=2048,
        )

        outputs = llm.generate([prompt], sampling_params)
        output_text = outputs[0].outputs[0].text

        # Parse output
        if task_type == "reasoning":
            question, answer = extract_reasoning_output(output_text)
            if question and answer:
                synthetic_data.append({
                    "question": question,
                    "answer": answer,
                    "seed_indices": [seed_data.index(s) for s in seeds],
                })
                pbar.update(1)
        else:
            synthetic_prompt = extract_instruction_output(output_text)
            if synthetic_prompt:
                synthetic_data.append({
                    "prompt": synthetic_prompt,
                    "seed_indices": [seed_data.index(s) for s in seeds],
                })
                pbar.update(1)

    pbar.close()
    return synthetic_data


def answer_consistency_filter(
    llm: LLM,
    synthetic_data: List[Dict],
    k_responses: int = 16,
    threshold: float = 0.5,
) -> List[Dict]:
    """Filter reasoning tasks using Answer-Consistency."""
    logger.info(f"Applying Answer-Consistency filter with K={k_responses}")

    filtered_data = []

    for item in tqdm(synthetic_data, desc="Answer-Consistency filtering"):
        question = item["question"]
        original_answer = item["answer"]

        # Generate K responses
        prompts = [question] * k_responses
        sampling_params = SamplingParams(
            temperature=0.6,
            top_p=0.95,
            max_tokens=1024,
        )

        outputs = llm.generate(prompts, sampling_params)

        # Extract answers
        answers = []
        for output in outputs:
            text = output.outputs[0].text
            # Try to extract boxed answer
            match = re.search(r'\\boxed\{(.*?)\}', text)
            if match:
                answers.append(match.group(1).strip())

        if not answers:
            continue

        # Get majority answer
        answer_counts = Counter(answers)
        if answer_counts:
            majority_answer, count = answer_counts.most_common(1)[0]

            # Check if majority answer matches original and meets threshold
            if (majority_answer == original_answer and
                count / len(answers) >= threshold):
                item["consistency_score"] = count / len(answers)
                filtered_data.append(item)

    logger.info(f"Answer-Consistency: kept {len(filtered_data)}/{len(synthetic_data)} examples")
    return filtered_data


def rip_filter(
    llm: LLM,
    synthetic_data: List[Dict],
    reward_model_id: str,
    k_responses: int = 32,
    threshold: float = 0.5,
) -> List[Dict]:
    """Filter using Rejecting Instruction Preferences (RIP)."""
    logger.info(f"Applying RIP filter with K={k_responses} and reward model {reward_model_id}")

    # Note: In a full implementation, you would load and use the actual reward model
    # For this example, we'll use a placeholder scoring mechanism
    logger.warning("RIP filtering requires a reward model implementation - using placeholder")

    filtered_data = []

    for item in tqdm(synthetic_data, desc="RIP filtering"):
        prompt = item.get("prompt", item.get("question", ""))

        # Generate K responses
        prompts = [prompt] * k_responses
        sampling_params = SamplingParams(
            temperature=1.0,
            top_p=1.0,
            max_tokens=1024,
        )

        outputs = llm.generate(prompts, sampling_params)

        # In real implementation: score each response with reward model
        # For now, use length as a proxy (longer responses often score higher)
        scores = [len(output.outputs[0].text) for output in outputs]

        # Use minimum score as quality indicator
        min_score = min(scores) if scores else 0
        normalized_score = min_score / 1000  # Normalize to 0-1 range

        if normalized_score >= threshold:
            item["rip_score"] = normalized_score
            filtered_data.append(item)

    logger.info(f"RIP filter: kept {len(filtered_data)}/{len(synthetic_data)} examples")
    return filtered_data


def create_dataset_card(
    task_type: str,
    source_dataset: str,
    generation_model: str,
    filter_method: str,
    num_generated: int,
    num_filtered: int,
    generation_time: str,
    additional_info: Dict = None,
) -> str:
    """Create a comprehensive dataset card."""
    filter_info = ""
    if filter_method == "answer-consistency":
        filter_info = """
### Answer-Consistency Filtering

This dataset was filtered using Answer-Consistency:
- Generated K responses for each synthetic question
- Kept only examples where majority answer matched the generated answer
- Ensures high-quality, correctly solved problems"""
    elif filter_method == "rip":
        filter_info = """
### RIP (Rejecting Instruction Preferences) Filtering

This dataset was filtered using RIP:
- Generated K responses for each synthetic prompt
- Scored responses using a reward model
- Kept only prompts with high minimum scores"""

    return f"""---
tags:
- synthetic-data
- cot-self-instruct
- {task_type}
- uv-script
---

# CoT-Self-Instruct Synthetic Data

This dataset contains synthetic {task_type} data generated using the Chain-of-Thought Self-Instruct methodology.

## Generation Details

- **Source Dataset**: [{source_dataset}](https://huggingface.co/datasets/{source_dataset})
- **Generation Model**: [{generation_model}](https://huggingface.co/{generation_model})
- **Task Type**: {task_type}
- **Filter Method**: {filter_method}
- **Generated Examples**: {num_generated:,}
- **After Filtering**: {num_filtered:,} ({(num_filtered/num_generated)*100:.1f}% acceptance rate)
- **Generation Date**: {generation_time}
{filter_info}

## Methodology

Generated using CoT-Self-Instruct, which:
1. Uses Chain-of-Thought reasoning to analyze seed examples
2. Generates new synthetic examples of similar quality and complexity
3. Applies quality filtering to ensure high-quality outputs

Based on the paper: "CoT-Self-Instruct: Building high-quality synthetic prompts for reasoning and non-reasoning tasks" (2025)

## Generation Script

Generated using the CoT-Self-Instruct script from [uv-scripts/synthetic-data](https://huggingface.co/datasets/uv-scripts/synthetic-data).

To reproduce:
```bash
uv run https://huggingface.co/datasets/uv-scripts/synthetic-data/raw/main/cot-self-instruct.py \\
    --seed-dataset {source_dataset} \\
    --output-dataset <your-dataset> \\
    --task-type {task_type} \\
    --generation-model {generation_model} \\
    --filter-method {filter_method}
```
"""


def main():
    parser = argparse.ArgumentParser(
        description="Generate synthetic data using CoT-Self-Instruct",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog=__doc__,
    )

    # Dataset arguments
    parser.add_argument(
        "--seed-dataset",
        type=str,
        required=True,
        help="HuggingFace dataset ID containing seed examples",
    )
    parser.add_argument(
        "--output-dataset",
        type=str,
        required=True,
        help="HuggingFace dataset ID for output",
    )

    # Task configuration
    parser.add_argument(
        "--task-type",
        type=str,
        choices=["reasoning", "instruction", "auto"],
        default="auto",
        help="Type of task (reasoning generates Q&A, instruction generates prompts)",
    )
    parser.add_argument(
        "--task-column",
        type=str,
        default=None,
        help="Column name containing tasks (auto-detected if not specified)",
    )

    # Model configuration
    parser.add_argument(
        "--generation-model",
        type=str,
        default="Qwen/Qwen3-30B-A3B-Thinking-2507",
        help="Model for synthetic data generation",
    )
    parser.add_argument(
        "--filter-model",
        type=str,
        default=None,
        help="Model for filtering (defaults to generation model)",
    )
    parser.add_argument(
        "--reward-model",
        type=str,
        default="Nexusflow/Athene-RM-8B",
        help="Reward model for RIP filtering",
    )

    # Generation parameters
    parser.add_argument(
        "--num-samples",
        type=int,
        default=5000,
        help="Number of synthetic examples to generate",
    )
    parser.add_argument(
        "--batch-size",
        type=int,
        default=1,
        help="Batch size for generation",
    )

    # Filtering parameters
    parser.add_argument(
        "--filter-method",
        type=str,
        choices=["answer-consistency", "rip", "both", "none"],
        default="answer-consistency",
        help="Quality filtering method",
    )
    parser.add_argument(
        "--k-responses",
        type=int,
        default=16,
        help="Number of responses for filtering",
    )
    parser.add_argument(
        "--quality-threshold",
        type=float,
        default=0.5,
        help="Minimum quality threshold for filtering",
    )

    # GPU configuration
    parser.add_argument(
        "--tensor-parallel-size",
        type=int,
        default=None,
        help="Number of GPUs for tensor parallelism (auto-detected if not set)",
    )
    parser.add_argument(
        "--gpu-memory-utilization",
        type=float,
        default=0.9,
        help="GPU memory utilization",
    )

    # Other arguments
    parser.add_argument(
        "--hf-token",
        type=str,
        default=None,
        help="HuggingFace API token",
    )
    parser.add_argument(
        "--seed",
        type=int,
        default=42,
        help="Random seed",
    )

    args = parser.parse_args()

    # Set random seeds
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)

    # Check GPU
    num_gpus = check_gpu_availability()
    tensor_parallel_size = args.tensor_parallel_size or num_gpus

    # Authentication
    hf_token = args.hf_token or os.environ.get("HF_TOKEN")
    if hf_token:
        login(token=hf_token)

    # Load seed dataset
    logger.info(f"Loading seed dataset: {args.seed_dataset}")
    seed_dataset = load_dataset(args.seed_dataset, split="train")

    # Auto-detect task type and column if needed
    if args.task_type == "auto":
        columns = seed_dataset.column_names
        if "question" in columns and "answer" in columns:
            args.task_type = "reasoning"
            logger.info("Auto-detected task type: reasoning")
        else:
            args.task_type = "instruction"
            logger.info("Auto-detected task type: instruction")

    if not args.task_column:
        if args.task_type == "reasoning":
            args.task_column = "question"
        else:
            # Try to find prompt column
            for col in ["prompt", "instruction", "text", "input"]:
                if col in seed_dataset.column_names:
                    args.task_column = col
                    break

    logger.info(f"Using task column: {args.task_column}")

    # Convert to list of dicts
    seed_data = seed_dataset.to_list()

    # Categorize prompts for instruction tasks
    categories = None
    if args.task_type == "instruction" and len(seed_data) > 100:
        prompts = [item.get(args.task_column, "") for item in seed_data]
        categories = categorize_prompts(prompts)

    # Initialize generation model
    logger.info(f"Loading generation model: {args.generation_model}")
    generation_llm = LLM(
        model=args.generation_model,
        tensor_parallel_size=tensor_parallel_size,
        gpu_memory_utilization=args.gpu_memory_utilization,
    )

    # Generate synthetic data
    start_time = datetime.now()
    synthetic_data = generate_synthetic_data(
        generation_llm,
        seed_data,
        args.task_type,
        args.num_samples,
        categories,
    )

    # Apply filtering
    filter_llm = generation_llm
    if args.filter_model and args.filter_model != args.generation_model:
        logger.info(f"Loading filter model: {args.filter_model}")
        # Clean up generation model
        del generation_llm
        torch.cuda.empty_cache()

        filter_llm = LLM(
            model=args.filter_model,
            tensor_parallel_size=tensor_parallel_size,
            gpu_memory_utilization=args.gpu_memory_utilization,
        )

    filtered_data = synthetic_data
    if args.filter_method != "none":
        if args.filter_method == "answer-consistency" and args.task_type == "reasoning":
            filtered_data = answer_consistency_filter(
                filter_llm,
                synthetic_data,
                args.k_responses,
                args.quality_threshold,
            )
        elif args.filter_method == "rip":
            filtered_data = rip_filter(
                filter_llm,
                synthetic_data,
                args.reward_model,
                args.k_responses,
                args.quality_threshold,
            )
        elif args.filter_method == "both":
            if args.task_type == "reasoning":
                filtered_data = answer_consistency_filter(
                    filter_llm,
                    synthetic_data,
                    args.k_responses,
                    args.quality_threshold,
                )
            filtered_data = rip_filter(
                filter_llm,
                filtered_data,
                args.reward_model,
                args.k_responses,
                args.quality_threshold,
            )

    # Create HuggingFace dataset
    logger.info(f"Creating dataset with {len(filtered_data)} examples")
    dataset = Dataset.from_list(filtered_data)

    # Create dataset card
    generation_time = start_time.strftime("%Y-%m-%d %H:%M:%S UTC")
    dataset_card = create_dataset_card(
        args.task_type,
        args.seed_dataset,
        args.generation_model,
        args.filter_method,
        len(synthetic_data),
        len(filtered_data),
        generation_time,
    )

    # Push to hub
    logger.info(f"Pushing dataset to: {args.output_dataset}")
    # Create dataset card
    card = DatasetCard(dataset_card)
    dataset.push_to_hub(args.output_dataset)
    # Push card separately
    card.push_to_hub(args.output_dataset)

    logger.info("Done! Dataset available at: https://huggingface.co/datasets/" + args.output_dataset)

    # Print example HF Jobs command if running locally
    if len(sys.argv) > 1:
        print("\nTo run on HF Jobs:")
        print(f"""hf jobs uv run --flavor l4x4 \\
    --image vllm/vllm-openai \\
    -e HF_TOKEN=$(python3 -c "from huggingface_hub import get_token; print(get_token())") \\
    https://huggingface.co/datasets/uv-scripts/synthetic-data/raw/main/cot-self-instruct.py \\
    --seed-dataset {args.seed_dataset} \\
    --output-dataset {args.output_dataset} \\
    --task-type {args.task_type} \\
    --generation-model {args.generation_model} \\
    --filter-method {args.filter_method} \\
    --num-samples {args.num_samples}""")


if __name__ == "__main__":
    main()