547 lines
17 KiB
Python
547 lines
17 KiB
Python
# /// script
|
|
# requires-python = ">=3.12"
|
|
# dependencies = [
|
|
# "polars>=1.31.0",
|
|
# "huggingface-hub",
|
|
# "datasets",
|
|
# "ascii-graph",
|
|
# ]
|
|
# ///
|
|
"""
|
|
Analyze educational quality trends across CommonCrawl dumps using Polars streaming.
|
|
|
|
Answers: "Is the web getting more educational over time?"
|
|
|
|
Demonstrates Polars HF Hub integration - process 50M+ docs without downloading 300GB+.
|
|
|
|
Example usage:
|
|
# Analyze English PDFs (default)
|
|
uv run finepdfs-stats.py
|
|
|
|
# Analyze all 70+ languages
|
|
uv run finepdfs-stats.py --all-languages
|
|
|
|
# Quick test
|
|
uv run finepdfs-stats.py --limit 10000 --show-plan
|
|
|
|
# Save results to HF Hub
|
|
uv run finepdfs-stats.py --output-repo username/finepdfs-temporal-stats
|
|
|
|
# Run on HF Jobs
|
|
hf jobs uv run \\
|
|
-s HF_TOKEN \\
|
|
-e HF_XET_HIGH_PERFORMANCE=1 \\
|
|
https://huggingface.co/datasets/uv-scripts/dataset-stats/raw/main/finepdfs-stats.py \\
|
|
-- --output-repo username/stats
|
|
"""
|
|
|
|
import argparse
|
|
import logging
|
|
import os
|
|
import sys
|
|
import time
|
|
from pathlib import Path
|
|
|
|
import polars as pl
|
|
from ascii_graph import Pyasciigraph
|
|
from datasets import Dataset
|
|
from huggingface_hub import HfApi, create_repo, list_repo_tree, login
|
|
|
|
logging.basicConfig(
|
|
level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
|
|
)
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Common language+script codes for finepdfs-edu
|
|
COMMON_LANGUAGES = {
|
|
"eng_Latn": "English (Latin script)",
|
|
"fra_Latn": "French (Latin script)",
|
|
"deu_Latn": "German (Latin script)",
|
|
"spa_Latn": "Spanish (Latin script)",
|
|
"por_Latn": "Portuguese (Latin script)",
|
|
"ita_Latn": "Italian (Latin script)",
|
|
"nld_Latn": "Dutch (Latin script)",
|
|
"pol_Latn": "Polish (Latin script)",
|
|
"rus_Cyrl": "Russian (Cyrillic script)",
|
|
"zho_Hans": "Chinese (Simplified)",
|
|
"zho_Hant": "Chinese (Traditional)",
|
|
"jpn_Jpan": "Japanese",
|
|
"kor_Hang": "Korean",
|
|
"ara_Arab": "Arabic",
|
|
"hin_Deva": "Hindi (Devanagari)",
|
|
}
|
|
|
|
|
|
def list_available_languages(dataset_id: str) -> list[str]:
|
|
"""List available language subsets in the dataset."""
|
|
try:
|
|
tree = list_repo_tree(dataset_id, path_in_repo="data", repo_type="dataset")
|
|
languages = [
|
|
item.path.replace("data/", "")
|
|
for item in tree
|
|
if item.path.startswith("data/")
|
|
and "/" not in item.path.replace("data/", "")
|
|
]
|
|
return sorted(languages)
|
|
except Exception as e:
|
|
logger.warning(f"Could not list languages: {e}")
|
|
return list(COMMON_LANGUAGES.keys())
|
|
|
|
|
|
def compute_temporal_stats(df: pl.LazyFrame, output_path: Path) -> pl.DataFrame:
|
|
"""Single scan: compute stats grouped by dump for temporal analysis."""
|
|
query = df.group_by("dump").agg(
|
|
pl.len().alias("doc_count"),
|
|
pl.col("token_count").sum().alias("total_tokens"),
|
|
pl.col("fw_edu_scores").list.mean().mean().alias("avg_edu_score"),
|
|
(pl.col("fw_edu_scores").list.mean() >= 3).sum().alias("high_edu_count"),
|
|
)
|
|
query.sink_parquet(output_path, engine="streaming")
|
|
return pl.read_parquet(output_path)
|
|
|
|
|
|
def compute_global_stats(temporal: pl.DataFrame) -> pl.DataFrame:
|
|
"""Compute global stats from temporal breakdown."""
|
|
total = temporal["doc_count"].sum()
|
|
return pl.DataFrame(
|
|
{
|
|
"total_docs": [total],
|
|
"total_tokens": [temporal["total_tokens"].sum()],
|
|
"avg_edu_score": [
|
|
(temporal["avg_edu_score"] * temporal["doc_count"]).sum() / total
|
|
],
|
|
"high_edu_rate": [temporal["high_edu_count"].sum() / total],
|
|
"num_dumps": [len(temporal)],
|
|
}
|
|
)
|
|
|
|
|
|
def format_temporal_stats(temporal: pl.DataFrame) -> pl.DataFrame:
|
|
"""Format temporal stats with high_edu_rate, sorted chronologically."""
|
|
return (
|
|
temporal.with_columns(
|
|
(pl.col("high_edu_count") / pl.col("doc_count")).alias("high_edu_rate")
|
|
)
|
|
.select(["dump", "doc_count", "avg_edu_score", "high_edu_rate"])
|
|
.sort(
|
|
"dump"
|
|
) # Chronological order (CC-MAIN-2017-xx comes before CC-MAIN-2024-xx)
|
|
)
|
|
|
|
|
|
def create_ascii_charts(temporal_stats: pl.DataFrame) -> str:
|
|
"""Create ASCII bar charts showing temporal trends."""
|
|
# Extract year from dump name (CC-MAIN-2024-42 -> 2024)
|
|
# Group by year and average the values for cleaner display
|
|
yearly = (
|
|
temporal_stats.with_columns(
|
|
pl.col("dump").str.extract(r"CC-MAIN-(\d{4})", 1).alias("year")
|
|
)
|
|
.group_by("year")
|
|
.agg(
|
|
pl.col("doc_count").sum(),
|
|
pl.col("avg_edu_score").mean(),
|
|
pl.col("high_edu_rate").mean(),
|
|
)
|
|
.sort("year")
|
|
)
|
|
|
|
lines = []
|
|
|
|
# High edu rate chart (more dramatic differences)
|
|
data_rate = [
|
|
(row["year"], row["high_edu_rate"] * 100)
|
|
for row in yearly.iter_rows(named=True)
|
|
]
|
|
graph = Pyasciigraph(line_length=60, float_format="{0:.1f}%")
|
|
lines.extend(graph.graph("High Educational Content (edu >= 3)", data_rate))
|
|
|
|
lines.append("")
|
|
|
|
# Avg edu score chart
|
|
data_score = [
|
|
(row["year"], row["avg_edu_score"]) for row in yearly.iter_rows(named=True)
|
|
]
|
|
graph2 = Pyasciigraph(line_length=60, float_format="{0:.2f}")
|
|
lines.extend(graph2.graph("Average Educational Score", data_score))
|
|
|
|
return "\n".join(lines)
|
|
|
|
|
|
def create_readme(
|
|
args,
|
|
global_stats: pl.DataFrame,
|
|
temporal_stats: pl.DataFrame,
|
|
scan_time: float,
|
|
ascii_charts: str,
|
|
) -> str:
|
|
"""Create README content for the stats dataset."""
|
|
stats = global_stats.to_dicts()[0]
|
|
total_docs = stats.get("total_docs", 0)
|
|
docs_per_sec = total_docs / scan_time if scan_time > 0 else 0
|
|
|
|
# Get first and last year averages for trend (more representative than single dumps)
|
|
yearly = (
|
|
temporal_stats.with_columns(
|
|
pl.col("dump").str.extract(r"CC-MAIN-(\d{4})", 1).alias("year")
|
|
)
|
|
.group_by("year")
|
|
.agg(
|
|
pl.col("doc_count").sum(),
|
|
pl.col("avg_edu_score").mean(),
|
|
pl.col("high_edu_rate").mean(),
|
|
)
|
|
.sort("year")
|
|
)
|
|
first_year = yearly.head(1).to_dicts()[0]
|
|
last_year = yearly.tail(1).to_dicts()[0]
|
|
|
|
scope = (
|
|
"all languages"
|
|
if args.all_languages
|
|
else COMMON_LANGUAGES.get(args.lang, args.lang)
|
|
)
|
|
|
|
return f"""---
|
|
tags:
|
|
- uv-script
|
|
- statistics
|
|
- polars
|
|
- finepdfs-edu
|
|
- temporal-analysis
|
|
license: odc-by
|
|
configs:
|
|
- config_name: global_stats
|
|
data_files: global_stats/train-*.parquet
|
|
- config_name: temporal_stats
|
|
data_files: temporal_stats/train-*.parquet
|
|
default_viewer_config: temporal_stats
|
|
---
|
|
|
|
# Is the Web Getting More Educational?
|
|
|
|
Temporal analysis of educational quality in **{scope}** across {stats.get("num_dumps", 0)} CommonCrawl dumps.
|
|
|
|
## Trend
|
|
|
|
```
|
|
{ascii_charts}
|
|
```
|
|
|
|
## Key Finding
|
|
|
|
| Year | Avg Edu Score | High Edu Rate |
|
|
|------|---------------|---------------|
|
|
| {first_year["year"]} | {first_year["avg_edu_score"]:.2f} | {first_year["high_edu_rate"] * 100:.1f}% |
|
|
| {last_year["year"]} | {last_year["avg_edu_score"]:.2f} | {last_year["high_edu_rate"] * 100:.1f}% |
|
|
|
|
## Performance
|
|
|
|
- **{total_docs:,} documents** processed in **{scan_time:.0f} seconds**
|
|
- **{docs_per_sec:,.0f} docs/sec** using Polars streaming
|
|
- Single scan, no full dataset download required
|
|
|
|
## Summary
|
|
|
|
| Metric | Value |
|
|
|--------|-------|
|
|
| Scope | {scope} |
|
|
| Total Documents | {total_docs:,} |
|
|
| Total Tokens | {stats.get("total_tokens", 0):,} |
|
|
| Avg Edu Score | {stats.get("avg_edu_score", 0):.3f} |
|
|
| High Edu Rate | {stats.get("high_edu_rate", 0) * 100:.1f}% |
|
|
| CommonCrawl Dumps | {stats.get("num_dumps", 0)} |
|
|
|
|
## Files
|
|
|
|
- `global_stats` - Overall summary
|
|
- `temporal_stats` - Per-dump breakdown (sorted chronologically)
|
|
|
|
## Reproduce
|
|
|
|
```bash
|
|
uv run https://huggingface.co/datasets/uv-scripts/dataset-stats/raw/main/finepdfs-stats.py \\
|
|
{"--all-languages" if args.all_languages else f"--lang {args.lang}"} --output-repo your-username/stats
|
|
```
|
|
|
|
## Source
|
|
|
|
- **Dataset**: [{args.source_dataset}](https://huggingface.co/datasets/{args.source_dataset})
|
|
- **Script**: [uv-scripts/dataset-stats](https://huggingface.co/datasets/uv-scripts/dataset-stats)
|
|
"""
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
description="Analyze educational quality trends across CommonCrawl dumps",
|
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
epilog=__doc__,
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--source-dataset",
|
|
type=str,
|
|
default="HuggingFaceFW/finepdfs-edu",
|
|
help="Source dataset (default: HuggingFaceFW/finepdfs-edu)",
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--lang",
|
|
type=str,
|
|
default="eng_Latn",
|
|
help="Language+script code (default: eng_Latn)",
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--all-languages",
|
|
action="store_true",
|
|
help="Analyze all languages (70+) instead of single language",
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--show-plan",
|
|
action="store_true",
|
|
help="Show Polars query plan (demonstrates optimization)",
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--list-languages",
|
|
action="store_true",
|
|
help="List available languages and exit",
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--limit",
|
|
type=int,
|
|
help="Limit to first N rows (for testing)",
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--output-repo",
|
|
type=str,
|
|
help="HuggingFace dataset repository to upload results",
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--output-dir",
|
|
type=str,
|
|
default="./stats_output",
|
|
help="Local directory for output files",
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--hf-token",
|
|
type=str,
|
|
help="HuggingFace API token (or set HF_TOKEN env var)",
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--private",
|
|
action="store_true",
|
|
help="Make the output dataset private",
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
# Check for high-performance mode
|
|
if os.environ.get("HF_XET_HIGH_PERFORMANCE"):
|
|
logger.info("High-performance mode enabled (HF_XET_HIGH_PERFORMANCE=1)")
|
|
|
|
# List languages mode
|
|
if args.list_languages:
|
|
print(f"Available language+script codes for {args.source_dataset}:\n")
|
|
print("Common languages:")
|
|
for code, name in COMMON_LANGUAGES.items():
|
|
print(f" {code:12} - {name}")
|
|
print("\nFetching full list from HF Hub...")
|
|
all_langs = list_available_languages(args.source_dataset)
|
|
print(f"\nAll available ({len(all_langs)} total):")
|
|
for lang in all_langs[:30]: # Show first 30
|
|
name = COMMON_LANGUAGES.get(lang, "")
|
|
print(f" {lang:12} {name}")
|
|
if len(all_langs) > 30:
|
|
print(f" ... and {len(all_langs) - 30} more")
|
|
sys.exit(0)
|
|
|
|
# Build the parquet path
|
|
if args.all_languages:
|
|
source_path = f"hf://datasets/{args.source_dataset}/data/*/train/*.parquet"
|
|
scope_desc = "all languages"
|
|
else:
|
|
source_path = (
|
|
f"hf://datasets/{args.source_dataset}/data/{args.lang}/train/*.parquet"
|
|
)
|
|
scope_desc = f"{args.lang} ({COMMON_LANGUAGES.get(args.lang, 'unknown')})"
|
|
|
|
logger.info(f"Scanning: {source_path}")
|
|
logger.info(f"Scope: {scope_desc}")
|
|
|
|
# Create lazy frame - this doesn't load any data yet!
|
|
logger.info("Creating lazy query plan...")
|
|
df = pl.scan_parquet(source_path)
|
|
|
|
# Apply limit if specified
|
|
if args.limit:
|
|
logger.info(f"Limiting to first {args.limit:,} rows")
|
|
df = df.head(args.limit)
|
|
|
|
# Show query plan if requested
|
|
if args.show_plan:
|
|
# Build a sample query to show the plan
|
|
sample_query = df.select(
|
|
pl.len(),
|
|
pl.col("token_count").sum(),
|
|
pl.col("language").n_unique(),
|
|
)
|
|
print("\nQuery Plan (showing Polars optimization):")
|
|
print("=" * 60)
|
|
print(sample_query.explain())
|
|
print("=" * 60)
|
|
print("\nNote: Polars uses projection pushdown - only reads columns needed!")
|
|
print("The 'text' column is never loaded, making this very fast.\n")
|
|
|
|
# Create output directory
|
|
output_dir = Path(args.output_dir)
|
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Single scan: compute temporal stats
|
|
logger.info("Computing temporal stats (single scan)...")
|
|
start = time.perf_counter()
|
|
temporal_path = output_dir / "temporal_stats.parquet"
|
|
temporal_raw = compute_temporal_stats(df, temporal_path)
|
|
scan_time = time.perf_counter() - start
|
|
logger.info(f"Scan complete in {scan_time:.2f}s - {len(temporal_raw)} dumps")
|
|
|
|
# Compute stats
|
|
global_stats = compute_global_stats(temporal_raw)
|
|
temporal_stats = format_temporal_stats(temporal_raw)
|
|
|
|
# Save
|
|
global_stats.write_parquet(output_dir / "global_stats.parquet")
|
|
temporal_stats.write_parquet(output_dir / "temporal_stats.parquet")
|
|
|
|
# Print results
|
|
total_docs = global_stats["total_docs"][0]
|
|
docs_per_sec = total_docs / scan_time if scan_time > 0 else 0
|
|
|
|
print("\n" + "=" * 70)
|
|
print("IS THE WEB GETTING MORE EDUCATIONAL?")
|
|
print("=" * 70)
|
|
|
|
print(f"\nScope: {scope_desc}")
|
|
print(f"Dataset: {args.source_dataset}")
|
|
|
|
print("\n" + "-" * 70)
|
|
print("GLOBAL STATS")
|
|
print("-" * 70)
|
|
print(global_stats)
|
|
|
|
print("\n" + "-" * 70)
|
|
print(f"TEMPORAL TREND ({len(temporal_stats)} CommonCrawl dumps)")
|
|
print("-" * 70)
|
|
# Show first 5 and last 5
|
|
if len(temporal_stats) > 10:
|
|
print("Earliest dumps:")
|
|
print(temporal_stats.head(5))
|
|
print("\n...")
|
|
print("\nLatest dumps:")
|
|
print(temporal_stats.tail(5))
|
|
else:
|
|
print(temporal_stats)
|
|
|
|
# Create ASCII charts
|
|
ascii_charts = create_ascii_charts(temporal_stats)
|
|
print("\n" + "-" * 70)
|
|
print("TREND VISUALIZATION")
|
|
print("-" * 70)
|
|
print(ascii_charts)
|
|
|
|
print("\n" + "-" * 70)
|
|
print("PERFORMANCE")
|
|
print("-" * 70)
|
|
print(f"Scan time: {scan_time:.2f}s")
|
|
print(f"Documents: {total_docs:,}")
|
|
print(f"Throughput: {docs_per_sec:,.0f} docs/sec")
|
|
|
|
logger.info(f"Results saved to: {output_dir}")
|
|
|
|
# Upload to HF Hub if requested
|
|
if args.output_repo:
|
|
hf_token = args.hf_token or os.environ.get("HF_TOKEN")
|
|
if hf_token:
|
|
login(token=hf_token)
|
|
|
|
api = HfApi(token=hf_token)
|
|
|
|
logger.info(f"Creating/updating dataset repository: {args.output_repo}")
|
|
create_repo(
|
|
args.output_repo,
|
|
repo_type="dataset",
|
|
private=args.private,
|
|
token=hf_token,
|
|
exist_ok=True,
|
|
)
|
|
|
|
# Upload each as a dataset config
|
|
configs = [
|
|
("global_stats", global_stats),
|
|
("temporal_stats", temporal_stats),
|
|
]
|
|
|
|
for config_name, stats_df in configs:
|
|
logger.info(f"Uploading {config_name}...")
|
|
ds = Dataset.from_polars(stats_df)
|
|
ds.push_to_hub(
|
|
args.output_repo,
|
|
config_name=config_name,
|
|
token=hf_token,
|
|
private=args.private,
|
|
)
|
|
time.sleep(1) # Avoid 409 conflicts
|
|
|
|
# Upload README
|
|
readme_content = create_readme(
|
|
args, global_stats, temporal_stats, scan_time, ascii_charts
|
|
)
|
|
api.upload_file(
|
|
path_or_fileobj=readme_content.encode(),
|
|
path_in_repo="README.md",
|
|
repo_id=args.output_repo,
|
|
repo_type="dataset",
|
|
token=hf_token,
|
|
)
|
|
|
|
dataset_url = f"https://huggingface.co/datasets/{args.output_repo}"
|
|
logger.info(f"Dataset uploaded: {dataset_url}")
|
|
print(f"\nResults uploaded to: {dataset_url}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
if len(sys.argv) == 1:
|
|
print("Is the Web Getting More Educational?")
|
|
print("=" * 40)
|
|
print("\nAnalyze educational quality trends across CommonCrawl dumps")
|
|
print("using Polars streaming - no download needed!\n")
|
|
print("Example commands:\n")
|
|
print("# Quick test:")
|
|
print("uv run finepdfs-stats.py --limit 10000\n")
|
|
print("# Analyze English PDFs:")
|
|
print("uv run finepdfs-stats.py\n")
|
|
print("# Analyze ALL 70+ languages:")
|
|
print("uv run finepdfs-stats.py --all-languages\n")
|
|
print("# Show query plan (see Polars optimization):")
|
|
print("uv run finepdfs-stats.py --show-plan --limit 1000\n")
|
|
print("# Save results to HF Hub:")
|
|
print("uv run finepdfs-stats.py --output-repo username/temporal-stats\n")
|
|
print("# Run on HF Jobs:")
|
|
print("hf jobs uv run \\")
|
|
print(" -s HF_TOKEN \\")
|
|
print(" -e HF_XET_HIGH_PERFORMANCE=1 \\")
|
|
print(
|
|
" https://huggingface.co/datasets/uv-scripts/dataset-stats/raw/main/finepdfs-stats.py \\"
|
|
)
|
|
print(" -- --output-repo username/stats")
|
|
sys.exit(0)
|
|
|
|
main()
|