254 lines
7.9 KiB
Python
254 lines
7.9 KiB
Python
"""
|
|
BigQuery Iceberg — Metadata Collection (collect only)
|
|
=====================================================
|
|
Collects table schemas, row counts, byte sizes, and freshness for BigQuery
|
|
Iceberg (BigLake-managed) tables using INFORMATION_SCHEMA.TABLE_STORAGE and
|
|
INFORMATION_SCHEMA.COLUMNS. Standard BigQuery collection uses __TABLES__ which
|
|
does not include Iceberg tables — this template fills that gap.
|
|
|
|
Can be run standalone via CLI or imported (use the ``collect()`` function).
|
|
|
|
Supports a ``--only-freshness-and-volume`` flag to skip the COLUMNS query for
|
|
fast periodic pushes after the initial full metadata push.
|
|
|
|
Substitution points (search for "← SUBSTITUTE"):
|
|
- BIGQUERY_PROJECT_ID : GCP project ID to collect from
|
|
- GOOGLE_APPLICATION_CREDENTIALS : path to service-account JSON key file
|
|
- REGION : BigQuery region (default "us")
|
|
|
|
Prerequisites:
|
|
pip install google-cloud-bigquery
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import json
|
|
import logging
|
|
import os
|
|
from datetime import datetime, timezone
|
|
|
|
from google.cloud import bigquery
|
|
|
|
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
|
|
log = logging.getLogger(__name__)
|
|
|
|
RESOURCE_TYPE = "bigquery"
|
|
|
|
# BigQuery type → Monte Carlo canonical type
|
|
BQ_TYPE_MAP: dict[str, str] = {
|
|
"INT64": "INTEGER",
|
|
"INTEGER": "INTEGER",
|
|
"FLOAT64": "FLOAT",
|
|
"FLOAT": "FLOAT",
|
|
"BOOL": "BOOLEAN",
|
|
"BOOLEAN": "BOOLEAN",
|
|
"STRING": "VARCHAR",
|
|
"BYTES": "BINARY",
|
|
"DATE": "DATE",
|
|
"DATETIME": "DATETIME",
|
|
"TIMESTAMP": "TIMESTAMP",
|
|
"TIME": "TIME",
|
|
"NUMERIC": "DECIMAL",
|
|
"BIGNUMERIC": "DECIMAL",
|
|
"RECORD": "STRUCT",
|
|
"STRUCT": "STRUCT",
|
|
"REPEATED": "ARRAY",
|
|
"JSON": "JSON",
|
|
"GEOGRAPHY": "GEOGRAPHY",
|
|
}
|
|
|
|
|
|
def map_bq_type(bq_type: str) -> str:
|
|
base = bq_type.split("(")[0].strip().upper()
|
|
return BQ_TYPE_MAP.get(base, bq_type.upper())
|
|
|
|
|
|
def _fetch_iceberg_tables(
|
|
client: bigquery.Client,
|
|
project_id: str,
|
|
datasets: list[str] | None = None,
|
|
tables: list[str] | None = None,
|
|
) -> list[dict]:
|
|
"""Query TABLE_STORAGE for BigLake (Iceberg) tables."""
|
|
conditions = [
|
|
"managed_table_type = 'BIGLAKE'",
|
|
"deleted = FALSE",
|
|
]
|
|
if datasets:
|
|
ds_list = ", ".join(f"'{d}'" for d in datasets)
|
|
conditions.append(f"table_schema IN ({ds_list})")
|
|
if tables:
|
|
tbl_list = ", ".join(f"'{t}'" for t in tables)
|
|
conditions.append(f"table_name IN ({tbl_list})")
|
|
|
|
where = " AND ".join(conditions)
|
|
query = f"""
|
|
SELECT
|
|
table_schema,
|
|
table_name,
|
|
total_rows,
|
|
current_physical_bytes,
|
|
storage_last_modified_time,
|
|
creation_time
|
|
FROM `{project_id}.region-us`.INFORMATION_SCHEMA.TABLE_STORAGE -- ← SUBSTITUTE: change region if needed
|
|
WHERE {where}
|
|
ORDER BY table_schema, table_name
|
|
"""
|
|
log.info("Querying TABLE_STORAGE for Iceberg tables ...")
|
|
rows = list(client.query(query).result())
|
|
log.info("Found %d Iceberg table(s).", len(rows))
|
|
return [dict(row) for row in rows]
|
|
|
|
|
|
def _fetch_columns(
|
|
client: bigquery.Client,
|
|
project_id: str,
|
|
dataset: str,
|
|
table_name: str,
|
|
) -> list[dict]:
|
|
"""Fetch column metadata for a specific table."""
|
|
query = f"""
|
|
SELECT column_name, data_type, ordinal_position, is_nullable, column_default
|
|
FROM `{project_id}.{dataset}.INFORMATION_SCHEMA.COLUMNS`
|
|
WHERE table_name = '{table_name}'
|
|
ORDER BY ordinal_position
|
|
"""
|
|
return [
|
|
{
|
|
"name": row["column_name"],
|
|
"type": map_bq_type(row["data_type"]),
|
|
}
|
|
for row in client.query(query).result()
|
|
]
|
|
|
|
|
|
def _resolve_freshness(row: dict) -> str:
|
|
"""Return the best available freshness timestamp as ISO8601.
|
|
|
|
Uses storage_last_modified_time if Google has populated it (expected
|
|
early April 2026). Falls back to current time with a warning.
|
|
"""
|
|
if row.get("storage_last_modified_time"):
|
|
return row["storage_last_modified_time"].isoformat()
|
|
|
|
log.warning(
|
|
"storage_last_modified_time is NULL for %s.%s — "
|
|
"falling back to current time. Google's TABLE_STORAGE update "
|
|
"for Iceberg tables may not have shipped yet.",
|
|
row["table_schema"],
|
|
row["table_name"],
|
|
)
|
|
return datetime.now(timezone.utc).isoformat()
|
|
|
|
|
|
def collect(
|
|
project_id: str,
|
|
datasets: list[str] | None = None,
|
|
tables: list[str] | None = None,
|
|
only_freshness_and_volume: bool = False,
|
|
output_file: str = "metadata_output.json",
|
|
) -> dict:
|
|
"""Collect Iceberg table metadata and write a JSON manifest.
|
|
|
|
When only_freshness_and_volume is True, skips the COLUMNS query and
|
|
omits fields from the manifest. Use this for periodic hourly pushes
|
|
after the initial full metadata push.
|
|
"""
|
|
client = bigquery.Client(project=project_id) # ← SUBSTITUTE: adjust auth if needed
|
|
|
|
if only_freshness_and_volume:
|
|
log.info("Running in freshness+volume only mode (skipping fields).")
|
|
|
|
iceberg_tables = _fetch_iceberg_tables(client, project_id, datasets, tables)
|
|
if not iceberg_tables:
|
|
log.warning("No Iceberg tables found matching the criteria.")
|
|
return {"resource_type": RESOURCE_TYPE, "assets": []}
|
|
|
|
assets: list[dict] = []
|
|
for row in iceberg_tables:
|
|
dataset = row["table_schema"]
|
|
name = row["table_name"]
|
|
|
|
asset = {
|
|
"name": name,
|
|
"database": project_id,
|
|
"schema": dataset,
|
|
"type": "TABLE",
|
|
"volume": {
|
|
"row_count": row["total_rows"],
|
|
"byte_count": row["current_physical_bytes"],
|
|
},
|
|
"freshness": {
|
|
"last_updated_time": _resolve_freshness(row),
|
|
},
|
|
}
|
|
|
|
if not only_freshness_and_volume:
|
|
asset["description"] = None
|
|
asset["fields"] = _fetch_columns(client, project_id, dataset, name)
|
|
|
|
assets.append(asset)
|
|
log.info(
|
|
"Collected %s.%s.%s — rows=%s, bytes=%s",
|
|
project_id, dataset, name,
|
|
row["total_rows"], row["current_physical_bytes"],
|
|
)
|
|
|
|
manifest = {
|
|
"resource_type": RESOURCE_TYPE,
|
|
"collected_at": datetime.now(timezone.utc).isoformat(),
|
|
"assets": assets,
|
|
}
|
|
with open(output_file, "w") as fh:
|
|
json.dump(manifest, fh, indent=2)
|
|
log.info("Manifest written to %s (%d assets)", output_file, len(assets))
|
|
|
|
return manifest
|
|
|
|
|
|
def main() -> None:
|
|
parser = argparse.ArgumentParser(
|
|
description="Collect BigQuery Iceberg table metadata into a JSON manifest",
|
|
)
|
|
parser.add_argument(
|
|
"--project-id",
|
|
default=os.getenv("BIGQUERY_PROJECT_ID"), # ← SUBSTITUTE
|
|
help="GCP project ID (or set BIGQUERY_PROJECT_ID env var)",
|
|
)
|
|
parser.add_argument(
|
|
"--datasets",
|
|
nargs="+",
|
|
default=None,
|
|
help="Limit to specific dataset(s). Omit to scan all datasets.",
|
|
)
|
|
parser.add_argument(
|
|
"--tables",
|
|
nargs="+",
|
|
default=None,
|
|
help="Limit to specific table name(s) within the datasets.",
|
|
)
|
|
parser.add_argument(
|
|
"--only-freshness-and-volume",
|
|
action="store_true",
|
|
help="Skip field/schema collection — only collect freshness and volume. "
|
|
"Use for periodic hourly pushes after the initial full metadata push.",
|
|
)
|
|
parser.add_argument("--output-file", default="metadata_output.json")
|
|
args = parser.parse_args()
|
|
|
|
if not args.project_id:
|
|
parser.error("--project-id or BIGQUERY_PROJECT_ID env var is required")
|
|
|
|
collect(
|
|
project_id=args.project_id,
|
|
datasets=args.datasets,
|
|
tables=args.tables,
|
|
only_freshness_and_volume=args.only_freshness_and_volume,
|
|
output_file=args.output_file,
|
|
)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|