165 lines
5.5 KiB
Python
165 lines
5.5 KiB
Python
"""
|
|
BigQuery — Query Log Collection (collect only)
|
|
================================================
|
|
Collects completed job query logs from BigQuery job history and writes them to
|
|
a JSON manifest file for later push to Monte Carlo.
|
|
|
|
Can be run standalone via CLI or imported (use the ``collect()`` function).
|
|
|
|
Substitution points (search for "← SUBSTITUTE"):
|
|
- BIGQUERY_PROJECT_ID : GCP project ID to collect query logs from
|
|
- GOOGLE_APPLICATION_CREDENTIALS : path to service-account JSON key file
|
|
- LOOKBACK_HOURS : how many hours back to collect (default 25, skip last 1 h)
|
|
- STATEMENT_TYPE_FILTER : restrict to specific statement types, or leave empty for all
|
|
- MAX_JOBS : cap on number of jobs to collect per run
|
|
|
|
Prerequisites:
|
|
pip install google-cloud-bigquery
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import json
|
|
import logging
|
|
import os
|
|
from datetime import datetime, timedelta, timezone
|
|
|
|
from google.cloud import bigquery
|
|
|
|
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
|
|
log = logging.getLogger(__name__)
|
|
|
|
LOG_TYPE = "bigquery"
|
|
|
|
# Collect jobs from [now - LOOKBACK_HOURS] to [now - LOOKBACK_LAG_HOURS].
|
|
# The lag avoids collecting in-flight jobs that have not yet completed.
|
|
LOOKBACK_HOURS: int = int(os.getenv("LOOKBACK_HOURS", "25")) # ← SUBSTITUTE
|
|
LOOKBACK_LAG_HOURS: int = int(os.getenv("LOOKBACK_LAG_HOURS", "1")) # ← SUBSTITUTE
|
|
|
|
# Limit statement types — e.g. ["SELECT", "CREATE_TABLE_AS_SELECT", "INSERT"]
|
|
# Set to an empty list to collect all statement types.
|
|
STATEMENT_TYPE_FILTER: list[str] = [] # ← SUBSTITUTE
|
|
|
|
# Maximum number of jobs to collect in a single run to avoid runaway costs
|
|
MAX_JOBS: int = int(os.getenv("MAX_JOBS", "10000")) # ← SUBSTITUTE
|
|
|
|
|
|
def _safe_isoformat(dt: datetime | None) -> str | None:
|
|
if dt is None:
|
|
return None
|
|
if dt.tzinfo is None:
|
|
dt = dt.replace(tzinfo=timezone.utc)
|
|
return dt.isoformat()
|
|
|
|
|
|
def _collect_query_logs(
|
|
bq_client: bigquery.Client,
|
|
project_id: str,
|
|
start_dt: datetime,
|
|
end_dt: datetime,
|
|
) -> list[dict]:
|
|
"""Collect query logs from BigQuery job history and return as a list of dicts."""
|
|
entries: list[dict] = []
|
|
|
|
log.info(
|
|
"Listing jobs for project=%s from %s to %s",
|
|
project_id, start_dt.isoformat(), end_dt.isoformat(),
|
|
)
|
|
|
|
for job in bq_client.list_jobs(
|
|
project=project_id,
|
|
all_users=True,
|
|
min_creation_time=start_dt,
|
|
max_creation_time=end_dt,
|
|
):
|
|
# Only process query jobs that have SQL text
|
|
sql: str = getattr(job, "query", None) or ""
|
|
if not sql.strip():
|
|
continue
|
|
|
|
statement_type: str = getattr(job, "statement_type", None) or ""
|
|
if STATEMENT_TYPE_FILTER and statement_type not in STATEMENT_TYPE_FILTER:
|
|
continue # ← SUBSTITUTE: adjust filter as needed
|
|
|
|
total_bytes_billed: int | None = getattr(job, "total_bytes_billed", None)
|
|
|
|
entries.append(
|
|
{
|
|
"query_id": job.job_id,
|
|
"query_text": sql,
|
|
"start_time": _safe_isoformat(getattr(job, "created", None)),
|
|
"end_time": _safe_isoformat(getattr(job, "ended", None)),
|
|
"user": getattr(job, "user_email", None),
|
|
"total_bytes_billed": total_bytes_billed,
|
|
"statement_type": statement_type or None,
|
|
}
|
|
)
|
|
|
|
if len(entries) >= MAX_JOBS:
|
|
log.warning("Reached MAX_JOBS=%d — stopping early", MAX_JOBS)
|
|
break
|
|
|
|
return entries
|
|
|
|
|
|
def collect(
|
|
project_id: str,
|
|
lookback_hours: int = LOOKBACK_HOURS,
|
|
lookback_lag_hours: int = LOOKBACK_LAG_HOURS,
|
|
output_file: str = "query_logs_output.json",
|
|
) -> dict:
|
|
"""
|
|
Connect to BigQuery, collect query logs, and write a JSON manifest.
|
|
|
|
Returns the manifest dict.
|
|
"""
|
|
bq_client = bigquery.Client(project=project_id) # ← SUBSTITUTE: adjust auth if needed
|
|
|
|
end_dt = datetime.now(timezone.utc) - timedelta(hours=lookback_lag_hours)
|
|
start_dt = end_dt - timedelta(hours=lookback_hours)
|
|
|
|
entries = _collect_query_logs(bq_client, project_id, start_dt, end_dt)
|
|
log.info("Collected %d query log entries.", len(entries))
|
|
|
|
manifest = {
|
|
"log_type": LOG_TYPE,
|
|
"collected_at": datetime.now(timezone.utc).isoformat(),
|
|
"window_start": start_dt.isoformat(),
|
|
"window_end": end_dt.isoformat(),
|
|
"query_log_count": len(entries),
|
|
"queries": entries,
|
|
}
|
|
with open(output_file, "w") as fh:
|
|
json.dump(manifest, fh, indent=2)
|
|
log.info("Query log manifest written to %s", output_file)
|
|
|
|
return manifest
|
|
|
|
|
|
def main() -> None:
|
|
parser = argparse.ArgumentParser(
|
|
description="Collect BigQuery query logs and write to a manifest file",
|
|
)
|
|
parser.add_argument("--project-id", default=os.getenv("BIGQUERY_PROJECT_ID")) # ← SUBSTITUTE
|
|
parser.add_argument("--lookback-hours", type=int, default=LOOKBACK_HOURS)
|
|
parser.add_argument("--lookback-lag-hours", type=int, default=LOOKBACK_LAG_HOURS)
|
|
parser.add_argument("--output-file", default="query_logs_output.json")
|
|
args = parser.parse_args()
|
|
|
|
required = ["project_id"]
|
|
missing = [k for k in required if getattr(args, k) is None]
|
|
if missing:
|
|
parser.error(f"Missing required arguments/env vars: {missing}")
|
|
|
|
collect(
|
|
project_id=args.project_id,
|
|
lookback_hours=args.lookback_hours,
|
|
lookback_lag_hours=args.lookback_lag_hours,
|
|
output_file=args.output_file,
|
|
)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|