playbook/antigravity-awesome-skills/skills/monte-carlo-push-ingestion/scripts/templates/databricks/collect_query_logs.py

205 lines
6.9 KiB
Python

"""
Databricks — Query Log Collection (collect-only)
==================================================
Collects finished query execution records from the Databricks system table
system.query.history and writes a JSON manifest file that can be consumed
by push_query_logs.py.
Substitution points (search for "← SUBSTITUTE"):
- DATABRICKS_HOST : workspace hostname
- DATABRICKS_HTTP_PATH : SQL warehouse HTTP path
- DATABRICKS_TOKEN : PAT or service-principal secret
- LOOKBACK_HOURS : hours back from [now - LAG_HOURS] to collect (default 25)
- LOOKBACK_LAG_HOURS : hours to lag behind now to avoid in-flight queries (default 1)
- MAX_ROWS : maximum query rows to collect per run (default 10000)
Prerequisites:
pip install databricks-sql-connector
"""
from __future__ import annotations
import argparse
import json
import logging
import os
from datetime import datetime, timezone
from typing import Any
from databricks import sql
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
log = logging.getLogger(__name__)
LOG_TYPE = "databricks"
LOOKBACK_HOURS: int = int(os.getenv("LOOKBACK_HOURS", "25")) # ← SUBSTITUTE
LOOKBACK_LAG_HOURS: int = int(os.getenv("LOOKBACK_LAG_HOURS", "1")) # ← SUBSTITUTE
MAX_ROWS: int = int(os.getenv("MAX_ROWS", "10000")) # ← SUBSTITUTE
_QUERY_LOG_SQL = """\
SELECT
statement_id AS query_id,
statement_text AS query_text,
start_time,
end_time,
executed_by AS user_name,
produced_rows AS returned_rows,
total_task_duration_ms,
read_rows,
read_bytes
FROM system.query.history
WHERE start_time >= DATEADD(HOUR, -{lookback_hours}, NOW())
AND start_time < DATEADD(HOUR, -{lag_hours}, NOW())
AND status = 'FINISHED'
ORDER BY start_time
LIMIT {max_rows}
""" # ← SUBSTITUTE: adjust status filter or add warehouse_id filter as needed
def _check_available_memory(min_gb: float = 2.0) -> None:
"""Warn if available memory is below the threshold."""
try:
if hasattr(os, "sysconf"): # Linux / macOS
page_size = os.sysconf("SC_PAGE_SIZE")
avail_pages = os.sysconf("SC_AVPHYS_PAGES")
avail_gb = (page_size * avail_pages) / (1024 ** 3)
else:
return # Windows — skip check
except (ValueError, OSError):
return
if avail_gb < min_gb:
log.warning(
"Only %.1f GB of memory available (minimum recommended: %.1f GB). "
"Consider reducing the collection scope or increasing available memory.",
avail_gb,
min_gb,
)
def _safe_isoformat(dt: Any) -> str | None:
if dt is None:
return None
if hasattr(dt, "isoformat"):
if dt.tzinfo is None:
dt = dt.replace(tzinfo=timezone.utc)
return dt.isoformat()
return str(dt)
def _query(cursor: Any, sql_text: str) -> list[dict[str, Any]]:
cursor.execute(sql_text)
cols = [d[0] for d in cursor.description]
rows = []
while True:
chunk = cursor.fetchmany(1000)
if not chunk:
break
rows.extend(dict(zip(cols, row)) for row in chunk)
return rows
def collect_query_logs(
cursor: Any,
lookback_hours: int,
lag_hours: int,
max_rows: int,
) -> list[dict[str, Any]]:
rendered_sql = _QUERY_LOG_SQL.format(
lookback_hours=lookback_hours + lag_hours, # offset from NOW() to cover the window
lag_hours=lag_hours,
max_rows=max_rows,
)
rows = _query(cursor, rendered_sql)
log.info("Retrieved %d query log rows from system.query.history", len(rows))
entries: list[dict[str, Any]] = []
for row in rows:
query_text: str = row.get("query_text") or ""
if not query_text.strip():
continue # ← SUBSTITUTE: decide whether to skip empty-text rows
entry = {
"query_id": row.get("query_id"),
"query_text": query_text,
"start_time": _safe_isoformat(row.get("start_time")),
"end_time": _safe_isoformat(row.get("end_time")),
"user": row.get("user_name"),
"returned_rows": row.get("returned_rows"),
"total_task_duration_ms": row.get("total_task_duration_ms"),
"read_rows": row.get("read_rows"),
"read_bytes": row.get("read_bytes"),
}
entries.append(entry)
return entries
def collect(
host: str,
http_path: str,
token: str,
manifest_path: str = "manifest_query_logs.json",
lookback_hours: int = LOOKBACK_HOURS,
lookback_lag_hours: int = LOOKBACK_LAG_HOURS,
max_rows: int = MAX_ROWS,
) -> list[dict[str, Any]]:
"""Connect to Databricks, collect query logs, write a JSON manifest, and return entries."""
_check_available_memory(min_gb=2.0)
collected_at = datetime.now(timezone.utc).isoformat()
with sql.connect(
server_hostname=host, # ← SUBSTITUTE
http_path=http_path, # ← SUBSTITUTE
access_token=token, # ← SUBSTITUTE
) as conn:
with conn.cursor() as cursor:
entries = collect_query_logs(cursor, lookback_hours, lookback_lag_hours, max_rows)
log.info("Collected %d query log entries", len(entries))
manifest = {
"log_type": LOG_TYPE,
"collected_at": collected_at,
"lookback_hours": lookback_hours,
"lookback_lag_hours": lookback_lag_hours,
"query_log_count": len(entries),
"entries": entries,
}
with open(manifest_path, "w") as fh:
json.dump(manifest, fh, indent=2)
log.info("Manifest written to %s (%d entries)", manifest_path, len(entries))
return entries
def main() -> None:
parser = argparse.ArgumentParser(description="Collect Databricks query logs to a manifest file")
parser.add_argument("--host", default=os.getenv("DATABRICKS_HOST")) # ← SUBSTITUTE
parser.add_argument("--http-path", default=os.getenv("DATABRICKS_HTTP_PATH")) # ← SUBSTITUTE
parser.add_argument("--token", default=os.getenv("DATABRICKS_TOKEN")) # ← SUBSTITUTE
parser.add_argument("--lookback-hours", type=int, default=LOOKBACK_HOURS)
parser.add_argument("--lookback-lag-hours", type=int, default=LOOKBACK_LAG_HOURS)
parser.add_argument("--max-rows", type=int, default=MAX_ROWS)
parser.add_argument("--manifest", default="manifest_query_logs.json")
args = parser.parse_args()
required = ["host", "http_path", "token"]
missing = [k for k in required if getattr(args, k) is None]
if missing:
parser.error(f"Missing required arguments/env vars: {missing}")
collect(
host=args.host,
http_path=args.http_path,
token=args.token,
manifest_path=args.manifest,
lookback_hours=args.lookback_hours,
lookback_lag_hours=args.lookback_lag_hours,
max_rows=args.max_rows,
)
if __name__ == "__main__":
main()