201 lines
7.2 KiB
Python
201 lines
7.2 KiB
Python
"""
|
|
Databricks — Query Log Push (push-only)
|
|
=========================================
|
|
Reads a JSON manifest file produced by collect_query_logs.py and pushes the query
|
|
log entries to Monte Carlo via the push ingestion API, with configurable batching
|
|
to keep compressed payloads under 1 MB.
|
|
|
|
Substitution points (search for "← SUBSTITUTE"):
|
|
- MCD_INGEST_ID / MCD_INGEST_TOKEN : Monte Carlo API credentials
|
|
- MCD_RESOURCE_UUID : UUID of the Databricks connection in Monte Carlo
|
|
- PUSH_BATCH_SIZE : number of entries per API call (default 100)
|
|
|
|
Prerequisites:
|
|
pip install pycarlo
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import json
|
|
import logging
|
|
import os
|
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
from datetime import datetime, timezone
|
|
from typing import Any
|
|
|
|
from dateutil.parser import isoparse
|
|
from pycarlo.core import Client, Session
|
|
from pycarlo.features.ingestion import IngestionService
|
|
from pycarlo.features.ingestion.models import QueryLogEntry
|
|
|
|
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
|
|
log = logging.getLogger(__name__)
|
|
|
|
LOG_TYPE = "databricks"
|
|
DEFAULT_BATCH_SIZE = 100 # ← SUBSTITUTE: conservative default to stay under 1 MB compressed
|
|
|
|
# Truncate query_text longer than this to prevent 413 errors.
|
|
# Some SQL statements (e.g., generated by BI tools) can be 100KB+ and blow up
|
|
# compressed payloads even at small batch sizes.
|
|
_MAX_QUERY_TEXT_LEN = 10_000
|
|
|
|
|
|
def _build_query_log_entries(entry_dicts: list[dict[str, Any]]) -> list[QueryLogEntry]:
|
|
"""Convert manifest query dicts into QueryLogEntry objects."""
|
|
entries = []
|
|
truncated = 0
|
|
for d in entry_dicts:
|
|
query_text = d.get("query_text") or ""
|
|
|
|
# Truncate very long SQL to prevent 413 Request Too Large
|
|
if len(query_text) > _MAX_QUERY_TEXT_LEN:
|
|
query_text = query_text[:_MAX_QUERY_TEXT_LEN] + "... [TRUNCATED]"
|
|
truncated += 1
|
|
|
|
extra = {}
|
|
if d.get("total_task_duration_ms") is not None:
|
|
extra["total_task_duration_ms"] = d["total_task_duration_ms"]
|
|
if d.get("read_rows") is not None:
|
|
extra["read_rows"] = d["read_rows"]
|
|
if d.get("read_bytes") is not None:
|
|
extra["read_bytes"] = d["read_bytes"]
|
|
|
|
start_time = d.get("start_time")
|
|
end_time = d.get("end_time")
|
|
|
|
entries.append(
|
|
QueryLogEntry(
|
|
query_id=d.get("query_id"),
|
|
query_text=query_text,
|
|
start_time=isoparse(start_time) if start_time else None,
|
|
end_time=isoparse(end_time) if end_time else None,
|
|
user=d.get("user"),
|
|
returned_rows=d.get("returned_rows"),
|
|
extra=extra or None,
|
|
)
|
|
)
|
|
if truncated:
|
|
log.info("Truncated %d query text(s) exceeding %d chars", truncated, _MAX_QUERY_TEXT_LEN)
|
|
return entries
|
|
|
|
|
|
def push(
|
|
manifest_path: str,
|
|
resource_uuid: str,
|
|
key_id: str,
|
|
key_token: str,
|
|
batch_size: int = DEFAULT_BATCH_SIZE,
|
|
) -> dict[str, Any]:
|
|
"""Read a collect manifest and push query log entries to Monte Carlo in batches.
|
|
|
|
Returns a summary dict with invocation IDs and counts.
|
|
"""
|
|
with open(manifest_path) as fh:
|
|
manifest = json.load(fh)
|
|
|
|
entry_dicts: list[dict[str, Any]] = manifest["entries"]
|
|
entries = _build_query_log_entries(entry_dicts)
|
|
log.info("Loaded %d query log entries from %s", len(entries), manifest_path)
|
|
|
|
if not entries:
|
|
log.info("No query log entries to push.")
|
|
summary = {
|
|
"resource_uuid": resource_uuid,
|
|
"log_type": LOG_TYPE,
|
|
"invocation_ids": [],
|
|
"pushed_at": datetime.now(timezone.utc).isoformat(),
|
|
"query_log_count": 0,
|
|
"batch_count": 0,
|
|
"batch_size": batch_size,
|
|
}
|
|
push_manifest_path = manifest_path.replace(".json", "_push_result.json")
|
|
with open(push_manifest_path, "w") as fh:
|
|
json.dump(summary, fh, indent=2)
|
|
return summary
|
|
|
|
# Split into batches
|
|
batches = []
|
|
for i in range(0, len(entries), batch_size):
|
|
batches.append(entries[i : i + batch_size])
|
|
total_batches = len(batches)
|
|
|
|
def _push_batch(batch: list, batch_num: int) -> str | None:
|
|
"""Push a single batch using a dedicated Session (thread-safe)."""
|
|
client = Client(session=Session(mcd_id=key_id, mcd_token=key_token, scope="Ingestion"))
|
|
service = IngestionService(mc_client=client)
|
|
result = service.send_query_logs(
|
|
resource_uuid=resource_uuid,
|
|
log_type=LOG_TYPE,
|
|
events=batch,
|
|
)
|
|
invocation_id = service.extract_invocation_id(result)
|
|
log.info("Pushed batch %d/%d (%d entries) — invocation_id=%s", batch_num, total_batches, len(batch), invocation_id)
|
|
return invocation_id
|
|
|
|
# Push batches in parallel (each thread gets its own pycarlo Session)
|
|
max_workers = min(4, total_batches)
|
|
invocation_ids: list[str | None] = [None] * total_batches
|
|
|
|
with ThreadPoolExecutor(max_workers=max_workers) as pool:
|
|
futures = {
|
|
pool.submit(_push_batch, batch, i + 1): i
|
|
for i, batch in enumerate(batches)
|
|
}
|
|
for future in as_completed(futures):
|
|
idx = futures[future]
|
|
try:
|
|
invocation_ids[idx] = future.result()
|
|
except Exception as exc:
|
|
log.error("ERROR pushing batch %d: %s", idx + 1, exc)
|
|
raise
|
|
|
|
log.info("All %d batches pushed (%d workers)", total_batches, max_workers)
|
|
|
|
pushed_at = datetime.now(timezone.utc).isoformat()
|
|
summary = {
|
|
"resource_uuid": resource_uuid,
|
|
"log_type": LOG_TYPE,
|
|
"invocation_ids": invocation_ids,
|
|
"pushed_at": pushed_at,
|
|
"query_log_count": len(entries),
|
|
"batch_count": total_batches,
|
|
"batch_size": batch_size,
|
|
"lookback_hours": manifest.get("lookback_hours"),
|
|
"lookback_lag_hours": manifest.get("lookback_lag_hours"),
|
|
}
|
|
|
|
push_manifest_path = manifest_path.replace(".json", "_push_result.json")
|
|
with open(push_manifest_path, "w") as fh:
|
|
json.dump(summary, fh, indent=2)
|
|
log.info("Push result written to %s", push_manifest_path)
|
|
|
|
return summary
|
|
|
|
|
|
def main() -> None:
|
|
parser = argparse.ArgumentParser(description="Push Databricks query logs to Monte Carlo from manifest")
|
|
parser.add_argument("--manifest", default="manifest_query_logs.json")
|
|
parser.add_argument("--resource-uuid", default=os.getenv("MCD_RESOURCE_UUID"))
|
|
parser.add_argument("--key-id", default=os.getenv("MCD_INGEST_ID"))
|
|
parser.add_argument("--key-token", default=os.getenv("MCD_INGEST_TOKEN"))
|
|
parser.add_argument("--batch-size", type=int, default=DEFAULT_BATCH_SIZE)
|
|
args = parser.parse_args()
|
|
|
|
required = ["resource_uuid", "key_id", "key_token"]
|
|
missing = [k for k in required if getattr(args, k) is None]
|
|
if missing:
|
|
parser.error(f"Missing required arguments/env vars: {missing}")
|
|
|
|
push(
|
|
manifest_path=args.manifest,
|
|
resource_uuid=args.resource_uuid,
|
|
key_id=args.key_id,
|
|
key_token=args.key_token,
|
|
batch_size=args.batch_size,
|
|
)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|