playbook/antigravity-awesome-skills/skills/monte-carlo-push-ingestion/scripts/templates/hive/push_query_logs.py

#!/usr/bin/env python3
"""
Push a collected Hive query log manifest to Monte Carlo — push only.

Reads a JSON manifest produced by ``collect_query_logs.py``, builds
QueryLogEntry objects, and calls ``send_query_logs`` in batches.  The manifest
is updated in-place with ``resource_uuid`` and ``invocation_id`` after a
successful push.

Can be run standalone via CLI or imported (use the ``push()`` function).

Substitution points
-------------------
- MCD_INGEST_ID    (env) / --key-id        (CLI) : Monte Carlo ingestion key ID
- MCD_INGEST_TOKEN (env) / --key-token      (CLI) : Monte Carlo ingestion key token
- MCD_RESOURCE_UUID    (env) / --resource-uuid  (CLI) : MC resource UUID (optional for query logs)

Prerequisites
-------------
    pip install pycarlo python-dateutil python-dotenv

Usage
-----
    python push_query_logs.py \\
        --key-id  <MCD_INGEST_ID> \\
        --key-token <MCD_INGEST_TOKEN> \\
        --resource-uuid <MCD_RESOURCE_UUID> \\
        --input-file query_logs_output.json
"""

import argparse
import json
import os
from concurrent.futures import ThreadPoolExecutor, as_completed
from datetime import datetime, timezone

from dateutil.parser import isoparse

from pycarlo.core import Client, Session
from pycarlo.features.ingestion import IngestionService
from pycarlo.features.ingestion.models import QueryLogEntry

# ← SUBSTITUTE: default batch size for query log push (events per request)
# Query logs include full SQL text — keep batches small to stay under the 1 MB
# compressed payload limit.  50 entries can trigger 413 on active warehouses.
DEFAULT_BATCH_SIZE = 100

# ← SUBSTITUTE: HTTP timeout for MC ingestion requests (seconds)
DEFAULT_TIMEOUT_SECONDS = 120

# Truncate query_text longer than this to prevent 413 errors.
# Some SQL statements (e.g., generated by BI tools) can be 100KB+ and blow up
# compressed payloads even at small batch sizes.
_MAX_QUERY_TEXT_LEN = 10_000


def _build_events(manifest: dict) -> list[QueryLogEntry]:
    """
    Rebuild QueryLogEntry objects from a collected query log manifest.

    ISO timestamp strings are parsed back to datetime.  Entries are
    deduplicated by query_id.
    """
    seen: set[str] = set()
    events = []
    truncated = 0
    for q in manifest.get("queries", []):
        qid = q.get("query_id")
        if qid and qid in seen:
            continue
        if qid:
            seen.add(qid)

        start_time = isoparse(q["start_time"])
        if not start_time.tzinfo:
            start_time = start_time.replace(tzinfo=timezone.utc)

        end_time = isoparse(q["end_time"])
        if not end_time.tzinfo:
            end_time = end_time.replace(tzinfo=timezone.utc)

        query_text = q.get("query") or ""

        # Truncate very long SQL to prevent 413 Request Too Large
        if len(query_text) > _MAX_QUERY_TEXT_LEN:
            query_text = query_text[:_MAX_QUERY_TEXT_LEN] + "... [TRUNCATED]"
            truncated += 1

        events.append(
            QueryLogEntry(
                start_time=start_time,
                end_time=end_time,
                query_text=query_text,
                query_id=qid or None,
                user=q.get("user", "hadoop"),  # ← SUBSTITUTE: set the user appropriate for your cluster
                returned_rows=q.get("returned_rows"),
            )
        )
    if truncated:
        print(f"  Truncated {truncated} query text(s) exceeding {_MAX_QUERY_TEXT_LEN} chars")
    return events


def push(
    manifest: dict,
    key_id: str,
    key_token: str,
    resource_uuid: str | None = None,
    batch_size: int = DEFAULT_BATCH_SIZE,
    timeout_seconds: int = DEFAULT_TIMEOUT_SECONDS,
) -> str | None:
    """
    Push collected query logs to Monte Carlo and update the manifest in-place.

    Events are sent in batches of ``batch_size`` (default 100) to avoid
    oversized payloads.

    Args:
        manifest: Dict loaded from a ``collect_query_logs.py`` output file.
        key_id: MC ingestion key ID.
        key_token: MC ingestion key token.
        resource_uuid: Optional MC resource UUID.
        batch_size: Events per POST request (default 100).
        timeout_seconds: HTTP timeout per request (default 120).

    Returns:
        The last invocation ID string if returned by MC, otherwise None.
    """
    log_type = manifest.get("log_type", "hive-s3")

    events = _build_events(manifest)
    n = len(events)
    print(f"Loaded {n} query log entry/entries from manifest")

    if not events:
        print("No query log entries to push.")
        manifest["log_type"] = log_type
        if resource_uuid is not None:
            manifest["resource_uuid"] = resource_uuid
        manifest["invocation_id"] = None
        return None

    # Split into batches
    batch_list = []
    for i in range(0, n, batch_size):
        batch_list.append(events[i : i + batch_size])
    total_batches = len(batch_list)

    def _push_batch(batch: list, batch_num: int) -> str | None:
        """Push a single batch using a dedicated Session (thread-safe)."""
        client = Client(session=Session(mcd_id=key_id, mcd_token=key_token, scope="Ingestion"))
        service = IngestionService(mc_client=client)
        result = service.send_query_logs(
            resource_uuid=resource_uuid,
            log_type=log_type,
            events=batch,
        )
        invocation_id = service.extract_invocation_id(result)
        print(f"  Pushed batch {batch_num}/{total_batches} ({len(batch)} entries) — invocation_id={invocation_id}")
        return invocation_id

    # Push batches in parallel (each thread gets its own pycarlo Session)
    max_workers = min(4, total_batches)
    invocation_ids: list[str | None] = [None] * total_batches

    with ThreadPoolExecutor(max_workers=max_workers) as pool:
        futures = {
            pool.submit(_push_batch, batch, i + 1): i
            for i, batch in enumerate(batch_list)
        }
        for future in as_completed(futures):
            idx = futures[future]
            try:
                invocation_ids[idx] = future.result()
            except Exception as exc:
                print(f"    ERROR pushing batch {idx + 1}: {exc}")
                raise

    print(f"  All {total_batches} batches pushed ({max_workers} workers)")

    manifest["log_type"] = log_type
    if resource_uuid is not None:
        manifest["resource_uuid"] = resource_uuid
    manifest["invocation_id"] = invocation_ids[-1] if invocation_ids else None
    if len([i for i in invocation_ids if i]) > 1:
        manifest["invocation_ids"] = invocation_ids
    elif "invocation_ids" in manifest:
        del manifest["invocation_ids"]

    return manifest.get("invocation_id")


def main() -> None:
    parser = argparse.ArgumentParser(
        description="Push a collected Hive query log manifest to Monte Carlo",
    )
    parser.add_argument(
        "--key-id",
        default=os.environ.get("MCD_INGEST_ID"),
        help="Monte Carlo ingestion key ID (env: MCD_INGEST_ID)",
    )
    parser.add_argument(
        "--key-token",
        default=os.environ.get("MCD_INGEST_TOKEN"),
        help="Monte Carlo ingestion key token (env: MCD_INGEST_TOKEN)",
    )
    parser.add_argument(
        "--resource-uuid",
        default=os.environ.get("MCD_RESOURCE_UUID"),
        help="Monte Carlo resource UUID (optional for query logs) (env: MCD_RESOURCE_UUID)",
    )
    parser.add_argument(
        "--input-file",
        default="query_logs_output.json",
        help="Path to the JSON manifest written by collect_query_logs.py (default: query_logs_output.json)",
    )
    parser.add_argument(
        "--batch-size",
        type=int,
        default=DEFAULT_BATCH_SIZE,
        metavar="N",
        help=f"Max events per POST (default: {DEFAULT_BATCH_SIZE})",
    )
    parser.add_argument(
        "--timeout",
        type=int,
        default=DEFAULT_TIMEOUT_SECONDS,
        metavar="SEC",
        help=f"HTTP timeout per request in seconds (default: {DEFAULT_TIMEOUT_SECONDS})",
    )
    args = parser.parse_args()

    if not args.key_id or not args.key_token:
        parser.error("--key-id and --key-token are required (or set MCD_INGEST_ID / MCD_INGEST_TOKEN)")

    with open(args.input_file) as fh:
        manifest = json.load(fh)

    push(
        manifest=manifest,
        key_id=args.key_id,
        key_token=args.key_token,
        resource_uuid=args.resource_uuid,
        batch_size=args.batch_size,
        timeout_seconds=args.timeout,
    )

    with open(args.input_file, "w") as fh:
        json.dump(manifest, fh, indent=2)
    print(f"Manifest updated in-place: {args.input_file}")
    print("Done.")


if __name__ == "__main__":
    main()