249 lines
8.3 KiB
Python
249 lines
8.3 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Push query logs to Monte Carlo from a JSON manifest — push only.
|
|
|
|
Reads a manifest file produced by ``collect_query_logs.py`` and sends the query
|
|
log entries to Monte Carlo using the pycarlo push ingestion API. Large payloads
|
|
are split into batches to stay under the 1 MB compressed limit.
|
|
|
|
Can be run standalone via CLI or imported (use the ``push()`` function).
|
|
|
|
Substitution points
|
|
-------------------
|
|
- MCD_INGEST_ID (env) / --key-id (CLI) : Monte Carlo ingestion key ID
|
|
- MCD_INGEST_TOKEN (env) / --key-token (CLI) : Monte Carlo ingestion key token
|
|
- MCD_RESOURCE_UUID (env) / --resource-uuid (CLI) : MC resource UUID for this connection
|
|
|
|
Prerequisites
|
|
-------------
|
|
pip install pycarlo
|
|
|
|
Usage
|
|
-----
|
|
python push_query_logs.py \\
|
|
--key-id <MCD_INGEST_ID> \\
|
|
--key-token <MCD_INGEST_TOKEN> \\
|
|
--resource-uuid <MCD_RESOURCE_UUID> \\
|
|
--input-file query_logs_output.json
|
|
"""
|
|
|
|
import argparse
|
|
import json
|
|
import os
|
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
from datetime import datetime, timezone
|
|
|
|
from dateutil.parser import isoparse
|
|
from pycarlo.core import Client, Session
|
|
from pycarlo.features.ingestion import IngestionService
|
|
from pycarlo.features.ingestion.models import QueryLogEntry
|
|
|
|
# ← SUBSTITUTE: set LOG_TYPE to match your warehouse type (query logs use log_type, not resource_type)
|
|
LOG_TYPE = "snowflake"
|
|
|
|
# Maximum entries per batch — conservative default to keep compressed payload under 1 MB.
|
|
# Query logs include full SQL text — keep batches small to stay under the 1 MB
|
|
# compressed payload limit. 50 entries can trigger 413 on active warehouses.
|
|
# ← SUBSTITUTE: tune based on average query length
|
|
_BATCH_SIZE = 100
|
|
|
|
# Truncate query_text longer than this to prevent 413 errors.
|
|
# Some SQL statements (e.g., generated by BI tools) can be 100KB+ and blow up
|
|
# compressed payloads even at small batch sizes.
|
|
_MAX_QUERY_TEXT_LEN = 10_000
|
|
|
|
|
|
def _build_query_log_entries(queries: list[dict]) -> list[QueryLogEntry]:
|
|
"""Convert manifest query dicts into QueryLogEntry objects."""
|
|
entries = []
|
|
truncated = 0
|
|
for q in queries:
|
|
start_time = q.get("start_time")
|
|
end_time = q.get("end_time")
|
|
query_text = q.get("query_text") or ""
|
|
query_id = q.get("query_id")
|
|
user_name = q.get("user")
|
|
warehouse_name = q.get("warehouse")
|
|
bytes_scanned = q.get("bytes_scanned")
|
|
rows_produced = q.get("rows_produced")
|
|
|
|
# Truncate very long SQL to prevent 413 Request Too Large
|
|
if len(query_text) > _MAX_QUERY_TEXT_LEN:
|
|
query_text = query_text[:_MAX_QUERY_TEXT_LEN] + "... [TRUNCATED]"
|
|
truncated += 1
|
|
|
|
extra = {}
|
|
if warehouse_name is not None:
|
|
extra["warehouse_name"] = warehouse_name
|
|
if bytes_scanned is not None:
|
|
extra["bytes_scanned"] = int(bytes_scanned)
|
|
|
|
entries.append(
|
|
QueryLogEntry(
|
|
start_time=isoparse(start_time) if start_time else None,
|
|
end_time=isoparse(end_time) if end_time else None,
|
|
query_text=query_text,
|
|
query_id=query_id,
|
|
user=user_name,
|
|
returned_rows=int(rows_produced) if rows_produced is not None else None,
|
|
extra=extra or None,
|
|
)
|
|
)
|
|
if truncated:
|
|
print(f" Truncated {truncated} query text(s) exceeding {_MAX_QUERY_TEXT_LEN} chars")
|
|
return entries
|
|
|
|
|
|
def push(
|
|
input_file: str,
|
|
resource_uuid: str,
|
|
key_id: str,
|
|
key_token: str,
|
|
batch_size: int = _BATCH_SIZE,
|
|
output_file: str = "query_logs_push_result.json",
|
|
) -> dict:
|
|
"""
|
|
Read a query log manifest and push entries to Monte Carlo in batches.
|
|
|
|
Returns a result dict with invocation IDs for each batch.
|
|
"""
|
|
with open(input_file) as fh:
|
|
manifest = json.load(fh)
|
|
|
|
queries = manifest.get("queries", [])
|
|
log_type = manifest.get("log_type", LOG_TYPE)
|
|
entries = _build_query_log_entries(queries)
|
|
print(f"Loaded {len(entries)} query log entry/entries from {input_file}")
|
|
|
|
if not entries:
|
|
print("No query log entries to push.")
|
|
push_result = {
|
|
"resource_uuid": resource_uuid,
|
|
"log_type": log_type,
|
|
"invocation_ids": [],
|
|
"pushed_at": datetime.now(tz=timezone.utc).isoformat(),
|
|
"total_entries": 0,
|
|
"batch_count": 0,
|
|
"batch_size": batch_size,
|
|
}
|
|
with open(output_file, "w") as fh:
|
|
json.dump(push_result, fh, indent=2)
|
|
return push_result
|
|
|
|
# Split into batches
|
|
batches = []
|
|
for i in range(0, len(entries), batch_size):
|
|
batches.append(entries[i : i + batch_size])
|
|
total_batches = len(batches)
|
|
|
|
def _push_batch(batch: list, batch_num: int) -> str | None:
|
|
"""Push a single batch using a dedicated Session (thread-safe)."""
|
|
client = Client(session=Session(mcd_id=key_id, mcd_token=key_token, scope="Ingestion"))
|
|
service = IngestionService(mc_client=client)
|
|
result = service.send_query_logs(
|
|
resource_uuid=resource_uuid,
|
|
log_type=log_type,
|
|
events=batch,
|
|
)
|
|
invocation_id = service.extract_invocation_id(result)
|
|
print(f" Pushed batch {batch_num}/{total_batches} ({len(batch)} entries) — invocation_id={invocation_id}")
|
|
return invocation_id
|
|
|
|
# Push batches in parallel (each thread gets its own pycarlo Session)
|
|
max_workers = min(4, total_batches)
|
|
invocation_ids: list[str | None] = [None] * total_batches
|
|
|
|
with ThreadPoolExecutor(max_workers=max_workers) as pool:
|
|
futures = {
|
|
pool.submit(_push_batch, batch, i + 1): i
|
|
for i, batch in enumerate(batches)
|
|
}
|
|
for future in as_completed(futures):
|
|
idx = futures[future]
|
|
try:
|
|
invocation_ids[idx] = future.result()
|
|
except Exception as exc:
|
|
print(f" ERROR pushing batch {idx + 1}: {exc}")
|
|
raise
|
|
|
|
print(f" All {total_batches} batches pushed ({max_workers} workers)")
|
|
|
|
push_result = {
|
|
"resource_uuid": resource_uuid,
|
|
"log_type": log_type,
|
|
"invocation_ids": invocation_ids,
|
|
"pushed_at": datetime.now(tz=timezone.utc).isoformat(),
|
|
"total_entries": len(entries),
|
|
"batch_count": total_batches,
|
|
"batch_size": batch_size,
|
|
}
|
|
with open(output_file, "w") as fh:
|
|
json.dump(push_result, fh, indent=2)
|
|
print(f"Push result written to {output_file}")
|
|
|
|
return push_result
|
|
|
|
|
|
def main() -> None:
|
|
parser = argparse.ArgumentParser(
|
|
description="Push Snowflake query logs from a manifest to Monte Carlo",
|
|
)
|
|
parser.add_argument(
|
|
"--key-id",
|
|
default=os.environ.get("MCD_INGEST_ID"),
|
|
help="Monte Carlo ingestion key ID (env: MCD_INGEST_ID)",
|
|
)
|
|
parser.add_argument(
|
|
"--key-token",
|
|
default=os.environ.get("MCD_INGEST_TOKEN"),
|
|
help="Monte Carlo ingestion key token (env: MCD_INGEST_TOKEN)",
|
|
)
|
|
parser.add_argument(
|
|
"--resource-uuid",
|
|
default=os.environ.get("MCD_RESOURCE_UUID"),
|
|
help="Monte Carlo resource UUID for this Snowflake connection (env: MCD_RESOURCE_UUID)",
|
|
)
|
|
parser.add_argument(
|
|
"--input-file",
|
|
default="query_logs_output.json",
|
|
help="Path to the collect manifest to read (default: query_logs_output.json)",
|
|
)
|
|
parser.add_argument(
|
|
"--output-file",
|
|
default="query_logs_push_result.json",
|
|
help="Path to write the push result (default: query_logs_push_result.json)",
|
|
)
|
|
parser.add_argument(
|
|
"--batch-size",
|
|
type=int,
|
|
default=_BATCH_SIZE,
|
|
help=f"Max entries per push batch (default: {_BATCH_SIZE})",
|
|
)
|
|
args = parser.parse_args()
|
|
|
|
missing = [
|
|
name
|
|
for name, val in [
|
|
("--key-id", args.key_id),
|
|
("--key-token", args.key_token),
|
|
("--resource-uuid", args.resource_uuid),
|
|
]
|
|
if not val
|
|
]
|
|
if missing:
|
|
parser.error(f"Missing required arguments: {', '.join(missing)}")
|
|
|
|
push(
|
|
input_file=args.input_file,
|
|
resource_uuid=args.resource_uuid,
|
|
key_id=args.key_id,
|
|
key_token=args.key_token,
|
|
batch_size=args.batch_size,
|
|
output_file=args.output_file,
|
|
)
|
|
print("Done.")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|