""" Databricks — Lineage Push (push-only) ======================================= Reads a JSON manifest file produced by collect_lineage.py and pushes the lineage events to Monte Carlo via the push ingestion API, with configurable batching to keep compressed payloads under 1 MB. Substitution points (search for "← SUBSTITUTE"): - MCD_INGEST_ID / MCD_INGEST_TOKEN : Monte Carlo API credentials - MCD_RESOURCE_UUID : UUID of the Databricks connection in Monte Carlo - PUSH_BATCH_SIZE : number of events per API call (default 500) Prerequisites: pip install pycarlo """ from __future__ import annotations import argparse import json import logging import os from concurrent.futures import ThreadPoolExecutor, as_completed from datetime import datetime, timezone from typing import Any from pycarlo.core import Client, Session from pycarlo.features.ingestion import IngestionService from pycarlo.features.ingestion.models import ( ColumnLineageField, ColumnLineageSourceField, LineageAssetRef, LineageEvent, ) logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s") log = logging.getLogger(__name__) RESOURCE_TYPE = "databricks" DEFAULT_BATCH_SIZE = 500 # ← SUBSTITUTE: conservative default to stay under 1 MB compressed def _ref_from_dict(d: dict[str, Any]) -> LineageAssetRef: database = d.get("database", "") schema = d.get("schema", "") name = d["asset_name"] return LineageAssetRef( type="TABLE", name=name, database=database, schema=schema, asset_id=f"{database}__{schema}__{name}", ) def _event_from_dict(d: dict[str, Any]) -> LineageEvent: """Reconstruct a LineageEvent from a manifest dict.""" sources = [_ref_from_dict(s) for s in d.get("sources", [])] destination = _ref_from_dict(d["destination"]) fields: list[ColumnLineageField] | None = None if d.get("column_lineage"): fields = [] for cl in d["column_lineage"]: src_fields = [] for s in cl.get("sources", []): asset_id = f"{s.get('database', '')}__{s.get('schema', '')}__{s['asset_name']}" src_fields.append( ColumnLineageSourceField( asset_id=asset_id, field_name=s["field"], ) ) fields.append( ColumnLineageField( name=cl["destination_field"], source_fields=src_fields, ) ) return LineageEvent( sources=sources, destination=destination, fields=fields, ) def push( manifest_path: str, resource_uuid: str, key_id: str, key_token: str, batch_size: int = DEFAULT_BATCH_SIZE, ) -> dict[str, Any]: """Read a collect manifest and push lineage events to Monte Carlo in batches. Returns a summary dict with invocation IDs and counts. """ with open(manifest_path) as fh: manifest = json.load(fh) event_dicts: list[dict[str, Any]] = manifest["events"] events = [_event_from_dict(d) for d in event_dicts] log.info("Loaded %d lineage events from %s", len(events), manifest_path) # Split into batches batches = [] for i in range(0, len(events), batch_size): batches.append(events[i : i + batch_size]) total_batches = len(batches) def _push_batch(batch: list, batch_num: int) -> str | None: """Push a single batch using a dedicated Session (thread-safe).""" log.info("Pushing batch %d/%d (%d events) ...", batch_num, total_batches, len(batch)) client = Client(session=Session(mcd_id=key_id, mcd_token=key_token, scope="Ingestion")) service = IngestionService(mc_client=client) result = service.send_lineage( resource_uuid=resource_uuid, resource_type=RESOURCE_TYPE, events=batch, ) invocation_id = service.extract_invocation_id(result) if invocation_id: log.info("Batch %d: invocation_id=%s", batch_num, invocation_id) return invocation_id # Push batches in parallel (each thread gets its own pycarlo Session) max_workers = min(4, total_batches) invocation_ids: list[str | None] = [None] * total_batches with ThreadPoolExecutor(max_workers=max_workers) as pool: futures = { pool.submit(_push_batch, batch, i + 1): i for i, batch in enumerate(batches) } for future in as_completed(futures): idx = futures[future] try: invocation_ids[idx] = future.result() except Exception as exc: log.error("ERROR pushing batch %d: %s", idx + 1, exc) raise log.info("All %d batches pushed (%d workers)", total_batches, max_workers) pushed_at = datetime.now(timezone.utc).isoformat() summary = { "resource_uuid": resource_uuid, "resource_type": RESOURCE_TYPE, "invocation_ids": invocation_ids, "pushed_at": pushed_at, "event_count": len(events), "batch_count": total_batches, "batch_size": batch_size, "lookback_days": manifest.get("lookback_days"), "table_lineage_events": manifest.get("table_lineage_events"), "column_lineage_events": manifest.get("column_lineage_events"), } push_manifest_path = manifest_path.replace(".json", "_push_result.json") with open(push_manifest_path, "w") as fh: json.dump(summary, fh, indent=2) log.info("Push result written to %s", push_manifest_path) return summary def main() -> None: parser = argparse.ArgumentParser(description="Push Databricks lineage to Monte Carlo from manifest") parser.add_argument("--manifest", default="manifest_lineage.json") parser.add_argument("--resource-uuid", default=os.getenv("MCD_RESOURCE_UUID")) parser.add_argument("--key-id", default=os.getenv("MCD_INGEST_ID")) parser.add_argument("--key-token", default=os.getenv("MCD_INGEST_TOKEN")) parser.add_argument("--batch-size", type=int, default=DEFAULT_BATCH_SIZE) args = parser.parse_args() required = ["resource_uuid", "key_id", "key_token"] missing = [k for k in required if getattr(args, k) is None] if missing: parser.error(f"Missing required arguments/env vars: {missing}") push( manifest_path=args.manifest, resource_uuid=args.resource_uuid, key_id=args.key_id, key_token=args.key_token, batch_size=args.batch_size, ) if __name__ == "__main__": main()