playbook/antigravity-awesome-skills/skills/monte-carlo-push-ingestion/scripts/templates/hive/collect_lineage.py

266 lines
9.5 KiB
Python

#!/usr/bin/env python3
"""
Extract table and column lineage from a local HiveServer2 log file — collection only.
Reads a plain-text Hive log file (not compressed), extracts SQL query blocks
from "Executing command" / "Starting command" entries, detects CTAS and
INSERT INTO ... SELECT patterns to build lineage edges, then writes a JSON
manifest file.
Can be run standalone via CLI or imported (use the ``collect()`` function).
Substitution points
-------------------
- --log-file path to local HiveServer2 log (default: /tmp/root/hive.log)
Prerequisites
-------------
pip install python-dotenv
Usage
-----
python collect_lineage.py \\
--log-file /tmp/root/hive.log \\
--output-file lineage_output.json
"""
from __future__ import annotations
import argparse
import json
import re
from dataclasses import dataclass, field
from datetime import datetime, timezone
# ← SUBSTITUTE: set RESOURCE_TYPE to match your Monte Carlo connection type
RESOURCE_TYPE = "data-lake"
# Regex for CTAS: CREATE TABLE [IF NOT EXISTS] db.table AS SELECT ... FROM db.table
_CTAS_RE = re.compile(
r"CREATE\s+TABLE\s+(?:IF\s+NOT\s+EXISTS\s+)?"
r"(?P<dest_db>\w+)\.(?P<dest_table>\w+)"
r".*?AS\s+SELECT\s+(?P<select_cols>.+?)\s+FROM\s+(?P<src_db>\w+)\.(?P<src_table>\w+)",
re.IGNORECASE | re.DOTALL,
)
# Regex for INSERT INTO/OVERWRITE db.table SELECT ... FROM db.table
_INSERT_RE = re.compile(
r"INSERT\s+(?:INTO|OVERWRITE)\s+(?:TABLE\s+)?(?P<dest_db>\w+)\.(?P<dest_table>\w+)"
r".*?SELECT\s+(?P<select_cols>.+?)\s+FROM\s+(?P<src_db>\w+)\.(?P<src_table>\w+)",
re.IGNORECASE | re.DOTALL,
)
# Regex to detect additional JOIN sources beyond the primary FROM clause
_JOIN_RE = re.compile(r"JOIN\s+(?P<src_db>\w+)\.(?P<src_table>\w+)", re.IGNORECASE)
# Simple column alias extraction: [alias.]col [AS dest]
_COL_RE = re.compile(r"(?:(\w+)\.)?(\w+)(?:\s+AS\s+(\w+))?", re.IGNORECASE)
# Hive string literals — strip before scanning so words inside 'status' AS ...
# are not treated as column refs
_STR_LITERAL_RE = re.compile(r"'(?:''|[^'])*'")
# ROW_NUMBER() OVER (...) AS alias — whole expression has no single source column;
# removing it avoids bogus tokens in col_mappings
_WINDOW_AS_ALIAS_RE = re.compile(
r"\b(?:ROW_NUMBER|RANK|DENSE_RANK|NTILE)\s*\(\s*\)\s+OVER\s*\([^)]*\)\s+AS\s+\w+",
re.IGNORECASE,
)
# Regex to pull query text out of Hive log "Executing/Starting command" lines
_COMMAND_START_RE = re.compile(
r"(?:Executing|Starting)\s+command\(queryId=\S*\):\s+(?P<query>.+?)(?=\n\d{4}-\d{2}-\d{2}|\Z)",
re.DOTALL,
)
# Tokens that are almost never real column names — SQL keywords, functions, casts, etc.
_SQL_SCAN_NOISE = frozenset(
{
"ROW_NUMBER", "RANK", "DENSE_RANK", "NTILE", "OVER", "PARTITION",
"ORDER", "BY", "CASE", "WHEN", "THEN", "ELSE", "END", "AND", "OR",
"NOT", "IN", "IS", "DISTINCT", "CAST", "CONVERT", "CURRENT_TIMESTAMP",
"CURRENT_DATE", "TRUE", "FALSE", "NULL", "BETWEEN", "LIKE", "EXISTS",
"ASC", "DESC", "LIMIT", "OFFSET", "GROUP", "HAVING", "UNION", "ALL",
"INNER", "LEFT", "RIGHT", "FULL", "OUTER", "CROSS", "JOIN", "ON",
"WHERE", "SELECT", "FROM", "AS", "STRING", "BIGINT", "INT", "SMALLINT",
"TINYINT", "DOUBLE", "FLOAT", "REAL", "DECIMAL", "BOOLEAN", "DATE",
"TIMESTAMP", "VARCHAR", "CHAR", "BINARY", "ARRAY", "MAP", "STRUCT",
"SUM", "AVG", "COUNT", "MIN", "MAX", "STDDEV", "VARIANCE", "VAR_POP",
"COALESCE", "IF", "SUBSTRING", "YEAR", "MONTH", "DAY", "LEAD", "LAG",
"FIRST_VALUE", "LAST_VALUE",
}
)
@dataclass
class _LineageEdge:
dest_db: str
dest_table: str
sources: list[tuple[str, str]] = field(default_factory=list)
# col_mappings: (dest_col, src_table, src_col)
col_mappings: list[tuple[str, str, str]] = field(default_factory=list)
def _prepare_select_for_col_scan(select_clause: str) -> str:
"""Remove literals and window headers so _COL_RE sees fewer false positives."""
s = _STR_LITERAL_RE.sub(" ", select_clause)
s = _WINDOW_AS_ALIAS_RE.sub(" ", s)
return s
def _dedupe_col_mappings(mappings: list[tuple[str, str, str]]) -> list[tuple[str, str, str]]:
seen: set[tuple[str, str, str]] = set()
out: list[tuple[str, str, str]] = []
for t in mappings:
if t in seen:
continue
seen.add(t)
out.append(t)
return out
def _extract_query_blocks(log_text: str) -> list[str]:
"""Extract individual SQL query strings from a Hive log file."""
return [m.group("query").strip() for m in _COMMAND_START_RE.finditer(log_text)]
def _parse_select_cols(select_clause: str, src_table: str) -> list[tuple[str, str, str]]:
"""
Lightweight column mapping: for each `alias.col AS dest` or `col AS dest`
in the SELECT clause, return (dest_col, src_table, src_col).
Strips string literals and window function headers first to reduce false
positives, and filters out SQL keywords/noise tokens.
"""
prepared = _prepare_select_for_col_scan(select_clause)
mappings = []
for m in _COL_RE.finditer(prepared):
src_col = m.group(2)
dest_col = m.group(3) or src_col
if src_col.upper() in ("FROM", "SELECT", "WHERE", "JOIN", "ON", "AS", "*"):
continue
if src_col.upper() in _SQL_SCAN_NOISE or dest_col.upper() in _SQL_SCAN_NOISE:
continue
# After stripping 'literal' AS col, we get " AS col" — skip bare (col, col) with no source expr.
if dest_col == src_col:
prefix = prepared[: m.start()].rstrip()
if prefix.upper().endswith("AS"):
continue
mappings.append((dest_col, src_table, src_col))
return _dedupe_col_mappings(mappings)
def _parse_edges(queries: list[str]) -> list[_LineageEdge]:
"""Parse SQL query strings into _LineageEdge objects."""
edges: dict[str, _LineageEdge] = {}
for sql in queries:
# Strip string literals to avoid false table/column matches inside quoted strings
sql_clean = re.sub(r"\s+", " ", _STR_LITERAL_RE.sub(" ", sql)).strip()
for pattern in (_CTAS_RE, _INSERT_RE):
m = pattern.search(sql_clean)
if not m:
continue
dest_db = m.group("dest_db").lower()
dest_table = m.group("dest_table").lower()
src_db = m.group("src_db").lower()
src_table = m.group("src_table").lower()
select_cols = m.group("select_cols")
key = f"{dest_db}.{dest_table}"
if key not in edges:
edges[key] = _LineageEdge(dest_db=dest_db, dest_table=dest_table)
edge = edges[key]
src_pair = (src_db, src_table)
if src_pair not in edge.sources:
edge.sources.append(src_pair)
# Pick up additional JOIN sources
for jm in _JOIN_RE.finditer(sql_clean):
jp = (jm.group("src_db").lower(), jm.group("src_table").lower())
if jp not in edge.sources:
edge.sources.append(jp)
edge.col_mappings.extend(_parse_select_cols(select_cols, src_table))
break # matched one pattern, move to next query
# Deduplicate column mappings per edge (same INSERT may appear many times in HS2 logs)
for e in edges.values():
e.col_mappings = _dedupe_col_mappings(e.col_mappings)
return list(edges.values())
def collect(log_file: str) -> dict:
"""
Parse lineage edges from a HiveServer2 log file and return a manifest dict.
Args:
log_file: Path to a local HiveServer2 log file.
Returns:
Manifest dict with keys: resource_type, collected_at, edges.
Each edge has destination, sources, and col_mappings lists.
"""
print(f"Reading Hive log file: {log_file} ...")
with open(log_file, errors="replace") as fh:
log_text = fh.read()
queries = _extract_query_blocks(log_text)
print(f" Extracted {len(queries)} query block(s).")
edges = _parse_edges(queries)
print(f" Parsed {len(edges)} lineage edge(s).")
manifest = {
"resource_type": RESOURCE_TYPE,
"collected_at": datetime.now(tz=timezone.utc).isoformat(),
"edges": [
{
"destination": {"database": e.dest_db, "table": e.dest_table},
"sources": [{"database": sdb, "table": stbl} for sdb, stbl in e.sources],
"col_mappings": [
{"dest_col": dc, "src_table": st, "src_col": sc}
for dc, st, sc in e.col_mappings
],
}
for e in edges
],
}
return manifest
def main() -> None:
parser = argparse.ArgumentParser(
description="Extract Hive lineage from a local log file and write a JSON manifest",
)
parser.add_argument(
"--log-file",
default="/tmp/root/hive.log",
help="Path to local HiveServer2 log file (default: /tmp/root/hive.log)", # ← SUBSTITUTE: your log path
)
parser.add_argument(
"--output-file",
default="lineage_output.json",
help="Path to write the lineage manifest (default: lineage_output.json)",
)
args = parser.parse_args()
manifest = collect(log_file=args.log_file)
if not manifest["edges"]:
print("No lineage edges detected — no CTAS or INSERT INTO ... SELECT patterns found.")
return
with open(args.output_file, "w") as fh:
json.dump(manifest, fh, indent=2)
print(f"Lineage manifest written to {args.output_file}")
print("Done.")
if __name__ == "__main__":
main()