#!/usr/bin/env python3 """ Extract table and column lineage from a local HiveServer2 log file — collection only. Reads a plain-text Hive log file (not compressed), extracts SQL query blocks from "Executing command" / "Starting command" entries, detects CTAS and INSERT INTO ... SELECT patterns to build lineage edges, then writes a JSON manifest file. Can be run standalone via CLI or imported (use the ``collect()`` function). Substitution points ------------------- - --log-file path to local HiveServer2 log (default: /tmp/root/hive.log) Prerequisites ------------- pip install python-dotenv Usage ----- python collect_lineage.py \\ --log-file /tmp/root/hive.log \\ --output-file lineage_output.json """ from __future__ import annotations import argparse import json import re from dataclasses import dataclass, field from datetime import datetime, timezone # ← SUBSTITUTE: set RESOURCE_TYPE to match your Monte Carlo connection type RESOURCE_TYPE = "data-lake" # Regex for CTAS: CREATE TABLE [IF NOT EXISTS] db.table AS SELECT ... FROM db.table _CTAS_RE = re.compile( r"CREATE\s+TABLE\s+(?:IF\s+NOT\s+EXISTS\s+)?" r"(?P\w+)\.(?P\w+)" r".*?AS\s+SELECT\s+(?P.+?)\s+FROM\s+(?P\w+)\.(?P\w+)", re.IGNORECASE | re.DOTALL, ) # Regex for INSERT INTO/OVERWRITE db.table SELECT ... FROM db.table _INSERT_RE = re.compile( r"INSERT\s+(?:INTO|OVERWRITE)\s+(?:TABLE\s+)?(?P\w+)\.(?P\w+)" r".*?SELECT\s+(?P.+?)\s+FROM\s+(?P\w+)\.(?P\w+)", re.IGNORECASE | re.DOTALL, ) # Regex to detect additional JOIN sources beyond the primary FROM clause _JOIN_RE = re.compile(r"JOIN\s+(?P\w+)\.(?P\w+)", re.IGNORECASE) # Simple column alias extraction: [alias.]col [AS dest] _COL_RE = re.compile(r"(?:(\w+)\.)?(\w+)(?:\s+AS\s+(\w+))?", re.IGNORECASE) # Hive string literals — strip before scanning so words inside 'status' AS ... # are not treated as column refs _STR_LITERAL_RE = re.compile(r"'(?:''|[^'])*'") # ROW_NUMBER() OVER (...) AS alias — whole expression has no single source column; # removing it avoids bogus tokens in col_mappings _WINDOW_AS_ALIAS_RE = re.compile( r"\b(?:ROW_NUMBER|RANK|DENSE_RANK|NTILE)\s*\(\s*\)\s+OVER\s*\([^)]*\)\s+AS\s+\w+", re.IGNORECASE, ) # Regex to pull query text out of Hive log "Executing/Starting command" lines _COMMAND_START_RE = re.compile( r"(?:Executing|Starting)\s+command\(queryId=\S*\):\s+(?P.+?)(?=\n\d{4}-\d{2}-\d{2}|\Z)", re.DOTALL, ) # Tokens that are almost never real column names — SQL keywords, functions, casts, etc. _SQL_SCAN_NOISE = frozenset( { "ROW_NUMBER", "RANK", "DENSE_RANK", "NTILE", "OVER", "PARTITION", "ORDER", "BY", "CASE", "WHEN", "THEN", "ELSE", "END", "AND", "OR", "NOT", "IN", "IS", "DISTINCT", "CAST", "CONVERT", "CURRENT_TIMESTAMP", "CURRENT_DATE", "TRUE", "FALSE", "NULL", "BETWEEN", "LIKE", "EXISTS", "ASC", "DESC", "LIMIT", "OFFSET", "GROUP", "HAVING", "UNION", "ALL", "INNER", "LEFT", "RIGHT", "FULL", "OUTER", "CROSS", "JOIN", "ON", "WHERE", "SELECT", "FROM", "AS", "STRING", "BIGINT", "INT", "SMALLINT", "TINYINT", "DOUBLE", "FLOAT", "REAL", "DECIMAL", "BOOLEAN", "DATE", "TIMESTAMP", "VARCHAR", "CHAR", "BINARY", "ARRAY", "MAP", "STRUCT", "SUM", "AVG", "COUNT", "MIN", "MAX", "STDDEV", "VARIANCE", "VAR_POP", "COALESCE", "IF", "SUBSTRING", "YEAR", "MONTH", "DAY", "LEAD", "LAG", "FIRST_VALUE", "LAST_VALUE", } ) @dataclass class _LineageEdge: dest_db: str dest_table: str sources: list[tuple[str, str]] = field(default_factory=list) # col_mappings: (dest_col, src_table, src_col) col_mappings: list[tuple[str, str, str]] = field(default_factory=list) def _prepare_select_for_col_scan(select_clause: str) -> str: """Remove literals and window headers so _COL_RE sees fewer false positives.""" s = _STR_LITERAL_RE.sub(" ", select_clause) s = _WINDOW_AS_ALIAS_RE.sub(" ", s) return s def _dedupe_col_mappings(mappings: list[tuple[str, str, str]]) -> list[tuple[str, str, str]]: seen: set[tuple[str, str, str]] = set() out: list[tuple[str, str, str]] = [] for t in mappings: if t in seen: continue seen.add(t) out.append(t) return out def _extract_query_blocks(log_text: str) -> list[str]: """Extract individual SQL query strings from a Hive log file.""" return [m.group("query").strip() for m in _COMMAND_START_RE.finditer(log_text)] def _parse_select_cols(select_clause: str, src_table: str) -> list[tuple[str, str, str]]: """ Lightweight column mapping: for each `alias.col AS dest` or `col AS dest` in the SELECT clause, return (dest_col, src_table, src_col). Strips string literals and window function headers first to reduce false positives, and filters out SQL keywords/noise tokens. """ prepared = _prepare_select_for_col_scan(select_clause) mappings = [] for m in _COL_RE.finditer(prepared): src_col = m.group(2) dest_col = m.group(3) or src_col if src_col.upper() in ("FROM", "SELECT", "WHERE", "JOIN", "ON", "AS", "*"): continue if src_col.upper() in _SQL_SCAN_NOISE or dest_col.upper() in _SQL_SCAN_NOISE: continue # After stripping 'literal' AS col, we get " AS col" — skip bare (col, col) with no source expr. if dest_col == src_col: prefix = prepared[: m.start()].rstrip() if prefix.upper().endswith("AS"): continue mappings.append((dest_col, src_table, src_col)) return _dedupe_col_mappings(mappings) def _parse_edges(queries: list[str]) -> list[_LineageEdge]: """Parse SQL query strings into _LineageEdge objects.""" edges: dict[str, _LineageEdge] = {} for sql in queries: # Strip string literals to avoid false table/column matches inside quoted strings sql_clean = re.sub(r"\s+", " ", _STR_LITERAL_RE.sub(" ", sql)).strip() for pattern in (_CTAS_RE, _INSERT_RE): m = pattern.search(sql_clean) if not m: continue dest_db = m.group("dest_db").lower() dest_table = m.group("dest_table").lower() src_db = m.group("src_db").lower() src_table = m.group("src_table").lower() select_cols = m.group("select_cols") key = f"{dest_db}.{dest_table}" if key not in edges: edges[key] = _LineageEdge(dest_db=dest_db, dest_table=dest_table) edge = edges[key] src_pair = (src_db, src_table) if src_pair not in edge.sources: edge.sources.append(src_pair) # Pick up additional JOIN sources for jm in _JOIN_RE.finditer(sql_clean): jp = (jm.group("src_db").lower(), jm.group("src_table").lower()) if jp not in edge.sources: edge.sources.append(jp) edge.col_mappings.extend(_parse_select_cols(select_cols, src_table)) break # matched one pattern, move to next query # Deduplicate column mappings per edge (same INSERT may appear many times in HS2 logs) for e in edges.values(): e.col_mappings = _dedupe_col_mappings(e.col_mappings) return list(edges.values()) def collect(log_file: str) -> dict: """ Parse lineage edges from a HiveServer2 log file and return a manifest dict. Args: log_file: Path to a local HiveServer2 log file. Returns: Manifest dict with keys: resource_type, collected_at, edges. Each edge has destination, sources, and col_mappings lists. """ print(f"Reading Hive log file: {log_file} ...") with open(log_file, errors="replace") as fh: log_text = fh.read() queries = _extract_query_blocks(log_text) print(f" Extracted {len(queries)} query block(s).") edges = _parse_edges(queries) print(f" Parsed {len(edges)} lineage edge(s).") manifest = { "resource_type": RESOURCE_TYPE, "collected_at": datetime.now(tz=timezone.utc).isoformat(), "edges": [ { "destination": {"database": e.dest_db, "table": e.dest_table}, "sources": [{"database": sdb, "table": stbl} for sdb, stbl in e.sources], "col_mappings": [ {"dest_col": dc, "src_table": st, "src_col": sc} for dc, st, sc in e.col_mappings ], } for e in edges ], } return manifest def main() -> None: parser = argparse.ArgumentParser( description="Extract Hive lineage from a local log file and write a JSON manifest", ) parser.add_argument( "--log-file", default="/tmp/root/hive.log", help="Path to local HiveServer2 log file (default: /tmp/root/hive.log)", # ← SUBSTITUTE: your log path ) parser.add_argument( "--output-file", default="lineage_output.json", help="Path to write the lineage manifest (default: lineage_output.json)", ) args = parser.parse_args() manifest = collect(log_file=args.log_file) if not manifest["edges"]: print("No lineage edges detected — no CTAS or INSERT INTO ... SELECT patterns found.") return with open(args.output_file, "w") as fh: json.dump(manifest, fh, indent=2) print(f"Lineage manifest written to {args.output_file}") print("Done.") if __name__ == "__main__": main()