#!/usr/bin/env python3 """ Collect table metadata from a Hive Metastore — collection only. Connects to HiveServer2 (default port 10000), discovers all databases and tables via SHOW DATABASES / SHOW TABLES, reads schema and table statistics via DESCRIBE FORMATTED, then writes a JSON manifest file. Can be run standalone via CLI or imported (use the ``collect()`` function). Substitution points ------------------- - HIVE_HOST (env) / --hive-host (CLI) : HiveServer2 hostname - HIVE_PORT (env) / --hive-port (CLI) : HiveServer2 port (default 10000) Prerequisites ------------- pip install pyhive python-dotenv Usage ----- python collect_metadata.py \\ --hive-host \\ --output-file metadata_output.json """ import argparse import json import os import re from datetime import datetime, timezone from pyhive import hive def _check_available_memory(min_gb: float = 2.0) -> None: """Warn if available memory is below the threshold.""" try: if hasattr(os, "sysconf"): # Linux / macOS page_size = os.sysconf("SC_PAGE_SIZE") avail_pages = os.sysconf("SC_AVPHYS_PAGES") avail_gb = (page_size * avail_pages) / (1024 ** 3) else: return # Windows — skip check except (ValueError, OSError): return if avail_gb < min_gb: print( f"WARNING: Only {avail_gb:.1f} GB of memory available " f"(minimum recommended: {min_gb:.1f} GB). " f"Consider reducing the number of databases/tables or increasing available memory." ) # ← SUBSTITUTE: set RESOURCE_TYPE to match your Monte Carlo connection type RESOURCE_TYPE = "data-lake" # Map Hive native types to SQL-standard uppercase types expected by Monte Carlo _HIVE_TYPE_MAP: dict[str, str] = { "tinyint": "TINYINT", "smallint": "SMALLINT", "int": "INTEGER", "integer": "INTEGER", "bigint": "BIGINT", "float": "FLOAT", "double": "DOUBLE", "double precision": "DOUBLE", "decimal": "DECIMAL", "numeric": "DECIMAL", "boolean": "BOOLEAN", "string": "VARCHAR", "varchar": "VARCHAR", "char": "CHAR", "binary": "BINARY", "timestamp": "TIMESTAMP", "date": "DATE", "interval": "INTERVAL", "array": "ARRAY", "map": "MAP", "struct": "STRUCT", "uniontype": "UNION", } # ← SUBSTITUTE: add any internal table name prefixes you want to skip _INTERNAL_TABLE_PREFIXES = ("tmp_", "__", "hive_") def _normalize_hive_type(hive_type: str) -> str: """Uppercase and normalize a Hive type string to a SQL-standard form. Parametrized types like ``decimal(10,2)`` or ``varchar(255)`` keep their suffix; the base type is mapped through ``_HIVE_TYPE_MAP``. """ lower = hive_type.lower().strip() base = lower.split("(")[0].strip() suffix = hive_type[len(base):].strip() # preserve original params, e.g. decimal(10,2) return _HIVE_TYPE_MAP.get(base, base.upper()) + suffix def _connect(host: str, port: int) -> hive.Connection: # ← SUBSTITUTE: update username/auth if your cluster requires Kerberos or LDAP return hive.connect(host=host, port=port, username="hadoop", auth="NONE") def _fetch_rows(cursor, query: str) -> list[tuple]: """Execute a query and fetch results in memory-safe chunks.""" cursor.execute(query) rows: list[tuple] = [] while True: chunk = cursor.fetchmany(1000) if not chunk: break rows.extend(chunk) return rows def _parse_describe_formatted(rows: list[tuple]) -> dict: """ Parse DESCRIBE FORMATTED . output into a structured dict: columns, row_count, total_size, last_modified, description, created_on """ result: dict = { "columns": [], "row_count": None, "total_size": None, "last_modified": None, "description": None, "created_on": None, } in_col_info = False in_table_info = False for row in rows: col_name = (row[0] or "").strip() data_type = (row[1] or "").strip() comment = (row[2] or "").strip() if len(row) > 2 else "" if col_name.startswith("# col_name"): in_col_info = True in_table_info = False continue if col_name.startswith("# Detailed Table Information"): in_col_info = False in_table_info = True continue if col_name.startswith("#"): in_col_info = False continue if in_col_info and col_name and data_type: result["columns"].append( { "name": col_name, "type": _normalize_hive_type(data_type), "description": comment or None, } ) if in_table_info: # Table Parameters rows have an empty col_name; key is in data_type, value in comment param_key = data_type.strip() if not col_name else col_name.strip().rstrip(":") param_val = (comment.strip() if not col_name else data_type.strip()) or "" if re.search(r"numRows", param_key, re.IGNORECASE): try: result["row_count"] = int(param_val) except (ValueError, TypeError): pass elif re.search(r"totalSize", param_key, re.IGNORECASE): try: result["total_size"] = int(param_val) except (ValueError, TypeError): pass elif re.search(r"last_modified_time", param_key, re.IGNORECASE): try: result["last_modified"] = datetime.fromtimestamp( int(param_val), tz=timezone.utc ).isoformat() except (ValueError, TypeError): pass elif re.search(r"^CreateTime", param_key): # e.g. "Wed Mar 18 20:15:40 UTC 2026" try: result["created_on"] = datetime.strptime( param_val, "%a %b %d %H:%M:%S %Z %Y" ).replace(tzinfo=timezone.utc).isoformat() except (ValueError, TypeError): pass elif param_key == "comment" and not result["description"] and param_val: result["description"] = param_val return result def collect( hive_host: str, hive_port: int = 10000, ) -> dict: """ Connect to HiveServer2, discover all databases and tables, and return a manifest dict with collected asset metadata. Args: hive_host: HiveServer2 hostname. hive_port: HiveServer2 port (default 10000). Returns: Manifest dict with keys: resource_type, collected_at, assets. """ _check_available_memory() print(f"Connecting to HiveServer2 at {hive_host}:{hive_port} ...") conn = _connect(hive_host, hive_port) cursor = conn.cursor() assets: list[dict] = [] print("Collecting table metadata ...") databases = [row[0] for row in _fetch_rows(cursor, "SHOW DATABASES")] print(f" Found databases: {databases}") for db in databases: # ← SUBSTITUTE: add any system databases you want to skip if db in ("information_schema",): continue tables = _fetch_rows(cursor, f"SHOW TABLES IN {db}") table_names = [row[0] for row in tables] print(f" {db}: {len(table_names)} table(s)") for table in table_names: if any(table.startswith(p) for p in _INTERNAL_TABLE_PREFIXES): continue try: desc_rows = _fetch_rows(cursor, f"DESCRIBE FORMATTED {db}.{table}") except Exception as exc: print(f" WARNING: could not describe {db}.{table}: {exc}") continue info = _parse_describe_formatted(desc_rows) row_count = info["row_count"] if info["row_count"] and info["row_count"] > 0 else None byte_count = info["total_size"] if info["total_size"] and info["total_size"] > 0 else None assets.append( { "database": db, "schema": db, "name": table, "description": info["description"], "created_on": info["created_on"], "row_count": row_count, "byte_count": byte_count, "last_modified": info["last_modified"], "fields": [ {"name": col["name"], "type": col["type"], "description": col["description"]} for col in info["columns"] ], } ) print( f" + {db}.{table} ({len(info['columns'])} columns, " f"desc={info['description']!r}, created={info['created_on']})" ) cursor.close() conn.close() print(f"\nCollected {len(assets)} table(s).") manifest = { "resource_type": RESOURCE_TYPE, "collected_at": datetime.now(tz=timezone.utc).isoformat(), "assets": assets, } return manifest def main() -> None: parser = argparse.ArgumentParser( description="Collect Hive table metadata and write a JSON manifest", ) parser.add_argument( "--hive-host", default=os.environ.get("HIVE_HOST"), help="HiveServer2 hostname (env: HIVE_HOST)", # ← SUBSTITUTE: your EMR master DNS or Hive host ) parser.add_argument( "--hive-port", type=int, default=10000, help="HiveServer2 port (default: 10000)", # ← SUBSTITUTE if your cluster uses a non-standard port ) parser.add_argument( "--output-file", default="metadata_output.json", help="Path to write the output manifest (default: metadata_output.json)", ) args = parser.parse_args() if not args.hive_host: parser.error("--hive-host is required (or set HIVE_HOST)") manifest = collect( hive_host=args.hive_host, hive_port=args.hive_port, ) with open(args.output_file, "w") as fh: json.dump(manifest, fh, indent=2) print(f"Asset manifest written to {args.output_file}") print("Done.") if __name__ == "__main__": main()