#!/usr/bin/env python3 """ Collect table metadata from Snowflake — collection only. Connects to Snowflake, discovers all accessible databases and schemas, then queries INFORMATION_SCHEMA.TABLES for volume/freshness and INFORMATION_SCHEMA.COLUMNS for field definitions. The collected assets are written to a JSON manifest file. Can be run standalone via CLI or imported (use the ``collect()`` function). Substitution points ------------------- - SNOWFLAKE_ACCOUNT (env) / --account (CLI) : Snowflake account identifier (e.g. xy12345.us-east-1) - SNOWFLAKE_USER (env) / --user (CLI) : Snowflake username - SNOWFLAKE_PASSWORD (env) / --password (CLI) : Snowflake password - SNOWFLAKE_WAREHOUSE (env) / --warehouse (CLI) : Snowflake virtual warehouse Prerequisites ------------- pip install snowflake-connector-python Usage ----- python collect_metadata.py \\ --account \\ --user \\ --password \\ --warehouse """ import argparse import json import os from datetime import datetime, timezone import snowflake.connector # ← SUBSTITUTE: set RESOURCE_TYPE to match your Monte Carlo connection type RESOURCE_TYPE = "snowflake" def _check_available_memory(min_gb: float = 2.0) -> None: """Warn if available memory is below the threshold.""" try: if hasattr(os, "sysconf"): # Linux / macOS page_size = os.sysconf("SC_PAGE_SIZE") avail_pages = os.sysconf("SC_AVPHYS_PAGES") avail_gb = (page_size * avail_pages) / (1024 ** 3) else: return # Windows — skip check except (ValueError, OSError): return if avail_gb < min_gb: print( f"WARNING: Only {avail_gb:.1f} GB of memory available " f"(minimum recommended: {min_gb:.1f} GB). " f"Consider reducing the lookback window or increasing available memory." ) # Databases that are Snowflake system databases — skip them _SKIP_DATABASES = {"SNOWFLAKE", "SNOWFLAKE_SAMPLE_DATA"} # Schemas that are Snowflake system schemas — skip them _SKIP_SCHEMAS = {"INFORMATION_SCHEMA"} # Snowflake TABLE_TYPE → Monte Carlo RelationalAsset.type mapping. # The MC API only accepts "TABLE" or "VIEW" (uppercase). _TABLE_TYPE_MAP = { "BASE TABLE": "TABLE", "TABLE": "TABLE", "DYNAMIC TABLE": "TABLE", "EXTERNAL TABLE": "TABLE", "VIEW": "VIEW", "MATERIALIZED VIEW": "VIEW", "SECURE VIEW": "VIEW", } def _normalize_table_type(raw_type: str | None) -> str: """Map Snowflake's TABLE_TYPE value to MC-accepted 'TABLE' or 'VIEW'.""" if not raw_type: return "TABLE" return _TABLE_TYPE_MAP.get(raw_type.upper(), "TABLE") def _connect(account: str, user: str, password: str, warehouse: str): # ← SUBSTITUTE: add role= or authenticator= kwargs if your org requires them return snowflake.connector.connect( account=account, user=user, password=password, warehouse=warehouse, ) def _collect_assets(conn) -> list[dict]: """Collect table metadata from Snowflake and return as a list of dicts.""" cursor = conn.cursor() assets: list[dict] = [] # --- Discover databases --- cursor.execute("SHOW DATABASES") # SHOW DATABASES returns (created_on, name, …); column index 1 is the name all_db_rows = [] while True: chunk = cursor.fetchmany(1000) if not chunk: break all_db_rows.extend(chunk) databases = [row[1] for row in all_db_rows if row[1] not in _SKIP_DATABASES] print(f" Found {len(databases)} database(s): {databases}") for db in databases: # --- Discover schemas in each database --- try: cursor.execute(f'SHOW SCHEMAS IN DATABASE "{db}"') except Exception as exc: print(f" WARNING: could not list schemas in {db}: {exc}") continue # Column index 1 is the schema name all_schema_rows = [] while True: chunk = cursor.fetchmany(1000) if not chunk: break all_schema_rows.extend(chunk) schemas = [row[1] for row in all_schema_rows if row[1] not in _SKIP_SCHEMAS] # --- Collect tables, volume, and freshness via INFORMATION_SCHEMA --- try: cursor.execute( f""" SELECT TABLE_CATALOG, TABLE_SCHEMA, TABLE_NAME, TABLE_TYPE, ROW_COUNT, BYTES, LAST_ALTERED, COMMENT FROM "{db}".INFORMATION_SCHEMA.TABLES WHERE TABLE_SCHEMA != 'INFORMATION_SCHEMA' ORDER BY TABLE_SCHEMA, TABLE_NAME """ ) except Exception as exc: print(f" WARNING: could not query INFORMATION_SCHEMA.TABLES in {db}: {exc}") continue table_rows = [] while True: chunk = cursor.fetchmany(1000) if not chunk: break table_rows.extend(chunk) print(f" {db}: {len(table_rows)} table(s)") # Build a set of schema names present in the table result to know which # INFORMATION_SCHEMA.COLUMNS queries to run schemas_with_tables: set[str] = {row[1] for row in table_rows} # Pre-fetch all columns for this database in one query per schema columns_by_table: dict[tuple[str, str], list[dict]] = {} for schema in schemas_with_tables: if schema not in schemas: continue # respect the earlier schema skip list try: cursor.execute( f""" SELECT TABLE_NAME, COLUMN_NAME, DATA_TYPE, COMMENT FROM "{db}".INFORMATION_SCHEMA.COLUMNS WHERE TABLE_SCHEMA = %s ORDER BY TABLE_NAME, ORDINAL_POSITION """, (schema,), ) except Exception as exc: print(f" WARNING: could not fetch columns for {db}.{schema}: {exc}") continue all_col_rows = [] while True: chunk = cursor.fetchmany(1000) if not chunk: break all_col_rows.extend(chunk) for col_row in all_col_rows: table_name, col_name, data_type, col_comment = col_row key = (schema, table_name) if key not in columns_by_table: columns_by_table[key] = [] columns_by_table[key].append( { "name": col_name, "type": data_type, "description": col_comment or None, } ) # Build asset dicts for row in table_rows: tbl_catalog, tbl_schema, tbl_name, tbl_type, row_count, byte_count, last_altered, tbl_comment = row volume = None if row_count is not None or byte_count is not None: volume = { "row_count": int(row_count) if row_count is not None else None, "byte_count": int(byte_count) if byte_count is not None else None, } freshness = None if last_altered is not None: freshness = { "last_update_time": last_altered.isoformat() if hasattr(last_altered, "isoformat") else str(last_altered), } fields = columns_by_table.get((tbl_schema, tbl_name), []) assets.append( { "type": _normalize_table_type(tbl_type), "database": tbl_catalog, "schema": tbl_schema, "name": tbl_name, "description": tbl_comment or None, "fields": fields, "volume": volume, "freshness": freshness, } ) print(f" + {tbl_catalog}.{tbl_schema}.{tbl_name} ({len(fields)} columns)") cursor.close() return assets def collect( account: str, user: str, password: str, warehouse: str, output_file: str = "metadata_output.json", ) -> dict: """ Connect to Snowflake, collect table metadata, and write a JSON manifest. Returns the manifest dict. """ _check_available_memory() print(f"Connecting to Snowflake account: {account} ...") conn = _connect(account, user, password, warehouse) print("Collecting table metadata ...") assets = _collect_assets(conn) conn.close() print(f"\nCollected {len(assets)} table(s).") manifest = { "resource_type": RESOURCE_TYPE, "collected_at": datetime.now(tz=timezone.utc).isoformat(), "assets": assets, } with open(output_file, "w") as fh: json.dump(manifest, fh, indent=2) print(f"Asset manifest written to {output_file}") return manifest def main() -> None: parser = argparse.ArgumentParser( description="Collect Snowflake table metadata and write to a manifest file", ) parser.add_argument( "--account", default=os.environ.get("SNOWFLAKE_ACCOUNT"), help="Snowflake account identifier, e.g. xy12345.us-east-1 (env: SNOWFLAKE_ACCOUNT)", # ← SUBSTITUTE ) parser.add_argument( "--user", default=os.environ.get("SNOWFLAKE_USER"), help="Snowflake username (env: SNOWFLAKE_USER)", # ← SUBSTITUTE ) parser.add_argument( "--password", default=os.environ.get("SNOWFLAKE_PASSWORD"), help="Snowflake password (env: SNOWFLAKE_PASSWORD)", # ← SUBSTITUTE ) parser.add_argument( "--warehouse", default=os.environ.get("SNOWFLAKE_WAREHOUSE"), help="Snowflake virtual warehouse (env: SNOWFLAKE_WAREHOUSE)", # ← SUBSTITUTE ) parser.add_argument( "--output-file", default="metadata_output.json", help="Path to write the output manifest (default: metadata_output.json)", ) args = parser.parse_args() missing = [ name for name, val in [ ("--account", args.account), ("--user", args.user), ("--password", args.password), ("--warehouse", args.warehouse), ] if not val ] if missing: parser.error(f"Missing required arguments: {', '.join(missing)}") collect( account=args.account, user=args.user, password=args.password, warehouse=args.warehouse, output_file=args.output_file, ) print("Done.") if __name__ == "__main__": main()