285 lines
9.0 KiB
Python
285 lines
9.0 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Collect Hive query logs from a local HiveServer2 log file — collection only.
|
|
|
|
Parses a plain-text HiveServer2 log for "Executing/Starting command" entries
|
|
to extract query text, query ID, start time and end time. Optionally reads
|
|
per-query operation logs to populate ``returned_rows`` from SelectOperator
|
|
``RECORDS_OUT`` counters. Deduplicates entries by query ID.
|
|
|
|
Can be run standalone via CLI or imported (use the ``collect()`` function).
|
|
|
|
Substitution points
|
|
-------------------
|
|
- --log-file path to local HiveServer2 log (default: /tmp/root/hive.log)
|
|
- --op-logs-dir optional directory of per-query <queryId>.log files
|
|
|
|
Prerequisites
|
|
-------------
|
|
pip install python-dateutil python-dotenv
|
|
|
|
Usage
|
|
-----
|
|
python collect_query_logs.py \\
|
|
--log-file /tmp/root/hive.log \\
|
|
[--op-logs-dir /var/log/hive/operation_logs] \\
|
|
--output-file query_logs_output.json
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import json
|
|
import re
|
|
from datetime import datetime, timezone
|
|
from io import StringIO
|
|
from pathlib import Path
|
|
|
|
from dateutil.parser import isoparse
|
|
|
|
# NOTE: the normalizer requires "hive-s3" — do not change to "hive" or "data-lake"
|
|
LOG_TYPE = "hive-s3"
|
|
|
|
# Matches the start of a new query block in the Hive log
|
|
_COMMAND_START_RE = re.compile(
|
|
r"(Executing|Starting)\s+command\(queryId=(?P<query_id>\S*)\):\s+(?P<command>.*)$"
|
|
)
|
|
|
|
# Extracts returned row counts from per-query Hive operation logs
|
|
_RECORDS_OUT_RE = re.compile(r"RECORDS_OUT_OPERATOR_SEL_\d+:(\d+)")
|
|
|
|
|
|
def _parse_log_entries(log_text: str) -> list[dict]:
|
|
"""
|
|
Parse a HiveServer2 log file and return a list of dicts:
|
|
query_id, start_time (datetime), end_time (datetime), query (str)
|
|
|
|
Each timestamped "Executing/Starting command" line starts a new entry.
|
|
The previous entry's end_time is set to the timestamp of the next line.
|
|
"""
|
|
entries = []
|
|
query = ""
|
|
query_id = ""
|
|
start_time: datetime | None = None
|
|
last_timestamp: datetime | None = None
|
|
|
|
for line in StringIO(log_text):
|
|
parts = line.split()
|
|
if not parts:
|
|
continue
|
|
|
|
try:
|
|
timestamp = isoparse(parts[0])
|
|
if not timestamp.tzinfo:
|
|
timestamp = timestamp.replace(tzinfo=timezone.utc)
|
|
except ValueError:
|
|
# Continuation line for a multi-line query
|
|
if query:
|
|
query += "\n" + line.rstrip()
|
|
continue
|
|
|
|
command_start = _COMMAND_START_RE.search(line)
|
|
if command_start:
|
|
# Emit the previous entry before starting a new one
|
|
if query and start_time:
|
|
entries.append(
|
|
{
|
|
"query_id": query_id,
|
|
"start_time": start_time,
|
|
"end_time": timestamp,
|
|
"query": query,
|
|
}
|
|
)
|
|
query_id = command_start.group("query_id")
|
|
start_time = timestamp
|
|
query = command_start.group("command").strip()
|
|
elif query and start_time:
|
|
# A timestamped non-command line closes the current entry
|
|
entries.append(
|
|
{
|
|
"query_id": query_id,
|
|
"start_time": start_time,
|
|
"end_time": timestamp,
|
|
"query": query,
|
|
}
|
|
)
|
|
query = ""
|
|
query_id = ""
|
|
start_time = None
|
|
|
|
last_timestamp = timestamp
|
|
|
|
# Flush any trailing entry
|
|
if query and start_time:
|
|
end_time = last_timestamp or start_time
|
|
entries.append(
|
|
{
|
|
"query_id": query_id,
|
|
"start_time": start_time,
|
|
"end_time": end_time,
|
|
"query": query,
|
|
}
|
|
)
|
|
|
|
return entries
|
|
|
|
|
|
def _load_returned_rows(op_logs_dir: str) -> dict[str, int]:
|
|
"""
|
|
Scan a directory of per-query Hive operation logs (named <queryId>.log) and
|
|
return a mapping of query_id -> rows returned.
|
|
|
|
The row count is taken from the last RECORDS_OUT_OPERATOR_SEL_N value in
|
|
each file, which reflects the final number of rows delivered to the client.
|
|
"""
|
|
rows_by_id: dict[str, int] = {}
|
|
for log_file in Path(op_logs_dir).glob("*.log"):
|
|
query_id = log_file.stem
|
|
last_count: int | None = None
|
|
try:
|
|
text = log_file.read_text(errors="replace")
|
|
except OSError:
|
|
continue
|
|
for m in _RECORDS_OUT_RE.finditer(text):
|
|
last_count = int(m.group(1))
|
|
if last_count is not None:
|
|
rows_by_id[query_id] = last_count
|
|
return rows_by_id
|
|
|
|
|
|
def _build_query_log_entries(
|
|
raw_entries: list[dict],
|
|
rows_by_id: dict[str, int] | None = None,
|
|
) -> list[dict]:
|
|
"""
|
|
Deduplicate raw log entries by query_id and enrich with returned_rows.
|
|
|
|
Returns plain dicts so that ``push_query_logs.py`` can reconstruct
|
|
QueryLogEntry objects from the JSON manifest.
|
|
"""
|
|
seen: set[str] = set()
|
|
entries = []
|
|
for r in raw_entries:
|
|
qid = r["query_id"]
|
|
if qid and qid in seen:
|
|
continue
|
|
if qid:
|
|
seen.add(qid)
|
|
|
|
returned_rows: int | None = rows_by_id.get(qid) if rows_by_id and qid else None
|
|
|
|
entries.append(
|
|
{
|
|
"query_id": qid or None,
|
|
"start_time": r["start_time"].isoformat(),
|
|
"end_time": r["end_time"].isoformat(),
|
|
"query_text": r["query"],
|
|
"user": "hadoop", # ← SUBSTITUTE: set the user appropriate for your cluster
|
|
"returned_rows": returned_rows,
|
|
}
|
|
)
|
|
return entries
|
|
|
|
|
|
def collect(
|
|
log_file: str,
|
|
op_logs_dir: str | None = None,
|
|
) -> dict:
|
|
"""
|
|
Parse query log entries from a HiveServer2 log file and return a manifest dict.
|
|
|
|
Args:
|
|
log_file: Path to a local HiveServer2 log file.
|
|
op_logs_dir: Optional directory containing per-query operation logs
|
|
(<queryId>.log). When provided, returned_rows is populated
|
|
from SelectOperator RECORDS_OUT counts.
|
|
|
|
Returns:
|
|
Manifest dict with keys: log_type, collected_at, entry_count,
|
|
window_start, window_end, queries.
|
|
"""
|
|
print(f"Reading Hive log file: {log_file} ...")
|
|
with open(log_file, errors="replace") as fh:
|
|
log_text = fh.read()
|
|
|
|
raw_entries = _parse_log_entries(log_text)
|
|
print(f" Parsed {len(raw_entries)} query log entry/entries.")
|
|
|
|
if not raw_entries:
|
|
print("No query log entries found.")
|
|
return {
|
|
"log_type": LOG_TYPE,
|
|
"collected_at": datetime.now(tz=timezone.utc).isoformat(),
|
|
"entry_count": 0,
|
|
"window_start": None,
|
|
"window_end": None,
|
|
"queries": [],
|
|
}
|
|
|
|
rows_by_id: dict[str, int] | None = None
|
|
if op_logs_dir:
|
|
rows_by_id = _load_returned_rows(op_logs_dir)
|
|
print(f" Loaded row counts for {len(rows_by_id)} query/queries from {op_logs_dir}")
|
|
|
|
queries = _build_query_log_entries(raw_entries, rows_by_id)
|
|
|
|
start_times = [r["start_time"] for r in raw_entries]
|
|
end_times = [r["end_time"] for r in raw_entries]
|
|
|
|
manifest = {
|
|
"log_type": LOG_TYPE,
|
|
"collected_at": datetime.now(tz=timezone.utc).isoformat(),
|
|
"entry_count": len(queries),
|
|
"window_start": min(start_times).isoformat() if start_times else None,
|
|
"window_end": max(end_times).isoformat() if end_times else None,
|
|
"queries": [
|
|
{
|
|
"query_id": q["query_id"],
|
|
"start_time": q["start_time"],
|
|
"end_time": q["end_time"],
|
|
"query": q["query_text"],
|
|
"user": q["user"],
|
|
"returned_rows": q["returned_rows"],
|
|
}
|
|
for q in queries
|
|
],
|
|
}
|
|
return manifest
|
|
|
|
|
|
def main() -> None:
|
|
parser = argparse.ArgumentParser(
|
|
description="Collect Hive query logs from a local log file and write a JSON manifest",
|
|
)
|
|
parser.add_argument(
|
|
"--log-file",
|
|
default="/tmp/root/hive.log",
|
|
help="Path to local HiveServer2 log file (default: /tmp/root/hive.log)", # ← SUBSTITUTE: your log path
|
|
)
|
|
parser.add_argument(
|
|
"--op-logs-dir",
|
|
default=None,
|
|
help=(
|
|
"Directory containing per-query Hive operation logs (<queryId>.log). "
|
|
"When provided, returned_rows is populated from SelectOperator RECORDS_OUT counts."
|
|
),
|
|
# ← SUBSTITUTE: e.g. /var/log/hive/operation_logs or wherever Hive writes op logs
|
|
)
|
|
parser.add_argument(
|
|
"--output-file",
|
|
default="query_logs_output.json",
|
|
help="Path to write the output manifest (default: query_logs_output.json)",
|
|
)
|
|
args = parser.parse_args()
|
|
|
|
manifest = collect(log_file=args.log_file, op_logs_dir=args.op_logs_dir)
|
|
|
|
with open(args.output_file, "w") as fh:
|
|
json.dump(manifest, fh, indent=2)
|
|
print(f"Query log manifest written to {args.output_file}")
|
|
print("Done.")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|