playbook/antigravity-awesome-skills/plugins/antigravity-awesome-skills-.../skills/docx-official/ooxml/scripts/unpack.py

96 lines
3.3 KiB
Python
Executable File

#!/usr/bin/env python3
"""Unpack and format XML contents of Office files (.docx, .pptx, .xlsx)"""
import random
import shutil
import stat
import sys
import zipfile
from pathlib import Path
MAX_ARCHIVE_MEMBERS = 5000
MAX_MEMBER_SIZE = 100 * 1024 * 1024
MAX_TOTAL_UNCOMPRESSED = 512 * 1024 * 1024
MAX_COMPRESSION_RATIO = 1000
def _is_zip_symlink(member: zipfile.ZipInfo) -> bool:
return stat.S_ISLNK(member.external_attr >> 16)
def _is_safe_destination(output_root: Path, member_name: str) -> bool:
destination = output_root / member_name
return destination.resolve().is_relative_to(output_root.resolve())
def _extract_member(archive: zipfile.ZipFile, member: zipfile.ZipInfo, output_root: Path):
destination = output_root / member.filename
if member.is_dir():
destination.mkdir(parents=True, exist_ok=True)
return
destination.parent.mkdir(parents=True, exist_ok=True)
with archive.open(member, "r") as source, open(destination, "wb") as target:
shutil.copyfileobj(source, target)
def _validate_archive_members(archive: zipfile.ZipFile, output_root: Path):
members = archive.infolist()
if len(members) > MAX_ARCHIVE_MEMBERS:
raise ValueError("Archive contains too many entries")
total_size = 0
for member in members:
if _is_zip_symlink(member):
raise ValueError(f"Unsafe archive entry: {member.filename}")
if not _is_safe_destination(output_root, member.filename):
raise ValueError(f"Unsafe archive entry: {member.filename}")
if member.file_size > MAX_MEMBER_SIZE:
raise ValueError(f"Archive entry too large: {member.filename}")
total_size += member.file_size
if total_size > MAX_TOTAL_UNCOMPRESSED:
raise ValueError("Archive uncompressed size is too large")
if member.compress_size and member.file_size / member.compress_size > MAX_COMPRESSION_RATIO:
raise ValueError(f"Archive entry compression ratio too high: {member.filename}")
return members
def extract_archive_safely(input_file: str | Path, output_dir: str | Path):
output_path = Path(output_dir)
output_path.mkdir(parents=True, exist_ok=True)
output_root = output_path.resolve()
with zipfile.ZipFile(input_file) as archive:
for member in _validate_archive_members(archive, output_root):
_extract_member(archive, member, output_path)
def pretty_print_xml(output_path: Path):
import defusedxml.minidom
xml_files = list(output_path.rglob("*.xml")) + list(output_path.rglob("*.rels"))
for xml_file in xml_files:
content = xml_file.read_text(encoding="utf-8")
dom = defusedxml.minidom.parseString(content)
xml_file.write_bytes(dom.toprettyxml(indent=" ", encoding="ascii"))
def main(argv: list[str] | None = None):
argv = argv or sys.argv[1:]
if len(argv) != 2:
raise SystemExit("Usage: python unpack.py <office_file> <output_dir>")
input_file, output_dir = argv
output_path = Path(output_dir)
extract_archive_safely(input_file, output_path)
pretty_print_xml(output_path)
if input_file.endswith(".docx"):
suggested_rsid = "".join(random.choices("0123456789ABCDEF", k=8))
print(f"Suggested RSID for edit session: {suggested_rsid}")
if __name__ == "__main__":
main()