subtitle_attachment_cleanup/subtitle_fonts_scanner.py

#!/usr/bin/env python3
"""
subtitle_fonts_scanner.py
─────────────────────────
Read-only scanner that inspects a single MKV file and reports:
  • Fonts required by ASS/SSA subtitle tracks
  • Fonts currently embedded as attachments
  • Fonts that are missing (required but not embedded)

Usage:
  python subtitle_fonts_scanner.py input.mkv
"""

import json
import os
import re
import subprocess
import sys
import tempfile
from pathlib import Path

# ── Dependency check ──────────────────────────────────────────────────────────
try:
    from fontTools.ttLib import TTFont
except ImportError:
    print("Error: 'fonttools' is required to accurately read internal font names.")
    print("  Install with:  pip install fonttools")
    print("  (Arch Linux):  sudo pacman -S python-fonttools")
    sys.exit(1)

# ── Helpers ───────────────────────────────────────────────────────────────────

def get_ass_font_names(ass_path: Path) -> set[str]:
    """
    Parse an ASS/SSA file and return a lower-case set of all font family names
    referenced in [V4+ Styles] and via \\fn inline overrides in [Events].
    """
    fonts: set[str] = set()
    try:
        with open(ass_path, "r", encoding="utf-8", errors="ignore") as fh:
            in_styles = False
            in_events = False
            fontname_idx = -1

            for raw in fh:
                line = raw.strip()
                if not line:
                    continue

                if line.startswith("["):
                    in_styles = line == "[V4+ Styles]"
                    in_events = line == "[Events]"
                    continue

                if in_styles:
                    if line.startswith("Format:"):
                        fmt_cols = [c.strip().lower() for c in line[len("Format:"):].split(",")]
                        fontname_idx = fmt_cols.index("fontname") if "fontname" in fmt_cols else -1
                    elif line.startswith("Style:") and fontname_idx != -1:
                        cols = [c.strip() for c in line[len("Style:"):].split(",")]
                        if len(cols) > fontname_idx:
                            fonts.add(cols[fontname_idx])

                if in_events and line.startswith("Dialogue:"):
                    for match in re.findall(r"\\fn([^\\}]+)", line):
                        fonts.add(match.strip())

    except Exception as exc:
        print(yellow(f"  Warning: could not fully parse {ass_path.name}: {exc}"))

    return {f.lower() for f in fonts if f}


def get_internal_font_names(font_path: Path) -> set[str]:
    """
    Extract the internal family / full-name / typographic family strings
    from a TTF, OTF, or TTC file using fontTools.
    Returns a lower-case set.
    """
    names: set[str] = set()
    try:
        font = TTFont(str(font_path), fontNumber=0)
        for record in font["name"].names:
            if record.nameID in (1, 4, 16):
                try:
                    names.add(record.toUnicode().lower())
                except Exception:
                    pass
        font.close()
    except Exception as exc:
        print(yellow(f"  Warning: could not read font metadata for {font_path.name}: {exc}"))
    return names


def safe_stem(name: str) -> str:
    """Strip unsafe characters for use as a temp filename component."""
    return "".join(c for c in name if c.isalnum() or c in " ._-").rstrip()


# ── Core scan logic ───────────────────────────────────────────────────────────

def scan_mkv(mkv_path: Path) -> None:
    print()
    print(f"Scanning: {mkv_path.name}")
    print("─" * 70)

    # ── 1. Read MKV metadata ─────────────────────────────────────────────────
    result = subprocess.run(
        ["mkvmerge", "-J", str(mkv_path)],
        capture_output=True, text=True, encoding="utf-8"
    )
    if result.returncode != 0:
        print(f"ERROR: mkvmerge could not read '{mkv_path.name}'.")
        print(f"  {result.stderr.strip()}")
        return

    try:
        mkv_info = json.loads(result.stdout)
    except json.JSONDecodeError as exc:
        print(f"ERROR: failed to parse mkvmerge JSON output: {exc}")
        return

    tracks      = mkv_info.get("tracks", [])
    attachments = mkv_info.get("attachments", [])

    # ── 2. Find ASS/SSA subtitle tracks ─────────────────────────────────────
    ass_tracks = [
        t for t in tracks
        if t.get("type") == "subtitles"
        and (
            "S_TEXT/ASS" in str(t.get("properties", {}).get("codec_id", ""))
            or "S_TEXT/SSA" in str(t.get("properties", {}).get("codec_id", ""))
            or "SubStationAlpha" in str(t.get("codec", ""))
        )
    ]

    # ── 3. Find font attachments ─────────────────────────────────────────────
    font_mimes = ["font", "truetype", "opentype", "sfnt", "application/x-truetype-font"]
    font_attachments = [
        a for a in attachments
        if any(m in a.get("content_type", "").lower() for m in font_mimes)
    ]

    # ── 4. Report basic counts ───────────────────────────────────────────────
    print(f"  ASS/SSA subtitle tracks : {len(ass_tracks)}")
    print(f"  Font attachments        : {len(font_attachments)}")

    if not ass_tracks:
        print("\n  WARNING: No ASS/SSA subtitle tracks found — no font requirements to check.")
        if not font_attachments:
            print("           No font attachments either. Nothing to report.")
        else:
            print(f"           {len(font_attachments)} font attachment(s) present but no subtitles reference them.")
            _list_embedded_fonts_only(font_attachments)
        return

    # ── 5. Extract ASS tracks to a temp directory ────────────────────────────
    required_fonts: set[str] = set()

    with tempfile.TemporaryDirectory(prefix="fonts_scanner_") as tmp:
        tmp_path = Path(tmp)

        # Extract subtitle tracks
        extract_sub_cmd = ["mkvextract", "tracks", str(mkv_path)]
        ass_temp_files: list[Path] = []
        for t in ass_tracks:
            tid = t["id"]
            out = tmp_path / f"track_{tid}.ass"
            extract_sub_cmd.append(f"{tid}:{out}")
            ass_temp_files.append(out)

        sub_result = subprocess.run(extract_sub_cmd, capture_output=True, text=True)
        if sub_result.returncode != 0:
            print(f"ERROR extracting subtitle tracks: {sub_result.stderr.strip()}")
            return

        # Collect required font names from each ASS file
        print(f"\n  ASS tracks parsed:")
        for ass_file in ass_temp_files:
            if not ass_file.exists():
                continue
            found = get_ass_font_names(ass_file)
            tid_str = ass_file.stem.replace("track_", "")
            track_info = next(
                (t for t in ass_tracks if str(t["id"]) == tid_str), {}
            )
            lang = track_info.get("properties", {}).get("language", "und")
            name = track_info.get("properties", {}).get("track_name", "")
            label = f"Track {tid_str}"
            if name:
                label += f" – {name}"
            label += f" [{lang}]"
            print(f"    {label}: {len(found)} font(s) referenced")
            required_fonts.update(found)

        # ── 6. Extract font attachments ──────────────────────────────────────
        embedded_font_names: dict[str, set[str]] = {}   # filename → internal names (lower)

        if font_attachments:
            extract_att_cmd = ["mkvextract", "attachments", str(mkv_path)]
            for att in font_attachments:
                aid  = att["id"]
                stem = safe_stem(att["file_name"])
                out  = tmp_path / f"att_{aid}_{stem}"
                extract_att_cmd.append(f"{aid}:{out}")
                att["_temp_path"] = out

            att_result = subprocess.run(extract_att_cmd, capture_output=True, text=True)
            if att_result.returncode != 0:
                print(f"ERROR extracting font attachments: {att_result.stderr.strip()}")
            else:
                for att in font_attachments:
                    temp_p: Path = att["_temp_path"]
                    if temp_p.exists():
                        internal = get_internal_font_names(temp_p)
                        embedded_font_names[att["file_name"]] = internal

        # ── 7. Match required → embedded ─────────────────────────────────────
        #
        # A font is "covered" if any embedded font file reports an internal name
        # that matches a required name (case-insensitive), OR if the attachment
        # filename stem matches the required name.
        #
        covered_required: set[str] = set()
        attachment_match: dict[str, list[str]] = {}  # file_name → matched font names

        for filename, internal_names in embedded_font_names.items():
            stem_lower = Path(filename).stem.lower()
            matched = set()
            for req in required_fonts:
                if req in internal_names or req == stem_lower:
                    matched.add(req)
            if matched:
                attachment_match[filename] = sorted(matched)
                covered_required.update(matched)

        missing_fonts  = required_fonts - covered_required
        extra_embedded = set(embedded_font_names.keys()) - set(attachment_match.keys())

        # ── 8. Print report ──────────────────────────────────────────────────
        _print_report(required_fonts, embedded_font_names, attachment_match,
                      missing_fonts, extra_embedded)


def _list_embedded_fonts_only(font_attachments: list) -> None:
    """Called when there are no ASS tracks — just list what's embedded."""
    print(f"\n  Embedded font attachments:")
    for att in font_attachments:
        print(f"    • {att['file_name']}  ({att.get('content_type', '?')})")


def _print_report(
    required: set[str],
    embedded: dict[str, set[str]],
    matched: dict[str, list[str]],
    missing: set[str],
    extra: set[str],
) -> None:
    sep = "─" * 70

    # ── Needed fonts ─────────────────────────────────────────────────────────
    print(f"\n  FONTS NEEDED BY SUBTITLES  ({len(required)} total)")
    print(f"  {sep}")
    if required:
        for name in sorted(required):
            status = "[OK]" if name not in missing else "[MISSING]"
            print(f"    {status}  {name}")
    else:
        print(f"    (none)")

    # ── Embedded fonts ────────────────────────────────────────────────────────
    print(f"\n  FONTS EMBEDDED IN MKV  ({len(embedded)} file(s))")
    print(f"  {sep}")
    if embedded:
        for filename, internal_names in sorted(embedded.items()):
            is_used = filename in matched
            tag  = "[USED]  " if is_used else "[EXTRA] "
            hits = matched.get(filename, [])
            line = f"    {tag}{filename}"
            if hits:
                line += f"  →  covers: {', '.join(hits)}"
            print(line)
            # Show internal name(s) for transparency
            for iname in sorted(internal_names)[:6]:
                print(f"          internal name: {iname}")
            if len(internal_names) > 6:
                print(f"          … and {len(internal_names) - 6} more")
    else:
        print(f"    (none)")

    # ── Missing fonts ─────────────────────────────────────────────────────────
    print(f"\n  MISSING FONTS  ({len(missing)} font(s) not embedded)")
    print(f"  {sep}")
    if missing:
        for name in sorted(missing):
            print(f"    ✘  {name}")
    else:
        print(f"    ✓ All required fonts are present — nothing missing!")

    # ── Extra (unused) embedded fonts ─────────────────────────────────────────
    if extra:
        print(f"\n  EXTRA / UNUSED EMBEDDINGS  ({len(extra)} file(s) not needed by any subtitle)")
        print(f"  {sep}")
        for filename in sorted(extra):
            print(f"    ⚠  {filename}")

    print()


# ── Entry point ───────────────────────────────────────────────────────────────

def main() -> None:
    if len(sys.argv) < 2:
        print()
        print("subtitle_fonts_scanner.py")
        print("─" * 40)
        print("  Scans an MKV file and reports which fonts are needed by")
        print("  ASS/SSA subtitles, which are already embedded, and which")
        print("  are missing.")
        print()
        print("Usage:")
        print("  python subtitle_fonts_scanner.py <input.mkv>")
        print()
        print("Example:")
        print("  python subtitle_fonts_scanner.py \"My Show S01E01.mkv\"")
        print()
        sys.exit(1)

    mkv_path = Path(sys.argv[1])

    if not mkv_path.exists():
        print(f"\nError: File not found: {mkv_path}")
        sys.exit(1)

    if mkv_path.suffix.lower() != ".mkv":
        print(f"\nWarning: '{mkv_path.name}' does not have an .mkv extension.")
        print("  This script is designed for MKV files. Proceeding anyway…")

    scan_mkv(mkv_path)


if __name__ == "__main__":
    main()