Files
subtitle_attachment_cleanup/subtitle_fonts_scanner.py

344 lines
14 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
"""
subtitle_fonts_scanner.py
─────────────────────────
Read-only scanner that inspects a single MKV file and reports:
• Fonts required by ASS/SSA subtitle tracks
• Fonts currently embedded as attachments
• Fonts that are missing (required but not embedded)
Usage:
python subtitle_fonts_scanner.py input.mkv
"""
import json
import os
import re
import subprocess
import sys
import tempfile
from pathlib import Path
# ── Dependency check ──────────────────────────────────────────────────────────
try:
from fontTools.ttLib import TTFont
except ImportError:
print("Error: 'fonttools' is required to accurately read internal font names.")
print(" Install with: pip install fonttools")
print(" (Arch Linux): sudo pacman -S python-fonttools")
sys.exit(1)
# ── Helpers ───────────────────────────────────────────────────────────────────
def get_ass_font_names(ass_path: Path) -> set[str]:
"""
Parse an ASS/SSA file and return a lower-case set of all font family names
referenced in [V4+ Styles] and via \\fn inline overrides in [Events].
"""
fonts: set[str] = set()
try:
with open(ass_path, "r", encoding="utf-8", errors="ignore") as fh:
in_styles = False
in_events = False
fontname_idx = -1
for raw in fh:
line = raw.strip()
if not line:
continue
if line.startswith("["):
in_styles = line == "[V4+ Styles]"
in_events = line == "[Events]"
continue
if in_styles:
if line.startswith("Format:"):
fmt_cols = [c.strip().lower() for c in line[len("Format:"):].split(",")]
fontname_idx = fmt_cols.index("fontname") if "fontname" in fmt_cols else -1
elif line.startswith("Style:") and fontname_idx != -1:
cols = [c.strip() for c in line[len("Style:"):].split(",")]
if len(cols) > fontname_idx:
fonts.add(cols[fontname_idx])
if in_events and line.startswith("Dialogue:"):
for match in re.findall(r"\\fn([^\\}]+)", line):
fonts.add(match.strip())
except Exception as exc:
print(yellow(f" Warning: could not fully parse {ass_path.name}: {exc}"))
return {f.lower() for f in fonts if f}
def get_internal_font_names(font_path: Path) -> set[str]:
"""
Extract the internal family / full-name / typographic family strings
from a TTF, OTF, or TTC file using fontTools.
Returns a lower-case set.
"""
names: set[str] = set()
try:
font = TTFont(str(font_path), fontNumber=0)
for record in font["name"].names:
if record.nameID in (1, 4, 16):
try:
names.add(record.toUnicode().lower())
except Exception:
pass
font.close()
except Exception as exc:
print(yellow(f" Warning: could not read font metadata for {font_path.name}: {exc}"))
return names
def safe_stem(name: str) -> str:
"""Strip unsafe characters for use as a temp filename component."""
return "".join(c for c in name if c.isalnum() or c in " ._-").rstrip()
# ── Core scan logic ───────────────────────────────────────────────────────────
def scan_mkv(mkv_path: Path) -> None:
print()
print(f"Scanning: {mkv_path.name}")
print("" * 70)
# ── 1. Read MKV metadata ─────────────────────────────────────────────────
result = subprocess.run(
["mkvmerge", "-J", str(mkv_path)],
capture_output=True, text=True, encoding="utf-8"
)
if result.returncode != 0:
print(f"ERROR: mkvmerge could not read '{mkv_path.name}'.")
print(f" {result.stderr.strip()}")
return
try:
mkv_info = json.loads(result.stdout)
except json.JSONDecodeError as exc:
print(f"ERROR: failed to parse mkvmerge JSON output: {exc}")
return
tracks = mkv_info.get("tracks", [])
attachments = mkv_info.get("attachments", [])
# ── 2. Find ASS/SSA subtitle tracks ─────────────────────────────────────
ass_tracks = [
t for t in tracks
if t.get("type") == "subtitles"
and (
"S_TEXT/ASS" in str(t.get("properties", {}).get("codec_id", ""))
or "S_TEXT/SSA" in str(t.get("properties", {}).get("codec_id", ""))
or "SubStationAlpha" in str(t.get("codec", ""))
)
]
# ── 3. Find font attachments ─────────────────────────────────────────────
font_mimes = ["font", "truetype", "opentype", "sfnt", "application/x-truetype-font"]
font_attachments = [
a for a in attachments
if any(m in a.get("content_type", "").lower() for m in font_mimes)
]
# ── 4. Report basic counts ───────────────────────────────────────────────
print(f" ASS/SSA subtitle tracks : {len(ass_tracks)}")
print(f" Font attachments : {len(font_attachments)}")
if not ass_tracks:
print("\n WARNING: No ASS/SSA subtitle tracks found — no font requirements to check.")
if not font_attachments:
print(" No font attachments either. Nothing to report.")
else:
print(f" {len(font_attachments)} font attachment(s) present but no subtitles reference them.")
_list_embedded_fonts_only(font_attachments)
return
# ── 5. Extract ASS tracks to a temp directory ────────────────────────────
required_fonts: set[str] = set()
with tempfile.TemporaryDirectory(prefix="fonts_scanner_") as tmp:
tmp_path = Path(tmp)
# Extract subtitle tracks
extract_sub_cmd = ["mkvextract", "tracks", str(mkv_path)]
ass_temp_files: list[Path] = []
for t in ass_tracks:
tid = t["id"]
out = tmp_path / f"track_{tid}.ass"
extract_sub_cmd.append(f"{tid}:{out}")
ass_temp_files.append(out)
sub_result = subprocess.run(extract_sub_cmd, capture_output=True, text=True)
if sub_result.returncode != 0:
print(f"ERROR extracting subtitle tracks: {sub_result.stderr.strip()}")
return
# Collect required font names from each ASS file
print(f"\n ASS tracks parsed:")
for ass_file in ass_temp_files:
if not ass_file.exists():
continue
found = get_ass_font_names(ass_file)
tid_str = ass_file.stem.replace("track_", "")
track_info = next(
(t for t in ass_tracks if str(t["id"]) == tid_str), {}
)
lang = track_info.get("properties", {}).get("language", "und")
name = track_info.get("properties", {}).get("track_name", "")
label = f"Track {tid_str}"
if name:
label += f" {name}"
label += f" [{lang}]"
print(f" {label}: {len(found)} font(s) referenced")
required_fonts.update(found)
# ── 6. Extract font attachments ──────────────────────────────────────
embedded_font_names: dict[str, set[str]] = {} # filename → internal names (lower)
if font_attachments:
extract_att_cmd = ["mkvextract", "attachments", str(mkv_path)]
for att in font_attachments:
aid = att["id"]
stem = safe_stem(att["file_name"])
out = tmp_path / f"att_{aid}_{stem}"
extract_att_cmd.append(f"{aid}:{out}")
att["_temp_path"] = out
att_result = subprocess.run(extract_att_cmd, capture_output=True, text=True)
if att_result.returncode != 0:
print(f"ERROR extracting font attachments: {att_result.stderr.strip()}")
else:
for att in font_attachments:
temp_p: Path = att["_temp_path"]
if temp_p.exists():
internal = get_internal_font_names(temp_p)
embedded_font_names[att["file_name"]] = internal
# ── 7. Match required → embedded ─────────────────────────────────────
#
# A font is "covered" if any embedded font file reports an internal name
# that matches a required name (case-insensitive), OR if the attachment
# filename stem matches the required name.
#
covered_required: set[str] = set()
attachment_match: dict[str, list[str]] = {} # file_name → matched font names
for filename, internal_names in embedded_font_names.items():
stem_lower = Path(filename).stem.lower()
matched = set()
for req in required_fonts:
if req in internal_names or req == stem_lower:
matched.add(req)
if matched:
attachment_match[filename] = sorted(matched)
covered_required.update(matched)
missing_fonts = required_fonts - covered_required
extra_embedded = set(embedded_font_names.keys()) - set(attachment_match.keys())
# ── 8. Print report ──────────────────────────────────────────────────
_print_report(required_fonts, embedded_font_names, attachment_match,
missing_fonts, extra_embedded)
def _list_embedded_fonts_only(font_attachments: list) -> None:
"""Called when there are no ASS tracks — just list what's embedded."""
print(f"\n Embedded font attachments:")
for att in font_attachments:
print(f"{att['file_name']} ({att.get('content_type', '?')})")
def _print_report(
required: set[str],
embedded: dict[str, set[str]],
matched: dict[str, list[str]],
missing: set[str],
extra: set[str],
) -> None:
sep = "" * 70
# ── Needed fonts ─────────────────────────────────────────────────────────
print(f"\n FONTS NEEDED BY SUBTITLES ({len(required)} total)")
print(f" {sep}")
if required:
for name in sorted(required):
status = "[OK]" if name not in missing else "[MISSING]"
print(f" {status} {name}")
else:
print(f" (none)")
# ── Embedded fonts ────────────────────────────────────────────────────────
print(f"\n FONTS EMBEDDED IN MKV ({len(embedded)} file(s))")
print(f" {sep}")
if embedded:
for filename, internal_names in sorted(embedded.items()):
is_used = filename in matched
tag = "[USED] " if is_used else "[EXTRA] "
hits = matched.get(filename, [])
line = f" {tag}{filename}"
if hits:
line += f" → covers: {', '.join(hits)}"
print(line)
# Show internal name(s) for transparency
for iname in sorted(internal_names)[:6]:
print(f" internal name: {iname}")
if len(internal_names) > 6:
print(f" … and {len(internal_names) - 6} more")
else:
print(f" (none)")
# ── Missing fonts ─────────────────────────────────────────────────────────
print(f"\n MISSING FONTS ({len(missing)} font(s) not embedded)")
print(f" {sep}")
if missing:
for name in sorted(missing):
print(f"{name}")
else:
print(f" ✓ All required fonts are present — nothing missing!")
# ── Extra (unused) embedded fonts ─────────────────────────────────────────
if extra:
print(f"\n EXTRA / UNUSED EMBEDDINGS ({len(extra)} file(s) not needed by any subtitle)")
print(f" {sep}")
for filename in sorted(extra):
print(f"{filename}")
print()
# ── Entry point ───────────────────────────────────────────────────────────────
def main() -> None:
if len(sys.argv) < 2:
print()
print("subtitle_fonts_scanner.py")
print("" * 40)
print(" Scans an MKV file and reports which fonts are needed by")
print(" ASS/SSA subtitles, which are already embedded, and which")
print(" are missing.")
print()
print("Usage:")
print(" python subtitle_fonts_scanner.py <input.mkv>")
print()
print("Example:")
print(" python subtitle_fonts_scanner.py \"My Show S01E01.mkv\"")
print()
sys.exit(1)
mkv_path = Path(sys.argv[1])
if not mkv_path.exists():
print(f"\nError: File not found: {mkv_path}")
sys.exit(1)
if mkv_path.suffix.lower() != ".mkv":
print(f"\nWarning: '{mkv_path.name}' does not have an .mkv extension.")
print(" This script is designed for MKV files. Proceeding anyway…")
scan_mkv(mkv_path)
if __name__ == "__main__":
main()