344 lines
14 KiB
Python
344 lines
14 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
subtitle_fonts_scanner.py
|
||
─────────────────────────
|
||
Read-only scanner that inspects a single MKV file and reports:
|
||
• Fonts required by ASS/SSA subtitle tracks
|
||
• Fonts currently embedded as attachments
|
||
• Fonts that are missing (required but not embedded)
|
||
|
||
Usage:
|
||
python subtitle_fonts_scanner.py input.mkv
|
||
"""
|
||
|
||
import json
|
||
import os
|
||
import re
|
||
import subprocess
|
||
import sys
|
||
import tempfile
|
||
from pathlib import Path
|
||
|
||
# ── Dependency check ──────────────────────────────────────────────────────────
|
||
try:
|
||
from fontTools.ttLib import TTFont
|
||
except ImportError:
|
||
print("Error: 'fonttools' is required to accurately read internal font names.")
|
||
print(" Install with: pip install fonttools")
|
||
print(" (Arch Linux): sudo pacman -S python-fonttools")
|
||
sys.exit(1)
|
||
|
||
# ── Helpers ───────────────────────────────────────────────────────────────────
|
||
|
||
def get_ass_font_names(ass_path: Path) -> set[str]:
|
||
"""
|
||
Parse an ASS/SSA file and return a lower-case set of all font family names
|
||
referenced in [V4+ Styles] and via \\fn inline overrides in [Events].
|
||
"""
|
||
fonts: set[str] = set()
|
||
try:
|
||
with open(ass_path, "r", encoding="utf-8", errors="ignore") as fh:
|
||
in_styles = False
|
||
in_events = False
|
||
fontname_idx = -1
|
||
|
||
for raw in fh:
|
||
line = raw.strip()
|
||
if not line:
|
||
continue
|
||
|
||
if line.startswith("["):
|
||
in_styles = line == "[V4+ Styles]"
|
||
in_events = line == "[Events]"
|
||
continue
|
||
|
||
if in_styles:
|
||
if line.startswith("Format:"):
|
||
fmt_cols = [c.strip().lower() for c in line[len("Format:"):].split(",")]
|
||
fontname_idx = fmt_cols.index("fontname") if "fontname" in fmt_cols else -1
|
||
elif line.startswith("Style:") and fontname_idx != -1:
|
||
cols = [c.strip() for c in line[len("Style:"):].split(",")]
|
||
if len(cols) > fontname_idx:
|
||
fonts.add(cols[fontname_idx])
|
||
|
||
if in_events and line.startswith("Dialogue:"):
|
||
for match in re.findall(r"\\fn([^\\}]+)", line):
|
||
fonts.add(match.strip())
|
||
|
||
except Exception as exc:
|
||
print(yellow(f" Warning: could not fully parse {ass_path.name}: {exc}"))
|
||
|
||
return {f.lower() for f in fonts if f}
|
||
|
||
|
||
def get_internal_font_names(font_path: Path) -> set[str]:
|
||
"""
|
||
Extract the internal family / full-name / typographic family strings
|
||
from a TTF, OTF, or TTC file using fontTools.
|
||
Returns a lower-case set.
|
||
"""
|
||
names: set[str] = set()
|
||
try:
|
||
font = TTFont(str(font_path), fontNumber=0)
|
||
for record in font["name"].names:
|
||
if record.nameID in (1, 4, 16):
|
||
try:
|
||
names.add(record.toUnicode().lower())
|
||
except Exception:
|
||
pass
|
||
font.close()
|
||
except Exception as exc:
|
||
print(yellow(f" Warning: could not read font metadata for {font_path.name}: {exc}"))
|
||
return names
|
||
|
||
|
||
def safe_stem(name: str) -> str:
|
||
"""Strip unsafe characters for use as a temp filename component."""
|
||
return "".join(c for c in name if c.isalnum() or c in " ._-").rstrip()
|
||
|
||
|
||
# ── Core scan logic ───────────────────────────────────────────────────────────
|
||
|
||
def scan_mkv(mkv_path: Path) -> None:
|
||
print()
|
||
print(f"Scanning: {mkv_path.name}")
|
||
print("─" * 70)
|
||
|
||
# ── 1. Read MKV metadata ─────────────────────────────────────────────────
|
||
result = subprocess.run(
|
||
["mkvmerge", "-J", str(mkv_path)],
|
||
capture_output=True, text=True, encoding="utf-8"
|
||
)
|
||
if result.returncode != 0:
|
||
print(f"ERROR: mkvmerge could not read '{mkv_path.name}'.")
|
||
print(f" {result.stderr.strip()}")
|
||
return
|
||
|
||
try:
|
||
mkv_info = json.loads(result.stdout)
|
||
except json.JSONDecodeError as exc:
|
||
print(f"ERROR: failed to parse mkvmerge JSON output: {exc}")
|
||
return
|
||
|
||
tracks = mkv_info.get("tracks", [])
|
||
attachments = mkv_info.get("attachments", [])
|
||
|
||
# ── 2. Find ASS/SSA subtitle tracks ─────────────────────────────────────
|
||
ass_tracks = [
|
||
t for t in tracks
|
||
if t.get("type") == "subtitles"
|
||
and (
|
||
"S_TEXT/ASS" in str(t.get("properties", {}).get("codec_id", ""))
|
||
or "S_TEXT/SSA" in str(t.get("properties", {}).get("codec_id", ""))
|
||
or "SubStationAlpha" in str(t.get("codec", ""))
|
||
)
|
||
]
|
||
|
||
# ── 3. Find font attachments ─────────────────────────────────────────────
|
||
font_mimes = ["font", "truetype", "opentype", "sfnt", "application/x-truetype-font"]
|
||
font_attachments = [
|
||
a for a in attachments
|
||
if any(m in a.get("content_type", "").lower() for m in font_mimes)
|
||
]
|
||
|
||
# ── 4. Report basic counts ───────────────────────────────────────────────
|
||
print(f" ASS/SSA subtitle tracks : {len(ass_tracks)}")
|
||
print(f" Font attachments : {len(font_attachments)}")
|
||
|
||
if not ass_tracks:
|
||
print("\n WARNING: No ASS/SSA subtitle tracks found — no font requirements to check.")
|
||
if not font_attachments:
|
||
print(" No font attachments either. Nothing to report.")
|
||
else:
|
||
print(f" {len(font_attachments)} font attachment(s) present but no subtitles reference them.")
|
||
_list_embedded_fonts_only(font_attachments)
|
||
return
|
||
|
||
# ── 5. Extract ASS tracks to a temp directory ────────────────────────────
|
||
required_fonts: set[str] = set()
|
||
|
||
with tempfile.TemporaryDirectory(prefix="fonts_scanner_") as tmp:
|
||
tmp_path = Path(tmp)
|
||
|
||
# Extract subtitle tracks
|
||
extract_sub_cmd = ["mkvextract", "tracks", str(mkv_path)]
|
||
ass_temp_files: list[Path] = []
|
||
for t in ass_tracks:
|
||
tid = t["id"]
|
||
out = tmp_path / f"track_{tid}.ass"
|
||
extract_sub_cmd.append(f"{tid}:{out}")
|
||
ass_temp_files.append(out)
|
||
|
||
sub_result = subprocess.run(extract_sub_cmd, capture_output=True, text=True)
|
||
if sub_result.returncode != 0:
|
||
print(f"ERROR extracting subtitle tracks: {sub_result.stderr.strip()}")
|
||
return
|
||
|
||
# Collect required font names from each ASS file
|
||
print(f"\n ASS tracks parsed:")
|
||
for ass_file in ass_temp_files:
|
||
if not ass_file.exists():
|
||
continue
|
||
found = get_ass_font_names(ass_file)
|
||
tid_str = ass_file.stem.replace("track_", "")
|
||
track_info = next(
|
||
(t for t in ass_tracks if str(t["id"]) == tid_str), {}
|
||
)
|
||
lang = track_info.get("properties", {}).get("language", "und")
|
||
name = track_info.get("properties", {}).get("track_name", "")
|
||
label = f"Track {tid_str}"
|
||
if name:
|
||
label += f" – {name}"
|
||
label += f" [{lang}]"
|
||
print(f" {label}: {len(found)} font(s) referenced")
|
||
required_fonts.update(found)
|
||
|
||
# ── 6. Extract font attachments ──────────────────────────────────────
|
||
embedded_font_names: dict[str, set[str]] = {} # filename → internal names (lower)
|
||
|
||
if font_attachments:
|
||
extract_att_cmd = ["mkvextract", "attachments", str(mkv_path)]
|
||
for att in font_attachments:
|
||
aid = att["id"]
|
||
stem = safe_stem(att["file_name"])
|
||
out = tmp_path / f"att_{aid}_{stem}"
|
||
extract_att_cmd.append(f"{aid}:{out}")
|
||
att["_temp_path"] = out
|
||
|
||
att_result = subprocess.run(extract_att_cmd, capture_output=True, text=True)
|
||
if att_result.returncode != 0:
|
||
print(f"ERROR extracting font attachments: {att_result.stderr.strip()}")
|
||
else:
|
||
for att in font_attachments:
|
||
temp_p: Path = att["_temp_path"]
|
||
if temp_p.exists():
|
||
internal = get_internal_font_names(temp_p)
|
||
embedded_font_names[att["file_name"]] = internal
|
||
|
||
# ── 7. Match required → embedded ─────────────────────────────────────
|
||
#
|
||
# A font is "covered" if any embedded font file reports an internal name
|
||
# that matches a required name (case-insensitive), OR if the attachment
|
||
# filename stem matches the required name.
|
||
#
|
||
covered_required: set[str] = set()
|
||
attachment_match: dict[str, list[str]] = {} # file_name → matched font names
|
||
|
||
for filename, internal_names in embedded_font_names.items():
|
||
stem_lower = Path(filename).stem.lower()
|
||
matched = set()
|
||
for req in required_fonts:
|
||
if req in internal_names or req == stem_lower:
|
||
matched.add(req)
|
||
if matched:
|
||
attachment_match[filename] = sorted(matched)
|
||
covered_required.update(matched)
|
||
|
||
missing_fonts = required_fonts - covered_required
|
||
extra_embedded = set(embedded_font_names.keys()) - set(attachment_match.keys())
|
||
|
||
# ── 8. Print report ──────────────────────────────────────────────────
|
||
_print_report(required_fonts, embedded_font_names, attachment_match,
|
||
missing_fonts, extra_embedded)
|
||
|
||
|
||
def _list_embedded_fonts_only(font_attachments: list) -> None:
|
||
"""Called when there are no ASS tracks — just list what's embedded."""
|
||
print(f"\n Embedded font attachments:")
|
||
for att in font_attachments:
|
||
print(f" • {att['file_name']} ({att.get('content_type', '?')})")
|
||
|
||
|
||
def _print_report(
|
||
required: set[str],
|
||
embedded: dict[str, set[str]],
|
||
matched: dict[str, list[str]],
|
||
missing: set[str],
|
||
extra: set[str],
|
||
) -> None:
|
||
sep = "─" * 70
|
||
|
||
# ── Needed fonts ─────────────────────────────────────────────────────────
|
||
print(f"\n FONTS NEEDED BY SUBTITLES ({len(required)} total)")
|
||
print(f" {sep}")
|
||
if required:
|
||
for name in sorted(required):
|
||
status = "[OK]" if name not in missing else "[MISSING]"
|
||
print(f" {status} {name}")
|
||
else:
|
||
print(f" (none)")
|
||
|
||
# ── Embedded fonts ────────────────────────────────────────────────────────
|
||
print(f"\n FONTS EMBEDDED IN MKV ({len(embedded)} file(s))")
|
||
print(f" {sep}")
|
||
if embedded:
|
||
for filename, internal_names in sorted(embedded.items()):
|
||
is_used = filename in matched
|
||
tag = "[USED] " if is_used else "[EXTRA] "
|
||
hits = matched.get(filename, [])
|
||
line = f" {tag}{filename}"
|
||
if hits:
|
||
line += f" → covers: {', '.join(hits)}"
|
||
print(line)
|
||
# Show internal name(s) for transparency
|
||
for iname in sorted(internal_names)[:6]:
|
||
print(f" internal name: {iname}")
|
||
if len(internal_names) > 6:
|
||
print(f" … and {len(internal_names) - 6} more")
|
||
else:
|
||
print(f" (none)")
|
||
|
||
# ── Missing fonts ─────────────────────────────────────────────────────────
|
||
print(f"\n MISSING FONTS ({len(missing)} font(s) not embedded)")
|
||
print(f" {sep}")
|
||
if missing:
|
||
for name in sorted(missing):
|
||
print(f" ✘ {name}")
|
||
else:
|
||
print(f" ✓ All required fonts are present — nothing missing!")
|
||
|
||
# ── Extra (unused) embedded fonts ─────────────────────────────────────────
|
||
if extra:
|
||
print(f"\n EXTRA / UNUSED EMBEDDINGS ({len(extra)} file(s) not needed by any subtitle)")
|
||
print(f" {sep}")
|
||
for filename in sorted(extra):
|
||
print(f" ⚠ {filename}")
|
||
|
||
print()
|
||
|
||
|
||
# ── Entry point ───────────────────────────────────────────────────────────────
|
||
|
||
def main() -> None:
|
||
if len(sys.argv) < 2:
|
||
print()
|
||
print("subtitle_fonts_scanner.py")
|
||
print("─" * 40)
|
||
print(" Scans an MKV file and reports which fonts are needed by")
|
||
print(" ASS/SSA subtitles, which are already embedded, and which")
|
||
print(" are missing.")
|
||
print()
|
||
print("Usage:")
|
||
print(" python subtitle_fonts_scanner.py <input.mkv>")
|
||
print()
|
||
print("Example:")
|
||
print(" python subtitle_fonts_scanner.py \"My Show S01E01.mkv\"")
|
||
print()
|
||
sys.exit(1)
|
||
|
||
mkv_path = Path(sys.argv[1])
|
||
|
||
if not mkv_path.exists():
|
||
print(f"\nError: File not found: {mkv_path}")
|
||
sys.exit(1)
|
||
|
||
if mkv_path.suffix.lower() != ".mkv":
|
||
print(f"\nWarning: '{mkv_path.name}' does not have an .mkv extension.")
|
||
print(" This script is designed for MKV files. Proceeding anyway…")
|
||
|
||
scan_mkv(mkv_path)
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|