diff --git a/README.md b/README.md index f73d458..2230232 100644 --- a/README.md +++ b/README.md @@ -38,11 +38,77 @@ python subtitle_fonts_cleaner.py # If in your PATH, simply execute: subtitle_fonts_cleaner.py ``` +This is the main script and intended default workflow for batch cleanup. + ### Folder Structure Upon execution, the script will create three folders in your working directory: - `temp_subs_fonts/` - A temporary directory used during processing (automatically deleted upon completion). - `original/` - Your original, unmodified `.mkv` files are safely moved here. - `finished/` - The new, lean `.mkv` files containing only the active ASS tracks, required font attachments, and original audio/video streams. +## Supplemental Script: Font Scanner (Read-Only) +This repository also includes `subtitle_fonts_scanner.py`, a companion script for inspection and reporting. + +Use the scanner when you want a dry-run style check before cleaning. +It does not modify files and does not create output folders. + +### What the scanner reports +- Number of ASS/SSA subtitle tracks detected +- Number of embedded font attachments +- Which fonts are required by subtitle styles and inline `\fn` overrides +- Which required fonts are covered by current attachments +- Which fonts are missing +- Which embedded font attachments appear unused + +### Scanner usage +Run it against a single MKV file: + +```bash +python subtitle_fonts_scanner.py "input.mkv" +# If in your PATH, simply execute: subtitle_fonts_scanner.py "input.mkv" +``` + +### Sample output +Example (truncated): + +```text +Scanning: Example Episode 01.mkv +────────────────────────────────────────────────────────────────────── + ASS/SSA subtitle tracks : 2 + Font attachments : 15 + + ASS tracks parsed: + Track 2 [eng]: 1 font(s) referenced + Track 3 [ger]: 3 font(s) referenced + + FONTS NEEDED BY SUBTITLES (4 total) + ────────────────────────────────────────────────────────────────────── + [OK] arial + [OK] gandhi sans + [MISSING] georgia bold + [OK] times new roman bold + + FONTS EMBEDDED IN MKV (15 file(s)) + ────────────────────────────────────────────────────────────────────── + [USED] ARIALNB.TTF -> covers: arial + [EXTRA] AdobeArabic-Bold.otf + ... + + MISSING FONTS (1 font(s) not embedded) + ────────────────────────────────────────────────────────────────────── + ✘ georgia bold + + EXTRA / UNUSED EMBEDDINGS (10 file(s) not needed by any subtitle) + ────────────────────────────────────────────────────────────────────── + ⚠ AdobeArabic-Bold.otf + ⚠ comic.ttf + ... +``` + +### Typical workflow +1. Run `subtitle_fonts_scanner.py` on a file to preview needed vs unused fonts. +2. Run `subtitle_fonts_cleaner.py` to process all MKVs in the working directory. +3. Optionally run the scanner again on a cleaned file to verify the result. + ## License MIT License. See the [LICENSE](LICENSE) file for more details. \ No newline at end of file diff --git a/subtitle_fonts_scanner.py b/subtitle_fonts_scanner.py new file mode 100644 index 0000000..5835163 --- /dev/null +++ b/subtitle_fonts_scanner.py @@ -0,0 +1,343 @@ +#!/usr/bin/env python3 +""" +subtitle_fonts_scanner.py +───────────────────────── +Read-only scanner that inspects a single MKV file and reports: + • Fonts required by ASS/SSA subtitle tracks + • Fonts currently embedded as attachments + • Fonts that are missing (required but not embedded) + +Usage: + python subtitle_fonts_scanner.py input.mkv +""" + +import json +import os +import re +import subprocess +import sys +import tempfile +from pathlib import Path + +# ── Dependency check ────────────────────────────────────────────────────────── +try: + from fontTools.ttLib import TTFont +except ImportError: + print("Error: 'fonttools' is required to accurately read internal font names.") + print(" Install with: pip install fonttools") + print(" (Arch Linux): sudo pacman -S python-fonttools") + sys.exit(1) + +# ── Helpers ─────────────────────────────────────────────────────────────────── + +def get_ass_font_names(ass_path: Path) -> set[str]: + """ + Parse an ASS/SSA file and return a lower-case set of all font family names + referenced in [V4+ Styles] and via \\fn inline overrides in [Events]. + """ + fonts: set[str] = set() + try: + with open(ass_path, "r", encoding="utf-8", errors="ignore") as fh: + in_styles = False + in_events = False + fontname_idx = -1 + + for raw in fh: + line = raw.strip() + if not line: + continue + + if line.startswith("["): + in_styles = line == "[V4+ Styles]" + in_events = line == "[Events]" + continue + + if in_styles: + if line.startswith("Format:"): + fmt_cols = [c.strip().lower() for c in line[len("Format:"):].split(",")] + fontname_idx = fmt_cols.index("fontname") if "fontname" in fmt_cols else -1 + elif line.startswith("Style:") and fontname_idx != -1: + cols = [c.strip() for c in line[len("Style:"):].split(",")] + if len(cols) > fontname_idx: + fonts.add(cols[fontname_idx]) + + if in_events and line.startswith("Dialogue:"): + for match in re.findall(r"\\fn([^\\}]+)", line): + fonts.add(match.strip()) + + except Exception as exc: + print(yellow(f" Warning: could not fully parse {ass_path.name}: {exc}")) + + return {f.lower() for f in fonts if f} + + +def get_internal_font_names(font_path: Path) -> set[str]: + """ + Extract the internal family / full-name / typographic family strings + from a TTF, OTF, or TTC file using fontTools. + Returns a lower-case set. + """ + names: set[str] = set() + try: + font = TTFont(str(font_path), fontNumber=0) + for record in font["name"].names: + if record.nameID in (1, 4, 16): + try: + names.add(record.toUnicode().lower()) + except Exception: + pass + font.close() + except Exception as exc: + print(yellow(f" Warning: could not read font metadata for {font_path.name}: {exc}")) + return names + + +def safe_stem(name: str) -> str: + """Strip unsafe characters for use as a temp filename component.""" + return "".join(c for c in name if c.isalnum() or c in " ._-").rstrip() + + +# ── Core scan logic ─────────────────────────────────────────────────────────── + +def scan_mkv(mkv_path: Path) -> None: + print() + print(f"Scanning: {mkv_path.name}") + print("─" * 70) + + # ── 1. Read MKV metadata ───────────────────────────────────────────────── + result = subprocess.run( + ["mkvmerge", "-J", str(mkv_path)], + capture_output=True, text=True, encoding="utf-8" + ) + if result.returncode != 0: + print(f"ERROR: mkvmerge could not read '{mkv_path.name}'.") + print(f" {result.stderr.strip()}") + return + + try: + mkv_info = json.loads(result.stdout) + except json.JSONDecodeError as exc: + print(f"ERROR: failed to parse mkvmerge JSON output: {exc}") + return + + tracks = mkv_info.get("tracks", []) + attachments = mkv_info.get("attachments", []) + + # ── 2. Find ASS/SSA subtitle tracks ───────────────────────────────────── + ass_tracks = [ + t for t in tracks + if t.get("type") == "subtitles" + and ( + "S_TEXT/ASS" in str(t.get("properties", {}).get("codec_id", "")) + or "S_TEXT/SSA" in str(t.get("properties", {}).get("codec_id", "")) + or "SubStationAlpha" in str(t.get("codec", "")) + ) + ] + + # ── 3. Find font attachments ───────────────────────────────────────────── + font_mimes = ["font", "truetype", "opentype", "sfnt", "application/x-truetype-font"] + font_attachments = [ + a for a in attachments + if any(m in a.get("content_type", "").lower() for m in font_mimes) + ] + + # ── 4. Report basic counts ─────────────────────────────────────────────── + print(f" ASS/SSA subtitle tracks : {len(ass_tracks)}") + print(f" Font attachments : {len(font_attachments)}") + + if not ass_tracks: + print("\n WARNING: No ASS/SSA subtitle tracks found — no font requirements to check.") + if not font_attachments: + print(" No font attachments either. Nothing to report.") + else: + print(f" {len(font_attachments)} font attachment(s) present but no subtitles reference them.") + _list_embedded_fonts_only(font_attachments) + return + + # ── 5. Extract ASS tracks to a temp directory ──────────────────────────── + required_fonts: set[str] = set() + + with tempfile.TemporaryDirectory(prefix="fonts_scanner_") as tmp: + tmp_path = Path(tmp) + + # Extract subtitle tracks + extract_sub_cmd = ["mkvextract", "tracks", str(mkv_path)] + ass_temp_files: list[Path] = [] + for t in ass_tracks: + tid = t["id"] + out = tmp_path / f"track_{tid}.ass" + extract_sub_cmd.append(f"{tid}:{out}") + ass_temp_files.append(out) + + sub_result = subprocess.run(extract_sub_cmd, capture_output=True, text=True) + if sub_result.returncode != 0: + print(f"ERROR extracting subtitle tracks: {sub_result.stderr.strip()}") + return + + # Collect required font names from each ASS file + print(f"\n ASS tracks parsed:") + for ass_file in ass_temp_files: + if not ass_file.exists(): + continue + found = get_ass_font_names(ass_file) + tid_str = ass_file.stem.replace("track_", "") + track_info = next( + (t for t in ass_tracks if str(t["id"]) == tid_str), {} + ) + lang = track_info.get("properties", {}).get("language", "und") + name = track_info.get("properties", {}).get("track_name", "") + label = f"Track {tid_str}" + if name: + label += f" – {name}" + label += f" [{lang}]" + print(f" {label}: {len(found)} font(s) referenced") + required_fonts.update(found) + + # ── 6. Extract font attachments ────────────────────────────────────── + embedded_font_names: dict[str, set[str]] = {} # filename → internal names (lower) + + if font_attachments: + extract_att_cmd = ["mkvextract", "attachments", str(mkv_path)] + for att in font_attachments: + aid = att["id"] + stem = safe_stem(att["file_name"]) + out = tmp_path / f"att_{aid}_{stem}" + extract_att_cmd.append(f"{aid}:{out}") + att["_temp_path"] = out + + att_result = subprocess.run(extract_att_cmd, capture_output=True, text=True) + if att_result.returncode != 0: + print(f"ERROR extracting font attachments: {att_result.stderr.strip()}") + else: + for att in font_attachments: + temp_p: Path = att["_temp_path"] + if temp_p.exists(): + internal = get_internal_font_names(temp_p) + embedded_font_names[att["file_name"]] = internal + + # ── 7. Match required → embedded ───────────────────────────────────── + # + # A font is "covered" if any embedded font file reports an internal name + # that matches a required name (case-insensitive), OR if the attachment + # filename stem matches the required name. + # + covered_required: set[str] = set() + attachment_match: dict[str, list[str]] = {} # file_name → matched font names + + for filename, internal_names in embedded_font_names.items(): + stem_lower = Path(filename).stem.lower() + matched = set() + for req in required_fonts: + if req in internal_names or req == stem_lower: + matched.add(req) + if matched: + attachment_match[filename] = sorted(matched) + covered_required.update(matched) + + missing_fonts = required_fonts - covered_required + extra_embedded = set(embedded_font_names.keys()) - set(attachment_match.keys()) + + # ── 8. Print report ────────────────────────────────────────────────── + _print_report(required_fonts, embedded_font_names, attachment_match, + missing_fonts, extra_embedded) + + +def _list_embedded_fonts_only(font_attachments: list) -> None: + """Called when there are no ASS tracks — just list what's embedded.""" + print(f"\n Embedded font attachments:") + for att in font_attachments: + print(f" • {att['file_name']} ({att.get('content_type', '?')})") + + +def _print_report( + required: set[str], + embedded: dict[str, set[str]], + matched: dict[str, list[str]], + missing: set[str], + extra: set[str], +) -> None: + sep = "─" * 70 + + # ── Needed fonts ───────────────────────────────────────────────────────── + print(f"\n FONTS NEEDED BY SUBTITLES ({len(required)} total)") + print(f" {sep}") + if required: + for name in sorted(required): + status = "[OK]" if name not in missing else "[MISSING]" + print(f" {status} {name}") + else: + print(f" (none)") + + # ── Embedded fonts ──────────────────────────────────────────────────────── + print(f"\n FONTS EMBEDDED IN MKV ({len(embedded)} file(s))") + print(f" {sep}") + if embedded: + for filename, internal_names in sorted(embedded.items()): + is_used = filename in matched + tag = "[USED] " if is_used else "[EXTRA] " + hits = matched.get(filename, []) + line = f" {tag}{filename}" + if hits: + line += f" → covers: {', '.join(hits)}" + print(line) + # Show internal name(s) for transparency + for iname in sorted(internal_names)[:6]: + print(f" internal name: {iname}") + if len(internal_names) > 6: + print(f" … and {len(internal_names) - 6} more") + else: + print(f" (none)") + + # ── Missing fonts ───────────────────────────────────────────────────────── + print(f"\n MISSING FONTS ({len(missing)} font(s) not embedded)") + print(f" {sep}") + if missing: + for name in sorted(missing): + print(f" ✘ {name}") + else: + print(f" ✓ All required fonts are present — nothing missing!") + + # ── Extra (unused) embedded fonts ───────────────────────────────────────── + if extra: + print(f"\n EXTRA / UNUSED EMBEDDINGS ({len(extra)} file(s) not needed by any subtitle)") + print(f" {sep}") + for filename in sorted(extra): + print(f" ⚠ {filename}") + + print() + + +# ── Entry point ─────────────────────────────────────────────────────────────── + +def main() -> None: + if len(sys.argv) < 2: + print() + print("subtitle_fonts_scanner.py") + print("─" * 40) + print(" Scans an MKV file and reports which fonts are needed by") + print(" ASS/SSA subtitles, which are already embedded, and which") + print(" are missing.") + print() + print("Usage:") + print(" python subtitle_fonts_scanner.py ") + print() + print("Example:") + print(" python subtitle_fonts_scanner.py \"My Show S01E01.mkv\"") + print() + sys.exit(1) + + mkv_path = Path(sys.argv[1]) + + if not mkv_path.exists(): + print(f"\nError: File not found: {mkv_path}") + sys.exit(1) + + if mkv_path.suffix.lower() != ".mkv": + print(f"\nWarning: '{mkv_path.name}' does not have an .mkv extension.") + print(" This script is designed for MKV files. Proceeding anyway…") + + scan_mkv(mkv_path) + + +if __name__ == "__main__": + main()