feat: add subtitle font scanner utility and update documentation

This commit is contained in:
2026-04-07 20:28:31 +02:00
parent 8350741f0d
commit 2001bc19dd
2 changed files with 409 additions and 0 deletions

View File

@@ -38,11 +38,77 @@ python subtitle_fonts_cleaner.py
# If in your PATH, simply execute: subtitle_fonts_cleaner.py # If in your PATH, simply execute: subtitle_fonts_cleaner.py
``` ```
This is the main script and intended default workflow for batch cleanup.
### Folder Structure ### Folder Structure
Upon execution, the script will create three folders in your working directory: Upon execution, the script will create three folders in your working directory:
- `temp_subs_fonts/` - A temporary directory used during processing (automatically deleted upon completion). - `temp_subs_fonts/` - A temporary directory used during processing (automatically deleted upon completion).
- `original/` - Your original, unmodified `.mkv` files are safely moved here. - `original/` - Your original, unmodified `.mkv` files are safely moved here.
- `finished/` - The new, lean `.mkv` files containing only the active ASS tracks, required font attachments, and original audio/video streams. - `finished/` - The new, lean `.mkv` files containing only the active ASS tracks, required font attachments, and original audio/video streams.
## Supplemental Script: Font Scanner (Read-Only)
This repository also includes `subtitle_fonts_scanner.py`, a companion script for inspection and reporting.
Use the scanner when you want a dry-run style check before cleaning.
It does not modify files and does not create output folders.
### What the scanner reports
- Number of ASS/SSA subtitle tracks detected
- Number of embedded font attachments
- Which fonts are required by subtitle styles and inline `\fn` overrides
- Which required fonts are covered by current attachments
- Which fonts are missing
- Which embedded font attachments appear unused
### Scanner usage
Run it against a single MKV file:
```bash
python subtitle_fonts_scanner.py "input.mkv"
# If in your PATH, simply execute: subtitle_fonts_scanner.py "input.mkv"
```
### Sample output
Example (truncated):
```text
Scanning: Example Episode 01.mkv
──────────────────────────────────────────────────────────────────────
ASS/SSA subtitle tracks : 2
Font attachments : 15
ASS tracks parsed:
Track 2 [eng]: 1 font(s) referenced
Track 3 [ger]: 3 font(s) referenced
FONTS NEEDED BY SUBTITLES (4 total)
──────────────────────────────────────────────────────────────────────
[OK] arial
[OK] gandhi sans
[MISSING] georgia bold
[OK] times new roman bold
FONTS EMBEDDED IN MKV (15 file(s))
──────────────────────────────────────────────────────────────────────
[USED] ARIALNB.TTF -> covers: arial
[EXTRA] AdobeArabic-Bold.otf
...
MISSING FONTS (1 font(s) not embedded)
──────────────────────────────────────────────────────────────────────
✘ georgia bold
EXTRA / UNUSED EMBEDDINGS (10 file(s) not needed by any subtitle)
──────────────────────────────────────────────────────────────────────
⚠ AdobeArabic-Bold.otf
⚠ comic.ttf
...
```
### Typical workflow
1. Run `subtitle_fonts_scanner.py` on a file to preview needed vs unused fonts.
2. Run `subtitle_fonts_cleaner.py` to process all MKVs in the working directory.
3. Optionally run the scanner again on a cleaned file to verify the result.
## License ## License
MIT License. See the [LICENSE](LICENSE) file for more details. MIT License. See the [LICENSE](LICENSE) file for more details.

343
subtitle_fonts_scanner.py Normal file
View File

@@ -0,0 +1,343 @@
#!/usr/bin/env python3
"""
subtitle_fonts_scanner.py
─────────────────────────
Read-only scanner that inspects a single MKV file and reports:
• Fonts required by ASS/SSA subtitle tracks
• Fonts currently embedded as attachments
• Fonts that are missing (required but not embedded)
Usage:
python subtitle_fonts_scanner.py input.mkv
"""
import json
import os
import re
import subprocess
import sys
import tempfile
from pathlib import Path
# ── Dependency check ──────────────────────────────────────────────────────────
try:
from fontTools.ttLib import TTFont
except ImportError:
print("Error: 'fonttools' is required to accurately read internal font names.")
print(" Install with: pip install fonttools")
print(" (Arch Linux): sudo pacman -S python-fonttools")
sys.exit(1)
# ── Helpers ───────────────────────────────────────────────────────────────────
def get_ass_font_names(ass_path: Path) -> set[str]:
"""
Parse an ASS/SSA file and return a lower-case set of all font family names
referenced in [V4+ Styles] and via \\fn inline overrides in [Events].
"""
fonts: set[str] = set()
try:
with open(ass_path, "r", encoding="utf-8", errors="ignore") as fh:
in_styles = False
in_events = False
fontname_idx = -1
for raw in fh:
line = raw.strip()
if not line:
continue
if line.startswith("["):
in_styles = line == "[V4+ Styles]"
in_events = line == "[Events]"
continue
if in_styles:
if line.startswith("Format:"):
fmt_cols = [c.strip().lower() for c in line[len("Format:"):].split(",")]
fontname_idx = fmt_cols.index("fontname") if "fontname" in fmt_cols else -1
elif line.startswith("Style:") and fontname_idx != -1:
cols = [c.strip() for c in line[len("Style:"):].split(",")]
if len(cols) > fontname_idx:
fonts.add(cols[fontname_idx])
if in_events and line.startswith("Dialogue:"):
for match in re.findall(r"\\fn([^\\}]+)", line):
fonts.add(match.strip())
except Exception as exc:
print(yellow(f" Warning: could not fully parse {ass_path.name}: {exc}"))
return {f.lower() for f in fonts if f}
def get_internal_font_names(font_path: Path) -> set[str]:
"""
Extract the internal family / full-name / typographic family strings
from a TTF, OTF, or TTC file using fontTools.
Returns a lower-case set.
"""
names: set[str] = set()
try:
font = TTFont(str(font_path), fontNumber=0)
for record in font["name"].names:
if record.nameID in (1, 4, 16):
try:
names.add(record.toUnicode().lower())
except Exception:
pass
font.close()
except Exception as exc:
print(yellow(f" Warning: could not read font metadata for {font_path.name}: {exc}"))
return names
def safe_stem(name: str) -> str:
"""Strip unsafe characters for use as a temp filename component."""
return "".join(c for c in name if c.isalnum() or c in " ._-").rstrip()
# ── Core scan logic ───────────────────────────────────────────────────────────
def scan_mkv(mkv_path: Path) -> None:
print()
print(f"Scanning: {mkv_path.name}")
print("" * 70)
# ── 1. Read MKV metadata ─────────────────────────────────────────────────
result = subprocess.run(
["mkvmerge", "-J", str(mkv_path)],
capture_output=True, text=True, encoding="utf-8"
)
if result.returncode != 0:
print(f"ERROR: mkvmerge could not read '{mkv_path.name}'.")
print(f" {result.stderr.strip()}")
return
try:
mkv_info = json.loads(result.stdout)
except json.JSONDecodeError as exc:
print(f"ERROR: failed to parse mkvmerge JSON output: {exc}")
return
tracks = mkv_info.get("tracks", [])
attachments = mkv_info.get("attachments", [])
# ── 2. Find ASS/SSA subtitle tracks ─────────────────────────────────────
ass_tracks = [
t for t in tracks
if t.get("type") == "subtitles"
and (
"S_TEXT/ASS" in str(t.get("properties", {}).get("codec_id", ""))
or "S_TEXT/SSA" in str(t.get("properties", {}).get("codec_id", ""))
or "SubStationAlpha" in str(t.get("codec", ""))
)
]
# ── 3. Find font attachments ─────────────────────────────────────────────
font_mimes = ["font", "truetype", "opentype", "sfnt", "application/x-truetype-font"]
font_attachments = [
a for a in attachments
if any(m in a.get("content_type", "").lower() for m in font_mimes)
]
# ── 4. Report basic counts ───────────────────────────────────────────────
print(f" ASS/SSA subtitle tracks : {len(ass_tracks)}")
print(f" Font attachments : {len(font_attachments)}")
if not ass_tracks:
print("\n WARNING: No ASS/SSA subtitle tracks found — no font requirements to check.")
if not font_attachments:
print(" No font attachments either. Nothing to report.")
else:
print(f" {len(font_attachments)} font attachment(s) present but no subtitles reference them.")
_list_embedded_fonts_only(font_attachments)
return
# ── 5. Extract ASS tracks to a temp directory ────────────────────────────
required_fonts: set[str] = set()
with tempfile.TemporaryDirectory(prefix="fonts_scanner_") as tmp:
tmp_path = Path(tmp)
# Extract subtitle tracks
extract_sub_cmd = ["mkvextract", "tracks", str(mkv_path)]
ass_temp_files: list[Path] = []
for t in ass_tracks:
tid = t["id"]
out = tmp_path / f"track_{tid}.ass"
extract_sub_cmd.append(f"{tid}:{out}")
ass_temp_files.append(out)
sub_result = subprocess.run(extract_sub_cmd, capture_output=True, text=True)
if sub_result.returncode != 0:
print(f"ERROR extracting subtitle tracks: {sub_result.stderr.strip()}")
return
# Collect required font names from each ASS file
print(f"\n ASS tracks parsed:")
for ass_file in ass_temp_files:
if not ass_file.exists():
continue
found = get_ass_font_names(ass_file)
tid_str = ass_file.stem.replace("track_", "")
track_info = next(
(t for t in ass_tracks if str(t["id"]) == tid_str), {}
)
lang = track_info.get("properties", {}).get("language", "und")
name = track_info.get("properties", {}).get("track_name", "")
label = f"Track {tid_str}"
if name:
label += f" {name}"
label += f" [{lang}]"
print(f" {label}: {len(found)} font(s) referenced")
required_fonts.update(found)
# ── 6. Extract font attachments ──────────────────────────────────────
embedded_font_names: dict[str, set[str]] = {} # filename → internal names (lower)
if font_attachments:
extract_att_cmd = ["mkvextract", "attachments", str(mkv_path)]
for att in font_attachments:
aid = att["id"]
stem = safe_stem(att["file_name"])
out = tmp_path / f"att_{aid}_{stem}"
extract_att_cmd.append(f"{aid}:{out}")
att["_temp_path"] = out
att_result = subprocess.run(extract_att_cmd, capture_output=True, text=True)
if att_result.returncode != 0:
print(f"ERROR extracting font attachments: {att_result.stderr.strip()}")
else:
for att in font_attachments:
temp_p: Path = att["_temp_path"]
if temp_p.exists():
internal = get_internal_font_names(temp_p)
embedded_font_names[att["file_name"]] = internal
# ── 7. Match required → embedded ─────────────────────────────────────
#
# A font is "covered" if any embedded font file reports an internal name
# that matches a required name (case-insensitive), OR if the attachment
# filename stem matches the required name.
#
covered_required: set[str] = set()
attachment_match: dict[str, list[str]] = {} # file_name → matched font names
for filename, internal_names in embedded_font_names.items():
stem_lower = Path(filename).stem.lower()
matched = set()
for req in required_fonts:
if req in internal_names or req == stem_lower:
matched.add(req)
if matched:
attachment_match[filename] = sorted(matched)
covered_required.update(matched)
missing_fonts = required_fonts - covered_required
extra_embedded = set(embedded_font_names.keys()) - set(attachment_match.keys())
# ── 8. Print report ──────────────────────────────────────────────────
_print_report(required_fonts, embedded_font_names, attachment_match,
missing_fonts, extra_embedded)
def _list_embedded_fonts_only(font_attachments: list) -> None:
"""Called when there are no ASS tracks — just list what's embedded."""
print(f"\n Embedded font attachments:")
for att in font_attachments:
print(f"{att['file_name']} ({att.get('content_type', '?')})")
def _print_report(
required: set[str],
embedded: dict[str, set[str]],
matched: dict[str, list[str]],
missing: set[str],
extra: set[str],
) -> None:
sep = "" * 70
# ── Needed fonts ─────────────────────────────────────────────────────────
print(f"\n FONTS NEEDED BY SUBTITLES ({len(required)} total)")
print(f" {sep}")
if required:
for name in sorted(required):
status = "[OK]" if name not in missing else "[MISSING]"
print(f" {status} {name}")
else:
print(f" (none)")
# ── Embedded fonts ────────────────────────────────────────────────────────
print(f"\n FONTS EMBEDDED IN MKV ({len(embedded)} file(s))")
print(f" {sep}")
if embedded:
for filename, internal_names in sorted(embedded.items()):
is_used = filename in matched
tag = "[USED] " if is_used else "[EXTRA] "
hits = matched.get(filename, [])
line = f" {tag}{filename}"
if hits:
line += f" → covers: {', '.join(hits)}"
print(line)
# Show internal name(s) for transparency
for iname in sorted(internal_names)[:6]:
print(f" internal name: {iname}")
if len(internal_names) > 6:
print(f" … and {len(internal_names) - 6} more")
else:
print(f" (none)")
# ── Missing fonts ─────────────────────────────────────────────────────────
print(f"\n MISSING FONTS ({len(missing)} font(s) not embedded)")
print(f" {sep}")
if missing:
for name in sorted(missing):
print(f"{name}")
else:
print(f" ✓ All required fonts are present — nothing missing!")
# ── Extra (unused) embedded fonts ─────────────────────────────────────────
if extra:
print(f"\n EXTRA / UNUSED EMBEDDINGS ({len(extra)} file(s) not needed by any subtitle)")
print(f" {sep}")
for filename in sorted(extra):
print(f"{filename}")
print()
# ── Entry point ───────────────────────────────────────────────────────────────
def main() -> None:
if len(sys.argv) < 2:
print()
print("subtitle_fonts_scanner.py")
print("" * 40)
print(" Scans an MKV file and reports which fonts are needed by")
print(" ASS/SSA subtitles, which are already embedded, and which")
print(" are missing.")
print()
print("Usage:")
print(" python subtitle_fonts_scanner.py <input.mkv>")
print()
print("Example:")
print(" python subtitle_fonts_scanner.py \"My Show S01E01.mkv\"")
print()
sys.exit(1)
mkv_path = Path(sys.argv[1])
if not mkv_path.exists():
print(f"\nError: File not found: {mkv_path}")
sys.exit(1)
if mkv_path.suffix.lower() != ".mkv":
print(f"\nWarning: '{mkv_path.name}' does not have an .mkv extension.")
print(" This script is designed for MKV files. Proceeding anyway…")
scan_mkv(mkv_path)
if __name__ == "__main__":
main()