feat: Add Python script to clean unused font attachments from MKV files, along with license and documentation.

This commit is contained in:
2026-03-02 13:50:09 +01:00
commit 5853a58093
3 changed files with 311 additions and 0 deletions

21
LICENSE Normal file
View File

@@ -0,0 +1,21 @@
MIT License
Copyright (c) 2026 pat-e
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

41
README.md Normal file
View File

@@ -0,0 +1,41 @@
# Subtitle Attachment Cleanup
A Python script to automatically clean up unnecessary font attachments from MKV video files.
When you remove unwanted subtitle streams (like foreign languages) from an MKV using tools like MKVToolNix, the font files attached to those removed subtitles are typically left behind, needlessly inflating the file size. This script reads your MKV files, thoroughly inspects the remaining SSA/ASS subtitle tracks to discover which fonts are actually being used, and builds a new MKV file—leaving behind any orphaned or unused font attachments.
## Features
- Intelligently extracts and parses `.ass` / `.ssa` subtitle tracks from MKV containers.
- Identifies both top-level font declarations (in `[V4+ Styles]`) and inline font overrides (in `[Events]`).
- Uses deep metadata scanning (`fonttools`) to accurately match requested font families against attached font files, even if the attached files have cryptic filenames (e.g., `arialbd.ED3587CD.ttf`).
- Safely preserves all non-font attachments (like cover images).
- Automatically moves original MKVs to an `original/` backup folder and places the cleaned files in a `finished/` folder.
## Prerequisites
Ensure the following tools and libraries are installed and accessible in your system's PATH:
1. **Python 3.x**
2. **MKVToolNix** (specifically `mkvmerge` and `mkvextract`)
3. **fonttools** (Python library)
Install the required Python dependency via pip:
```bash
pip install fonttools
```
## Usage
Simply place the script inside the directory containing the `.mkv` files you wish to process and run it. You can also place the script in your personal `bin` or `PATH` folder to run it from anywhere.
```bash
python cleanup_fonts.py
# If in your PATH, simply execute: cleanup_fonts.py
```
### Folder Structure
Upon execution, the script will create three folders in your working directory:
- `temp_subs_fonts/` - A temporary directory used during processing (automatically deleted upon completion).
- `original/` - Your original, unmodified `.mkv` files are safely moved here.
- `finished/` - The new, lean `.mkv` files containing only the active ASS tracks, required font attachments, and original audio/video streams.
## License
MIT License. See the [LICENSE](LICENSE) file for more details.

249
cleanup_fonts.py Normal file
View File

@@ -0,0 +1,249 @@
#!/usr/bin/env python3
import os
import re
import sys
import json
import shutil
import subprocess
from pathlib import Path
# Try importing fontTools, which is needed to read internal font names
try:
from fontTools.ttLib import TTFont
except ImportError:
print("Error: 'fonttools' is required to accurately read internal font names.")
print("Please install it by running: pip install fonttools")
sys.exit(1)
def get_ass_font_names(ass_path):
"""
Parse an ASS file and return a set of all font names used.
It checks the [V4+ Styles] section and \fn overrides in [Events].
"""
fonts = set()
try:
with open(ass_path, 'r', encoding='utf-8', errors='ignore') as f:
in_styles = False
in_events = False
fontname_idx = -1
for line in f:
line = line.strip()
if not line:
continue
if line.startswith('['):
in_styles = (line == '[V4+ Styles]')
in_events = (line == '[Events]')
continue
if in_styles:
if line.startswith('Format:'):
format_str = line[len('Format:'):].strip()
format_cols = [col.strip().lower() for col in format_str.split(',')]
if 'fontname' in format_cols:
fontname_idx = format_cols.index('fontname')
elif line.startswith('Style:') and fontname_idx != -1:
style_str = line[len('Style:'):].strip()
style_cols = [col.strip() for col in style_str.split(',')]
if len(style_cols) > fontname_idx:
fonts.add(style_cols[fontname_idx])
if in_events:
if line.startswith('Dialogue:'):
# Find overrides like \fnArial or \fnComic Sans MS
matches = re.findall(r'\\fn([^\\}]+)', line)
for match in matches:
fonts.add(match.strip())
except Exception as e:
print(f"Warning: Failed to read {ass_path.name}: {e}")
return {f.lower() for f in fonts} # case-insensitive set
def get_internal_font_names(font_path):
"""
Extract internal font names (Family, Full Name, Typographic Family) from a TTF/OTF.
"""
names = set()
try:
# fontNumber=0 works for single fonts and the first font in a collection (.ttc)
font = TTFont(str(font_path), fontNumber=0)
for record in font['name'].names:
if record.nameID in (1, 4, 16):
try:
text = record.toUnicode()
names.add(text.lower())
except:
pass
font.close()
except Exception as e:
print(f" Warning: Could not read metadata for {font_path.name}: {e}")
return names
def safe_filename(name):
"""Make string safe for temporary filename."""
return "".join(c for c in name if c.isalpha() or c.isdigit() or c in ' .-_').rstrip()
def main():
root_dir = Path.cwd()
mkv_files = list(root_dir.glob("*.mkv"))
if not mkv_files:
print("No MKV files found in the current directory.")
return
original_dir = root_dir / "original"
finished_dir = root_dir / "finished"
temp_dir = root_dir / "temp_subs_fonts"
original_dir.mkdir(exist_ok=True)
finished_dir.mkdir(exist_ok=True)
temp_dir.mkdir(exist_ok=True)
for mkv_path in mkv_files:
print(f"\nProcessing: {mkv_path.name}")
# 1. Get MKV info via mkvmerge -J
result = subprocess.run(['mkvmerge', '-J', str(mkv_path)], capture_output=True, text=True, encoding='utf-8')
if result.returncode != 0:
print(f" Error reading {mkv_path.name} with mkvmerge. Skipping.")
continue
try:
mkv_info = json.loads(result.stdout)
except json.JSONDecodeError:
print(f" Error parsing JSON output from mkvmerge for {mkv_path.name}. Skipping.")
continue
tracks = mkv_info.get('tracks', [])
attachments = mkv_info.get('attachments', [])
# Fix condition: checking codec or codec_id
ass_tracks = [t for t in tracks if t.get('type') == 'subtitles' and
('S_TEXT/ASS' in str(t.get('properties', {}).get('codec_id', '')) or
'S_TEXT/SSA' in str(t.get('properties', {}).get('codec_id', '')) or
'SubStationAlpha' in str(t.get('codec', '')))]
# Identify fonts vs other attachments. We look at mime types.
font_mimes = ['font', 'truetype', 'opentype', 'sfnt', 'application/x-truetype-font']
font_attachments = [a for a in attachments if any(m in a.get('content_type', '').lower() for m in font_mimes)]
other_attachments = [a for a in attachments if a not in font_attachments]
if not ass_tracks and not font_attachments:
print(" No ASS tracks and no fonts found. Copying without changes.")
shutil.copy2(str(mkv_path), str(finished_dir / mkv_path.name))
shutil.move(str(mkv_path), str(original_dir / mkv_path.name))
continue
# 2. Extract ASS tracks
required_fonts = set()
if ass_tracks:
print(f" Extracting {len(ass_tracks)} ASS track(s)...")
extract_ass_cmd = ['mkvextract', 'tracks', str(mkv_path)]
ass_temp_files = []
for t in ass_tracks:
track_id = t['id']
out_ass = temp_dir / f"{mkv_path.stem}_track_{track_id}.ass"
extract_ass_cmd.append(f"{track_id}:{out_ass}")
ass_temp_files.append(out_ass)
subprocess.run(extract_ass_cmd, check=True)
# Retrieve required names
for ass_file in ass_temp_files:
ass_fonts = get_ass_font_names(ass_file)
print(f" {ass_file.name} references {len(ass_fonts)} distinct font(s): {list(ass_fonts)[:5]}{'...' if len(ass_fonts) > 5 else ''}")
required_fonts.update(ass_fonts)
print(f" Total distinct font name(s) referenced in ASS across MKV: {len(required_fonts)}")
print(f" Required fonts list: {list(required_fonts)}")
# 3. Extract and verify font attachments
fonts_to_keep = []
if font_attachments:
print(f" Extracting {len(font_attachments)} font attachment(s) to verify...")
extract_att_cmd = ['mkvextract', 'attachments', str(mkv_path)]
for att in font_attachments:
att_id = att['id']
out_font = temp_dir / f"att_{att_id}_{safe_filename(att['file_name'])}"
extract_att_cmd.append(f"{att_id}:{out_font}")
att['temp_path'] = out_font
subprocess.run(extract_att_cmd, check=True)
# Check which fonts match the required list
# We must be careful because some MKVs have fonts but no ASS referencing them
for att in font_attachments:
att_path = att['temp_path']
if not att_path.exists():
continue
keep_this_font = False
# Check 1: Exact filename match (minus extension)
filename_no_ext = att_path.stem.lower()
if filename_no_ext in required_fonts:
print(f" [MATCH] '{att['file_name']}' matched exactly via filename.")
keep_this_font = True
# Check 2: Internal TrueType/OpenType name match using fonttools
if not keep_this_font:
internal_names = get_internal_font_names(att_path)
intersect = required_fonts.intersection(internal_names)
if intersect:
print(f" [MATCH] '{att['file_name']}' matched via internal names: {intersect}")
keep_this_font = True
else:
print(f" [SKIP] '{att['file_name']}' did not match any required font. Internal names: {list(internal_names)[:5]}")
if keep_this_font:
fonts_to_keep.append(att)
print(f" Keeping {len(fonts_to_keep)} required font attachment(s).")
# 4. Extract other non-font attachments (like cover.jpg) so we don't lose them!
other_to_keep = []
if other_attachments:
print(f" Extracting {len(other_attachments)} non-font attachment(s) to preserve them...")
extract_other_cmd = ['mkvextract', 'attachments', str(mkv_path)]
for att in other_attachments:
att_id = att['id']
out_other = temp_dir / f"other_{att_id}_{safe_filename(att['file_name'])}"
extract_other_cmd.append(f"{att_id}:{out_other}")
att['temp_path'] = out_other
subprocess.run(extract_other_cmd, check=True)
other_to_keep.extend(other_attachments)
# 5. Remux using mkvmerge
out_mkv = finished_dir / mkv_path.name
remux_cmd = ['mkvmerge', '-o', str(out_mkv), '--no-attachments', str(mkv_path)]
for att in fonts_to_keep + other_to_keep:
remux_cmd.extend([
'--attachment-name', att['file_name'],
'--attachment-mime-type', att['content_type'],
'--attach-file', str(att['temp_path'])
])
print(f" Remuxing to: {out_mkv.name}")
subprocess.run(remux_cmd, check=True)
# 6. Move original processed file
print(f" Moving original file to 'original' folder...")
shutil.move(str(mkv_path), str(original_dir / mkv_path.name))
# Cleanup temp for this MKV
for item in temp_dir.iterdir():
if item.is_file():
item.unlink()
# Final cleanup of temp directory
try:
temp_dir.rmdir()
except OSError:
pass # Not empty, or some other error, keep it for debugging
print("\nAll tasks completed.")
if __name__ == '__main__':
main()