feat: Add Python script to clean unused font attachments from MKV files, along with license and documentation.

2026-03-02 13:50:09 +01:00
commit 5853a58093
3 changed files with 311 additions and 0 deletions
--- a/21
+++ b/21
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2026 pat-e
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
--- a/README.md
+++ b/README.md
@@ -0,0 +1,41 @@
+# Subtitle Attachment Cleanup
+
+A Python script to automatically clean up unnecessary font attachments from MKV video files. 
+
+When you remove unwanted subtitle streams (like foreign languages) from an MKV using tools like MKVToolNix, the font files attached to those removed subtitles are typically left behind, needlessly inflating the file size. This script reads your MKV files, thoroughly inspects the remaining SSA/ASS subtitle tracks to discover which fonts are actually being used, and builds a new MKV file—leaving behind any orphaned or unused font attachments.
+
+## Features
+- Intelligently extracts and parses `.ass` / `.ssa` subtitle tracks from MKV containers.
+- Identifies both top-level font declarations (in `[V4+ Styles]`) and inline font overrides (in `[Events]`).
+- Uses deep metadata scanning (`fonttools`) to accurately match requested font families against attached font files, even if the attached files have cryptic filenames (e.g., `arialbd.ED3587CD.ttf`).
+- Safely preserves all non-font attachments (like cover images).
+- Automatically moves original MKVs to an `original/` backup folder and places the cleaned files in a `finished/` folder.
+
+## Prerequisites
+Ensure the following tools and libraries are installed and accessible in your system's PATH:
+
+1. **Python 3.x**
+2. **MKVToolNix** (specifically `mkvmerge` and `mkvextract`)
+3. **fonttools** (Python library)
+
+Install the required Python dependency via pip:
+```bash
+pip install fonttools
+```
+
+## Usage
+Simply place the script inside the directory containing the `.mkv` files you wish to process and run it. You can also place the script in your personal `bin` or `PATH` folder to run it from anywhere.
+
+```bash
+python cleanup_fonts.py
+# If in your PATH, simply execute: cleanup_fonts.py
+```
+
+### Folder Structure
+Upon execution, the script will create three folders in your working directory:
+- `temp_subs_fonts/` - A temporary directory used during processing (automatically deleted upon completion).
+- `original/` - Your original, unmodified `.mkv` files are safely moved here.
+- `finished/` - The new, lean `.mkv` files containing only the active ASS tracks, required font attachments, and original audio/video streams.
+
+## License
+MIT License. See the [LICENSE](LICENSE) file for more details.
--- a/cleanup_fonts.py
+++ b/cleanup_fonts.py
@@ -0,0 +1,249 @@
+#!/usr/bin/env python3
+
+import os
+import re
+import sys
+import json
+import shutil
+import subprocess
+from pathlib import Path
+
+# Try importing fontTools, which is needed to read internal font names
+try:
+    from fontTools.ttLib import TTFont
+except ImportError:
+    print("Error: 'fonttools' is required to accurately read internal font names.")
+    print("Please install it by running: pip install fonttools")
+    sys.exit(1)
+
+def get_ass_font_names(ass_path):
+    """
+    Parse an ASS file and return a set of all font names used.
+    It checks the [V4+ Styles] section and \fn overrides in [Events].
+    """
+    fonts = set()
+    try:
+        with open(ass_path, 'r', encoding='utf-8', errors='ignore') as f:
+            in_styles = False
+            in_events = False
+            fontname_idx = -1
+            
+            for line in f:
+                line = line.strip()
+                if not line:
+                    continue
+                if line.startswith('['):
+                    in_styles = (line == '[V4+ Styles]')
+                    in_events = (line == '[Events]')
+                    continue
+                    
+                if in_styles:
+                    if line.startswith('Format:'):
+                        format_str = line[len('Format:'):].strip()
+                        format_cols = [col.strip().lower() for col in format_str.split(',')]
+                        if 'fontname' in format_cols:
+                            fontname_idx = format_cols.index('fontname')
+                    elif line.startswith('Style:') and fontname_idx != -1:
+                        style_str = line[len('Style:'):].strip()
+                        style_cols = [col.strip() for col in style_str.split(',')]
+                        if len(style_cols) > fontname_idx:
+                            fonts.add(style_cols[fontname_idx])
+                            
+                if in_events:
+                    if line.startswith('Dialogue:'):
+                        # Find overrides like \fnArial or \fnComic Sans MS
+                        matches = re.findall(r'\\fn([^\\}]+)', line)
+                        for match in matches:
+                            fonts.add(match.strip())
+    except Exception as e:
+        print(f"Warning: Failed to read {ass_path.name}: {e}")
+                        
+    return {f.lower() for f in fonts} # case-insensitive set
+
+def get_internal_font_names(font_path):
+    """
+    Extract internal font names (Family, Full Name, Typographic Family) from a TTF/OTF.
+    """
+    names = set()
+    try:
+        # fontNumber=0 works for single fonts and the first font in a collection (.ttc)
+        font = TTFont(str(font_path), fontNumber=0) 
+        for record in font['name'].names:
+            if record.nameID in (1, 4, 16): 
+                try:
+                    text = record.toUnicode()
+                    names.add(text.lower())
+                except:
+                    pass
+        font.close()
+    except Exception as e:
+        print(f"      Warning: Could not read metadata for {font_path.name}: {e}")
+    return names
+
+def safe_filename(name):
+    """Make string safe for temporary filename."""
+    return "".join(c for c in name if c.isalpha() or c.isdigit() or c in ' .-_').rstrip()
+
+def main():
+    root_dir = Path.cwd()
+    mkv_files = list(root_dir.glob("*.mkv"))
+    if not mkv_files:
+        print("No MKV files found in the current directory.")
+        return
+
+    original_dir = root_dir / "original"
+    finished_dir = root_dir / "finished"
+    temp_dir = root_dir / "temp_subs_fonts"
+
+    original_dir.mkdir(exist_ok=True)
+    finished_dir.mkdir(exist_ok=True)
+    temp_dir.mkdir(exist_ok=True)
+
+    for mkv_path in mkv_files:
+        print(f"\nProcessing: {mkv_path.name}")
+        
+        # 1. Get MKV info via mkvmerge -J
+        result = subprocess.run(['mkvmerge', '-J', str(mkv_path)], capture_output=True, text=True, encoding='utf-8')
+        if result.returncode != 0:
+            print(f"  Error reading {mkv_path.name} with mkvmerge. Skipping.")
+            continue
+            
+        try:
+            mkv_info = json.loads(result.stdout)
+        except json.JSONDecodeError:
+            print(f"  Error parsing JSON output from mkvmerge for {mkv_path.name}. Skipping.")
+            continue
+
+        tracks = mkv_info.get('tracks', [])
+        attachments = mkv_info.get('attachments', [])
+
+        # Fix condition: checking codec or codec_id
+        ass_tracks = [t for t in tracks if t.get('type') == 'subtitles' and 
+                      ('S_TEXT/ASS' in str(t.get('properties', {}).get('codec_id', '')) or 
+                       'S_TEXT/SSA' in str(t.get('properties', {}).get('codec_id', '')) or 
+                       'SubStationAlpha' in str(t.get('codec', '')))]
+        
+        # Identify fonts vs other attachments. We look at mime types.
+        font_mimes = ['font', 'truetype', 'opentype', 'sfnt', 'application/x-truetype-font']
+        font_attachments = [a for a in attachments if any(m in a.get('content_type', '').lower() for m in font_mimes)]
+        other_attachments = [a for a in attachments if a not in font_attachments]
+
+        if not ass_tracks and not font_attachments:
+            print("  No ASS tracks and no fonts found. Copying without changes.")
+            shutil.copy2(str(mkv_path), str(finished_dir / mkv_path.name))
+            shutil.move(str(mkv_path), str(original_dir / mkv_path.name))
+            continue
+
+        # 2. Extract ASS tracks
+        required_fonts = set()
+        if ass_tracks:
+            print(f"  Extracting {len(ass_tracks)} ASS track(s)...")
+            extract_ass_cmd = ['mkvextract', 'tracks', str(mkv_path)]
+            ass_temp_files = []
+            for t in ass_tracks:
+                track_id = t['id']
+                out_ass = temp_dir / f"{mkv_path.stem}_track_{track_id}.ass"
+                extract_ass_cmd.append(f"{track_id}:{out_ass}")
+                ass_temp_files.append(out_ass)
+                
+            subprocess.run(extract_ass_cmd, check=True)
+            
+            # Retrieve required names
+            for ass_file in ass_temp_files:
+                ass_fonts = get_ass_font_names(ass_file)
+                print(f"    {ass_file.name} references {len(ass_fonts)} distinct font(s): {list(ass_fonts)[:5]}{'...' if len(ass_fonts) > 5 else ''}")
+                required_fonts.update(ass_fonts)
+                
+            print(f"  Total distinct font name(s) referenced in ASS across MKV: {len(required_fonts)}")
+            print(f"  Required fonts list: {list(required_fonts)}")
+
+        # 3. Extract and verify font attachments
+        fonts_to_keep = [] 
+        if font_attachments:
+            print(f"  Extracting {len(font_attachments)} font attachment(s) to verify...")
+            extract_att_cmd = ['mkvextract', 'attachments', str(mkv_path)]
+            for att in font_attachments:
+                att_id = att['id']
+                out_font = temp_dir / f"att_{att_id}_{safe_filename(att['file_name'])}"
+                extract_att_cmd.append(f"{att_id}:{out_font}")
+                att['temp_path'] = out_font 
+                
+            subprocess.run(extract_att_cmd, check=True)
+            
+            # Check which fonts match the required list
+            # We must be careful because some MKVs have fonts but no ASS referencing them
+            for att in font_attachments:
+                att_path = att['temp_path']
+                if not att_path.exists():
+                    continue
+                    
+                keep_this_font = False
+                
+                # Check 1: Exact filename match (minus extension)
+                filename_no_ext = att_path.stem.lower()
+                if filename_no_ext in required_fonts:
+                    print(f"    [MATCH] '{att['file_name']}' matched exactly via filename.")
+                    keep_this_font = True
+                
+                # Check 2: Internal TrueType/OpenType name match using fonttools
+                if not keep_this_font:
+                    internal_names = get_internal_font_names(att_path)
+                    intersect = required_fonts.intersection(internal_names)
+                    if intersect:
+                        print(f"    [MATCH] '{att['file_name']}' matched via internal names: {intersect}")
+                        keep_this_font = True
+                    else:
+                        print(f"    [SKIP]  '{att['file_name']}' did not match any required font. Internal names: {list(internal_names)[:5]}")
+                        
+                if keep_this_font:
+                    fonts_to_keep.append(att)
+                    
+        print(f"  Keeping {len(fonts_to_keep)} required font attachment(s).")
+        
+        # 4. Extract other non-font attachments (like cover.jpg) so we don't lose them!
+        other_to_keep = []
+        if other_attachments:
+            print(f"  Extracting {len(other_attachments)} non-font attachment(s) to preserve them...")
+            extract_other_cmd = ['mkvextract', 'attachments', str(mkv_path)]
+            for att in other_attachments:
+                att_id = att['id']
+                out_other = temp_dir / f"other_{att_id}_{safe_filename(att['file_name'])}"
+                extract_other_cmd.append(f"{att_id}:{out_other}")
+                att['temp_path'] = out_other
+            
+            subprocess.run(extract_other_cmd, check=True)
+            other_to_keep.extend(other_attachments)
+            
+        # 5. Remux using mkvmerge
+        out_mkv = finished_dir / mkv_path.name
+        remux_cmd = ['mkvmerge', '-o', str(out_mkv), '--no-attachments', str(mkv_path)]
+        
+        for att in fonts_to_keep + other_to_keep:
+            remux_cmd.extend([
+                '--attachment-name', att['file_name'],
+                '--attachment-mime-type', att['content_type'],
+                '--attach-file', str(att['temp_path'])
+            ])
+            
+        print(f"  Remuxing to: {out_mkv.name}")
+        subprocess.run(remux_cmd, check=True)
+        
+        # 6. Move original processed file
+        print(f"  Moving original file to 'original' folder...")
+        shutil.move(str(mkv_path), str(original_dir / mkv_path.name))
+        
+        # Cleanup temp for this MKV
+        for item in temp_dir.iterdir():
+            if item.is_file():
+                item.unlink()
+                
+    # Final cleanup of temp directory
+    try:
+        temp_dir.rmdir()
+    except OSError:
+        pass # Not empty, or some other error, keep it for debugging
+        
+    print("\nAll tasks completed.")
+
+if __name__ == '__main__':
+    main()