refactor: rename cleanup_fonts.py to subtitle_fonts_cleaner.py and update README references.

2026-03-02 21:42:13 +01:00
parent cc5e5359e5
commit fd0497e174
2 changed files with 2 additions and 2 deletions
--- a/subtitle_fonts_cleaner.py
+++ b/subtitle_fonts_cleaner.py
@@ -0,0 +1,250 @@
+#!/usr/bin/env python3
+
+import os
+import re
+import sys
+import json
+import shutil
+import subprocess
+from pathlib import Path
+
+# Try importing fontTools, which is needed to read internal font names
+try:
+    from fontTools.ttLib import TTFont
+except ImportError:
+    print("Error: 'fonttools' is required to accurately read internal font names.")
+    print("Please install it by running: pip install fonttools (for Windows/Ubuntu)")
+    print("Or: pipx install fonttools (for ArchLinux)")
+    sys.exit(1)
+
+def get_ass_font_names(ass_path):
+    """
+    Parse an ASS file and return a set of all font names used.
+    It checks the [V4+ Styles] section and \fn overrides in [Events].
+    """
+    fonts = set()
+    try:
+        with open(ass_path, 'r', encoding='utf-8', errors='ignore') as f:
+            in_styles = False
+            in_events = False
+            fontname_idx = -1
+            
+            for line in f:
+                line = line.strip()
+                if not line:
+                    continue
+                if line.startswith('['):
+                    in_styles = (line == '[V4+ Styles]')
+                    in_events = (line == '[Events]')
+                    continue
+                    
+                if in_styles:
+                    if line.startswith('Format:'):
+                        format_str = line[len('Format:'):].strip()
+                        format_cols = [col.strip().lower() for col in format_str.split(',')]
+                        if 'fontname' in format_cols:
+                            fontname_idx = format_cols.index('fontname')
+                    elif line.startswith('Style:') and fontname_idx != -1:
+                        style_str = line[len('Style:'):].strip()
+                        style_cols = [col.strip() for col in style_str.split(',')]
+                        if len(style_cols) > fontname_idx:
+                            fonts.add(style_cols[fontname_idx])
+                            
+                if in_events:
+                    if line.startswith('Dialogue:'):
+                        # Find overrides like \fnArial or \fnComic Sans MS
+                        matches = re.findall(r'\\fn([^\\}]+)', line)
+                        for match in matches:
+                            fonts.add(match.strip())
+    except Exception as e:
+        print(f"Warning: Failed to read {ass_path.name}: {e}")
+                        
+    return {f.lower() for f in fonts} # case-insensitive set
+
+def get_internal_font_names(font_path):
+    """
+    Extract internal font names (Family, Full Name, Typographic Family) from a TTF/OTF.
+    """
+    names = set()
+    try:
+        # fontNumber=0 works for single fonts and the first font in a collection (.ttc)
+        font = TTFont(str(font_path), fontNumber=0) 
+        for record in font['name'].names:
+            if record.nameID in (1, 4, 16): 
+                try:
+                    text = record.toUnicode()
+                    names.add(text.lower())
+                except:
+                    pass
+        font.close()
+    except Exception as e:
+        print(f"      Warning: Could not read metadata for {font_path.name}: {e}")
+    return names
+
+def safe_filename(name):
+    """Make string safe for temporary filename."""
+    return "".join(c for c in name if c.isalpha() or c.isdigit() or c in ' .-_').rstrip()
+
+def main():
+    root_dir = Path.cwd()
+    mkv_files = list(root_dir.glob("*.mkv"))
+    if not mkv_files:
+        print("No MKV files found in the current directory.")
+        return
+
+    original_dir = root_dir / "original"
+    finished_dir = root_dir / "finished"
+    temp_dir = root_dir / "temp_subs_fonts"
+
+    original_dir.mkdir(exist_ok=True)
+    finished_dir.mkdir(exist_ok=True)
+    temp_dir.mkdir(exist_ok=True)
+
+    for mkv_path in mkv_files:
+        print(f"\nProcessing: {mkv_path.name}")
+        
+        # 1. Get MKV info via mkvmerge -J
+        result = subprocess.run(['mkvmerge', '-J', str(mkv_path)], capture_output=True, text=True, encoding='utf-8')
+        if result.returncode != 0:
+            print(f"  Error reading {mkv_path.name} with mkvmerge. Skipping.")
+            continue
+            
+        try:
+            mkv_info = json.loads(result.stdout)
+        except json.JSONDecodeError:
+            print(f"  Error parsing JSON output from mkvmerge for {mkv_path.name}. Skipping.")
+            continue
+
+        tracks = mkv_info.get('tracks', [])
+        attachments = mkv_info.get('attachments', [])
+
+        # Fix condition: checking codec or codec_id
+        ass_tracks = [t for t in tracks if t.get('type') == 'subtitles' and 
+                      ('S_TEXT/ASS' in str(t.get('properties', {}).get('codec_id', '')) or 
+                       'S_TEXT/SSA' in str(t.get('properties', {}).get('codec_id', '')) or 
+                       'SubStationAlpha' in str(t.get('codec', '')))]
+        
+        # Identify fonts vs other attachments. We look at mime types.
+        font_mimes = ['font', 'truetype', 'opentype', 'sfnt', 'application/x-truetype-font']
+        font_attachments = [a for a in attachments if any(m in a.get('content_type', '').lower() for m in font_mimes)]
+        other_attachments = [a for a in attachments if a not in font_attachments]
+
+        if not ass_tracks and not font_attachments:
+            print("  No ASS tracks and no fonts found. Copying without changes.")
+            shutil.copy2(str(mkv_path), str(finished_dir / mkv_path.name))
+            shutil.move(str(mkv_path), str(original_dir / mkv_path.name))
+            continue
+
+        # 2. Extract ASS tracks
+        required_fonts = set()
+        if ass_tracks:
+            print(f"  Extracting {len(ass_tracks)} ASS track(s)...")
+            extract_ass_cmd = ['mkvextract', 'tracks', str(mkv_path)]
+            ass_temp_files = []
+            for t in ass_tracks:
+                track_id = t['id']
+                out_ass = temp_dir / f"{mkv_path.stem}_track_{track_id}.ass"
+                extract_ass_cmd.append(f"{track_id}:{out_ass}")
+                ass_temp_files.append(out_ass)
+                
+            subprocess.run(extract_ass_cmd, check=True)
+            
+            # Retrieve required names
+            for ass_file in ass_temp_files:
+                ass_fonts = get_ass_font_names(ass_file)
+                print(f"    {ass_file.name} references {len(ass_fonts)} distinct font(s): {list(ass_fonts)[:5]}{'...' if len(ass_fonts) > 5 else ''}")
+                required_fonts.update(ass_fonts)
+                
+            print(f"  Total distinct font name(s) referenced in ASS across MKV: {len(required_fonts)}")
+            print(f"  Required fonts list: {list(required_fonts)}")
+
+        # 3. Extract and verify font attachments
+        fonts_to_keep = [] 
+        if font_attachments:
+            print(f"  Extracting {len(font_attachments)} font attachment(s) to verify...")
+            extract_att_cmd = ['mkvextract', 'attachments', str(mkv_path)]
+            for att in font_attachments:
+                att_id = att['id']
+                out_font = temp_dir / f"att_{att_id}_{safe_filename(att['file_name'])}"
+                extract_att_cmd.append(f"{att_id}:{out_font}")
+                att['temp_path'] = out_font 
+                
+            subprocess.run(extract_att_cmd, check=True)
+            
+            # Check which fonts match the required list
+            # We must be careful because some MKVs have fonts but no ASS referencing them
+            for att in font_attachments:
+                att_path = att['temp_path']
+                if not att_path.exists():
+                    continue
+                    
+                keep_this_font = False
+                
+                # Check 1: Exact filename match (minus extension)
+                filename_no_ext = att_path.stem.lower()
+                if filename_no_ext in required_fonts:
+                    print(f"    [MATCH] '{att['file_name']}' matched exactly via filename.")
+                    keep_this_font = True
+                
+                # Check 2: Internal TrueType/OpenType name match using fonttools
+                if not keep_this_font:
+                    internal_names = get_internal_font_names(att_path)
+                    intersect = required_fonts.intersection(internal_names)
+                    if intersect:
+                        print(f"    [MATCH] '{att['file_name']}' matched via internal names: {intersect}")
+                        keep_this_font = True
+                    else:
+                        print(f"    [SKIP]  '{att['file_name']}' did not match any required font. Internal names: {list(internal_names)[:5]}")
+                        
+                if keep_this_font:
+                    fonts_to_keep.append(att)
+                    
+        print(f"  Keeping {len(fonts_to_keep)} required font attachment(s).")
+        
+        # 4. Extract other non-font attachments (like cover.jpg) so we don't lose them!
+        other_to_keep = []
+        if other_attachments:
+            print(f"  Extracting {len(other_attachments)} non-font attachment(s) to preserve them...")
+            extract_other_cmd = ['mkvextract', 'attachments', str(mkv_path)]
+            for att in other_attachments:
+                att_id = att['id']
+                out_other = temp_dir / f"other_{att_id}_{safe_filename(att['file_name'])}"
+                extract_other_cmd.append(f"{att_id}:{out_other}")
+                att['temp_path'] = out_other
+            
+            subprocess.run(extract_other_cmd, check=True)
+            other_to_keep.extend(other_attachments)
+            
+        # 5. Remux using mkvmerge
+        out_mkv = finished_dir / mkv_path.name
+        remux_cmd = ['mkvmerge', '-o', str(out_mkv), '--no-attachments', str(mkv_path)]
+        
+        for att in fonts_to_keep + other_to_keep:
+            remux_cmd.extend([
+                '--attachment-name', att['file_name'],
+                '--attachment-mime-type', att['content_type'],
+                '--attach-file', str(att['temp_path'])
+            ])
+            
+        print(f"  Remuxing to: {out_mkv.name}")
+        subprocess.run(remux_cmd, check=True)
+        
+        # 6. Move original processed file
+        print(f"  Moving original file to 'original' folder...")
+        shutil.move(str(mkv_path), str(original_dir / mkv_path.name))
+        
+        # Cleanup temp for this MKV
+        for item in temp_dir.iterdir():
+            if item.is_file():
+                item.unlink()
+                
+    # Final cleanup of temp directory
+    try:
+        temp_dir.rmdir()
+    except OSError:
+        pass # Not empty, or some other error, keep it for debugging
+        
+    print("\nAll tasks completed.")
+
+if __name__ == '__main__':
+    main()