refactor: rename cleanup_fonts.py to subtitle_fonts_cleaner.py and update README references.
This commit is contained in:
250
subtitle_fonts_cleaner.py
Normal file
250
subtitle_fonts_cleaner.py
Normal file
@@ -0,0 +1,250 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import json
|
||||
import shutil
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
|
||||
# Try importing fontTools, which is needed to read internal font names
|
||||
try:
|
||||
from fontTools.ttLib import TTFont
|
||||
except ImportError:
|
||||
print("Error: 'fonttools' is required to accurately read internal font names.")
|
||||
print("Please install it by running: pip install fonttools (for Windows/Ubuntu)")
|
||||
print("Or: pipx install fonttools (for ArchLinux)")
|
||||
sys.exit(1)
|
||||
|
||||
def get_ass_font_names(ass_path):
|
||||
"""
|
||||
Parse an ASS file and return a set of all font names used.
|
||||
It checks the [V4+ Styles] section and \fn overrides in [Events].
|
||||
"""
|
||||
fonts = set()
|
||||
try:
|
||||
with open(ass_path, 'r', encoding='utf-8', errors='ignore') as f:
|
||||
in_styles = False
|
||||
in_events = False
|
||||
fontname_idx = -1
|
||||
|
||||
for line in f:
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
if line.startswith('['):
|
||||
in_styles = (line == '[V4+ Styles]')
|
||||
in_events = (line == '[Events]')
|
||||
continue
|
||||
|
||||
if in_styles:
|
||||
if line.startswith('Format:'):
|
||||
format_str = line[len('Format:'):].strip()
|
||||
format_cols = [col.strip().lower() for col in format_str.split(',')]
|
||||
if 'fontname' in format_cols:
|
||||
fontname_idx = format_cols.index('fontname')
|
||||
elif line.startswith('Style:') and fontname_idx != -1:
|
||||
style_str = line[len('Style:'):].strip()
|
||||
style_cols = [col.strip() for col in style_str.split(',')]
|
||||
if len(style_cols) > fontname_idx:
|
||||
fonts.add(style_cols[fontname_idx])
|
||||
|
||||
if in_events:
|
||||
if line.startswith('Dialogue:'):
|
||||
# Find overrides like \fnArial or \fnComic Sans MS
|
||||
matches = re.findall(r'\\fn([^\\}]+)', line)
|
||||
for match in matches:
|
||||
fonts.add(match.strip())
|
||||
except Exception as e:
|
||||
print(f"Warning: Failed to read {ass_path.name}: {e}")
|
||||
|
||||
return {f.lower() for f in fonts} # case-insensitive set
|
||||
|
||||
def get_internal_font_names(font_path):
|
||||
"""
|
||||
Extract internal font names (Family, Full Name, Typographic Family) from a TTF/OTF.
|
||||
"""
|
||||
names = set()
|
||||
try:
|
||||
# fontNumber=0 works for single fonts and the first font in a collection (.ttc)
|
||||
font = TTFont(str(font_path), fontNumber=0)
|
||||
for record in font['name'].names:
|
||||
if record.nameID in (1, 4, 16):
|
||||
try:
|
||||
text = record.toUnicode()
|
||||
names.add(text.lower())
|
||||
except:
|
||||
pass
|
||||
font.close()
|
||||
except Exception as e:
|
||||
print(f" Warning: Could not read metadata for {font_path.name}: {e}")
|
||||
return names
|
||||
|
||||
def safe_filename(name):
|
||||
"""Make string safe for temporary filename."""
|
||||
return "".join(c for c in name if c.isalpha() or c.isdigit() or c in ' .-_').rstrip()
|
||||
|
||||
def main():
|
||||
root_dir = Path.cwd()
|
||||
mkv_files = list(root_dir.glob("*.mkv"))
|
||||
if not mkv_files:
|
||||
print("No MKV files found in the current directory.")
|
||||
return
|
||||
|
||||
original_dir = root_dir / "original"
|
||||
finished_dir = root_dir / "finished"
|
||||
temp_dir = root_dir / "temp_subs_fonts"
|
||||
|
||||
original_dir.mkdir(exist_ok=True)
|
||||
finished_dir.mkdir(exist_ok=True)
|
||||
temp_dir.mkdir(exist_ok=True)
|
||||
|
||||
for mkv_path in mkv_files:
|
||||
print(f"\nProcessing: {mkv_path.name}")
|
||||
|
||||
# 1. Get MKV info via mkvmerge -J
|
||||
result = subprocess.run(['mkvmerge', '-J', str(mkv_path)], capture_output=True, text=True, encoding='utf-8')
|
||||
if result.returncode != 0:
|
||||
print(f" Error reading {mkv_path.name} with mkvmerge. Skipping.")
|
||||
continue
|
||||
|
||||
try:
|
||||
mkv_info = json.loads(result.stdout)
|
||||
except json.JSONDecodeError:
|
||||
print(f" Error parsing JSON output from mkvmerge for {mkv_path.name}. Skipping.")
|
||||
continue
|
||||
|
||||
tracks = mkv_info.get('tracks', [])
|
||||
attachments = mkv_info.get('attachments', [])
|
||||
|
||||
# Fix condition: checking codec or codec_id
|
||||
ass_tracks = [t for t in tracks if t.get('type') == 'subtitles' and
|
||||
('S_TEXT/ASS' in str(t.get('properties', {}).get('codec_id', '')) or
|
||||
'S_TEXT/SSA' in str(t.get('properties', {}).get('codec_id', '')) or
|
||||
'SubStationAlpha' in str(t.get('codec', '')))]
|
||||
|
||||
# Identify fonts vs other attachments. We look at mime types.
|
||||
font_mimes = ['font', 'truetype', 'opentype', 'sfnt', 'application/x-truetype-font']
|
||||
font_attachments = [a for a in attachments if any(m in a.get('content_type', '').lower() for m in font_mimes)]
|
||||
other_attachments = [a for a in attachments if a not in font_attachments]
|
||||
|
||||
if not ass_tracks and not font_attachments:
|
||||
print(" No ASS tracks and no fonts found. Copying without changes.")
|
||||
shutil.copy2(str(mkv_path), str(finished_dir / mkv_path.name))
|
||||
shutil.move(str(mkv_path), str(original_dir / mkv_path.name))
|
||||
continue
|
||||
|
||||
# 2. Extract ASS tracks
|
||||
required_fonts = set()
|
||||
if ass_tracks:
|
||||
print(f" Extracting {len(ass_tracks)} ASS track(s)...")
|
||||
extract_ass_cmd = ['mkvextract', 'tracks', str(mkv_path)]
|
||||
ass_temp_files = []
|
||||
for t in ass_tracks:
|
||||
track_id = t['id']
|
||||
out_ass = temp_dir / f"{mkv_path.stem}_track_{track_id}.ass"
|
||||
extract_ass_cmd.append(f"{track_id}:{out_ass}")
|
||||
ass_temp_files.append(out_ass)
|
||||
|
||||
subprocess.run(extract_ass_cmd, check=True)
|
||||
|
||||
# Retrieve required names
|
||||
for ass_file in ass_temp_files:
|
||||
ass_fonts = get_ass_font_names(ass_file)
|
||||
print(f" {ass_file.name} references {len(ass_fonts)} distinct font(s): {list(ass_fonts)[:5]}{'...' if len(ass_fonts) > 5 else ''}")
|
||||
required_fonts.update(ass_fonts)
|
||||
|
||||
print(f" Total distinct font name(s) referenced in ASS across MKV: {len(required_fonts)}")
|
||||
print(f" Required fonts list: {list(required_fonts)}")
|
||||
|
||||
# 3. Extract and verify font attachments
|
||||
fonts_to_keep = []
|
||||
if font_attachments:
|
||||
print(f" Extracting {len(font_attachments)} font attachment(s) to verify...")
|
||||
extract_att_cmd = ['mkvextract', 'attachments', str(mkv_path)]
|
||||
for att in font_attachments:
|
||||
att_id = att['id']
|
||||
out_font = temp_dir / f"att_{att_id}_{safe_filename(att['file_name'])}"
|
||||
extract_att_cmd.append(f"{att_id}:{out_font}")
|
||||
att['temp_path'] = out_font
|
||||
|
||||
subprocess.run(extract_att_cmd, check=True)
|
||||
|
||||
# Check which fonts match the required list
|
||||
# We must be careful because some MKVs have fonts but no ASS referencing them
|
||||
for att in font_attachments:
|
||||
att_path = att['temp_path']
|
||||
if not att_path.exists():
|
||||
continue
|
||||
|
||||
keep_this_font = False
|
||||
|
||||
# Check 1: Exact filename match (minus extension)
|
||||
filename_no_ext = att_path.stem.lower()
|
||||
if filename_no_ext in required_fonts:
|
||||
print(f" [MATCH] '{att['file_name']}' matched exactly via filename.")
|
||||
keep_this_font = True
|
||||
|
||||
# Check 2: Internal TrueType/OpenType name match using fonttools
|
||||
if not keep_this_font:
|
||||
internal_names = get_internal_font_names(att_path)
|
||||
intersect = required_fonts.intersection(internal_names)
|
||||
if intersect:
|
||||
print(f" [MATCH] '{att['file_name']}' matched via internal names: {intersect}")
|
||||
keep_this_font = True
|
||||
else:
|
||||
print(f" [SKIP] '{att['file_name']}' did not match any required font. Internal names: {list(internal_names)[:5]}")
|
||||
|
||||
if keep_this_font:
|
||||
fonts_to_keep.append(att)
|
||||
|
||||
print(f" Keeping {len(fonts_to_keep)} required font attachment(s).")
|
||||
|
||||
# 4. Extract other non-font attachments (like cover.jpg) so we don't lose them!
|
||||
other_to_keep = []
|
||||
if other_attachments:
|
||||
print(f" Extracting {len(other_attachments)} non-font attachment(s) to preserve them...")
|
||||
extract_other_cmd = ['mkvextract', 'attachments', str(mkv_path)]
|
||||
for att in other_attachments:
|
||||
att_id = att['id']
|
||||
out_other = temp_dir / f"other_{att_id}_{safe_filename(att['file_name'])}"
|
||||
extract_other_cmd.append(f"{att_id}:{out_other}")
|
||||
att['temp_path'] = out_other
|
||||
|
||||
subprocess.run(extract_other_cmd, check=True)
|
||||
other_to_keep.extend(other_attachments)
|
||||
|
||||
# 5. Remux using mkvmerge
|
||||
out_mkv = finished_dir / mkv_path.name
|
||||
remux_cmd = ['mkvmerge', '-o', str(out_mkv), '--no-attachments', str(mkv_path)]
|
||||
|
||||
for att in fonts_to_keep + other_to_keep:
|
||||
remux_cmd.extend([
|
||||
'--attachment-name', att['file_name'],
|
||||
'--attachment-mime-type', att['content_type'],
|
||||
'--attach-file', str(att['temp_path'])
|
||||
])
|
||||
|
||||
print(f" Remuxing to: {out_mkv.name}")
|
||||
subprocess.run(remux_cmd, check=True)
|
||||
|
||||
# 6. Move original processed file
|
||||
print(f" Moving original file to 'original' folder...")
|
||||
shutil.move(str(mkv_path), str(original_dir / mkv_path.name))
|
||||
|
||||
# Cleanup temp for this MKV
|
||||
for item in temp_dir.iterdir():
|
||||
if item.is_file():
|
||||
item.unlink()
|
||||
|
||||
# Final cleanup of temp directory
|
||||
try:
|
||||
temp_dir.rmdir()
|
||||
except OSError:
|
||||
pass # Not empty, or some other error, keep it for debugging
|
||||
|
||||
print("\nAll tasks completed.")
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Reference in New Issue
Block a user