Files
dualshock-tools.github.io/scripts/check_translations.py
2025-12-24 13:44:39 +01:00

538 lines
21 KiB
Python
Executable File
Raw Permalink Blame History

This file contains invisible Unicode characters
This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
# (C) 2025 dualshock-tools
#
# This script analyzes the source files to find translation strings and compares
# them with the language files to identify:
# - Strings that are used in code but missing from translation files
# - Strings that are in translation files but no longer used in code
#
# The script extracts translation strings from:
# - HTML files: elements with ds-i18n class
# - JavaScript files: l() function calls
# - JavaScript files: HTML embedded in strings with ds-i18n class
#
# The script automatically ignores commented-out code:
# - HTML comments (<!-- ... -->)
# - JavaScript single-line comments (// ...)
# - JavaScript multi-line comments (/* ... */)
#
# Usage:
# python3 scripts/check_translations.py # Normal output
# python3 scripts/check_translations.py --verbose # Show excluded strings
# python3 scripts/check_translations.py --compact # Compact output (no language details)
# python3 scripts/check_translations.py --json # Output in JSON format
import os
import re
import json
import sys
from pathlib import Path
# Check for flags
VERBOSE = '--verbose' in sys.argv or '-v' in sys.argv
JSON_OUTPUT = '--json' in sys.argv
COMPACT = '--compact' in sys.argv
# Directories to scan
ROOT_DIR = Path(".")
LANG_DIR = ROOT_DIR / "lang"
JS_DIR = ROOT_DIR / "js"
TEMPLATES_DIR = ROOT_DIR / "templates"
# Special keys that are not in source code
SPECIAL_KEYS = {".authorMsg", ".title"}
# Patterns to exclude from translation checks (CSS selectors, technical strings, etc.)
EXCLUDE_PATTERNS = [
r'^\.[\w-]+$', # CSS class selectors like .alert, .hide
r'^#[\w-]+$', # CSS ID selectors
r'^[\w-]+\.[\w-]+$', # CSS compound selectors like circle.ds-touch
r'^path,rect,circle', # SVG element lists
r'^\\x[0-9a-fA-F]+$', # Hex escape sequences
]
# Whitelist of strings that are in language files but should be ignored by unused check
# These strings may be used dynamically, in comments, or reserved for future use
WHITELIST_UNUSED = {
"(beta)",
"30th Anniversary",
"Astro Bot",
"Chroma Indigo",
"Chroma Pearl",
"Chroma Teal",
"Cobalt Blue",
"Cosmic Red",
"Fortnite",
"Galactic Purple",
"God of War Ragnarok",
"Grey Camouflage",
"Midnight Black",
"Nova Pink",
"Spider-Man 2",
"Starlight Blue",
"Sterling Silver",
"The Last of Us",
"Volcanic Red",
"White",
"Sony DualSense",
"Sony DualSense Edge",
"Sony DualShock 4 V1",
"Sony DualShock 4 V2",
"Calibration in progress",
"Continue",
"Start",
"Initializing...",
"Sampling...",
"left module",
"right module",
"Your device might not be a genuine Sony controller. If it is not a clone then please report this issue.",
"Adaptive Trigger",
"Buttons",
"Haptic Vibration",
"Headphone Jack",
"Lights",
"Microphone",
"Speaker",
"USB Connector",
}
def should_exclude_string(text):
"""Check if a string should be excluded from translation checks."""
for pattern in EXCLUDE_PATTERNS:
if re.match(pattern, text):
return True
return False
def find_html_files():
"""Find all HTML files in the project."""
html_files = []
# Root HTML files
html_files.extend(ROOT_DIR.glob("*.html"))
# Template HTML files
html_files.extend(TEMPLATES_DIR.glob("*.html"))
return html_files
def find_js_files():
"""Find all JavaScript files in the js directory."""
js_files = []
js_files.extend(JS_DIR.glob("**/*.js"))
return js_files
def extract_ds_i18n_strings(html_files):
"""Extract strings from elements with ds-i18n class in HTML files.
Automatically ignores HTML comments (<!-- ... -->) before extraction.
"""
strings = {} # Changed to dict to track locations
# Pattern to match elements with ds-i18n class and extract their content
# This handles various HTML structures including multi-line content
# Match opening tag with ds-i18n class, then capture content until closing tag
pattern = r'<(\w+)[^>]*class="[^"]*ds-i18n[^"]*"[^>]*>(.*?)</\1>'
for html_file in html_files:
try:
with open(html_file, 'r', encoding='utf-8') as f:
content = f.read()
lines = content.split('\n')
# Remove HTML comments before processing
# This regex handles both single-line and multi-line comments
content = re.sub(r'<!--.*?-->', '', content, flags=re.DOTALL)
# Find all matches (DOTALL flag allows . to match newlines)
matches = re.finditer(pattern, content, re.DOTALL)
for match in matches:
text = match.group(2)
# Skip if contains complex nested HTML tags
# Allow simple formatting tags like <b>, <i>, <em>, <strong>, <span>
if '<' in text and '>' in text:
# Check if it contains only simple formatting tags
# Remove simple formatting tags temporarily to check for other HTML
text_without_simple_tags = re.sub(r'</?(?:b|i|em|strong|span)>', '', text)
if '<' in text_without_simple_tags:
# Contains other HTML elements (complex content), skip it
continue
# Otherwise, keep the original text with simple formatting tags
if text:
# Calculate line and column number
line_num = content[:match.start()].count('\n') + 1
col_num = match.start() - content[:match.start()].rfind('\n')
# Store location info
if text not in strings:
strings[text] = []
strings[text].append({
'file': str(html_file),
'line': line_num,
'col': col_num
})
except Exception as e:
print(f"Error reading {html_file}: {e}")
return strings
def extract_l_function_strings(js_files):
"""Extract strings passed to l() function in JavaScript files.
Automatically ignores JavaScript comments (// and /* ... */) before extraction.
"""
strings = {} # Changed to dict to track locations
# Pattern to match l("string") or l('string') or this.l("string") or this.l('string')
# Handles both single and double quotes
# Use word boundary \b to ensure 'l' is not part of a larger word (e.g., .html)
pattern = r'(?:this\.)?\bl\s*\(\s*["\'`]([^"\'`]+)["\'`]\s*\)'
for js_file in js_files:
try:
with open(js_file, 'r', encoding='utf-8') as f:
content = f.read()
# Remove JavaScript comments before processing
# Remove single-line comments (// ...)
content = re.sub(r'//.*?$', '', content, flags=re.MULTILINE)
# Remove multi-line comments (/* ... */)
content = re.sub(r'/\*.*?\*/', '', content, flags=re.DOTALL)
# Find all matches
matches = re.finditer(pattern, content)
for match in matches:
text = match.group(1)
if text:
# Calculate line and column number
line_num = content[:match.start()].count('\n') + 1
col_num = match.start() - content[:match.start()].rfind('\n')
# Store location info
if text not in strings:
strings[text] = []
strings[text].append({
'file': str(js_file),
'line': line_num,
'col': col_num
})
except Exception as e:
print(f"Error reading {js_file}: {e}")
return strings
def extract_html_strings_from_js(js_files):
"""Extract strings from HTML embedded in JavaScript files.
This function looks for HTML strings in JavaScript that contain elements with ds-i18n class.
Automatically ignores JavaScript comments (// and /* ... */) before extraction.
"""
strings = {} # Dict to track locations
# Pattern to match elements with ds-i18n class in HTML strings
# This handles HTML within JavaScript strings (both single and double quotes)
pattern = r'<(\w+)[^>]*class=["\'`][^"\'`]*ds-i18n[^"\'`]*["\'`][^>]*>(.*?)</\1>'
# Pattern to match template literal function calls like ${l('string')} or ${l("string")}
template_literal_pattern = r'\$\{l\s*\(\s*["\'`]([^"\'`]+)["\'`]\s*\)\}'
for js_file in js_files:
try:
with open(js_file, 'r', encoding='utf-8') as f:
content = f.read()
original_content = content # Keep original for line number calculation
# Remove JavaScript comments before processing
# Remove single-line comments (// ...)
content = re.sub(r'//.*?$', '', content, flags=re.MULTILINE)
# Remove multi-line comments (/* ... */)
content = re.sub(r'/\*.*?\*/', '', content, flags=re.DOTALL)
# Find all matches (DOTALL flag allows . to match newlines)
matches = re.finditer(pattern, content, re.DOTALL)
for match in matches:
text = match.group(2)
# Skip if contains complex nested HTML tags
# Allow simple formatting tags like <b>, <i>, <em>, <strong>, <span>
if '<' in text and '>' in text:
# Check if it contains only simple formatting tags
# Remove simple formatting tags temporarily to check for other HTML
text_without_simple_tags = re.sub(r'</?(?:b|i|em|strong|span)>', '', text)
if '<' in text_without_simple_tags:
# Contains other HTML elements (complex content), skip it
continue
# Otherwise, keep the original text with simple formatting tags
if text:
# Extract any template literal function calls like ${l('string')}
template_matches = re.finditer(template_literal_pattern, text)
for template_match in template_matches:
extracted_string = template_match.group(1)
if extracted_string:
# Calculate line and column number using original content
line_num = original_content[:match.start()].count('\n') + 1
col_num = match.start() - original_content[:match.start()].rfind('\n')
# Store location info
if extracted_string not in strings:
strings[extracted_string] = []
strings[extracted_string].append({
'file': str(js_file),
'line': line_num,
'col': col_num
})
# Also handle text that doesn't contain template literal patterns
# (for backwards compatibility with non-template literal strings)
if not re.search(template_literal_pattern, text):
# Calculate line and column number using original content
line_num = original_content[:match.start()].count('\n') + 1
col_num = match.start() - original_content[:match.start()].rfind('\n')
# Store location info
if text not in strings:
strings[text] = []
strings[text].append({
'file': str(js_file),
'line': line_num,
'col': col_num
})
except Exception as e:
print(f"Error reading {js_file}: {e}")
return strings
def load_translation_keys():
"""Load all translation keys from language files.
Returns:
tuple: (all_keys, keys_by_language)
- all_keys: set of all unique keys across all language files
- keys_by_language: dict mapping language code to set of keys in that language
"""
all_keys = set()
keys_by_language = {}
lang_files = list(LANG_DIR.glob("*.json"))
if not lang_files:
print(f"Warning: No language files found in {LANG_DIR}")
return all_keys, keys_by_language
# Load keys from all language files
for lang_file in lang_files:
try:
# Extract language code from filename (e.g., "en_us" from "en_us.json")
lang_code = lang_file.stem
with open(lang_file, 'r', encoding='utf-8') as f:
data = json.load(f)
keys = set(data.keys())
keys.discard("") # Remove empty string key if present
keys_by_language[lang_code] = keys
all_keys.update(keys)
except Exception as e:
print(f"Error reading {lang_file}: {e}")
# Remove empty string key if present
all_keys.discard("")
return all_keys, keys_by_language
def main():
if not JSON_OUTPUT:
print("=" * 80)
print("Translation String Checker")
print("=" * 80)
print()
# Find all source files
if not JSON_OUTPUT:
print("Scanning source files...")
html_files = find_html_files()
js_files = find_js_files()
if not JSON_OUTPUT:
print(f"Found {len(html_files)} HTML files")
print(f"Found {len(js_files)} JavaScript files")
print()
# Extract strings from source files
if not JSON_OUTPUT:
print("Extracting translation strings from source files...")
ds_i18n_strings = extract_ds_i18n_strings(html_files)
l_function_strings = extract_l_function_strings(js_files)
html_in_js_strings = extract_html_strings_from_js(js_files)
if not JSON_OUTPUT:
print(f"Found {len(ds_i18n_strings)} strings with ds-i18n class in HTML files")
print(f"Found {len(l_function_strings)} strings in l() function calls")
print(f"Found {len(html_in_js_strings)} strings with ds-i18n class in JavaScript files")
print()
# Combine all used strings and filter out excluded patterns
# Merge the three dictionaries, combining location lists for duplicate strings
all_used_strings_with_locations = {}
for text, locations in ds_i18n_strings.items():
all_used_strings_with_locations[text] = locations.copy()
for text, locations in l_function_strings.items():
if text in all_used_strings_with_locations:
all_used_strings_with_locations[text].extend(locations)
else:
all_used_strings_with_locations[text] = locations.copy()
for text, locations in html_in_js_strings.items():
if text in all_used_strings_with_locations:
all_used_strings_with_locations[text].extend(locations)
else:
all_used_strings_with_locations[text] = locations.copy()
excluded_strings = {s for s in all_used_strings_with_locations.keys() if should_exclude_string(s)}
used_strings_with_locations = {k: v for k, v in all_used_strings_with_locations.items() if k not in excluded_strings}
used_strings = set(used_strings_with_locations.keys())
if not JSON_OUTPUT and excluded_strings:
print(f"Excluded {len(excluded_strings)} non-translatable strings (CSS selectors, etc.)")
if VERBOSE:
for s in sorted(excluded_strings):
print(f" - \"{s}\"")
print()
# Load translation keys
if not JSON_OUTPUT:
print("Loading translation keys from language files...")
translation_keys, keys_by_language = load_translation_keys()
if not JSON_OUTPUT:
print(f"Found {len(translation_keys)} keys in translation files")
print(f"Found {len(keys_by_language)} language files")
print()
# Remove special keys from comparison
translation_keys_for_comparison = translation_keys - SPECIAL_KEYS
# Remove special keys from each language's key set
keys_by_language_filtered = {}
for lang_code, keys in keys_by_language.items():
keys_by_language_filtered[lang_code] = keys - SPECIAL_KEYS
# Find missing translations (used in code but not in translation files)
missing_translations = used_strings - translation_keys_for_comparison
# For each missing translation, find which languages are missing it
missing_by_language = {}
for string in missing_translations:
missing_langs = []
for lang_code, keys in keys_by_language_filtered.items():
if string not in keys:
missing_langs.append(lang_code)
missing_by_language[string] = sorted(missing_langs)
# Find unused translations (in translation files but not used in code)
# Exclude whitelisted strings from unused check
unused_translations = (translation_keys_for_comparison - used_strings) - WHITELIST_UNUSED
# Output results
if JSON_OUTPUT:
# Build missing translations with locations and missing languages
missing_with_locations = []
for string in sorted(missing_translations):
entry = {
"string": string,
"missing_from_languages": missing_by_language.get(string, [])
}
if string in used_strings_with_locations:
entry["locations"] = used_strings_with_locations[string]
missing_with_locations.append(entry)
result = {
"summary": {
"total_strings_used": len(used_strings),
"total_translation_keys": len(translation_keys_for_comparison),
"total_languages": len(keys_by_language),
"missing_count": len(missing_translations),
"unused_count": len(unused_translations),
"excluded_count": len(excluded_strings),
"whitelisted_count": len(WHITELIST_UNUSED)
},
"missing_translations": missing_with_locations,
"unused_translations": sorted(unused_translations),
"excluded_strings": sorted(excluded_strings),
"whitelisted_strings": sorted(WHITELIST_UNUSED)
}
print(json.dumps(result, indent=2, ensure_ascii=False))
return 1 if (missing_translations or unused_translations) else 0
# Print results (text format)
print("=" * 80)
print("RESULTS")
print("=" * 80)
print()
if missing_translations:
print(f"⚠️ MISSING TRANSLATIONS ({len(missing_translations)} strings)")
print("These strings are used in code but not found in translation files:")
print("-" * 80)
for string in sorted(missing_translations):
print(f" - \"{string}\"")
# Show first location where this string was found (skip in compact mode)
if not COMPACT and string in used_strings_with_locations:
locations = used_strings_with_locations[string]
if locations:
loc = locations[0]
print(f"{loc['file']}:{loc['line']}:{loc['col']}")
if len(locations) > 1:
print(f" (and {len(locations) - 1} more location{'s' if len(locations) > 2 else ''})")
# Show which languages are missing this translation (skip in compact mode)
if not COMPACT and string in missing_by_language:
missing_langs = missing_by_language[string]
if len(missing_langs) == len(keys_by_language):
print(f" Missing from: ALL languages ({len(missing_langs)})")
else:
# Show first few languages, then count
if len(missing_langs) <= 5:
print(f" Missing from: {', '.join(missing_langs)}")
else:
print(f" Missing from: {', '.join(missing_langs[:5])} (and {len(missing_langs) - 5} more)")
print()
else:
print("✅ No missing translations found!")
print()
if unused_translations:
print(f" UNUSED TRANSLATIONS ({len(unused_translations)} strings)")
print("These strings are in translation files but not used in code:")
print("-" * 80)
for string in sorted(unused_translations):
print(f" - \"{string}\"")
print()
else:
print("✅ No unused translations found!")
print()
# Summary
print("=" * 80)
print("SUMMARY")
print("=" * 80)
print(f"Total strings used in code: {len(used_strings)}")
print(f"Total keys in translation files: {len(translation_keys_for_comparison)}")
print(f"Missing translations: {len(missing_translations)}")
print(f"Unused translations: {len(unused_translations)}")
print(f"Whitelisted strings: {len(WHITELIST_UNUSED)}")
print()
if missing_translations or unused_translations:
print("⚠️ Translation files need updates!")
return 1
else:
print("✅ All translations are in sync!")
return 0
if __name__ == "__main__":
exit(main())