dualshock-tools.github.io/scripts/check_translations.py

#!/usr/bin/env python3

# (C) 2025 dualshock-tools
#
# This script analyzes the source files to find translation strings and compares
# them with the language files to identify:
# - Strings that are used in code but missing from translation files
# - Strings that are in translation files but no longer used in code
#
# The script extracts translation strings from:
# - HTML files: elements with ds-i18n class
# - JavaScript files: l() function calls
# - JavaScript files: HTML embedded in strings with ds-i18n class
#
# The script automatically ignores commented-out code:
# - HTML comments (<!-- ... -->)
# - JavaScript single-line comments (// ...)
# - JavaScript multi-line comments (/* ... */)
#
# Usage:
#   python3 scripts/check_translations.py           # Normal output
#   python3 scripts/check_translations.py --verbose # Show excluded strings
#   python3 scripts/check_translations.py --compact # Compact output (no language details)
#   python3 scripts/check_translations.py --json    # Output in JSON format

import os
import re
import json
import sys
from pathlib import Path

# Check for flags
VERBOSE = '--verbose' in sys.argv or '-v' in sys.argv
JSON_OUTPUT = '--json' in sys.argv
COMPACT = '--compact' in sys.argv

# Directories to scan
ROOT_DIR = Path(".")
LANG_DIR = ROOT_DIR / "lang"
JS_DIR = ROOT_DIR / "js"
TEMPLATES_DIR = ROOT_DIR / "templates"

# Special keys that are not in source code
SPECIAL_KEYS = {".authorMsg", ".title"}

# Patterns to exclude from translation checks (CSS selectors, technical strings, etc.)
EXCLUDE_PATTERNS = [
    r'^\.[\w-]+$',  # CSS class selectors like .alert, .hide
    r'^#[\w-]+$',  # CSS ID selectors
    r'^[\w-]+\.[\w-]+$',  # CSS compound selectors like circle.ds-touch
    r'^path,rect,circle',  # SVG element lists
    r'^\\x[0-9a-fA-F]+$',  # Hex escape sequences
]

# Whitelist of strings that are in language files but should be ignored by unused check
# These strings may be used dynamically, in comments, or reserved for future use
WHITELIST_UNUSED = {
    "(beta)",
    "30th Anniversary",
    "Astro Bot",
    "Chroma Indigo",
    "Chroma Pearl",
    "Chroma Teal",
    "Cobalt Blue",
    "Cosmic Red",
    "Fortnite",
    "Galactic Purple",
    "God of War Ragnarok",
    "Grey Camouflage",
    "Midnight Black",
    "Nova Pink",
    "Spider-Man 2",
    "Starlight Blue",
    "Sterling Silver",
    "The Last of Us",
    "Volcanic Red",
    "White",

    "Sony DualSense",
    "Sony DualSense Edge",
    "Sony DualShock 4 V1",
    "Sony DualShock 4 V2",

    "Calibration in progress",
    "Continue",
    "Start",
    "Initializing...",
    "Sampling...",
    "left module",
    "right module",
    "Your device might not be a genuine Sony controller. If it is not a clone then please report this issue.",

    "Adaptive Trigger",
    "Buttons",
    "Haptic Vibration",
    "Headphone Jack",
    "Lights",
    "Microphone",
    "Speaker",
    "USB Connector",
}


def should_exclude_string(text):
    """Check if a string should be excluded from translation checks."""
    for pattern in EXCLUDE_PATTERNS:
        if re.match(pattern, text):
            return True
    return False

def find_html_files():
    """Find all HTML files in the project."""
    html_files = []
    # Root HTML files
    html_files.extend(ROOT_DIR.glob("*.html"))
    # Template HTML files
    html_files.extend(TEMPLATES_DIR.glob("*.html"))
    return html_files

def find_js_files():
    """Find all JavaScript files in the js directory."""
    js_files = []
    js_files.extend(JS_DIR.glob("**/*.js"))
    return js_files

def extract_ds_i18n_strings(html_files):
    """Extract strings from elements with ds-i18n class in HTML files.

    Automatically ignores HTML comments (<!-- ... -->) before extraction.
    """
    strings = {}  # Changed to dict to track locations

    # Pattern to match elements with ds-i18n class and extract their content
    # This handles various HTML structures including multi-line content
    # Match opening tag with ds-i18n class, then capture content until closing tag
    pattern = r'<(\w+)[^>]*class="[^"]*ds-i18n[^"]*"[^>]*>(.*?)</\1>'

    for html_file in html_files:
        try:
            with open(html_file, 'r', encoding='utf-8') as f:
                content = f.read()
                lines = content.split('\n')

                # Remove HTML comments before processing
                # This regex handles both single-line and multi-line comments
                content = re.sub(r'<!--.*?-->', '', content, flags=re.DOTALL)

                # Find all matches (DOTALL flag allows . to match newlines)
                matches = re.finditer(pattern, content, re.DOTALL)
                for match in matches:
                    text = match.group(2)

                    # Skip if contains complex nested HTML tags
                    # Allow simple formatting tags like <b>, <i>, <em>, <strong>, <span>
                    if '<' in text and '>' in text:
                        # Check if it contains only simple formatting tags
                        # Remove simple formatting tags temporarily to check for other HTML
                        text_without_simple_tags = re.sub(r'</?(?:b|i|em|strong|span)>', '', text)
                        if '<' in text_without_simple_tags:
                            # Contains other HTML elements (complex content), skip it
                            continue
                        # Otherwise, keep the original text with simple formatting tags

                    if text:
                        # Calculate line and column number
                        line_num = content[:match.start()].count('\n') + 1
                        col_num = match.start() - content[:match.start()].rfind('\n')

                        # Store location info
                        if text not in strings:
                            strings[text] = []
                        strings[text].append({
                            'file': str(html_file),
                            'line': line_num,
                            'col': col_num
                        })

        except Exception as e:
            print(f"Error reading {html_file}: {e}")

    return strings

def extract_l_function_strings(js_files):
    """Extract strings passed to l() function in JavaScript files.

    Automatically ignores JavaScript comments (// and /* ... */) before extraction.
    """
    strings = {}  # Changed to dict to track locations

    # Pattern to match l("string") or l('string') or this.l("string") or this.l('string')
    # Handles both single and double quotes
    # Use word boundary \b to ensure 'l' is not part of a larger word (e.g., .html)
    pattern = r'(?:this\.)?\bl\s*\(\s*["\'`]([^"\'`]+)["\'`]\s*\)'

    for js_file in js_files:
        try:
            with open(js_file, 'r', encoding='utf-8') as f:
                content = f.read()

                # Remove JavaScript comments before processing
                # Remove single-line comments (// ...)
                content = re.sub(r'//.*?$', '', content, flags=re.MULTILINE)
                # Remove multi-line comments (/* ... */)
                content = re.sub(r'/\*.*?\*/', '', content, flags=re.DOTALL)

                # Find all matches
                matches = re.finditer(pattern, content)
                for match in matches:
                    text = match.group(1)
                    if text:
                        # Calculate line and column number
                        line_num = content[:match.start()].count('\n') + 1
                        col_num = match.start() - content[:match.start()].rfind('\n')

                        # Store location info
                        if text not in strings:
                            strings[text] = []
                        strings[text].append({
                            'file': str(js_file),
                            'line': line_num,
                            'col': col_num
                        })

        except Exception as e:
            print(f"Error reading {js_file}: {e}")

    return strings

def extract_html_strings_from_js(js_files):
    """Extract strings from HTML embedded in JavaScript files.

    This function looks for HTML strings in JavaScript that contain elements with ds-i18n class.
    Automatically ignores JavaScript comments (// and /* ... */) before extraction.
    """
    strings = {}  # Dict to track locations

    # Pattern to match elements with ds-i18n class in HTML strings
    # This handles HTML within JavaScript strings (both single and double quotes)
    pattern = r'<(\w+)[^>]*class=["\'`][^"\'`]*ds-i18n[^"\'`]*["\'`][^>]*>(.*?)</\1>'

    # Pattern to match template literal function calls like ${l('string')} or ${l("string")}
    template_literal_pattern = r'\$\{l\s*\(\s*["\'`]([^"\'`]+)["\'`]\s*\)\}'

    for js_file in js_files:
        try:
            with open(js_file, 'r', encoding='utf-8') as f:
                content = f.read()
                original_content = content  # Keep original for line number calculation

                # Remove JavaScript comments before processing
                # Remove single-line comments (// ...)
                content = re.sub(r'//.*?$', '', content, flags=re.MULTILINE)
                # Remove multi-line comments (/* ... */)
                content = re.sub(r'/\*.*?\*/', '', content, flags=re.DOTALL)

                # Find all matches (DOTALL flag allows . to match newlines)
                matches = re.finditer(pattern, content, re.DOTALL)
                for match in matches:
                    text = match.group(2)

                    # Skip if contains complex nested HTML tags
                    # Allow simple formatting tags like <b>, <i>, <em>, <strong>, <span>
                    if '<' in text and '>' in text:
                        # Check if it contains only simple formatting tags
                        # Remove simple formatting tags temporarily to check for other HTML
                        text_without_simple_tags = re.sub(r'</?(?:b|i|em|strong|span)>', '', text)
                        if '<' in text_without_simple_tags:
                            # Contains other HTML elements (complex content), skip it
                            continue
                        # Otherwise, keep the original text with simple formatting tags

                    if text:
                        # Extract any template literal function calls like ${l('string')}
                        template_matches = re.finditer(template_literal_pattern, text)
                        for template_match in template_matches:
                            extracted_string = template_match.group(1)
                            if extracted_string:
                                # Calculate line and column number using original content
                                line_num = original_content[:match.start()].count('\n') + 1
                                col_num = match.start() - original_content[:match.start()].rfind('\n')

                                # Store location info
                                if extracted_string not in strings:
                                    strings[extracted_string] = []
                                strings[extracted_string].append({
                                    'file': str(js_file),
                                    'line': line_num,
                                    'col': col_num
                                })

                        # Also handle text that doesn't contain template literal patterns
                        # (for backwards compatibility with non-template literal strings)
                        if not re.search(template_literal_pattern, text):
                            # Calculate line and column number using original content
                            line_num = original_content[:match.start()].count('\n') + 1
                            col_num = match.start() - original_content[:match.start()].rfind('\n')

                            # Store location info
                            if text not in strings:
                                strings[text] = []
                            strings[text].append({
                                'file': str(js_file),
                                'line': line_num,
                                'col': col_num
                            })

        except Exception as e:
            print(f"Error reading {js_file}: {e}")

    return strings

def load_translation_keys():
    """Load all translation keys from language files.

    Returns:
        tuple: (all_keys, keys_by_language)
            - all_keys: set of all unique keys across all language files
            - keys_by_language: dict mapping language code to set of keys in that language
    """
    all_keys = set()
    keys_by_language = {}

    lang_files = list(LANG_DIR.glob("*.json"))

    if not lang_files:
        print(f"Warning: No language files found in {LANG_DIR}")
        return all_keys, keys_by_language

    # Load keys from all language files
    for lang_file in lang_files:
        try:
            # Extract language code from filename (e.g., "en_us" from "en_us.json")
            lang_code = lang_file.stem

            with open(lang_file, 'r', encoding='utf-8') as f:
                data = json.load(f)
                keys = set(data.keys())
                keys.discard("")  # Remove empty string key if present

                keys_by_language[lang_code] = keys
                all_keys.update(keys)
        except Exception as e:
            print(f"Error reading {lang_file}: {e}")

    # Remove empty string key if present
    all_keys.discard("")

    return all_keys, keys_by_language

def main():
    if not JSON_OUTPUT:
        print("=" * 80)
        print("Translation String Checker")
        print("=" * 80)
        print()

    # Find all source files
    if not JSON_OUTPUT:
        print("Scanning source files...")
    html_files = find_html_files()
    js_files = find_js_files()

    if not JSON_OUTPUT:
        print(f"Found {len(html_files)} HTML files")
        print(f"Found {len(js_files)} JavaScript files")
        print()

    # Extract strings from source files
    if not JSON_OUTPUT:
        print("Extracting translation strings from source files...")
    ds_i18n_strings = extract_ds_i18n_strings(html_files)
    l_function_strings = extract_l_function_strings(js_files)
    html_in_js_strings = extract_html_strings_from_js(js_files)

    if not JSON_OUTPUT:
        print(f"Found {len(ds_i18n_strings)} strings with ds-i18n class in HTML files")
        print(f"Found {len(l_function_strings)} strings in l() function calls")
        print(f"Found {len(html_in_js_strings)} strings with ds-i18n class in JavaScript files")
        print()

    # Combine all used strings and filter out excluded patterns
    # Merge the three dictionaries, combining location lists for duplicate strings
    all_used_strings_with_locations = {}
    for text, locations in ds_i18n_strings.items():
        all_used_strings_with_locations[text] = locations.copy()
    for text, locations in l_function_strings.items():
        if text in all_used_strings_with_locations:
            all_used_strings_with_locations[text].extend(locations)
        else:
            all_used_strings_with_locations[text] = locations.copy()
    for text, locations in html_in_js_strings.items():
        if text in all_used_strings_with_locations:
            all_used_strings_with_locations[text].extend(locations)
        else:
            all_used_strings_with_locations[text] = locations.copy()

    excluded_strings = {s for s in all_used_strings_with_locations.keys() if should_exclude_string(s)}
    used_strings_with_locations = {k: v for k, v in all_used_strings_with_locations.items() if k not in excluded_strings}
    used_strings = set(used_strings_with_locations.keys())

    if not JSON_OUTPUT and excluded_strings:
        print(f"Excluded {len(excluded_strings)} non-translatable strings (CSS selectors, etc.)")
        if VERBOSE:
            for s in sorted(excluded_strings):
                print(f"  - \"{s}\"")
        print()

    # Load translation keys
    if not JSON_OUTPUT:
        print("Loading translation keys from language files...")
    translation_keys, keys_by_language = load_translation_keys()
    if not JSON_OUTPUT:
        print(f"Found {len(translation_keys)} keys in translation files")
        print(f"Found {len(keys_by_language)} language files")
        print()

    # Remove special keys from comparison
    translation_keys_for_comparison = translation_keys - SPECIAL_KEYS

    # Remove special keys from each language's key set
    keys_by_language_filtered = {}
    for lang_code, keys in keys_by_language.items():
        keys_by_language_filtered[lang_code] = keys - SPECIAL_KEYS

    # Find missing translations (used in code but not in translation files)
    missing_translations = used_strings - translation_keys_for_comparison

    # For each missing translation, find which languages are missing it
    missing_by_language = {}
    for string in missing_translations:
        missing_langs = []
        for lang_code, keys in keys_by_language_filtered.items():
            if string not in keys:
                missing_langs.append(lang_code)
        missing_by_language[string] = sorted(missing_langs)

    # Find unused translations (in translation files but not used in code)
    # Exclude whitelisted strings from unused check
    unused_translations = (translation_keys_for_comparison - used_strings) - WHITELIST_UNUSED

    # Output results
    if JSON_OUTPUT:
        # Build missing translations with locations and missing languages
        missing_with_locations = []
        for string in sorted(missing_translations):
            entry = {
                "string": string,
                "missing_from_languages": missing_by_language.get(string, [])
            }
            if string in used_strings_with_locations:
                entry["locations"] = used_strings_with_locations[string]
            missing_with_locations.append(entry)

        result = {
            "summary": {
                "total_strings_used": len(used_strings),
                "total_translation_keys": len(translation_keys_for_comparison),
                "total_languages": len(keys_by_language),
                "missing_count": len(missing_translations),
                "unused_count": len(unused_translations),
                "excluded_count": len(excluded_strings),
                "whitelisted_count": len(WHITELIST_UNUSED)
            },
            "missing_translations": missing_with_locations,
            "unused_translations": sorted(unused_translations),
            "excluded_strings": sorted(excluded_strings),
            "whitelisted_strings": sorted(WHITELIST_UNUSED)
        }
        print(json.dumps(result, indent=2, ensure_ascii=False))
        return 1 if (missing_translations or unused_translations) else 0

    # Print results (text format)
    print("=" * 80)
    print("RESULTS")
    print("=" * 80)
    print()

    if missing_translations:
        print(f"⚠️  MISSING TRANSLATIONS ({len(missing_translations)} strings)")
        print("These strings are used in code but not found in translation files:")
        print("-" * 80)
        for string in sorted(missing_translations):
            print(f"  - \"{string}\"")
            # Show first location where this string was found (skip in compact mode)
            if not COMPACT and string in used_strings_with_locations:
                locations = used_strings_with_locations[string]
                if locations:
                    loc = locations[0]
                    print(f"    → {loc['file']}:{loc['line']}:{loc['col']}")
                    if len(locations) > 1:
                        print(f"    (and {len(locations) - 1} more location{'s' if len(locations) > 2 else ''})")
            # Show which languages are missing this translation (skip in compact mode)
            if not COMPACT and string in missing_by_language:
                missing_langs = missing_by_language[string]
                if len(missing_langs) == len(keys_by_language):
                    print(f"    Missing from: ALL languages ({len(missing_langs)})")
                else:
                    # Show first few languages, then count
                    if len(missing_langs) <= 5:
                        print(f"    Missing from: {', '.join(missing_langs)}")
                    else:
                        print(f"    Missing from: {', '.join(missing_langs[:5])} (and {len(missing_langs) - 5} more)")
        print()
    else:
        print("✅ No missing translations found!")
        print()

    if unused_translations:
        print(f"ℹ️  UNUSED TRANSLATIONS ({len(unused_translations)} strings)")
        print("These strings are in translation files but not used in code:")
        print("-" * 80)
        for string in sorted(unused_translations):
            print(f"  - \"{string}\"")
        print()
    else:
        print("✅ No unused translations found!")
        print()

    # Summary
    print("=" * 80)
    print("SUMMARY")
    print("=" * 80)
    print(f"Total strings used in code: {len(used_strings)}")
    print(f"Total keys in translation files: {len(translation_keys_for_comparison)}")
    print(f"Missing translations: {len(missing_translations)}")
    print(f"Unused translations: {len(unused_translations)}")
    print(f"Whitelisted strings: {len(WHITELIST_UNUSED)}")
    print()

    if missing_translations or unused_translations:
        print("⚠️  Translation files need updates!")
        return 1
    else:
        print("✅ All translations are in sync!")
        return 0

if __name__ == "__main__":
    exit(main())