add subtitle language detection

This commit is contained in:
Luke Pulverenti
2017-06-17 18:59:17 -04:00
parent c9d7eb9b04
commit 0e7cbb0465
76 changed files with 2256 additions and 26 deletions

View File

@@ -8,6 +8,8 @@ using System.Threading.Tasks;
using MediaBrowser.Model.MediaInfo;
using MediaBrowser.Model.Logging;
using UniversalDetector;
using NLangDetect.Core;
using MediaBrowser.Model.Serialization;
namespace Emby.Common.Implementations.TextEncoding
{
@@ -15,11 +17,13 @@ namespace Emby.Common.Implementations.TextEncoding
{
private readonly IFileSystem _fileSystem;
private readonly ILogger _logger;
private IJsonSerializer _json;
public TextEncoding(IFileSystem fileSystem, ILogger logger)
public TextEncoding(IFileSystem fileSystem, ILogger logger, IJsonSerializer json)
{
_fileSystem = fileSystem;
_logger = logger;
_json = json;
}
public Encoding GetASCIIEncoding()
@@ -63,6 +67,7 @@ namespace Emby.Common.Implementations.TextEncoding
}
}
private bool _langDetectInitialized;
public string GetDetectedEncodingName(byte[] bytes, string language)
{
var encoding = GetInitialEncoding(bytes);
@@ -72,6 +77,22 @@ namespace Emby.Common.Implementations.TextEncoding
return "utf-8";
}
if (!_langDetectInitialized)
{
_langDetectInitialized = true;
LanguageDetector.Initialize(_json);
}
if (string.IsNullOrWhiteSpace(language))
{
language = DetectLanguage(bytes);
if (!string.IsNullOrWhiteSpace(language))
{
_logger.Debug("Text language detected as {0}", language);
}
}
var charset = DetectCharset(bytes, language);
if (!string.IsNullOrWhiteSpace(charset))
@@ -95,6 +116,35 @@ namespace Emby.Common.Implementations.TextEncoding
return null;
}
private string DetectLanguage(byte[] bytes)
{
try
{
return LanguageDetector.DetectLanguage(Encoding.UTF8.GetString(bytes));
}
catch (NLangDetectException ex)
{
}
try
{
return LanguageDetector.DetectLanguage(Encoding.ASCII.GetString(bytes));
}
catch (NLangDetectException ex)
{
}
try
{
return LanguageDetector.DetectLanguage(Encoding.Unicode.GetString(bytes));
}
catch (NLangDetectException ex)
{
}
return null;
}
public Encoding GetEncodingFromCharset(string charset)
{
if (string.IsNullOrWhiteSpace(charset))
@@ -136,22 +186,29 @@ namespace Emby.Common.Implementations.TextEncoding
case "cze":
case "ces":
case "slo":
case "slk":
case "slv":
case "srp":
case "hrv":
case "rum":
case "ron":
case "rup":
return "windows-1250";
// albanian
case "alb":
case "sqi":
return "windows-1250";
// slovak
case "slk":
case "slv":
return "windows-1250";
case "ara":
return "windows-1256";
case "heb":
return "windows-1255";
case "grc":
return "windows-1253";
// greek
case "gre":
case "ell":
return "windows-1253";
case "crh":
case "ota":