Prefer Language Detector over FastText (#6526)

This commit is contained in:
zubiden 2025-12-22 22:53:25 +01:00 committed by Alexander Zinchuk
parent 12b241c2cf
commit 6a37bfbf43
8 changed files with 125 additions and 74 deletions

8
package-lock.json generated
View File

@ -47,6 +47,7 @@
"@tauri-apps/cli": "^2.9.4",
"@testing-library/jest-dom": "^6.9.1",
"@twbs/fantasticon": "^3.1.0",
"@types/dom-chromium-ai": "^0.0.11",
"@types/dom-view-transitions": "^1.0.6",
"@types/hast": "^3.0.4",
"@types/jest": "^30.0.0",
@ -5616,6 +5617,13 @@
"@types/node": "*"
}
},
"node_modules/@types/dom-chromium-ai": {
"version": "0.0.11",
"resolved": "https://registry.npmjs.org/@types/dom-chromium-ai/-/dom-chromium-ai-0.0.11.tgz",
"integrity": "sha512-Li04Mac9ic1vbX/te9re8v1010fh5YB/30dMcJLpIuIyDoT7xE/dIdg9r9UrFZLs5Ztmonb3nP7+LhPpFuHBGw==",
"dev": true,
"license": "MIT"
},
"node_modules/@types/dom-view-transitions": {
"version": "1.0.6",
"resolved": "https://registry.npmjs.org/@types/dom-view-transitions/-/dom-view-transitions-1.0.6.tgz",

View File

@ -59,6 +59,7 @@
"@tauri-apps/cli": "^2.9.4",
"@testing-library/jest-dom": "^6.9.1",
"@twbs/fantasticon": "^3.1.0",
"@types/dom-chromium-ai": "^0.0.11",
"@types/dom-view-transitions": "^1.0.6",
"@types/hast": "^3.0.4",
"@types/jest": "^30.0.0",

View File

@ -1,4 +1,3 @@
import type { FC } from '../../../lib/teact/teact';
import {
memo, useMemo, useState,
} from '../../../lib/teact/teact';
@ -8,43 +7,15 @@ import type { AccountSettings } from '../../../types';
import { SUPPORTED_TRANSLATION_LANGUAGES } from '../../../config';
import buildClassName from '../../../util/buildClassName';
import { partition } from '../../../util/iteratees';
import useEffectWithPrevDeps from '../../../hooks/useEffectWithPrevDeps';
import useHistoryBack from '../../../hooks/useHistoryBack';
import useLang from '../../../hooks/useLang';
import useLastCallback from '../../../hooks/useLastCallback';
import useOldLang from '../../../hooks/useOldLang';
import ItemPicker, { type ItemPickerOption } from '../../common/pickers/ItemPicker';
import styles from './SettingsDoNotTranslate.module.scss';
// https://fasttext.cc/docs/en/language-identification.html
const LOCAL_SUPPORTED_DETECTION_LANGUAGES = [
'af', 'als', 'am', 'an', 'ar', 'arz', 'as', 'ast', 'av', 'az',
'azb', 'ba', 'bar', 'bcl', 'be', 'bg', 'bh', 'bn', 'bo', 'bpy',
'br', 'bs', 'bxr', 'ca', 'cbk', 'ce', 'ceb', 'ckb', 'co', 'cs',
'cv', 'cy', 'da', 'de', 'diq', 'dsb', 'dty', 'dv', 'el', 'eml',
'en', 'eo', 'es', 'et', 'eu', 'fa', 'fi', 'fr', 'frr', 'fy',
'ga', 'gd', 'gl', 'gn', 'gom', 'gu', 'gv', 'he', 'hi', 'hif',
'hr', 'hsb', 'ht', 'hu', 'hy', 'ia', 'id', 'ie', 'ilo', 'io',
'is', 'it', 'ja', 'jbo', 'jv', 'ka', 'kk', 'km', 'kn', 'ko',
'krc', 'ku', 'kv', 'kw', 'ky', 'la', 'lb', 'lez', 'li', 'lmo',
'lo', 'lrc', 'lt', 'lv', 'mai', 'mg', 'mhr', 'min', 'mk', 'ml',
'mn', 'mr', 'mrj', 'ms', 'mt', 'mwl', 'my', 'myv', 'mzn', 'nah',
'nap', 'nds', 'ne', 'new', 'nl', 'nn', 'no', 'oc', 'or', 'os',
'pa', 'pam', 'pfl', 'pl', 'pms', 'pnb', 'ps', 'pt', 'qu', 'rm',
'ro', 'ru', 'rue', 'sa', 'sah', 'sc', 'scn', 'sco', 'sd', 'sh',
'si', 'sk', 'sl', 'so', 'sq', 'sr', 'su', 'sv', 'sw', 'ta', 'te',
'tg', 'th', 'tk', 'tl', 'tr', 'tt', 'tyv', 'ug', 'uk', 'ur', 'uz',
'vec', 'vep', 'vi', 'vls', 'vo', 'wa', 'war', 'wuu', 'xal', 'xmf',
'yi', 'yo', 'yue', 'zh',
];
const SUPPORTED_LANGUAGES = SUPPORTED_TRANSLATION_LANGUAGES.filter((lang: string) => (
LOCAL_SUPPORTED_DETECTION_LANGUAGES.includes(lang)
));
type OwnProps = {
isActive?: boolean;
onReset: () => void;
@ -52,36 +23,35 @@ type OwnProps = {
type StateProps = Pick<AccountSettings, 'doNotTranslate'>;
const SettingsDoNotTranslate: FC<OwnProps & StateProps> = ({
const SettingsDoNotTranslate = ({
isActive,
doNotTranslate,
onReset,
}) => {
}: OwnProps & StateProps) => {
const { setSettingOption } = getActions();
const lang = useOldLang();
const language = lang.code || 'en';
const [displayedOptions, setDisplayedOptions] = useState<string[]>([]);
const lang = useLang();
const language = lang.code;
const [searchQuery, setSearchQuery] = useState<string>('');
const displayedOptionList: ItemPickerOption[] = useMemo(() => {
const options = SUPPORTED_LANGUAGES.map((langCode: string) => {
const translatedNames = new Intl.DisplayNames([language], { type: 'language' });
const translatedName = translatedNames.of(langCode)!;
const translatedNames = new Intl.DisplayNames([language], { type: 'language' });
const options = SUPPORTED_TRANSLATION_LANGUAGES.map((langCode: string) => {
const translatedName = translatedNames.of(langCode);
const originalNames = new Intl.DisplayNames([langCode], { type: 'language' });
const originalName = originalNames.of(langCode)!;
const originalName = new Intl.DisplayNames([langCode], { type: 'language' })
.of(langCode);
if (!translatedName || !originalName) {
return undefined;
}
return {
langCode,
translatedName,
originalName,
value: langCode,
label: translatedName,
subLabel: originalName,
};
}).filter(Boolean).map(({ langCode, translatedName, originalName }) => ({
label: translatedName,
subLabel: originalName,
value: langCode,
}));
}).filter(Boolean);
if (!searchQuery.trim()) {
const currentLanguageOption = options.find((option) => option.value === language);
@ -89,17 +59,14 @@ const SettingsDoNotTranslate: FC<OwnProps & StateProps> = ({
return currentLanguageOption ? [currentLanguageOption, ...otherOptionList] : options;
}
return options?.filter((option) => option.label.toLowerCase().includes(searchQuery.toLowerCase()));
return options?.filter((option) => (
option.label.toLowerCase().includes(searchQuery.toLowerCase())
|| option.subLabel?.toLowerCase().includes(searchQuery.toLowerCase())
|| option.value.toLowerCase().includes(searchQuery.toLowerCase())
));
}, [language, searchQuery]);
useEffectWithPrevDeps(([prevIsActive, prevLanguage]) => {
if (prevIsActive === isActive && prevLanguage?.find((option) => option === language)) return;
const [selected] = partition(displayedOptionList, (option) => doNotTranslate.includes(option.value));
setDisplayedOptions([...selected.map((option) => option.value)]);
}, [isActive, doNotTranslate, displayedOptions.length, language, displayedOptionList]);
const handleChange = useLastCallback((newSelectedIds: string[]) => {
setDisplayedOptions(newSelectedIds);
setSettingOption({
doNotTranslate: newSelectedIds,
});
@ -116,7 +83,7 @@ const SettingsDoNotTranslate: FC<OwnProps & StateProps> = ({
<ItemPicker
className={styles.picker}
items={displayedOptionList}
selectedValues={displayedOptions}
selectedValues={doNotTranslate}
onSelectedValuesChange={handleChange}
filterValue={searchQuery}
onFilterChange={setSearchQuery}

View File

@ -782,7 +782,7 @@ const Message = ({
const detectedLanguage = useTextLanguage(
text?.text,
!(areTranslationsEnabled && shouldDetectChatLanguage),
!(areTranslationsEnabled && shouldDetectChatLanguage) || isTypingDraft,
getIsMessageListReady,
);
useDetectChatLanguage(message, detectedLanguage, !shouldDetectChatLanguage, getIsMessageListReady);

View File

@ -1,20 +1,41 @@
import { useEffect, useState } from '../lib/teact/teact';
import { useEffect, useRef, useState } from '../lib/teact/teact';
import type { Signal } from '../util/signals';
import { detectLanguage } from '../util/languageDetection';
export default function useTextLanguage(text?: string, isDisabled?: boolean, getIsReady?: Signal<boolean>) {
const [language, setLanguage] = useState<string | undefined>();
const [language, setLanguage] = useState<string>();
const lastTextRef = useRef<string>();
useEffect(() => {
if (isDisabled || (getIsReady && !getIsReady())) return;
if (isDisabled || (getIsReady && !getIsReady()) || lastTextRef.current === text) return;
if (text) {
detectLanguage(text).then(setLanguage);
} else {
let isCancelled = false;
if (!text) {
setLanguage(undefined);
lastTextRef.current = undefined;
return;
}
detectLanguage(text).then((lang) => {
if (isCancelled) {
return;
}
setLanguage(lang);
}).finally(() => {
if (isCancelled) {
return;
}
lastTextRef.current = text;
});
return () => {
isCancelled = true;
};
}, [isDisabled, text, getIsReady]);
return language;

View File

@ -39,8 +39,8 @@ function parseLabelsWithProbabilities(labels: string) {
.map((labelWithProb: string) => {
const [label, prob] = labelWithProb.split(' ');
return {
label: parseLabel(label),
prob: parseFloat(prob),
detectedLanguage: parseLabel(label),
confidence: parseFloat(prob),
};
});
}

View File

@ -111,6 +111,7 @@ export const IS_BACKDROP_BLUR_SUPPORTED = CSS.supports('backdrop-filter: blur()'
export const IS_INSTALL_PROMPT_SUPPORTED = 'onbeforeinstallprompt' in window;
export const IS_OPEN_IN_NEW_TAB_SUPPORTED = !(IS_PWA && IS_MOBILE);
export const IS_TRANSLATION_SUPPORTED = !IS_TEST;
export const IS_TRANSLATION_DETECTOR_SUPPORTED = 'LanguageDetector' in window;
export const IS_VIEW_TRANSITION_SUPPORTED = CSS.supports('view-transition-class: test')
&& !IS_FIREFOX; // Fix flashing elements before removing

View File

@ -1,24 +1,40 @@
import type { FastTextApi } from '../lib/fasttextweb/fasttext.worker';
import type { Connector } from './PostMessageConnector';
import { IS_TRANSLATION_SUPPORTED } from './browser/windowEnvironment';
import { DEBUG } from '../config';
import { IS_TRANSLATION_DETECTOR_SUPPORTED, IS_TRANSLATION_SUPPORTED } from './browser/windowEnvironment';
import Deferred from './Deferred';
import { createConnector } from './PostMessageConnector';
const WORKER_INIT_DELAY = 4000;
const DETECTOR_INIT_DELAY = 4000;
const DEFAULT_THRESHOLD = 0.2;
const DEFAULT_LABELS_COUNT = 5;
const UNDEFINED_LANGUAGE = 'und';
let worker: Connector<FastTextApi> | undefined;
let languageDetector: LanguageDetector | undefined;
const initializationDeferred = new Deferred();
if (IS_TRANSLATION_SUPPORTED) {
setTimeout(initWorker, WORKER_INIT_DELAY);
setTimeout(initLanguageDetection, DETECTOR_INIT_DELAY);
}
function initWorker() {
async function initLanguageDetection() {
if (isInitialized()) return;
if (IS_TRANSLATION_DETECTOR_SUPPORTED) {
try {
languageDetector = await LanguageDetector.create();
initializationDeferred.resolve();
return;
} catch (error) {
// eslint-disable-next-line no-console
if (DEBUG) console.error('Failed to initialize language detector: ', error);
}
}
if (!worker) {
worker = createConnector<FastTextApi>(
new Worker(new URL('../lib/fasttextweb/fasttext.worker.ts', import.meta.url)),
@ -27,16 +43,53 @@ function initWorker() {
}
}
export async function detectLanguage(text: string, threshold = DEFAULT_THRESHOLD) {
if (!worker) await initializationDeferred.promise;
function isInitialized() {
return Boolean(languageDetector || worker);
}
export async function detectLanguage(text: string, threshold = DEFAULT_THRESHOLD): Promise<string | undefined> {
if (!isInitialized()) await initializationDeferred.promise;
if (languageDetector) {
try {
const results = await languageDetector.detect(text);
const first = results[0];
if (
!first
|| first.detectedLanguage === UNDEFINED_LANGUAGE
|| !first.confidence
|| first.confidence < threshold
) return undefined;
return first.detectedLanguage;
} catch (error) {
// eslint-disable-next-line no-console
if (DEBUG) console.error('Failed to detect language: ', error);
return undefined;
}
}
const result = await worker!.request({ name: 'detectLanguage', args: [text, threshold] });
return result;
}
export async function detectLanguageProbability(
text: string, labelsCount = DEFAULT_LABELS_COUNT, threshold = DEFAULT_THRESHOLD,
) {
if (!worker) await initializationDeferred.promise;
): Promise<LanguageDetectionResult[] | undefined> {
if (!isInitialized()) await initializationDeferred.promise;
if (languageDetector) {
try {
const results = await languageDetector.detect(text);
return results.filter((result) => result.detectedLanguage !== UNDEFINED_LANGUAGE
&& (result.confidence && result.confidence >= threshold))
.slice(0, labelsCount);
} catch (error) {
// eslint-disable-next-line no-console
if (DEBUG) console.error('Failed to detect language probability: ', error);
return undefined;
}
}
const result = await worker!.request({ name: 'detectLanguageProbability', args: [text, labelsCount, threshold] });
return result;
}