Apply lang= hint to suppress other languages in close sets
git-svn-id: https://cld2.googlecode.com/svn/trunk@14 b252ecd4-b096-bf77-eb8e-91563289f87e
This commit is contained in:
@@ -45,6 +45,9 @@ using namespace std;
|
|||||||
// cld2_generated_distinctocta*.cc
|
// cld2_generated_distinctocta*.cc
|
||||||
// cld_generated_score_quad_octa_1024_256.cc
|
// cld_generated_score_quad_octa_1024_256.cc
|
||||||
|
|
||||||
|
extern const int kLanguageToPLangSize;
|
||||||
|
extern const int kCloseSetSize;
|
||||||
|
|
||||||
extern const UTF8PropObj cld_generated_CjkUni_obj;
|
extern const UTF8PropObj cld_generated_CjkUni_obj;
|
||||||
extern const CLD2TableSummary kCjkCompat_obj;
|
extern const CLD2TableSummary kCjkCompat_obj;
|
||||||
extern const CLD2TableSummary kCjkDeltaBi_obj;
|
extern const CLD2TableSummary kCjkDeltaBi_obj;
|
||||||
@@ -1400,21 +1403,70 @@ void CalcSummaryLang(DocTote* doc_tote, int total_text_bytes,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void AddLangPriorBoost(uint32 langprob, ScoringContext* scoringcontext) {
|
void AddLangPriorBoost(Language lang, uint32 langprob,
|
||||||
// this is called 0..n times with language hints
|
ScoringContext* scoringcontext) {
|
||||||
// but we don't know the script, so put in both
|
// This is called 0..n times with language hints
|
||||||
// TODO: only put in Latn if lang can be in Latn, only in Othr similarly
|
// but we don't know the script -- so boost either or both Latn, Othr.
|
||||||
// lang == Language FromPerScriptNumber(ulscript, perscript_number);
|
|
||||||
|
|
||||||
LangBoosts* langprior_boost = &scoringcontext->langprior_boost.latn;
|
if (IsLatnLanguage(lang)) {
|
||||||
int n = langprior_boost->n;
|
LangBoosts* langprior_boost = &scoringcontext->langprior_boost.latn;
|
||||||
langprior_boost->langprob[n] = langprob;
|
int n = langprior_boost->n;
|
||||||
langprior_boost->n = langprior_boost->wrap(n + 1);
|
langprior_boost->langprob[n] = langprob;
|
||||||
|
langprior_boost->n = langprior_boost->wrap(n + 1);
|
||||||
|
}
|
||||||
|
|
||||||
langprior_boost = &scoringcontext->langprior_boost.othr;
|
if (IsOthrLanguage(lang)) {
|
||||||
n = langprior_boost->n;
|
LangBoosts* langprior_boost = &scoringcontext->langprior_boost.othr;
|
||||||
langprior_boost->langprob[n] = langprob;
|
int n = langprior_boost->n;
|
||||||
langprior_boost->n = langprior_boost->wrap(n + 1);
|
langprior_boost->langprob[n] = langprob;
|
||||||
|
langprior_boost->n = langprior_boost->wrap(n + 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
void AddOneWhack(Language whacker_lang, Language whackee_lang,
|
||||||
|
ScoringContext* scoringcontext) {
|
||||||
|
uint32 langprob = MakeLangProb(whackee_lang, 1);
|
||||||
|
// This logic avoids hr-Latn whacking sr-Cyrl, but still whacks sr-Latn
|
||||||
|
if (IsLatnLanguage(whacker_lang) && IsLatnLanguage(whackee_lang)) {
|
||||||
|
LangBoosts* langprior_whack = &scoringcontext->langprior_whack.latn;
|
||||||
|
int n = langprior_whack->n;
|
||||||
|
langprior_whack->langprob[n] = langprob;
|
||||||
|
langprior_whack->n = langprior_whack->wrap(n + 1);
|
||||||
|
}
|
||||||
|
if (IsOthrLanguage(whacker_lang) && IsOthrLanguage(whackee_lang)) {
|
||||||
|
LangBoosts* langprior_whack = &scoringcontext->langprior_whack.othr;
|
||||||
|
int n = langprior_whack->n;
|
||||||
|
langprior_whack->langprob[n] = langprob;
|
||||||
|
langprior_whack->n = langprior_whack->wrap(n + 1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void AddCloseLangWhack(Language lang, ScoringContext* scoringcontext) {
|
||||||
|
// We do not in general want zh-Hans and zh-Hant to be close pairs,
|
||||||
|
// but we do here.
|
||||||
|
if (lang == CLD2::CHINESE) {
|
||||||
|
AddOneWhack(lang, CLD2::CHINESE_T, scoringcontext);
|
||||||
|
AddOneWhack(lang, CLD2::JAPANESE, scoringcontext);
|
||||||
|
AddOneWhack(lang, CLD2::KOREAN, scoringcontext);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
if (lang == CLD2::CHINESE_T) {
|
||||||
|
AddOneWhack(lang, CLD2::CHINESE, scoringcontext);
|
||||||
|
AddOneWhack(lang, CLD2::JAPANESE, scoringcontext);
|
||||||
|
AddOneWhack(lang, CLD2::KOREAN, scoringcontext);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
int base_lang_set = LanguageCloseSet(lang);
|
||||||
|
if (base_lang_set == 0) {return;}
|
||||||
|
// TODO: add an explicit list of each set to avoid this 512-times loop
|
||||||
|
for (int i = 0; i < kLanguageToPLangSize; ++i) {
|
||||||
|
Language lang2 = static_cast<Language>(i);
|
||||||
|
if ((base_lang_set == LanguageCloseSet(lang2)) && (lang != lang2)) {
|
||||||
|
AddOneWhack(lang, lang2, scoringcontext);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@@ -1477,15 +1529,51 @@ void ApplyHints(const char* buffer,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Put into ScoringContext
|
// Put boosts into ScoringContext
|
||||||
for (int i = 0; i < GetCLDLangPriorCount(&lang_priors); ++i) {
|
for (int i = 0; i < GetCLDLangPriorCount(&lang_priors); ++i) {
|
||||||
Language lang = GetCLDPriorLang(lang_priors.prior[i]);
|
Language lang = GetCLDPriorLang(lang_priors.prior[i]);
|
||||||
int qprob = GetCLDPriorWeight(lang_priors.prior[i]);
|
int qprob = GetCLDPriorWeight(lang_priors.prior[i]);
|
||||||
if (qprob > 0) {
|
if (qprob > 0) {
|
||||||
uint32 langprob = MakeLangProb(lang, qprob);
|
uint32 langprob = MakeLangProb(lang, qprob);
|
||||||
AddLangPriorBoost(langprob, scoringcontext);
|
AddLangPriorBoost(lang, langprob, scoringcontext);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Put whacks into scoring context
|
||||||
|
// We do not in general want zh-Hans and zh-Hant to be close pairs,
|
||||||
|
// but we do here. Use close_set_count[kCloseSetSize] to count zh, zh-Hant
|
||||||
|
int close_set_count[kCloseSetSize + 1];
|
||||||
|
memset(close_set_count, 0, sizeof(close_set_count));
|
||||||
|
|
||||||
|
for (int i = 0; i < GetCLDLangPriorCount(&lang_priors); ++i) {
|
||||||
|
Language lang = GetCLDPriorLang(lang_priors.prior[i]);
|
||||||
|
++close_set_count[LanguageCloseSet(lang)];
|
||||||
|
if (lang == CLD2::CHINESE) {++close_set_count[kCloseSetSize];}
|
||||||
|
if (lang == CLD2::CHINESE_T) {++close_set_count[kCloseSetSize];}
|
||||||
|
}
|
||||||
|
|
||||||
|
// If a boost language is in a close set, force suppressing the others in
|
||||||
|
// that set, if exactly one of the set is present
|
||||||
|
for (int i = 0; i < GetCLDLangPriorCount(&lang_priors); ++i) {
|
||||||
|
Language lang = GetCLDPriorLang(lang_priors.prior[i]);
|
||||||
|
int qprob = GetCLDPriorWeight(lang_priors.prior[i]);
|
||||||
|
if (qprob > 0) {
|
||||||
|
int close_set = LanguageCloseSet(lang);
|
||||||
|
if ((close_set > 0) && (close_set_count[close_set] == 1)) {
|
||||||
|
AddCloseLangWhack(lang, scoringcontext);
|
||||||
|
}
|
||||||
|
if (((lang == CLD2::CHINESE) || (lang == CLD2::CHINESE_T)) &&
|
||||||
|
(close_set_count[kCloseSetSize] == 1)) {
|
||||||
|
AddCloseLangWhack(lang, scoringcontext);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user