Apply lang= hint to suppress other languages in close sets

git-svn-id: https://cld2.googlecode.com/svn/trunk@14 b252ecd4-b096-bf77-eb8e-91563289f87e
This commit is contained in:
dsites@google.com
2013-07-15 23:26:30 +00:00
parent ef2db3729f
commit 703233b60b

View File

@@ -45,6 +45,9 @@ using namespace std;
// cld2_generated_distinctocta*.cc
// cld_generated_score_quad_octa_1024_256.cc
extern const int kLanguageToPLangSize;
extern const int kCloseSetSize;
extern const UTF8PropObj cld_generated_CjkUni_obj;
extern const CLD2TableSummary kCjkCompat_obj;
extern const CLD2TableSummary kCjkDeltaBi_obj;
@@ -1400,21 +1403,70 @@ void CalcSummaryLang(DocTote* doc_tote, int total_text_bytes,
}
}
void AddLangPriorBoost(uint32 langprob, ScoringContext* scoringcontext) {
// this is called 0..n times with language hints
// but we don't know the script, so put in both
// TODO: only put in Latn if lang can be in Latn, only in Othr similarly
// lang == Language FromPerScriptNumber(ulscript, perscript_number);
void AddLangPriorBoost(Language lang, uint32 langprob,
ScoringContext* scoringcontext) {
// This is called 0..n times with language hints
// but we don't know the script -- so boost either or both Latn, Othr.
LangBoosts* langprior_boost = &scoringcontext->langprior_boost.latn;
int n = langprior_boost->n;
langprior_boost->langprob[n] = langprob;
langprior_boost->n = langprior_boost->wrap(n + 1);
if (IsLatnLanguage(lang)) {
LangBoosts* langprior_boost = &scoringcontext->langprior_boost.latn;
int n = langprior_boost->n;
langprior_boost->langprob[n] = langprob;
langprior_boost->n = langprior_boost->wrap(n + 1);
}
langprior_boost = &scoringcontext->langprior_boost.othr;
n = langprior_boost->n;
langprior_boost->langprob[n] = langprob;
langprior_boost->n = langprior_boost->wrap(n + 1);
if (IsOthrLanguage(lang)) {
LangBoosts* langprior_boost = &scoringcontext->langprior_boost.othr;
int n = langprior_boost->n;
langprior_boost->langprob[n] = langprob;
langprior_boost->n = langprior_boost->wrap(n + 1);
}
}
void AddOneWhack(Language whacker_lang, Language whackee_lang,
ScoringContext* scoringcontext) {
uint32 langprob = MakeLangProb(whackee_lang, 1);
// This logic avoids hr-Latn whacking sr-Cyrl, but still whacks sr-Latn
if (IsLatnLanguage(whacker_lang) && IsLatnLanguage(whackee_lang)) {
LangBoosts* langprior_whack = &scoringcontext->langprior_whack.latn;
int n = langprior_whack->n;
langprior_whack->langprob[n] = langprob;
langprior_whack->n = langprior_whack->wrap(n + 1);
}
if (IsOthrLanguage(whacker_lang) && IsOthrLanguage(whackee_lang)) {
LangBoosts* langprior_whack = &scoringcontext->langprior_whack.othr;
int n = langprior_whack->n;
langprior_whack->langprob[n] = langprob;
langprior_whack->n = langprior_whack->wrap(n + 1);
}
}
void AddCloseLangWhack(Language lang, ScoringContext* scoringcontext) {
// We do not in general want zh-Hans and zh-Hant to be close pairs,
// but we do here.
if (lang == CLD2::CHINESE) {
AddOneWhack(lang, CLD2::CHINESE_T, scoringcontext);
AddOneWhack(lang, CLD2::JAPANESE, scoringcontext);
AddOneWhack(lang, CLD2::KOREAN, scoringcontext);
return;
}
if (lang == CLD2::CHINESE_T) {
AddOneWhack(lang, CLD2::CHINESE, scoringcontext);
AddOneWhack(lang, CLD2::JAPANESE, scoringcontext);
AddOneWhack(lang, CLD2::KOREAN, scoringcontext);
return;
}
int base_lang_set = LanguageCloseSet(lang);
if (base_lang_set == 0) {return;}
// TODO: add an explicit list of each set to avoid this 512-times loop
for (int i = 0; i < kLanguageToPLangSize; ++i) {
Language lang2 = static_cast<Language>(i);
if ((base_lang_set == LanguageCloseSet(lang2)) && (lang != lang2)) {
AddOneWhack(lang, lang2, scoringcontext);
}
}
}
@@ -1477,15 +1529,51 @@ void ApplyHints(const char* buffer,
}
}
// Put into ScoringContext
// Put boosts into ScoringContext
for (int i = 0; i < GetCLDLangPriorCount(&lang_priors); ++i) {
Language lang = GetCLDPriorLang(lang_priors.prior[i]);
int qprob = GetCLDPriorWeight(lang_priors.prior[i]);
if (qprob > 0) {
uint32 langprob = MakeLangProb(lang, qprob);
AddLangPriorBoost(langprob, scoringcontext);
AddLangPriorBoost(lang, langprob, scoringcontext);
}
}
// Put whacks into scoring context
// We do not in general want zh-Hans and zh-Hant to be close pairs,
// but we do here. Use close_set_count[kCloseSetSize] to count zh, zh-Hant
int close_set_count[kCloseSetSize + 1];
memset(close_set_count, 0, sizeof(close_set_count));
for (int i = 0; i < GetCLDLangPriorCount(&lang_priors); ++i) {
Language lang = GetCLDPriorLang(lang_priors.prior[i]);
++close_set_count[LanguageCloseSet(lang)];
if (lang == CLD2::CHINESE) {++close_set_count[kCloseSetSize];}
if (lang == CLD2::CHINESE_T) {++close_set_count[kCloseSetSize];}
}
// If a boost language is in a close set, force suppressing the others in
// that set, if exactly one of the set is present
for (int i = 0; i < GetCLDLangPriorCount(&lang_priors); ++i) {
Language lang = GetCLDPriorLang(lang_priors.prior[i]);
int qprob = GetCLDPriorWeight(lang_priors.prior[i]);
if (qprob > 0) {
int close_set = LanguageCloseSet(lang);
if ((close_set > 0) && (close_set_count[close_set] == 1)) {
AddCloseLangWhack(lang, scoringcontext);
}
if (((lang == CLD2::CHINESE) || (lang == CLD2::CHINESE_T)) &&
(close_set_count[kCloseSetSize] == 1)) {
AddCloseLangWhack(lang, scoringcontext);
}
}
}
}