Infrastructure for lang= hint to suppress other languages in close sets
git-svn-id: https://cld2.googlecode.com/svn/trunk@13 b252ecd4-b096-bf77-eb8e-91563289f87e
This commit is contained in:
@@ -319,9 +319,11 @@ void CLD2_Debug(const char* text,
|
|||||||
// Score boosts for langprior and distinct tokens
|
// Score boosts for langprior and distinct tokens
|
||||||
// Get boosts for current script
|
// Get boosts for current script
|
||||||
const LangBoosts* langprior_boost = &scoringcontext->langprior_boost.latn;
|
const LangBoosts* langprior_boost = &scoringcontext->langprior_boost.latn;
|
||||||
|
const LangBoosts* langprior_whack = &scoringcontext->langprior_whack.latn;
|
||||||
const LangBoosts* distinct_boost = &scoringcontext->distinct_boost.latn;
|
const LangBoosts* distinct_boost = &scoringcontext->distinct_boost.latn;
|
||||||
if (scoringcontext->ulscript != ULScript_Latin) {
|
if (scoringcontext->ulscript != ULScript_Latin) {
|
||||||
langprior_boost = &scoringcontext->langprior_boost.othr;
|
langprior_boost = &scoringcontext->langprior_boost.othr;
|
||||||
|
langprior_whack = &scoringcontext->langprior_whack.othr;
|
||||||
distinct_boost = &scoringcontext->distinct_boost.othr;
|
distinct_boost = &scoringcontext->distinct_boost.othr;
|
||||||
}
|
}
|
||||||
fprintf(df, "LangPrior_boost: ");
|
fprintf(df, "LangPrior_boost: ");
|
||||||
@@ -332,6 +334,14 @@ void CLD2_Debug(const char* text,
|
|||||||
GetLangProbTxt(scoringcontext, langprob).c_str());
|
GetLangProbTxt(scoringcontext, langprob).c_str());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
fprintf(df, "LangPrior_whack: ");
|
||||||
|
for (int k = 0; k < kMaxBoosts; ++k) {
|
||||||
|
uint32 langprob = langprior_whack->langprob[k];
|
||||||
|
if (langprob > 0) {
|
||||||
|
fprintf(df, "%s ",
|
||||||
|
GetLangProbTxt(scoringcontext, langprob).c_str());
|
||||||
|
}
|
||||||
|
}
|
||||||
fprintf(df, "Distinct_boost: ");
|
fprintf(df, "Distinct_boost: ");
|
||||||
for (int k = 0; k < kMaxBoosts; ++k) {
|
for (int k = 0; k < kMaxBoosts; ++k) {
|
||||||
uint32 langprob = distinct_boost->langprob[k];
|
uint32 langprob = distinct_boost->langprob[k];
|
||||||
|
@@ -34,6 +34,11 @@ void AddLangProb(uint32 langprob, Tote* chunk_tote) {
|
|||||||
ProcessProbV2Tote(langprob, chunk_tote);
|
ProcessProbV2Tote(langprob, chunk_tote);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void ZeroPSLang(uint32 langprob, Tote* chunk_tote) {
|
||||||
|
uint8 top1 = (langprob >> 8) & 0xff;
|
||||||
|
chunk_tote->SetScore(top1, 0);
|
||||||
|
}
|
||||||
|
|
||||||
bool SameCloseSet(uint16 lang1, uint16 lang2) {
|
bool SameCloseSet(uint16 lang1, uint16 lang2) {
|
||||||
int lang1_close_set = LanguageCloseSet(static_cast<Language>(lang1));
|
int lang1_close_set = LanguageCloseSet(static_cast<Language>(lang1));
|
||||||
if (lang1_close_set == 0) {return false;}
|
if (lang1_close_set == 0) {return false;}
|
||||||
@@ -118,9 +123,11 @@ void AddDistinctBoost2(uint32 langprob, ScoringContext* scoringcontext) {
|
|||||||
void ScoreBoosts(const ScoringContext* scoringcontext, Tote* chunk_tote) {
|
void ScoreBoosts(const ScoringContext* scoringcontext, Tote* chunk_tote) {
|
||||||
// Get boosts for current script
|
// Get boosts for current script
|
||||||
const LangBoosts* langprior_boost = &scoringcontext->langprior_boost.latn;
|
const LangBoosts* langprior_boost = &scoringcontext->langprior_boost.latn;
|
||||||
|
const LangBoosts* langprior_whack = &scoringcontext->langprior_whack.latn;
|
||||||
const LangBoosts* distinct_boost = &scoringcontext->distinct_boost.latn;
|
const LangBoosts* distinct_boost = &scoringcontext->distinct_boost.latn;
|
||||||
if (scoringcontext->ulscript != ULScript_Latin) {
|
if (scoringcontext->ulscript != ULScript_Latin) {
|
||||||
langprior_boost = &scoringcontext->langprior_boost.othr;
|
langprior_boost = &scoringcontext->langprior_boost.othr;
|
||||||
|
langprior_whack = &scoringcontext->langprior_whack.othr;
|
||||||
distinct_boost = &scoringcontext->distinct_boost.othr;
|
distinct_boost = &scoringcontext->distinct_boost.othr;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -132,6 +139,14 @@ void ScoreBoosts(const ScoringContext* scoringcontext, Tote* chunk_tote) {
|
|||||||
uint32 langprob = distinct_boost->langprob[k];
|
uint32 langprob = distinct_boost->langprob[k];
|
||||||
if (langprob > 0) {AddLangProb(langprob, chunk_tote);}
|
if (langprob > 0) {AddLangProb(langprob, chunk_tote);}
|
||||||
}
|
}
|
||||||
|
// boost has a packed set of per-script langs and probabilites
|
||||||
|
// whack has a packed set of per-script lang to be suppressed (zeroed)
|
||||||
|
// When a language in a close set is given as an explicit hint, others in
|
||||||
|
// that set will be whacked here.
|
||||||
|
for (int k = 0; k < kMaxBoosts; ++k) {
|
||||||
|
uint32 langprob = langprior_whack->langprob[k];
|
||||||
|
if (langprob > 0) {ZeroPSLang(langprob, chunk_tote);}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@@ -136,7 +136,12 @@ typedef struct {
|
|||||||
bool flags_cld2_verbose;
|
bool flags_cld2_verbose;
|
||||||
ULScript ulscript; // langprobs below are with respect to this script
|
ULScript ulscript; // langprobs below are with respect to this script
|
||||||
Language prior_chunk_lang; // Mostly for debug output
|
Language prior_chunk_lang; // Mostly for debug output
|
||||||
|
// boost has a packed set of per-script langs and probabilites
|
||||||
|
// whack has a per-script lang to be suppressed from ever scoring (zeroed)
|
||||||
|
// When a language in a close set is given as an explicit hint, others in
|
||||||
|
// that set will be whacked.
|
||||||
PerScriptLangBoosts langprior_boost; // From http content-lang or meta lang=
|
PerScriptLangBoosts langprior_boost; // From http content-lang or meta lang=
|
||||||
|
PerScriptLangBoosts langprior_whack; // From http content-lang or meta lang=
|
||||||
PerScriptLangBoosts distinct_boost; // From distinctive letter groups
|
PerScriptLangBoosts distinct_boost; // From distinctive letter groups
|
||||||
int oldest_distinct_boost; // Subscript in hitbuffer of oldest
|
int oldest_distinct_boost; // Subscript in hitbuffer of oldest
|
||||||
// distinct score to use
|
// distinct score to use
|
||||||
@@ -146,6 +151,7 @@ typedef struct {
|
|||||||
// Inits boosts
|
// Inits boosts
|
||||||
void init() {
|
void init() {
|
||||||
memset(&langprior_boost, 0, sizeof(langprior_boost));
|
memset(&langprior_boost, 0, sizeof(langprior_boost));
|
||||||
|
memset(&langprior_whack, 0, sizeof(langprior_whack));
|
||||||
memset(&distinct_boost, 0, sizeof(distinct_boost));
|
memset(&distinct_boost, 0, sizeof(distinct_boost));
|
||||||
};
|
};
|
||||||
} ScoringContext;
|
} ScoringContext;
|
||||||
|
Reference in New Issue
Block a user