diff --git a/internal/debug.cc b/internal/debug.cc index b36e5a8..1a56981 100644 --- a/internal/debug.cc +++ b/internal/debug.cc @@ -319,9 +319,11 @@ void CLD2_Debug(const char* text, // Score boosts for langprior and distinct tokens // Get boosts for current script const LangBoosts* langprior_boost = &scoringcontext->langprior_boost.latn; + const LangBoosts* langprior_whack = &scoringcontext->langprior_whack.latn; const LangBoosts* distinct_boost = &scoringcontext->distinct_boost.latn; if (scoringcontext->ulscript != ULScript_Latin) { langprior_boost = &scoringcontext->langprior_boost.othr; + langprior_whack = &scoringcontext->langprior_whack.othr; distinct_boost = &scoringcontext->distinct_boost.othr; } fprintf(df, "LangPrior_boost: "); @@ -332,6 +334,14 @@ void CLD2_Debug(const char* text, GetLangProbTxt(scoringcontext, langprob).c_str()); } } + fprintf(df, "LangPrior_whack: "); + for (int k = 0; k < kMaxBoosts; ++k) { + uint32 langprob = langprior_whack->langprob[k]; + if (langprob > 0) { + fprintf(df, "%s   ", + GetLangProbTxt(scoringcontext, langprob).c_str()); + } + } fprintf(df, "Distinct_boost: "); for (int k = 0; k < kMaxBoosts; ++k) { uint32 langprob = distinct_boost->langprob[k]; diff --git a/internal/scoreonescriptspan.cc b/internal/scoreonescriptspan.cc index 0d30059..a581885 100644 --- a/internal/scoreonescriptspan.cc +++ b/internal/scoreonescriptspan.cc @@ -34,6 +34,11 @@ void AddLangProb(uint32 langprob, Tote* chunk_tote) { ProcessProbV2Tote(langprob, chunk_tote); } +void ZeroPSLang(uint32 langprob, Tote* chunk_tote) { + uint8 top1 = (langprob >> 8) & 0xff; + chunk_tote->SetScore(top1, 0); +} + bool SameCloseSet(uint16 lang1, uint16 lang2) { int lang1_close_set = LanguageCloseSet(static_cast(lang1)); if (lang1_close_set == 0) {return false;} @@ -118,9 +123,11 @@ void AddDistinctBoost2(uint32 langprob, ScoringContext* scoringcontext) { void ScoreBoosts(const ScoringContext* scoringcontext, Tote* chunk_tote) { // Get boosts for current script const LangBoosts* langprior_boost = &scoringcontext->langprior_boost.latn; + const LangBoosts* langprior_whack = &scoringcontext->langprior_whack.latn; const LangBoosts* distinct_boost = &scoringcontext->distinct_boost.latn; if (scoringcontext->ulscript != ULScript_Latin) { langprior_boost = &scoringcontext->langprior_boost.othr; + langprior_whack = &scoringcontext->langprior_whack.othr; distinct_boost = &scoringcontext->distinct_boost.othr; } @@ -132,6 +139,14 @@ void ScoreBoosts(const ScoringContext* scoringcontext, Tote* chunk_tote) { uint32 langprob = distinct_boost->langprob[k]; if (langprob > 0) {AddLangProb(langprob, chunk_tote);} } + // boost has a packed set of per-script langs and probabilites + // whack has a packed set of per-script lang to be suppressed (zeroed) + // When a language in a close set is given as an explicit hint, others in + // that set will be whacked here. + for (int k = 0; k < kMaxBoosts; ++k) { + uint32 langprob = langprior_whack->langprob[k]; + if (langprob > 0) {ZeroPSLang(langprob, chunk_tote);} + } } diff --git a/internal/scoreonescriptspan.h b/internal/scoreonescriptspan.h index a6c2768..b6f9b52 100644 --- a/internal/scoreonescriptspan.h +++ b/internal/scoreonescriptspan.h @@ -136,7 +136,12 @@ typedef struct { bool flags_cld2_verbose; ULScript ulscript; // langprobs below are with respect to this script Language prior_chunk_lang; // Mostly for debug output + // boost has a packed set of per-script langs and probabilites + // whack has a per-script lang to be suppressed from ever scoring (zeroed) + // When a language in a close set is given as an explicit hint, others in + // that set will be whacked. PerScriptLangBoosts langprior_boost; // From http content-lang or meta lang= + PerScriptLangBoosts langprior_whack; // From http content-lang or meta lang= PerScriptLangBoosts distinct_boost; // From distinctive letter groups int oldest_distinct_boost; // Subscript in hitbuffer of oldest // distinct score to use @@ -146,6 +151,7 @@ typedef struct { // Inits boosts void init() { memset(&langprior_boost, 0, sizeof(langprior_boost)); + memset(&langprior_whack, 0, sizeof(langprior_whack)); memset(&distinct_boost, 0, sizeof(distinct_boost)); }; } ScoringContext;