// Copyright 2013 Google Inc. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // // Author: dsites@google.com (Dick Sites) // #include "cldutil_offline.h" #include "tote.h" #include //------------------------------------------------------------------------------ // Offline: used by mapreduce or table construction //------------------------------------------------------------------------------ namespace CLD2 { // BIGRAM, QUADGRAM, OCTAGRAM score one => tote // Input: 4-byte entry of 3 language numbers and one probability subscript, plus // an accumulator tote. (language 0 means unused entry) // Output: running sums in tote updated void ProcessProbV2Tote(uint32 probs, Tote* tote) { uint8 prob123 = (probs >> 0) & 0xff; const uint8* prob123_entry = LgProb2TblEntry(prob123); uint8 top1 = (probs >> 8) & 0xff; if (top1 > 0) {tote->Add(top1, LgProb3(prob123_entry, 0));} uint8 top2 = (probs >> 16) & 0xff; if (top2 > 0) {tote->Add(top2, LgProb3(prob123_entry, 1));} uint8 top3 = (probs >> 24) & 0xff; if (top3 > 0) {tote->Add(top3, LgProb3(prob123_entry, 2));} } // Advances src, decrements len uint32 GetNextLangprob(ULScriptRType rtype, const CLD2TableSummary* wrt_unigram_obj, const CLD2TableSummary* wrt_quadgram_obj, const char** isrc, int* isrclen) { // fprintf(stderr, "GetNextLangprob '%s' %d
\n", *isrc, *isrclen); if (*isrclen <= 0) {return 0;} // Find one quadgram const char* src = *isrc; const char* srclimit = src + *isrclen; if (*src == ' ') {++src;} const char* src_end = src; src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]]; src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]]; const char* src_mid = src_end; src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]]; src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]]; int len = src_end - src; // Hash the quadgram uint32 quadhash = QuadHashV2(src, len); uint32 probs = QuadHashV3Lookup4(wrt_quadgram_obj, quadhash); int indirect_subscr = probs & ~wrt_quadgram_obj->kCLDTableKeyMask; uint32 langprob; if (indirect_subscr < static_cast(wrt_quadgram_obj->kCLDTableSizeOne)) { // Up to three languages at indirect langprob = wrt_quadgram_obj->kCLDTableInd[indirect_subscr]; } else { // Up to six languages at start + 2 * (indirect - start) indirect_subscr += (indirect_subscr - wrt_quadgram_obj->kCLDTableSizeOne); langprob = wrt_quadgram_obj->kCLDTableInd[indirect_subscr]; } // Advance: all the way past word if at end-of-word, else 2 chars if (src_end[0] == ' ') { src = src_end; } else { src = src_mid; } if (src < srclimit) { src += kAdvanceOneCharSpaceVowel[(uint8)src[0]]; } else { // Advancing by 4/8/16 can overshoot, but we are about to exit anyway src = srclimit; } int quadadvance = src - *isrc; *isrc = src; *isrclen -= quadadvance; return langprob; } // Find top two langs and scores for one word; underpins delta tables void DoWordScore(const char* isrc, int srclen, ULScript ulscript, const CLD2TableSummary* wrt_unigram_obj, const CLD2TableSummary* wrt_quadgram_obj, Language* lang1, int* score1, Language* lang2, int* score2) { ULScriptRType rtype = ULScriptRecognitionType(ulscript); Tote word_tote; const char* src = isrc; int len = srclen; uint32 langprob; // Advances src, decrements len langprob = GetNextLangprob(rtype, wrt_unigram_obj, wrt_quadgram_obj, &src, &len); ProcessProbV2Tote(langprob, &word_tote); // Advances src, decrements len langprob = GetNextLangprob(rtype, wrt_unigram_obj, wrt_quadgram_obj, &src, &len); ProcessProbV2Tote(langprob, &word_tote); int key3[3]; word_tote.CurrentTopThreeKeys(key3); *lang1 = FromPerScriptNumber(ulscript, key3[0]); *lang2 = FromPerScriptNumber(ulscript, key3[1]); *score1 = word_tote.GetScore(key3[0]); *score2 = word_tote.GetScore(key3[1]); } // Routines to store 3 or 5 log probabilities in a single byte. // Resolution/range = 2**1 to 2**12 //------------------------------------------------------------------------------ // For constructing tables // Given a vector of 3 probabilities 1..12, find subscript of best table match. // Minimizes RMS error // Brute-force version uint8 FindBestProb3Match(const uint8* prob3) { int minsubscr = 0; int minrmserr = 9999; for (int i = 0; i < kLgProbV2TblSize; ++i) { int rmserr = 0; for (int j = 0; j < 3; ++j) { // If target prob is zero, item is unused, so no errterm if (prob3[j] > 0) { int errterm = prob3[j] - LgProb3(LgProb2TblEntry(i), j); rmserr += (errterm * errterm); } } if (minrmserr > rmserr) { minrmserr = rmserr; minsubscr = i; } } return static_cast(minsubscr); }; // Not sure who calls this... // Return the probability for given language, or 0 int GetProb(Language lang, uint32 probs) { int prob123 = (probs >> 0) & 0xff; const uint8* prob123_entry = LgProb2TblEntry(prob123); int ilang = PerScriptNumber(ULScript_Latin, lang); int top1 = (probs >> 8) & 0xff; if (ilang == top1) {return LgProb3(prob123_entry, 0);} int top2 = (probs >> 16) & 0xff; if (ilang == top2) {return LgProb3(prob123_entry, 1);} int top3 = (probs >> 16) & 0xff; if (ilang == top3) {return LgProb3(prob123_entry, 2);} return 0; } // Converts a unigram prob/lang byte into an approximate prob/lang triple // Just keeps the largest value. // Now unused. uint32 ApproxProb3(int propval) { return 0; } // Take three packed languages and three probabilities 1..12 and put into uint32 // For offline construction of tables uint32 ProbPackV2(uint8* plang3, uint8* prob3) { uint32 retval; // If < 3 entries, pack as top, 0, second, else pack as top, second, third // This allows FindBestProb3Match to always find a perfect match for < 3 if (plang3[2] == 0) { // Swap [2] and [3] uint8 temp = plang3[2]; plang3[2] = plang3[1]; plang3[1] = temp; temp = prob3[2]; prob3[2] = prob3[1]; prob3[1] = temp; } retval = (plang3[2] << 24) | (plang3[1] << 16) | (plang3[0] << 8) | (FindBestProb3Match(prob3)); return retval; } // Take uint32 and unpack into three packed languages and three probabilities // For runtime use of tables void ProbUnpackV2(uint32 prob, uint8* plang3, uint8* prob3) { plang3[0] = (prob >> 8) & 0xff; plang3[1] = (prob >> 16) & 0xff; plang3[2] = (prob >> 24) & 0xff; int prob123 = (prob >> 0) & 0xff; const uint8* prob123_entry = LgProb2TblEntry(prob123); prob3[0] = LgProb3(prob123_entry, 0); prob3[1] = LgProb3(prob123_entry, 1); prob3[2] = LgProb3(prob123_entry, 2); } } // End namespace CLD2