git-svn-id: https://cld2.googlecode.com/svn/trunk@13 b252ecd4-b096-bf77-eb8e-91563289f87e
479 lines
15 KiB
C++
479 lines
15 KiB
C++
// Copyright 2013 Google Inc. All Rights Reserved.
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
//
|
|
// Author: dsites@google.com (Dick Sites)
|
|
//
|
|
|
|
#include "debug.h"
|
|
#include <stdio.h>
|
|
#include <string>
|
|
|
|
#include "cldutil.h"
|
|
#include "getonescriptspan.h"
|
|
#include "lang_script.h"
|
|
|
|
using namespace std;
|
|
|
|
namespace CLD2 {
|
|
|
|
// Debug output string of one unigram
|
|
string GetUniAt(const char* text) {
|
|
string retval;
|
|
retval.clear();
|
|
int uni_len = UniLen(text);
|
|
retval.append(text, uni_len);
|
|
return retval;
|
|
}
|
|
|
|
// Debug output string of one bigram
|
|
string GetBiAt(const char* text) {
|
|
string retval;
|
|
retval.clear();
|
|
int bi_len = BiLen(text);
|
|
retval.append(text, bi_len);
|
|
return retval;
|
|
}
|
|
|
|
// Debug output string of one quadgram, including underscores
|
|
string GetQuadAt(const char* text) {
|
|
string retval;
|
|
retval.clear();
|
|
if (text[-1] == ' ') {retval.append("_");}
|
|
int quad_len = QuadLen(text);
|
|
retval.append(text, quad_len);
|
|
if (text[quad_len] == ' ') {retval.append("_");}
|
|
return retval;
|
|
}
|
|
|
|
// Debug output string of one octagram, including underscores
|
|
string GetOctaAt(const char* text) {
|
|
string retval;
|
|
retval.clear();
|
|
if (text[-1] == ' ') {retval.append("_");}
|
|
int octa_len = OctaLen(text);
|
|
retval.append(text, octa_len);
|
|
if (text[octa_len] == ' ') {retval.append("_");}
|
|
return retval;
|
|
}
|
|
|
|
// Debug output string of two octagrams, including underscores
|
|
string GetOcta2At(const char* text) {
|
|
string retval;
|
|
retval.clear();
|
|
if (text[-1] == ' ') {retval.append("_");}
|
|
int octa_len = OctaLen(text);
|
|
retval.append(text, octa_len);
|
|
if (text[octa_len] == ' ') {retval.append("_");}
|
|
text += (octa_len + 1);
|
|
int octa2_len = OctaLen(text);
|
|
retval.append(text, octa2_len);
|
|
if (text[octa2_len] == ' ') {retval.append("_");}
|
|
return retval;
|
|
}
|
|
|
|
// Debug output string of one formatted pslang,qprob pair
|
|
string FmtLP(ULScript ulscript, uint8 pslang, uint8 qprob) {
|
|
string retval;
|
|
retval.clear();
|
|
Language lang = FromPerScriptNumber(ulscript, pslang);
|
|
char temp[16];
|
|
sprintf(temp, "%s.%d", LanguageCode(lang), qprob);
|
|
retval.append(temp);
|
|
return retval;
|
|
}
|
|
|
|
// Debug output string of one formatted langprob
|
|
// Returns "en.24 fr.10 es.4"
|
|
string GetLangProbTxt(const ScoringContext* scoringcontext, uint32 langprob) {
|
|
/*const uint16* pslangtolang = scoringcontext->pslangtolang;*/
|
|
string retval;
|
|
retval.clear();
|
|
uint8 prob123 = (langprob >> 0) & 0xff;
|
|
const uint8* prob123_entry = LgProb2TblEntry(prob123);
|
|
uint8 top1 = (langprob >> 8) & 0xff;
|
|
if (top1 > 0) {
|
|
retval.append(FmtLP(scoringcontext->ulscript,
|
|
top1, LgProb3(prob123_entry, 0)));
|
|
}
|
|
uint8 top2 = (langprob >> 16) & 0xff;
|
|
if (top2 > 0) {
|
|
if (!retval.empty()) {retval.append("~");}
|
|
retval.append(FmtLP(scoringcontext->ulscript,
|
|
top2, LgProb3(prob123_entry, 1)));
|
|
}
|
|
uint8 top3 = (langprob >> 24) & 0xff;
|
|
if (top3 > 0) {
|
|
if (!retval.empty()) {retval.append("~");}
|
|
retval.append(FmtLP(scoringcontext->ulscript,
|
|
top3, LgProb3(prob123_entry, 2)));
|
|
}
|
|
return retval;
|
|
}
|
|
|
|
|
|
// Debug output string of one or two formatted quadgram langprobs
|
|
string GetScoreTxt(const ScoringContext* scoringcontext,
|
|
const CLD2TableSummary* base_obj, int indirect) {
|
|
string retval;
|
|
retval.clear();
|
|
if (indirect < base_obj->kCLDTableSizeOne) {
|
|
// Up to three languages at indirect
|
|
uint32 langprob = base_obj->kCLDTableInd[indirect];
|
|
retval.append(GetLangProbTxt(scoringcontext, langprob));
|
|
} else {
|
|
// Up to six languages at start + 2 * (indirect - start)
|
|
indirect += (indirect - base_obj->kCLDTableSizeOne);
|
|
uint32 langprob = base_obj->kCLDTableInd[indirect];
|
|
uint32 langprob2 = base_obj->kCLDTableInd[indirect + 1];
|
|
retval.append(GetLangProbTxt(scoringcontext, langprob));
|
|
if (!retval.empty()) {retval.append("~");}
|
|
retval.append(GetLangProbTxt(scoringcontext, langprob2));
|
|
}
|
|
return retval;
|
|
}
|
|
|
|
|
|
// 16 background colors, perhaps from the low 4 bits of the language number
|
|
static const int kLangBackground[16] = {
|
|
0xffd8d8, 0xf8ffd8, 0xd8ffe7, 0xd8f3ff,
|
|
0xefd8ff, 0xffd8eb, 0xfff7d8, 0xe3ffd8,
|
|
0xd8ffff, 0xe3d8ff, 0xffd8f7, 0xffebd8,
|
|
0xefffd8, 0xd8fff3, 0xd8e7ff, 0xf8d8ff,
|
|
};
|
|
|
|
// 16 text colors, perhaps from the high 4 bits of the language number
|
|
// 00..7f
|
|
static const int kLangColor[16] = {
|
|
0x000000, 0x7f2f00, 0x7f5f00, 0x6f7f00, // first 16 lang: black text
|
|
0x3f7f00, 0x0f7f00, 0x007f1f, 0x007f4f,
|
|
0x007f7f, 0x004f7f, 0x001f7f, 0x0f007f,
|
|
0x3f007f, 0x6f007f, 0x7f005f, 0x7f002f,
|
|
};
|
|
|
|
static const int kUnscoredText = 0xb0b0b0; // medium-light gray
|
|
static const int kUnscoredBackground = 0xffffff; // white
|
|
static const int kIgnoremeText = 0x8090a0; // medium-light green-gray
|
|
static const int kIgnoremeBackground = 0xffeecc; // light orange
|
|
static const int kEnglishBackground = 0xfffff4; // very light yellow
|
|
|
|
static int GetBackColor(Language lang, bool lighten) {
|
|
int retval;
|
|
if (lang == ENGLISH) {
|
|
retval = kEnglishBackground;
|
|
} else if (lang == UNKNOWN_LANGUAGE) {
|
|
retval = kUnscoredBackground;
|
|
} else if (lang == TG_UNKNOWN_LANGUAGE) {
|
|
retval = kIgnoremeBackground;
|
|
} else if (lang < 0) {
|
|
retval = kUnscoredBackground;
|
|
} else {
|
|
retval = kLangBackground[lang & 0x0f];
|
|
}
|
|
if (lighten) {
|
|
// Make 1/2 as far away from white
|
|
retval = (retval >> 1) | 0x808080;
|
|
}
|
|
return retval;
|
|
}
|
|
|
|
static int GetTextColor(Language lang, bool lighten) {
|
|
int retval;
|
|
if (lang == UNKNOWN_LANGUAGE) {
|
|
retval = kUnscoredText;
|
|
} else if (lang == TG_UNKNOWN_LANGUAGE) {
|
|
retval = kIgnoremeText;
|
|
} else if (lang < 0) {
|
|
retval = kUnscoredText;
|
|
} else {
|
|
retval = kLangColor[(lang >> 4) & 0x0f];
|
|
}
|
|
if (lighten) {
|
|
// Make 1/2 as far away from white
|
|
retval = (retval >> 1) | 0x808080;
|
|
}
|
|
return retval;
|
|
}
|
|
|
|
string GetPlainEscapedText(const string& txt) {
|
|
string retval;
|
|
retval.clear();
|
|
for (int i = 0; i < txt.size(); ++i) {
|
|
char c = txt[i];
|
|
if (c == '\n') {
|
|
retval.append(" ");
|
|
} else if (c == '\r') {
|
|
retval.append(" ");
|
|
} else {
|
|
retval.append(1, c);
|
|
}
|
|
}
|
|
return retval;
|
|
}
|
|
|
|
string GetHtmlEscapedText(const string& txt) {
|
|
string retval;
|
|
retval.clear();
|
|
for (int i = 0; i < txt.size(); ++i) {
|
|
char c = txt[i];
|
|
if (c == '<') {
|
|
retval.append("<");
|
|
} else if (c == '>') {
|
|
retval.append(">");
|
|
} else if (c == '&') {
|
|
retval.append("&");
|
|
} else if (c == '\'') {
|
|
retval.append("'");
|
|
} else if (c == '"') {
|
|
retval.append(""");
|
|
} else if (c == '\n') {
|
|
retval.append(" ");
|
|
} else if (c == '\r') {
|
|
retval.append(" ");
|
|
} else {
|
|
retval.append(1, c);
|
|
}
|
|
}
|
|
return retval;
|
|
}
|
|
|
|
string GetColorHtmlEscapedText(Language lang, const string& txt) {
|
|
char temp[64];
|
|
sprintf(temp, " <span style=\"background:#%06X;color:#%06X;\">\n",
|
|
GetBackColor(lang, false),
|
|
GetTextColor(lang, false));
|
|
string esc_txt = string(temp);
|
|
esc_txt.append(GetHtmlEscapedText(txt));
|
|
esc_txt.append("</span>");
|
|
return esc_txt;
|
|
}
|
|
|
|
string GetLangColorHtmlEscapedText(Language lang, const string& txt) {
|
|
char temp[64];
|
|
sprintf(temp, "[%s]", LanguageCode(lang));
|
|
string esc_txt = string(temp);
|
|
esc_txt.append(GetColorHtmlEscapedText(lang, txt));
|
|
return esc_txt;
|
|
}
|
|
|
|
|
|
// For showing one chunk
|
|
// Print debug output for one scored chunk
|
|
// Optionally print out per-chunk scoring information
|
|
// In degenerate cases, hitbuffer and cspan can be NULL
|
|
void CLD2_Debug(const char* text,
|
|
int lo_offset,
|
|
int hi_offset,
|
|
bool more_to_come, bool score_cjk,
|
|
const ScoringHitBuffer* hitbuffer,
|
|
const ScoringContext* scoringcontext,
|
|
const ChunkSpan* cspan,
|
|
const ChunkSummary* chunksummary) {
|
|
FILE* df = scoringcontext->debug_file;
|
|
if (df == NULL) {return;}
|
|
|
|
if (scoringcontext->flags_cld2_verbose &&
|
|
(hitbuffer != NULL) &&
|
|
(cspan != NULL) && (hitbuffer->next_linear > 0)) {
|
|
int base_limit = cspan->chunk_base + cspan->base_len;
|
|
for (int i = cspan->chunk_base; i < base_limit; ++i) {
|
|
int ngram_start = hitbuffer->linear[i].offset;
|
|
uint32 langprob = hitbuffer->linear[i].langprob;
|
|
string ngram_text;
|
|
switch (hitbuffer->linear[i].type) {
|
|
case UNIHIT:
|
|
ngram_text = GetUniAt(&text[ngram_start]);
|
|
break;
|
|
case QUADHIT:
|
|
ngram_text = GetQuadAt(&text[ngram_start]);
|
|
break;
|
|
case DELTAHIT:
|
|
case DISTINCTHIT:
|
|
if (score_cjk) {
|
|
ngram_text = GetBiAt(&text[ngram_start]);
|
|
} else {
|
|
// TODO: figure out how to display optional two words
|
|
ngram_text = GetOctaAt(&text[ngram_start]);
|
|
}
|
|
break;
|
|
}
|
|
string score_text = GetLangProbTxt(scoringcontext, langprob);
|
|
fprintf(df, "%c:%s=%s ",
|
|
"UQLD"[hitbuffer->linear[i].type],
|
|
ngram_text.c_str(),
|
|
score_text.c_str());
|
|
}
|
|
fprintf(df, "<br>\n");
|
|
|
|
// Score boosts for langprior and distinct tokens
|
|
// Get boosts for current script
|
|
const LangBoosts* langprior_boost = &scoringcontext->langprior_boost.latn;
|
|
const LangBoosts* langprior_whack = &scoringcontext->langprior_whack.latn;
|
|
const LangBoosts* distinct_boost = &scoringcontext->distinct_boost.latn;
|
|
if (scoringcontext->ulscript != ULScript_Latin) {
|
|
langprior_boost = &scoringcontext->langprior_boost.othr;
|
|
langprior_whack = &scoringcontext->langprior_whack.othr;
|
|
distinct_boost = &scoringcontext->distinct_boost.othr;
|
|
}
|
|
fprintf(df, "LangPrior_boost: ");
|
|
for (int k = 0; k < kMaxBoosts; ++k) {
|
|
uint32 langprob = langprior_boost->langprob[k];
|
|
if (langprob > 0) {
|
|
fprintf(df, "%s ",
|
|
GetLangProbTxt(scoringcontext, langprob).c_str());
|
|
}
|
|
}
|
|
fprintf(df, "LangPrior_whack: ");
|
|
for (int k = 0; k < kMaxBoosts; ++k) {
|
|
uint32 langprob = langprior_whack->langprob[k];
|
|
if (langprob > 0) {
|
|
fprintf(df, "%s ",
|
|
GetLangProbTxt(scoringcontext, langprob).c_str());
|
|
}
|
|
}
|
|
fprintf(df, "Distinct_boost: ");
|
|
for (int k = 0; k < kMaxBoosts; ++k) {
|
|
uint32 langprob = distinct_boost->langprob[k];
|
|
if (langprob > 0) {
|
|
fprintf(df, "%s ",
|
|
GetLangProbTxt(scoringcontext, langprob).c_str());
|
|
}
|
|
}
|
|
fprintf(df, "<br>\n");
|
|
|
|
// Print chunksummary
|
|
fprintf(df, "%s.%d %s.%d %dB %d# %s %dRd %dRs<br>\n",
|
|
LanguageCode(static_cast<Language>(chunksummary->lang1)),
|
|
chunksummary->score1,
|
|
LanguageCode(static_cast<Language>(chunksummary->lang2)),
|
|
chunksummary->score2,
|
|
chunksummary->bytes,
|
|
chunksummary->grams,
|
|
ULScriptCode(static_cast<ULScript>(chunksummary->ulscript)),
|
|
chunksummary->reliability_delta,
|
|
chunksummary->reliability_score);
|
|
} // End flags_cld2_verbose linear
|
|
|
|
|
|
// Print annotated colored text of this chunk
|
|
bool is_reliable = true;
|
|
bool match_prior = false;
|
|
int reliable = CLD2::minint(chunksummary->reliability_delta,
|
|
chunksummary->reliability_score);
|
|
is_reliable = (reliable >= 75);
|
|
match_prior = (chunksummary->lang1 == scoringcontext->prior_chunk_lang);
|
|
if (!is_reliable) {match_prior = false;}
|
|
|
|
if (match_prior) {
|
|
fprintf(df, "[]");
|
|
} else if (is_reliable) {
|
|
fprintf(df, "[%s]",
|
|
LanguageCode(static_cast<Language>(chunksummary->lang1)));
|
|
} else {
|
|
fprintf(df, "[%s*.%d/%s.%d]",
|
|
LanguageCode(static_cast<Language>(chunksummary->lang1)),
|
|
chunksummary->score1,
|
|
LanguageCode(static_cast<Language>(chunksummary->lang2)),
|
|
chunksummary->score2);
|
|
}
|
|
|
|
int chunktext_len = hi_offset - lo_offset;
|
|
if (chunktext_len < 0) {
|
|
chunktext_len = 0;
|
|
fprintf(df, " LEN_ERR hi %d lo %d<br>\n", hi_offset, lo_offset);
|
|
}
|
|
string chunk_text(&text[lo_offset], chunktext_len);
|
|
|
|
Language lang = static_cast<Language>(chunksummary->lang1);
|
|
fprintf(df, " <span style=\"background:#%06X;color:#%06X;\">\n",
|
|
GetBackColor(lang, false),
|
|
GetTextColor(lang, false));
|
|
fprintf(df, "%s", chunk_text.c_str());
|
|
if (scoringcontext->flags_cld2_cr) {
|
|
fprintf(df, "</span><br>\n");
|
|
} else {
|
|
fprintf(df, "</span> \n");
|
|
}
|
|
}
|
|
|
|
// For showing all chunks
|
|
void CLD2_Debug2(const char* text,
|
|
bool more_to_come, bool score_cjk,
|
|
const ScoringHitBuffer* hitbuffer,
|
|
const ScoringContext* scoringcontext,
|
|
const SummaryBuffer* summarybuffer) {
|
|
FILE* df = scoringcontext->debug_file;
|
|
if (df == NULL) {return;}
|
|
uint16 prior_chunk_lang = static_cast<uint16>(UNKNOWN_LANGUAGE);
|
|
|
|
for (int i = 0; i < summarybuffer->n; ++i) {
|
|
fprintf(df, "Debug2[%d] ", i);
|
|
const ChunkSummary* chunksummary = &summarybuffer->chunksummary[i];
|
|
// Print annotated colored text of this chunk
|
|
bool is_reliable = true;
|
|
bool match_prior = false;
|
|
int reliable = CLD2::minint(chunksummary->reliability_delta,
|
|
chunksummary->reliability_score);
|
|
is_reliable = (reliable >= 75);
|
|
match_prior = (chunksummary->lang1 == prior_chunk_lang);
|
|
if (!is_reliable) {match_prior = false;}
|
|
|
|
if (match_prior) {
|
|
fprintf(df, "[]");
|
|
} else if (is_reliable) {
|
|
fprintf(df, "[%s]",
|
|
LanguageCode(static_cast<Language>(chunksummary->lang1)));
|
|
} else {
|
|
fprintf(df, "[%s*.%d/%s.%d]",
|
|
LanguageCode(static_cast<Language>(chunksummary->lang1)),
|
|
chunksummary->score1,
|
|
LanguageCode(static_cast<Language>(chunksummary->lang2)),
|
|
chunksummary->score2);
|
|
}
|
|
|
|
int lo_offset = chunksummary->offset;
|
|
int chunktext_len = chunksummary->bytes;
|
|
string chunk_text(&text[lo_offset], chunktext_len);
|
|
|
|
Language lang = static_cast<Language>(chunksummary->lang1);
|
|
fprintf(df, " <span style=\"background:#%06X;color:#%06X;\">\n",
|
|
GetBackColor(lang, false),
|
|
GetTextColor(lang, false));
|
|
fprintf(df, "%s", chunk_text.c_str());
|
|
if (scoringcontext->flags_cld2_cr) {
|
|
fprintf(df, "</span><br>\n");
|
|
} else {
|
|
fprintf(df, "</span> \n");
|
|
}
|
|
prior_chunk_lang = chunksummary->lang1;
|
|
}
|
|
}
|
|
|
|
void DumpResultChunkVector(FILE* f, const char* src,
|
|
ResultChunkVector* resultchunkvector) {
|
|
fprintf(f, "DumpResultChunkVector[%ld]<br>\n", resultchunkvector->size());
|
|
for (int i = 0; i < resultchunkvector->size(); ++i) {
|
|
ResultChunk* rc = &(*resultchunkvector)[i];
|
|
Language lang1 = static_cast<Language>(rc->lang1);
|
|
string this_chunk = string(src, rc->offset, rc->bytes);
|
|
fprintf(f, "[%d]{%d %d %s} ", i, rc->offset, rc->bytes, LanguageCode(lang1));
|
|
fprintf(f, "%s<br>\n", GetColorHtmlEscapedText(lang1, this_chunk).c_str());
|
|
}
|
|
fprintf(f, "<br>\n");
|
|
}
|
|
|
|
} // End namespace CLD2
|
|
|
|
|