cld2/internal/debug.cc

// Copyright 2013 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

//
// Author: dsites@google.com (Dick Sites)
//

#include "debug.h"
#include <stdio.h>
#include <string>

#include "cldutil.h"
#include "getonescriptspan.h"
#include "lang_script.h"

using namespace std;

namespace CLD2 {

// Debug output string of one unigram
string GetUniAt(const char* text) {
  string retval;
  retval.clear();
  int uni_len = UniLen(text);
  retval.append(text, uni_len);
  return retval;
}

// Debug output string of one bigram
string GetBiAt(const char* text) {
  string retval;
  retval.clear();
  int bi_len = BiLen(text);
  retval.append(text, bi_len);
  return retval;
}

// Debug output string of one quadgram, including underscores
string GetQuadAt(const char* text) {
  string retval;
  retval.clear();
  if (text[-1] == ' ') {retval.append("_");}
  int quad_len = QuadLen(text);
  retval.append(text, quad_len);
  if (text[quad_len] == ' ') {retval.append("_");}
  return retval;
}

// Debug output string of one octagram, including underscores
string GetOctaAt(const char* text) {
  string retval;
  retval.clear();
  if (text[-1] == ' ') {retval.append("_");}
  int octa_len = OctaLen(text);
  retval.append(text, octa_len);
  if (text[octa_len] == ' ') {retval.append("_");}
  return retval;
}

// Debug output string of two octagrams, including underscores
string GetOcta2At(const char* text) {
  string retval;
  retval.clear();
  if (text[-1] == ' ') {retval.append("_");}
  int octa_len = OctaLen(text);
  retval.append(text, octa_len);
  if (text[octa_len] == ' ') {retval.append("_");}
  text += (octa_len + 1);
  int octa2_len = OctaLen(text);
  retval.append(text, octa2_len);
  if (text[octa2_len] == ' ') {retval.append("_");}
  return retval;
}

// Debug output string of one formatted pslang,qprob pair
string FmtLP(ULScript ulscript, uint8 pslang, uint8 qprob) {
  string retval;
  retval.clear();
  Language lang = FromPerScriptNumber(ulscript, pslang);
  char temp[16];
  sprintf(temp, "%s.%d", LanguageCode(lang), qprob);
  retval.append(temp);
  return retval;
}

// Debug output string of one formatted langprob
// Returns "en.24&nbsp;fr.10&nbsp;es.4"
string GetLangProbTxt(const ScoringContext* scoringcontext, uint32 langprob) {
  /*const uint16* pslangtolang = scoringcontext->pslangtolang;*/
  string retval;
  retval.clear();
  uint8 prob123 = (langprob >> 0) & 0xff;
  const uint8* prob123_entry = LgProb2TblEntry(prob123);
  uint8 top1 = (langprob >> 8) & 0xff;
  if (top1 > 0) {
    retval.append(FmtLP(scoringcontext->ulscript,
                        top1, LgProb3(prob123_entry, 0)));
  }
  uint8 top2 = (langprob >> 16) & 0xff;
  if (top2 > 0) {
    if (!retval.empty()) {retval.append("~");}
    retval.append(FmtLP(scoringcontext->ulscript,
                        top2, LgProb3(prob123_entry, 1)));
  }
  uint8 top3 = (langprob >> 24) & 0xff;
  if (top3 > 0) {
    if (!retval.empty()) {retval.append("~");}
    retval.append(FmtLP(scoringcontext->ulscript,
                        top3, LgProb3(prob123_entry, 2)));
  }
  return retval;
}


// Debug output string of one or two formatted quadgram langprobs
string GetScoreTxt(const ScoringContext* scoringcontext,
                   const CLD2TableSummary* base_obj, int indirect) {
  string retval;
  retval.clear();
  if (indirect < base_obj->kCLDTableSizeOne) {
    // Up to three languages at indirect
    uint32 langprob = base_obj->kCLDTableInd[indirect];
    retval.append(GetLangProbTxt(scoringcontext, langprob));
  } else {
    // Up to six languages at start + 2 * (indirect - start)
    indirect += (indirect - base_obj->kCLDTableSizeOne);
    uint32 langprob = base_obj->kCLDTableInd[indirect];
    uint32 langprob2 = base_obj->kCLDTableInd[indirect + 1];
    retval.append(GetLangProbTxt(scoringcontext, langprob));
    if (!retval.empty()) {retval.append("~");}
    retval.append(GetLangProbTxt(scoringcontext, langprob2));
  }
  return retval;
}


// 16 background colors, perhaps from the low 4 bits of the language number
static const int kLangBackground[16] = {
  0xffd8d8, 0xf8ffd8, 0xd8ffe7, 0xd8f3ff,
  0xefd8ff, 0xffd8eb, 0xfff7d8, 0xe3ffd8,
  0xd8ffff, 0xe3d8ff, 0xffd8f7, 0xffebd8,
  0xefffd8, 0xd8fff3, 0xd8e7ff, 0xf8d8ff,
};

// 16 text colors, perhaps from the high 4 bits of the language number
// 00..7f
static const int kLangColor[16] = {
  0x000000, 0x7f2f00, 0x7f5f00, 0x6f7f00,        // first 16 lang: black text
  0x3f7f00, 0x0f7f00, 0x007f1f, 0x007f4f,
  0x007f7f, 0x004f7f, 0x001f7f, 0x0f007f,
  0x3f007f, 0x6f007f, 0x7f005f, 0x7f002f,
};

static const int kUnscoredText = 0xb0b0b0;        // medium-light gray
static const int kUnscoredBackground = 0xffffff;  // white
static const int kIgnoremeText = 0x8090a0;        // medium-light green-gray
static const int kIgnoremeBackground = 0xffeecc;  // light orange
static const int kEnglishBackground = 0xfffff4;   // very light yellow

static int GetBackColor(Language lang, bool lighten) {
  int retval;
  if (lang == ENGLISH) {
    retval = kEnglishBackground;
  } else if (lang == UNKNOWN_LANGUAGE) {
    retval = kUnscoredBackground;
  } else if (lang == TG_UNKNOWN_LANGUAGE) {
    retval = kIgnoremeBackground;
  } else if (lang < 0) {
    retval = kUnscoredBackground;
  } else {
    retval = kLangBackground[lang & 0x0f];
  }
  if (lighten) {
    // Make 1/2 as far away from white
    retval = (retval >> 1) | 0x808080;
  }
  return retval;
}

static int GetTextColor(Language lang, bool lighten) {
  int retval;
  if (lang == UNKNOWN_LANGUAGE) {
    retval = kUnscoredText;
  } else if (lang == TG_UNKNOWN_LANGUAGE) {
    retval = kIgnoremeText;
  } else if (lang < 0) {
    retval = kUnscoredText;
  } else {
    retval = kLangColor[(lang >> 4) & 0x0f];
  }
  if (lighten) {
    // Make 1/2 as far away from white
    retval = (retval >> 1) | 0x808080;
  }
  return retval;
}

string GetPlainEscapedText(const string& txt) {
  string retval;
  retval.clear();
  for (int i = 0; i < txt.size(); ++i) {
    char c = txt[i];
    if (c == '\n') {
      retval.append(" ");
    } else if (c == '\r') {
      retval.append(" ");
    } else {
      retval.append(1, c);
    }
  }
  return retval;
}

string GetHtmlEscapedText(const string& txt) {
  string retval;
  retval.clear();
  for (int i = 0; i < txt.size(); ++i) {
    char c = txt[i];
    if (c == '<') {
      retval.append("&lt;");
    } else if (c == '>') {
      retval.append("&gt;");
    } else if (c == '&') {
      retval.append("&amp;");
    } else if (c == '\'') {
      retval.append("&apos;");
    } else if (c == '"') {
      retval.append("&quot;");
    } else if (c == '\n') {
      retval.append(" ");
    } else if (c == '\r') {
      retval.append(" ");
    } else {
      retval.append(1, c);
    }
  }
  return retval;
}

string GetColorHtmlEscapedText(Language lang, const string& txt) {
  char temp[64];
  sprintf(temp, " <span style=\"background:#%06X;color:#%06X;\">\n",
          GetBackColor(lang, false),
          GetTextColor(lang, false));
  string esc_txt = string(temp);
  esc_txt.append(GetHtmlEscapedText(txt));
  esc_txt.append("</span>");
  return esc_txt;
}

string GetLangColorHtmlEscapedText(Language lang, const string& txt) {
  char temp[64];
  sprintf(temp, "[%s]", LanguageCode(lang));
  string esc_txt = string(temp);
  esc_txt.append(GetColorHtmlEscapedText(lang, txt));
  return esc_txt;
}


// For showing one chunk
// Print debug output for one scored chunk
// Optionally print out per-chunk scoring information
// In degenerate cases, hitbuffer and cspan can be NULL
void CLD2_Debug(const char* text,
                int lo_offset,
                int hi_offset,
                bool more_to_come, bool score_cjk,
                const ScoringHitBuffer* hitbuffer,
                const ScoringContext* scoringcontext,
                const ChunkSpan* cspan,
                const ChunkSummary* chunksummary) {
  FILE* df = scoringcontext->debug_file;
  if (df == NULL) {return;}

  if (scoringcontext->flags_cld2_verbose &&
      (hitbuffer != NULL) &&
      (cspan != NULL) && (hitbuffer->next_linear > 0)) {
    int base_limit = cspan->chunk_base + cspan->base_len;
    for (int i = cspan->chunk_base; i < base_limit; ++i) {
      int ngram_start = hitbuffer->linear[i].offset;
      uint32 langprob = hitbuffer->linear[i].langprob;
      string ngram_text;
      switch (hitbuffer->linear[i].type) {
      case UNIHIT:
        ngram_text = GetUniAt(&text[ngram_start]);
        break;
      case QUADHIT:
        ngram_text = GetQuadAt(&text[ngram_start]);
        break;
      case DELTAHIT:
      case DISTINCTHIT:
        if (score_cjk) {
          ngram_text = GetBiAt(&text[ngram_start]);
        } else {
          // TODO: figure out how to display optional two words
          ngram_text = GetOctaAt(&text[ngram_start]);
        }
        break;
      }
      string score_text = GetLangProbTxt(scoringcontext, langprob);
      fprintf(df, "%c:%s=%s&nbsp;&nbsp; ",
              "UQLD"[hitbuffer->linear[i].type],
              ngram_text.c_str(),
              score_text.c_str());
    }
    fprintf(df, "<br>\n");

    // Score boosts for langprior and distinct tokens
    // Get boosts for current script
    const LangBoosts* langprior_boost = &scoringcontext->langprior_boost.latn;
    const LangBoosts* langprior_whack = &scoringcontext->langprior_whack.latn;
    const LangBoosts* distinct_boost = &scoringcontext->distinct_boost.latn;
    if (scoringcontext->ulscript != ULScript_Latin) {
      langprior_boost = &scoringcontext->langprior_boost.othr;
      langprior_whack = &scoringcontext->langprior_whack.othr;
      distinct_boost = &scoringcontext->distinct_boost.othr;
    }
    fprintf(df, "LangPrior_boost: ");
    for (int k = 0; k < kMaxBoosts; ++k) {
      uint32 langprob = langprior_boost->langprob[k];
      if (langprob > 0) {
        fprintf(df, "%s&nbsp;&nbsp; ",
                GetLangProbTxt(scoringcontext, langprob).c_str());
      }
    }
    fprintf(df, "LangPrior_whack: ");
    for (int k = 0; k < kMaxBoosts; ++k) {
      uint32 langprob = langprior_whack->langprob[k];
      if (langprob > 0) {
        fprintf(df, "%s&nbsp;&nbsp; ",
                GetLangProbTxt(scoringcontext, langprob).c_str());
      }
    }
    fprintf(df, "Distinct_boost: ");
    for (int k = 0; k < kMaxBoosts; ++k) {
      uint32 langprob = distinct_boost->langprob[k];
      if (langprob > 0) {
        fprintf(df, "%s&nbsp;&nbsp; ",
                GetLangProbTxt(scoringcontext, langprob).c_str());
      }
    }
    fprintf(df, "<br>\n");

    // Print chunksummary
    fprintf(df, "%s.%d %s.%d %dB %d# %s %dRd %dRs<br>\n",
            LanguageCode(static_cast<Language>(chunksummary->lang1)),
            chunksummary->score1,
            LanguageCode(static_cast<Language>(chunksummary->lang2)),
            chunksummary->score2,
            chunksummary->bytes,
            chunksummary->grams,
            ULScriptCode(static_cast<ULScript>(chunksummary->ulscript)),
            chunksummary->reliability_delta,
            chunksummary->reliability_score);
  }   // End flags_cld2_verbose linear


  // Print annotated colored text of this chunk
  bool is_reliable = true;
  bool match_prior = false;
  int reliable = CLD2::minint(chunksummary->reliability_delta,
                             chunksummary->reliability_score);
  is_reliable = (reliable >= 75);
  match_prior = (chunksummary->lang1 == scoringcontext->prior_chunk_lang);
  if (!is_reliable) {match_prior = false;}

  if (match_prior) {
    fprintf(df, "[]");
  } else if (is_reliable) {
    fprintf(df, "[%s]",
            LanguageCode(static_cast<Language>(chunksummary->lang1)));
  } else {
    fprintf(df, "[%s*.%d/%s.%d]",
            LanguageCode(static_cast<Language>(chunksummary->lang1)),
            chunksummary->score1,
            LanguageCode(static_cast<Language>(chunksummary->lang2)),
            chunksummary->score2);
  }

  int chunktext_len = hi_offset - lo_offset;
  if (chunktext_len < 0) {
    chunktext_len = 0;
    fprintf(df, " LEN_ERR hi %d lo %d<br>\n", hi_offset, lo_offset);
  }
  string chunk_text(&text[lo_offset], chunktext_len);

  Language lang = static_cast<Language>(chunksummary->lang1);
  fprintf(df, " <span style=\"background:#%06X;color:#%06X;\">\n",
          GetBackColor(lang, false),
          GetTextColor(lang, false));
  fprintf(df, "%s", chunk_text.c_str());
  if (scoringcontext->flags_cld2_cr) {
    fprintf(df, "</span><br>\n");
  } else {
    fprintf(df, "</span> \n");
  }
}

// For showing all chunks
void CLD2_Debug2(const char* text,
                 bool more_to_come, bool score_cjk,
                 const ScoringHitBuffer* hitbuffer,
                 const ScoringContext* scoringcontext,
                 const SummaryBuffer* summarybuffer) {
  FILE* df = scoringcontext->debug_file;
  if (df == NULL) {return;}
  uint16 prior_chunk_lang = static_cast<uint16>(UNKNOWN_LANGUAGE);

  for (int i = 0; i < summarybuffer->n; ++i) {
    fprintf(df, "Debug2[%d] ", i);
    const ChunkSummary* chunksummary = &summarybuffer->chunksummary[i];
     // Print annotated colored text of this chunk
    bool is_reliable = true;
    bool match_prior = false;
    int reliable = CLD2::minint(chunksummary->reliability_delta,
                                chunksummary->reliability_score);
    is_reliable = (reliable >= 75);
    match_prior = (chunksummary->lang1 == prior_chunk_lang);
    if (!is_reliable) {match_prior = false;}

    if (match_prior) {
      fprintf(df, "[]");
    } else if (is_reliable) {
      fprintf(df, "[%s]",
              LanguageCode(static_cast<Language>(chunksummary->lang1)));
    } else {
      fprintf(df, "[%s*.%d/%s.%d]",
              LanguageCode(static_cast<Language>(chunksummary->lang1)),
              chunksummary->score1,
              LanguageCode(static_cast<Language>(chunksummary->lang2)),
              chunksummary->score2);
    }

    int lo_offset = chunksummary->offset;
    int chunktext_len = chunksummary->bytes;
    string chunk_text(&text[lo_offset], chunktext_len);

    Language lang = static_cast<Language>(chunksummary->lang1);
    fprintf(df, " <span style=\"background:#%06X;color:#%06X;\">\n",
            GetBackColor(lang, false),
            GetTextColor(lang, false));
    fprintf(df, "%s", chunk_text.c_str());
    if (scoringcontext->flags_cld2_cr) {
      fprintf(df, "</span><br>\n");
    } else {
      fprintf(df, "</span> \n");
    }
    prior_chunk_lang = chunksummary->lang1;
  }
}

void DumpResultChunkVector(FILE* f, const char* src,
                           ResultChunkVector* resultchunkvector) {
  fprintf(f, "DumpResultChunkVector[%ld]<br>\n", resultchunkvector->size());
  for (int i = 0; i < resultchunkvector->size(); ++i) {
    ResultChunk* rc = &(*resultchunkvector)[i];
    Language lang1 = static_cast<Language>(rc->lang1);
    string this_chunk = string(src, rc->offset, rc->bytes);
    fprintf(f, "[%d]{%d %d %s} ", i, rc->offset, rc->bytes, LanguageCode(lang1));
    fprintf(f, "%s<br>\n", GetColorHtmlEscapedText(lang1, this_chunk).c_str());
  }
  fprintf(f, "<br>\n");
}

}       // End namespace CLD2