cld2/internal/cld2_do_score.cc

// Copyright 2013 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Little program to read lines of sample text, calculate score per 1024 bytes
// per language-script4 combination
// Possible input file /export/hda3/cld/pre2010/b0_samp_prune_20100722.utf8

#include <stdio.h>
#include <string.h>
#include <string>

#include "compact_lang_det_impl.h"
#include "lang_script.h"

using namespace std;
using namespace CLD2;

double bytes[NUM_LANGUAGES][4];
double scores[NUM_LANGUAGES][4];


// Return score per 1024 bytes for top language
Language ScoreOneLine(const char* buffer, int buffer_length,
                    int* bytes, double* score_per_1kb) {
  bool is_plain_text = true;
  const CLDHints* cld_hints = NULL;
  bool allow_extended_lang = true;
  int flags = 0;
  Language plus_one = UNKNOWN_LANGUAGE;
  Language language3[3];
  int percent3[3];
  double normalized_score3[3];
  ResultChunkVector* resultchunkvector = NULL;
  int text_bytes;
  bool is_reliable;
  Language summary_lang;

  summary_lang = DetectLanguageSummaryV2(
                        buffer,
                        buffer_length,
                        is_plain_text,
                        cld_hints,
                        allow_extended_lang,
                        flags,
                        plus_one,
                        language3,
                        percent3,
                        normalized_score3,
                        resultchunkvector,
                        &text_bytes,
                        &is_reliable);
  *bytes = text_bytes;
  *score_per_1kb = normalized_score3[0];
  return language3[0];
}

#define LF 0x0a
#define CR 0x0d
const int kMaxBuffer = 5 * 1024;

bool ReadLine(FILE* infile, char* buffer, size_t maxlen) {
  char* p = fgets(buffer, maxlen, infile);
  if (p == NULL) {
    return false;
  }
  int len = strlen(buffer);

  // trim CR LF
  if (buffer[len-1] == LF) {buffer[--len] = '\0';}
  if (buffer[len-1] == CR) {buffer[--len] = '\0';}
  return true;
}

bool IsComment(const char* buffer) {
  int len = strlen(buffer);
  if (len == 0) {return true;}
  if (buffer[0] == '#') {return true;}
  if (buffer[0] == ' ') {return true;}    // Any leading space is comment
  return false;
}

// Skips over xxxxx_ where _ is one or more spaces/tabs
// Returns string::npos if no more fields
int SkipOneField(const string& src, int pos) {
  if (pos == string::npos) {return pos;}

  int lpos = pos;
  lpos = src.find_first_of(" \t", lpos);
  if (lpos == string::npos) {return lpos;}
  lpos = src.find_first_not_of(" \t", lpos);
  if (lpos == string::npos) {return lpos;}
  return lpos;
}

// Return language and script from parsed line or defaults
void GetLangScript(const string& src,
                   Language default_lang, ULScript default_lscript,
                   Language* target_lang, ULScript* target_lscript,
                   string* tld) {
  *target_lang = default_lang;
  *target_lscript = default_lscript;
  *tld = "";
  int pos = 0;
  int pos2 = 0;
  if (src.substr(0,7) == "SAMPLE ") {
    // SAMPLE ll-Ssss
    pos = SkipOneField(src, pos);
  } else if (src.substr(0,5) == "SAMP ") {
    // SAMP ll-Ssss /tld2.tld/
    pos = SkipOneField(src, pos);
    pos2 = SkipOneField(src, pos);
  } else if (src.substr(0,5) == "Samp ") {
    // Samp ll-Ssss /tld2.tld/
    pos = SkipOneField(src, pos);
    pos2 = SkipOneField(src, pos);
  }
  if (pos == 0) {return;}
  if (pos == string::npos) {return;}

  // Pos is at the first letter of language-script combination
  int end = src.find_first_of(" \t", pos);    // find end of lang-script
  if (end == string::npos) {return;}
  *target_lang = GetLanguageFromName(src.substr(pos, end - pos).c_str());
  *target_lscript = GetULScriptFromName(src.substr(pos, end - pos).c_str());

  // Pos2 is 0 or at the first letter of the tld string
  if (pos2 == 0) {return;}
  if (pos2 == string::npos) {return;}
  end = src.find_first_of(" \t", pos2);
  if (end == string::npos) {return;}
  *tld = src.substr(pos2, end - pos2);
}

// Return position of start of text
int GetTextBeginPos(const string& src) {
  int pos = 0;
  if (src.size() < 8) {return pos;}

  if (src.substr(0,7) == "SAMPLE ") {
    // Skip SAMPLE ll-Ssss
    pos = SkipOneField(src, pos);
    pos = SkipOneField(src, pos);
  } else if (src.substr(0,5) == "SAMP ") {
    // Skip SAMP ll-Ssss /tld2.tld/
    pos = SkipOneField(src, pos);
    pos = SkipOneField(src, pos);
    pos = SkipOneField(src, pos);
  } else if (src.substr(0,5) == "Samp ") {
    // Skip Samp ll-Ssss /tld2.tld/
    pos = SkipOneField(src, pos);
    pos = SkipOneField(src, pos);
    pos = SkipOneField(src, pos);
  }
  return pos;
}

// Avoid zdiv
inline double Divisor(double x) {
   return (x > 0.0 ? x : 1.0);
}

void Flush(Language cur_lang, ULScript ulscript,
           double total_score_cur_lang,
           double total_bytes_cur_lang, double total_bad_bytes_cur_lang) {
  if (cur_lang == UNKNOWN_LANGUAGE) {return;}

  bytes[cur_lang][LScript4(ulscript)] += total_bytes_cur_lang;
  scores[cur_lang][LScript4(ulscript)] += total_score_cur_lang;

  double score = total_score_cur_lang * 1024.0 / Divisor(total_bytes_cur_lang);
  double percent_bad = 100.0 * total_bad_bytes_cur_lang /
     Divisor(total_bytes_cur_lang + total_bad_bytes_cur_lang);
  fprintf(stdout, "%s-%s    %7.0f %6.1f, %2.0f%% bad SUMMARY\n\n",
          LanguageCode(cur_lang),
          ULScriptCode(ulscript),
          total_bytes_cur_lang,
          score,
          percent_bad);
}

int BytesPer1KB(int i, int j) {
   int bytes_per_1kb = ((scores[i][j] * 1024.0) / Divisor(bytes[i][j])) + 0.5;
   return bytes_per_1kb;
}

int main(int argc, char *argv[]) {
  Language cur_lang = UNKNOWN_LANGUAGE;
  ULScript cur_ulscript = ULScript_Common;
  double total_score_cur_lang = 0.0;
  double total_bytes_cur_lang = 0.0;
  double total_bad_bytes_cur_lang = 0.0;
  memset(bytes, 0, sizeof(bytes));
  memset(scores, 0, sizeof(bytes));

  char buffer[kMaxBuffer];
  int buffer_length;
  const char* filename = NULL;
  FILE* infile = stdin;
  for (int i = 1; i < argc; ++i) {
     if (argv[i][0] != '-') {
        filename = argv[i];
     }
  }

  if (filename != NULL) {
     infile = fopen(filename, "r");
     if (infile == NULL) {
        fprintf(stderr, "%s did not open\n", filename);
        return 0;
     }
  }

  while (ReadLine(infile, buffer, kMaxBuffer)) {
    if (IsComment(buffer)) {continue;}

    buffer_length = strlen(buffer);
    int bytes;
    double score_per_1kb;
    Language toplang;
    Language target_lang;
    ULScript target_ulscript;

    string src(buffer, buffer_length);
    string tld("");
    int pos = GetTextBeginPos(src);
    GetLangScript(src, UNKNOWN_LANGUAGE, ULScript_Common,
                   &target_lang, &target_ulscript, &tld);
    if ((cur_lang != target_lang) || (cur_ulscript != target_ulscript)) {
      Flush(cur_lang, cur_ulscript, total_score_cur_lang,
            total_bytes_cur_lang, total_bad_bytes_cur_lang);
      cur_lang = target_lang;
      cur_ulscript = target_ulscript;
      total_score_cur_lang = 0.0;
      total_bytes_cur_lang = 0.0;
      total_bad_bytes_cur_lang = 0.0;
    }

    toplang = ScoreOneLine(&src[pos], src.size() - pos, &bytes, &score_per_1kb);

    fprintf(stdout, "%s%c %d %4.1f %s\n",
            LanguageCode(toplang),
            (toplang == target_lang) ? ' ' : '*',
            bytes, score_per_1kb, buffer);
    // Only count when detected lang matches the claimed target lang
    if (toplang == target_lang) {
      total_bytes_cur_lang += bytes;
      total_score_cur_lang += (score_per_1kb * bytes) / 1024.0;
    } else {
      total_bad_bytes_cur_lang += bytes;
    }
  }
  Flush(cur_lang, cur_ulscript, total_score_cur_lang,
        total_bytes_cur_lang, total_bad_bytes_cur_lang);

  for (int i = 0; i < NUM_LANGUAGES; ++i) {
     Language ilang = static_cast<Language>(i);
     fprintf(stdout, "  {%4d, %4d, %4d, %4d},  // %d %s %s\n",
             BytesPer1KB(i, 0), BytesPer1KB(i, 1),
             BytesPer1KB(i, 2), BytesPer1KB(i, 3),
             i, LanguageName(ilang), LanguageCode(ilang));
  }

  if (infile != stdin) {
     fclose(infile);
  }
}