Initial CLD2 source upload.
git-svn-id: https://cld2.googlecode.com/svn/trunk@3 b252ecd4-b096-bf77-eb8e-91563289f87e
This commit is contained in:
276
internal/cld2_do_score.cc
Normal file
276
internal/cld2_do_score.cc
Normal file
@@ -0,0 +1,276 @@
|
||||
// Copyright 2013 Google Inc. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
// Little program to read lines of sample text, calculate score per 1024 bytes
|
||||
// per language-script4 combination
|
||||
// Possible input file /export/hda3/cld/pre2010/b0_samp_prune_20100722.utf8
|
||||
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <string>
|
||||
|
||||
#include "compact_lang_det_impl.h"
|
||||
#include "lang_script.h"
|
||||
|
||||
using namespace CLD2;
|
||||
|
||||
double bytes[NUM_LANGUAGES][4];
|
||||
double scores[NUM_LANGUAGES][4];
|
||||
|
||||
|
||||
// Return score per 1024 bytes for top language
|
||||
Language ScoreOneLine(const char* buffer, int buffer_length,
|
||||
int* bytes, double* score_per_1kb) {
|
||||
bool is_plain_text = true;
|
||||
const CLDHints* cld_hints = NULL;
|
||||
bool allow_extended_lang = true;
|
||||
int flags = 0;
|
||||
Language plus_one = UNKNOWN_LANGUAGE;
|
||||
Language language3[3];
|
||||
int percent3[3];
|
||||
double normalized_score3[3];
|
||||
ResultChunkVector* resultchunkvector = NULL;
|
||||
int text_bytes;
|
||||
bool is_reliable;
|
||||
Language summary_lang;
|
||||
|
||||
summary_lang = DetectLanguageSummaryV2(
|
||||
buffer,
|
||||
buffer_length,
|
||||
is_plain_text,
|
||||
cld_hints,
|
||||
allow_extended_lang,
|
||||
flags,
|
||||
plus_one,
|
||||
language3,
|
||||
percent3,
|
||||
normalized_score3,
|
||||
resultchunkvector,
|
||||
&text_bytes,
|
||||
&is_reliable);
|
||||
*bytes = text_bytes;
|
||||
*score_per_1kb = normalized_score3[0];
|
||||
return language3[0];
|
||||
}
|
||||
|
||||
#define LF 0x0a
|
||||
#define CR 0x0d
|
||||
const int kMaxBuffer = 5 * 1024;
|
||||
|
||||
bool ReadLine(FILE* infile, char* buffer, size_t maxlen) {
|
||||
char* p = fgets(buffer, maxlen, infile);
|
||||
if (p == NULL) {
|
||||
return false;
|
||||
}
|
||||
int len = strlen(buffer);
|
||||
|
||||
// trim CR LF
|
||||
if (buffer[len-1] == LF) {buffer[--len] = '\0';}
|
||||
if (buffer[len-1] == CR) {buffer[--len] = '\0';}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool IsComment(const char* buffer) {
|
||||
int len = strlen(buffer);
|
||||
if (len == 0) {return true;}
|
||||
if (buffer[0] == '#') {return true;}
|
||||
if (buffer[0] == ' ') {return true;} // Any leading space is comment
|
||||
return false;
|
||||
}
|
||||
|
||||
// Skips over xxxxx_ where _ is one or more spaces/tabs
|
||||
// Returns string::npos if no more fields
|
||||
int SkipOneField(const string& src, int pos) {
|
||||
if (pos == string::npos) {return pos;}
|
||||
|
||||
int lpos = pos;
|
||||
lpos = src.find_first_of(" \t", lpos);
|
||||
if (lpos == string::npos) {return lpos;}
|
||||
lpos = src.find_first_not_of(" \t", lpos);
|
||||
if (lpos == string::npos) {return lpos;}
|
||||
return lpos;
|
||||
}
|
||||
|
||||
// Return language and script from parsed line or defaults
|
||||
void GetLangScript(const string& src,
|
||||
Language default_lang, ULScript default_lscript,
|
||||
Language* target_lang, ULScript* target_lscript,
|
||||
string* tld) {
|
||||
*target_lang = default_lang;
|
||||
*target_lscript = default_lscript;
|
||||
*tld = "";
|
||||
int pos = 0;
|
||||
int pos2 = 0;
|
||||
if (src.substr(0,7) == "SAMPLE ") {
|
||||
// SAMPLE ll-Ssss
|
||||
pos = SkipOneField(src, pos);
|
||||
} else if (src.substr(0,5) == "SAMP ") {
|
||||
// SAMP ll-Ssss /tld2.tld/
|
||||
pos = SkipOneField(src, pos);
|
||||
pos2 = SkipOneField(src, pos);
|
||||
} else if (src.substr(0,5) == "Samp ") {
|
||||
// Samp ll-Ssss /tld2.tld/
|
||||
pos = SkipOneField(src, pos);
|
||||
pos2 = SkipOneField(src, pos);
|
||||
}
|
||||
if (pos == 0) {return;}
|
||||
if (pos == string::npos) {return;}
|
||||
|
||||
// Pos is at the first letter of language-script combination
|
||||
int end = src.find_first_of(" \t", pos); // find end of lang-script
|
||||
if (end == string::npos) {return;}
|
||||
*target_lang = GetLanguageFromName(src.substr(pos, end - pos).c_str());
|
||||
*target_lscript = GetULScriptFromName(src.substr(pos, end - pos).c_str());
|
||||
|
||||
// Pos2 is 0 or at the first letter of the tld string
|
||||
if (pos2 == 0) {return;}
|
||||
if (pos2 == string::npos) {return;}
|
||||
end = src.find_first_of(" \t", pos2);
|
||||
if (end == string::npos) {return;}
|
||||
*tld = src.substr(pos2, end - pos2);
|
||||
}
|
||||
|
||||
// Return position of start of text
|
||||
int GetTextBeginPos(const string& src) {
|
||||
int pos = 0;
|
||||
if (src.size() < 8) {return pos;}
|
||||
|
||||
if (src.substr(0,7) == "SAMPLE ") {
|
||||
// Skip SAMPLE ll-Ssss
|
||||
pos = SkipOneField(src, pos);
|
||||
pos = SkipOneField(src, pos);
|
||||
} else if (src.substr(0,5) == "SAMP ") {
|
||||
// Skip SAMP ll-Ssss /tld2.tld/
|
||||
pos = SkipOneField(src, pos);
|
||||
pos = SkipOneField(src, pos);
|
||||
pos = SkipOneField(src, pos);
|
||||
} else if (src.substr(0,5) == "Samp ") {
|
||||
// Skip Samp ll-Ssss /tld2.tld/
|
||||
pos = SkipOneField(src, pos);
|
||||
pos = SkipOneField(src, pos);
|
||||
pos = SkipOneField(src, pos);
|
||||
}
|
||||
return pos;
|
||||
}
|
||||
|
||||
// Avoid zdiv
|
||||
inline double Divisor(double x) {
|
||||
return (x > 0.0 ? x : 1.0);
|
||||
}
|
||||
|
||||
void Flush(Language cur_lang, ULScript ulscript,
|
||||
double total_score_cur_lang,
|
||||
double total_bytes_cur_lang, double total_bad_bytes_cur_lang) {
|
||||
if (cur_lang == UNKNOWN_LANGUAGE) {return;}
|
||||
|
||||
bytes[cur_lang][LScript4(ulscript)] += total_bytes_cur_lang;
|
||||
scores[cur_lang][LScript4(ulscript)] += total_score_cur_lang;
|
||||
|
||||
double score = total_score_cur_lang * 1024.0 / Divisor(total_bytes_cur_lang);
|
||||
double percent_bad = 100.0 * total_bad_bytes_cur_lang /
|
||||
Divisor(total_bytes_cur_lang + total_bad_bytes_cur_lang);
|
||||
fprintf(stdout, "%s-%s %7.0f %6.1f, %2.0f%% bad SUMMARY\n\n",
|
||||
LanguageCode(cur_lang),
|
||||
ULScriptCode(ulscript),
|
||||
total_bytes_cur_lang,
|
||||
score,
|
||||
percent_bad);
|
||||
}
|
||||
|
||||
int BytesPer1KB(int i, int j) {
|
||||
int bytes_per_1kb = ((scores[i][j] * 1024.0) / Divisor(bytes[i][j])) + 0.5;
|
||||
return bytes_per_1kb;
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
Language cur_lang = UNKNOWN_LANGUAGE;
|
||||
ULScript cur_ulscript = ULScript_Common;
|
||||
double total_score_cur_lang = 0.0;
|
||||
double total_bytes_cur_lang = 0.0;
|
||||
double total_bad_bytes_cur_lang = 0.0;
|
||||
memset(bytes, 0, sizeof(bytes));
|
||||
memset(scores, 0, sizeof(bytes));
|
||||
|
||||
char buffer[kMaxBuffer];
|
||||
int buffer_length;
|
||||
const char* filename = NULL;
|
||||
FILE* infile = stdin;
|
||||
for (int i = 1; i < argc; ++i) {
|
||||
if (argv[i][0] != '-') {
|
||||
filename = argv[i];
|
||||
}
|
||||
}
|
||||
|
||||
if (filename != NULL) {
|
||||
infile = fopen(filename, "r");
|
||||
if (infile == NULL) {
|
||||
fprintf(stderr, "%s did not open\n", filename);
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
while (ReadLine(infile, buffer, kMaxBuffer)) {
|
||||
if (IsComment(buffer)) {continue;}
|
||||
|
||||
buffer_length = strlen(buffer);
|
||||
int bytes;
|
||||
double score_per_1kb;
|
||||
Language toplang;
|
||||
Language target_lang;
|
||||
ULScript target_ulscript;
|
||||
|
||||
string src(buffer, buffer_length);
|
||||
string tld("");
|
||||
int pos = GetTextBeginPos(src);
|
||||
GetLangScript(src, UNKNOWN_LANGUAGE, ULScript_Common,
|
||||
&target_lang, &target_ulscript, &tld);
|
||||
if ((cur_lang != target_lang) || (cur_ulscript != target_ulscript)) {
|
||||
Flush(cur_lang, cur_ulscript, total_score_cur_lang,
|
||||
total_bytes_cur_lang, total_bad_bytes_cur_lang);
|
||||
cur_lang = target_lang;
|
||||
cur_ulscript = target_ulscript;
|
||||
total_score_cur_lang = 0.0;
|
||||
total_bytes_cur_lang = 0.0;
|
||||
total_bad_bytes_cur_lang = 0.0;
|
||||
}
|
||||
|
||||
toplang = ScoreOneLine(&src[pos], src.size() - pos, &bytes, &score_per_1kb);
|
||||
|
||||
fprintf(stdout, "%s%c %d %4.1f %s\n",
|
||||
LanguageCode(toplang),
|
||||
(toplang == target_lang) ? ' ' : '*',
|
||||
bytes, score_per_1kb, buffer);
|
||||
// Only count when detected lang matches the claimed target lang
|
||||
if (toplang == target_lang) {
|
||||
total_bytes_cur_lang += bytes;
|
||||
total_score_cur_lang += (score_per_1kb * bytes) / 1024.0;
|
||||
} else {
|
||||
total_bad_bytes_cur_lang += bytes;
|
||||
}
|
||||
}
|
||||
Flush(cur_lang, cur_ulscript, total_score_cur_lang,
|
||||
total_bytes_cur_lang, total_bad_bytes_cur_lang);
|
||||
|
||||
for (int i = 0; i < NUM_LANGUAGES; ++i) {
|
||||
Language ilang = static_cast<Language>(i);
|
||||
fprintf(stdout, " {%4d, %4d, %4d, %4d}, // %d %s %s\n",
|
||||
BytesPer1KB(i, 0), BytesPer1KB(i, 1),
|
||||
BytesPer1KB(i, 2), BytesPer1KB(i, 3),
|
||||
i, LanguageName(ilang), LanguageCode(ilang));
|
||||
}
|
||||
|
||||
if (infile != stdin) {
|
||||
fclose(infile);
|
||||
}
|
||||
}
|
Reference in New Issue
Block a user