278 lines
8.5 KiB
C++
278 lines
8.5 KiB
C++
// Copyright 2013 Google Inc. All Rights Reserved.
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
// Little program to read lines of sample text, calculate score per 1024 bytes
|
|
// per language-script4 combination
|
|
// Possible input file /export/hda3/cld/pre2010/b0_samp_prune_20100722.utf8
|
|
|
|
#include <stdio.h>
|
|
#include <string.h>
|
|
#include <string>
|
|
|
|
#include "compact_lang_det_impl.h"
|
|
#include "lang_script.h"
|
|
|
|
using namespace std;
|
|
using namespace CLD2;
|
|
|
|
double bytes[NUM_LANGUAGES][4];
|
|
double scores[NUM_LANGUAGES][4];
|
|
|
|
|
|
// Return score per 1024 bytes for top language
|
|
Language ScoreOneLine(const char* buffer, int buffer_length,
|
|
int* bytes, double* score_per_1kb) {
|
|
bool is_plain_text = true;
|
|
const CLDHints* cld_hints = NULL;
|
|
bool allow_extended_lang = true;
|
|
int flags = 0;
|
|
Language plus_one = UNKNOWN_LANGUAGE;
|
|
Language language3[3];
|
|
int percent3[3];
|
|
double normalized_score3[3];
|
|
ResultChunkVector* resultchunkvector = NULL;
|
|
int text_bytes;
|
|
bool is_reliable;
|
|
Language summary_lang;
|
|
|
|
summary_lang = DetectLanguageSummaryV2(
|
|
buffer,
|
|
buffer_length,
|
|
is_plain_text,
|
|
cld_hints,
|
|
allow_extended_lang,
|
|
flags,
|
|
plus_one,
|
|
language3,
|
|
percent3,
|
|
normalized_score3,
|
|
resultchunkvector,
|
|
&text_bytes,
|
|
&is_reliable);
|
|
*bytes = text_bytes;
|
|
*score_per_1kb = normalized_score3[0];
|
|
return language3[0];
|
|
}
|
|
|
|
#define LF 0x0a
|
|
#define CR 0x0d
|
|
const int kMaxBuffer = 5 * 1024;
|
|
|
|
bool ReadLine(FILE* infile, char* buffer, size_t maxlen) {
|
|
char* p = fgets(buffer, maxlen, infile);
|
|
if (p == NULL) {
|
|
return false;
|
|
}
|
|
int len = strlen(buffer);
|
|
|
|
// trim CR LF
|
|
if (buffer[len-1] == LF) {buffer[--len] = '\0';}
|
|
if (buffer[len-1] == CR) {buffer[--len] = '\0';}
|
|
return true;
|
|
}
|
|
|
|
bool IsComment(const char* buffer) {
|
|
int len = strlen(buffer);
|
|
if (len == 0) {return true;}
|
|
if (buffer[0] == '#') {return true;}
|
|
if (buffer[0] == ' ') {return true;} // Any leading space is comment
|
|
return false;
|
|
}
|
|
|
|
// Skips over xxxxx_ where _ is one or more spaces/tabs
|
|
// Returns string::npos if no more fields
|
|
int SkipOneField(const string& src, int pos) {
|
|
if (pos == string::npos) {return pos;}
|
|
|
|
int lpos = pos;
|
|
lpos = src.find_first_of(" \t", lpos);
|
|
if (lpos == string::npos) {return lpos;}
|
|
lpos = src.find_first_not_of(" \t", lpos);
|
|
if (lpos == string::npos) {return lpos;}
|
|
return lpos;
|
|
}
|
|
|
|
// Return language and script from parsed line or defaults
|
|
void GetLangScript(const string& src,
|
|
Language default_lang, ULScript default_lscript,
|
|
Language* target_lang, ULScript* target_lscript,
|
|
string* tld) {
|
|
*target_lang = default_lang;
|
|
*target_lscript = default_lscript;
|
|
*tld = "";
|
|
int pos = 0;
|
|
int pos2 = 0;
|
|
if (src.substr(0,7) == "SAMPLE ") {
|
|
// SAMPLE ll-Ssss
|
|
pos = SkipOneField(src, pos);
|
|
} else if (src.substr(0,5) == "SAMP ") {
|
|
// SAMP ll-Ssss /tld2.tld/
|
|
pos = SkipOneField(src, pos);
|
|
pos2 = SkipOneField(src, pos);
|
|
} else if (src.substr(0,5) == "Samp ") {
|
|
// Samp ll-Ssss /tld2.tld/
|
|
pos = SkipOneField(src, pos);
|
|
pos2 = SkipOneField(src, pos);
|
|
}
|
|
if (pos == 0) {return;}
|
|
if (pos == string::npos) {return;}
|
|
|
|
// Pos is at the first letter of language-script combination
|
|
int end = src.find_first_of(" \t", pos); // find end of lang-script
|
|
if (end == string::npos) {return;}
|
|
*target_lang = GetLanguageFromName(src.substr(pos, end - pos).c_str());
|
|
*target_lscript = GetULScriptFromName(src.substr(pos, end - pos).c_str());
|
|
|
|
// Pos2 is 0 or at the first letter of the tld string
|
|
if (pos2 == 0) {return;}
|
|
if (pos2 == string::npos) {return;}
|
|
end = src.find_first_of(" \t", pos2);
|
|
if (end == string::npos) {return;}
|
|
*tld = src.substr(pos2, end - pos2);
|
|
}
|
|
|
|
// Return position of start of text
|
|
int GetTextBeginPos(const string& src) {
|
|
int pos = 0;
|
|
if (src.size() < 8) {return pos;}
|
|
|
|
if (src.substr(0,7) == "SAMPLE ") {
|
|
// Skip SAMPLE ll-Ssss
|
|
pos = SkipOneField(src, pos);
|
|
pos = SkipOneField(src, pos);
|
|
} else if (src.substr(0,5) == "SAMP ") {
|
|
// Skip SAMP ll-Ssss /tld2.tld/
|
|
pos = SkipOneField(src, pos);
|
|
pos = SkipOneField(src, pos);
|
|
pos = SkipOneField(src, pos);
|
|
} else if (src.substr(0,5) == "Samp ") {
|
|
// Skip Samp ll-Ssss /tld2.tld/
|
|
pos = SkipOneField(src, pos);
|
|
pos = SkipOneField(src, pos);
|
|
pos = SkipOneField(src, pos);
|
|
}
|
|
return pos;
|
|
}
|
|
|
|
// Avoid zdiv
|
|
inline double Divisor(double x) {
|
|
return (x > 0.0 ? x : 1.0);
|
|
}
|
|
|
|
void Flush(Language cur_lang, ULScript ulscript,
|
|
double total_score_cur_lang,
|
|
double total_bytes_cur_lang, double total_bad_bytes_cur_lang) {
|
|
if (cur_lang == UNKNOWN_LANGUAGE) {return;}
|
|
|
|
bytes[cur_lang][LScript4(ulscript)] += total_bytes_cur_lang;
|
|
scores[cur_lang][LScript4(ulscript)] += total_score_cur_lang;
|
|
|
|
double score = total_score_cur_lang * 1024.0 / Divisor(total_bytes_cur_lang);
|
|
double percent_bad = 100.0 * total_bad_bytes_cur_lang /
|
|
Divisor(total_bytes_cur_lang + total_bad_bytes_cur_lang);
|
|
fprintf(stdout, "%s-%s %7.0f %6.1f, %2.0f%% bad SUMMARY\n\n",
|
|
LanguageCode(cur_lang),
|
|
ULScriptCode(ulscript),
|
|
total_bytes_cur_lang,
|
|
score,
|
|
percent_bad);
|
|
}
|
|
|
|
int BytesPer1KB(int i, int j) {
|
|
int bytes_per_1kb = ((scores[i][j] * 1024.0) / Divisor(bytes[i][j])) + 0.5;
|
|
return bytes_per_1kb;
|
|
}
|
|
|
|
int main(int argc, char *argv[]) {
|
|
Language cur_lang = UNKNOWN_LANGUAGE;
|
|
ULScript cur_ulscript = ULScript_Common;
|
|
double total_score_cur_lang = 0.0;
|
|
double total_bytes_cur_lang = 0.0;
|
|
double total_bad_bytes_cur_lang = 0.0;
|
|
memset(bytes, 0, sizeof(bytes));
|
|
memset(scores, 0, sizeof(bytes));
|
|
|
|
char buffer[kMaxBuffer];
|
|
int buffer_length;
|
|
const char* filename = NULL;
|
|
FILE* infile = stdin;
|
|
for (int i = 1; i < argc; ++i) {
|
|
if (argv[i][0] != '-') {
|
|
filename = argv[i];
|
|
}
|
|
}
|
|
|
|
if (filename != NULL) {
|
|
infile = fopen(filename, "r");
|
|
if (infile == NULL) {
|
|
fprintf(stderr, "%s did not open\n", filename);
|
|
return 0;
|
|
}
|
|
}
|
|
|
|
while (ReadLine(infile, buffer, kMaxBuffer)) {
|
|
if (IsComment(buffer)) {continue;}
|
|
|
|
buffer_length = strlen(buffer);
|
|
int bytes;
|
|
double score_per_1kb;
|
|
Language toplang;
|
|
Language target_lang;
|
|
ULScript target_ulscript;
|
|
|
|
string src(buffer, buffer_length);
|
|
string tld("");
|
|
int pos = GetTextBeginPos(src);
|
|
GetLangScript(src, UNKNOWN_LANGUAGE, ULScript_Common,
|
|
&target_lang, &target_ulscript, &tld);
|
|
if ((cur_lang != target_lang) || (cur_ulscript != target_ulscript)) {
|
|
Flush(cur_lang, cur_ulscript, total_score_cur_lang,
|
|
total_bytes_cur_lang, total_bad_bytes_cur_lang);
|
|
cur_lang = target_lang;
|
|
cur_ulscript = target_ulscript;
|
|
total_score_cur_lang = 0.0;
|
|
total_bytes_cur_lang = 0.0;
|
|
total_bad_bytes_cur_lang = 0.0;
|
|
}
|
|
|
|
toplang = ScoreOneLine(&src[pos], src.size() - pos, &bytes, &score_per_1kb);
|
|
|
|
fprintf(stdout, "%s%c %d %4.1f %s\n",
|
|
LanguageCode(toplang),
|
|
(toplang == target_lang) ? ' ' : '*',
|
|
bytes, score_per_1kb, buffer);
|
|
// Only count when detected lang matches the claimed target lang
|
|
if (toplang == target_lang) {
|
|
total_bytes_cur_lang += bytes;
|
|
total_score_cur_lang += (score_per_1kb * bytes) / 1024.0;
|
|
} else {
|
|
total_bad_bytes_cur_lang += bytes;
|
|
}
|
|
}
|
|
Flush(cur_lang, cur_ulscript, total_score_cur_lang,
|
|
total_bytes_cur_lang, total_bad_bytes_cur_lang);
|
|
|
|
for (int i = 0; i < NUM_LANGUAGES; ++i) {
|
|
Language ilang = static_cast<Language>(i);
|
|
fprintf(stdout, " {%4d, %4d, %4d, %4d}, // %d %s %s\n",
|
|
BytesPer1KB(i, 0), BytesPer1KB(i, 1),
|
|
BytesPer1KB(i, 2), BytesPer1KB(i, 3),
|
|
i, LanguageName(ilang), LanguageCode(ilang));
|
|
}
|
|
|
|
if (infile != stdin) {
|
|
fclose(infile);
|
|
}
|
|
}
|