git-svn-id: https://cld2.googlecode.com/svn/trunk@78 b252ecd4-b096-bf77-eb8e-91563289f87e
548 lines
17 KiB
C++
548 lines
17 KiB
C++
// Copyright 2013 Google Inc. All Rights Reserved.
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
//
|
|
// Little program to read sample UTF-8 text and score it
|
|
// Giving precision, recall, F, and matrix
|
|
//
|
|
// Author: dsites@google.com (Dick Sites)
|
|
//
|
|
|
|
#include <math.h> // for sqrt
|
|
#include <stdio.h>
|
|
#include <string.h>
|
|
#include <string>
|
|
|
|
#include "debug.h" // for uint8 etc
|
|
#include "integral_types.h" // for uint8 etc
|
|
#include "compact_lang_det_impl.h"
|
|
#include "lang_script.h"
|
|
|
|
using namespace std;
|
|
|
|
namespace CLD2 {
|
|
|
|
|
|
// Scaffolding
|
|
typedef int32 Encoding;
|
|
static const Encoding UNKNOWN_ENCODING = 0;
|
|
|
|
static const bool FLAGS_cld2_html = true;
|
|
static const bool FLAGS_noext = false;
|
|
static const bool FLAGS_echo_mismatch = true;
|
|
static const int32 FLAGS_minsize = 0;
|
|
|
|
|
|
/***
|
|
accepts one or more input files
|
|
loop:
|
|
reads source line
|
|
does cld on each source line
|
|
records source lang-script, CLD lang-script, count+=1
|
|
|
|
at end, print row headers = CLD lang, script, per_M
|
|
print column headers = in lang, script, per_M
|
|
print matrix, recall, precision, F
|
|
and overall RMS F.
|
|
|
|
sort by script, and within script, by per_M and near diagonal
|
|
***/
|
|
|
|
|
|
#define LF 0x0a
|
|
#define CR 0x0d
|
|
const int kMaxBuffer = 5 * 1024;
|
|
|
|
bool ReadLine(FILE* infile, char* buffer, size_t maxlen) {
|
|
char* p = fgets(buffer, maxlen, infile);
|
|
if (p == NULL) {
|
|
return false;
|
|
}
|
|
int len = strlen(buffer);
|
|
|
|
// trim CR LF
|
|
if (buffer[len-1] == LF) {buffer[--len] = '\0';}
|
|
if (buffer[len-1] == CR) {buffer[--len] = '\0';}
|
|
return true;
|
|
}
|
|
|
|
bool IsComment(char* buffer) {
|
|
int len = strlen(buffer);
|
|
if (len == 0) {return true;}
|
|
if (buffer[0] == '#') {return true;}
|
|
if (buffer[0] == ' ') {return true;} // Any leading space is comment
|
|
if ((len >= 5) && (memcmp(buffer, "BOGUS", 5) == 0)) {return true;}
|
|
return false;
|
|
}
|
|
|
|
|
|
// Skips over xxxxx_ where _ is one or more spaces/tabs
|
|
// Returns string::npos if no more fields
|
|
int SkipOneField(const string& src, int pos) {
|
|
if (pos == string::npos) {return pos;}
|
|
|
|
int lpos = pos;
|
|
lpos = src.find_first_of(" \t", lpos);
|
|
if (lpos == string::npos) {return lpos;}
|
|
lpos = src.find_first_not_of(" \t", lpos);
|
|
if (lpos == string::npos) {return lpos;}
|
|
return lpos;
|
|
}
|
|
|
|
// Return language and script from parsed line
|
|
void GetStatedLangScript(const string& src, string* lang_script, string* tld) {
|
|
*lang_script = "";
|
|
*tld = "";
|
|
int pos = 0;
|
|
int pos2 = 0;
|
|
if (src.substr(0,7) == "SAMPLE ") {
|
|
// SAMPLE ll-Ssss
|
|
pos = SkipOneField(src, pos);
|
|
} else if (src.substr(0,5) == "SAMP ") {
|
|
// SAMP ll-Ssss /tld2.tld/
|
|
pos = SkipOneField(src, pos);
|
|
pos2 = SkipOneField(src, pos);
|
|
} else if (src.substr(0,5) == "Samp ") {
|
|
// Samp ll-Ssss /tld2.tld/
|
|
pos = SkipOneField(src, pos);
|
|
pos2 = SkipOneField(src, pos);
|
|
}
|
|
if (pos == 0) {return;}
|
|
if (pos == string::npos) {return;}
|
|
|
|
// Pos is at the first letter of language-script combination
|
|
int end = src.find_first_of(" \t", pos); // find end of lang-script
|
|
if (end == string::npos) {return;}
|
|
*lang_script = src.substr(pos, end - pos);
|
|
|
|
// Pos2 is 0 or at the first letter of the tld string
|
|
if (pos2 == 0) {return;}
|
|
if (pos2 == string::npos) {return;}
|
|
end = src.find_first_of(" \t", pos2);
|
|
if (end == string::npos) {return;}
|
|
*tld = src.substr(pos2, end - pos2);
|
|
}
|
|
|
|
// Return position of start of text
|
|
int GetTextBeginPos(const string& src) {
|
|
int pos = 0;
|
|
if (src.size() < 8) {return pos;}
|
|
|
|
if (src.substr(0,7) == "SAMPLE ") {
|
|
// Skip SAMPLE ll-Ssss
|
|
pos = SkipOneField(src, pos);
|
|
pos = SkipOneField(src, pos);
|
|
} else if (src.substr(0,5) == "SAMP ") {
|
|
// Skip SAMP ll-Ssss /tld2.tld/
|
|
pos = SkipOneField(src, pos);
|
|
pos = SkipOneField(src, pos);
|
|
pos = SkipOneField(src, pos);
|
|
} else if (src.substr(0,5) == "Samp ") {
|
|
// Skip Samp ll-Ssss /tld2.tld/
|
|
pos = SkipOneField(src, pos);
|
|
pos = SkipOneField(src, pos);
|
|
pos = SkipOneField(src, pos);
|
|
}
|
|
return pos;
|
|
}
|
|
|
|
|
|
bool CarefulMatch(const char* in_langscript,
|
|
Language in_lang, ULScript in_lscript,
|
|
Language cld_lang, ULScript cld_lscript) {
|
|
bool easy_match = ((in_lang == cld_lang) & (in_lscript == cld_lscript));
|
|
if (easy_match) {return true;}
|
|
|
|
// Unrecognized list, matching un-Xxxx
|
|
if ((cld_lang == UNKNOWN_LANGUAGE) && (in_lscript == cld_lscript)) {
|
|
if (strcmp(in_langscript, "az-Arab") == 0) {return true;}
|
|
if (strcmp(in_langscript, "az-Cyrl") == 0) {return true;}
|
|
if (strcmp(in_langscript, "kk-Latn") == 0) {return true;}
|
|
if (strcmp(in_langscript, "ku-Latn") == 0) {return true;}
|
|
if (strcmp(in_langscript, "my-Latn") == 0) {return true;}
|
|
if (strcmp(in_langscript, "ru-Latn") == 0) {return true;}
|
|
if (strcmp(in_langscript, "tg-Arab") == 0) {return true;}
|
|
if (strcmp(in_langscript, "ug-Latn") == 0) {return true;}
|
|
if (strcmp(in_langscript, "za-Hani") == 0) {return true;}
|
|
}
|
|
|
|
// bs/me => sr/hr
|
|
if ((cld_lang == CROATIAN) && (cld_lscript == ULScript_Latin)) {
|
|
if (strcmp(in_langscript, "bs-Latn") == 0) {return true;}
|
|
if (strcmp(in_langscript, "sr-ME-Latn") == 0) {return true;}
|
|
}
|
|
if ((cld_lang == SERBIAN) && (cld_lscript == ULScript_Cyrillic)) {
|
|
if (strcmp(in_langscript, "bs-Cyrl") == 0) {return true;}
|
|
if (strcmp(in_langscript, "sr-ME-Cyrl") == 0) {return true;}
|
|
}
|
|
|
|
// Twi => Akan
|
|
if ((cld_lang == AKAN) && (cld_lscript == ULScript_Latin)) {
|
|
if (strcmp(in_langscript, "tw-Latn") == 0) {return true;}
|
|
}
|
|
|
|
// za-Hani
|
|
if ((cld_lang == CHINESE) && (cld_lscript == ULScript_Hani)) {
|
|
if (strcmp(in_langscript, "za-Hani") == 0) {return true;}
|
|
}
|
|
|
|
// zzb, zze, zzh fake languages
|
|
if (strcmp(in_langscript, "zzb-Latn") == 0) {return true;}
|
|
if (strcmp(in_langscript, "zze-Latn") == 0) {return true;}
|
|
if (strcmp(in_langscript, "zzh-Latn") == 0) {return true;}
|
|
|
|
|
|
return false;
|
|
}
|
|
|
|
|
|
#if 0
|
|
typedef hash_map<string, int32> StringIntMap;
|
|
|
|
static int32 next_in_map;
|
|
static int32 next_cld_map;
|
|
static int32 next_in_cld_map;
|
|
static StringIntMap in_map; // xx-Fooo to small int
|
|
static StringIntMap cld_map; // xx-Fooo to small int
|
|
static StringIntMap in_cld_map; // xx-Fooo_xx-Barr to small int
|
|
|
|
static vector<int32> in_count; // counts by in_map subscript
|
|
static vector<int32> cld_count; // counts by cld_map subscript
|
|
static vector<int32> in_cld_count; // counts by in_cld_map subscript
|
|
|
|
int32 MapToSmallInt(const string& s, StringIntMap* smap, int* next_smap) {
|
|
StringIntMap::iterator it = smap->find(s);
|
|
if (it == smap->end()) {
|
|
// New
|
|
(*smap)[s] = *next_smap;
|
|
*next_smap += 1;
|
|
}
|
|
return (*smap)[s];
|
|
}
|
|
#endif
|
|
|
|
|
|
void InitResult() {
|
|
#if 0
|
|
in_map.clear();
|
|
cld_map.clear();
|
|
in_cld_map.clear();
|
|
next_in_map = 0;
|
|
next_cld_map = 0;
|
|
next_in_cld_map = 0;
|
|
in_count.clear();
|
|
cld_count.clear();
|
|
in_cld_count.clear();
|
|
#endif
|
|
}
|
|
|
|
void RecordCLDResult(const char* buffer, const char* in_langscript,
|
|
Language in_lang, ULScript in_lscript,
|
|
Language cld_lang, ULScript cld_lscript) {
|
|
|
|
bool match = CarefulMatch(in_langscript,
|
|
in_lang, in_lscript, cld_lang, cld_lscript);
|
|
if (FLAGS_echo_mismatch && !match) {
|
|
fprintf(stderr,
|
|
" =Mismatch: "
|
|
"expected %s, actual %s<br>\n",
|
|
LanguageCode(in_lang), LanguageCode(cld_lang));
|
|
}
|
|
#if 0
|
|
printf("%s %s-%s %s\n", in_langscript,
|
|
ExtLanguageCode(cld_lang), UnicodeLScriptCode(cld_lscript),
|
|
match ? "" : "!=");
|
|
|
|
string cld_langscript = ExtLanguageCode(cld_lang);
|
|
cld_langscript.append("-");
|
|
cld_langscript.append( UnicodeLScriptCode(cld_lscript));
|
|
|
|
string in_cld_langscript = in_langscript;
|
|
in_cld_langscript.append("_");
|
|
in_cld_langscript.append(cld_langscript);
|
|
|
|
// Extend vectors if needed
|
|
int32 in_int = MapToSmallInt(in_langscript, &in_map, &next_in_map);
|
|
while (in_count.size() <= in_int) {in_count.push_back(0);}
|
|
in_count[in_int] += 1;
|
|
|
|
int32 cld_int = MapToSmallInt(cld_langscript, &cld_map, &next_cld_map);
|
|
while (cld_count.size() <= cld_int) {cld_count.push_back(0);}
|
|
cld_count[cld_int] += 1;
|
|
|
|
int32 in_cld_int = MapToSmallInt(in_cld_langscript,
|
|
&in_cld_map, &next_in_cld_map);
|
|
while (in_cld_count.size() <= in_cld_int) {in_cld_count.push_back(0);}
|
|
in_cld_count[in_cld_int] += 1;
|
|
#endif
|
|
}
|
|
|
|
void FinishResult() {
|
|
#if 0
|
|
int32 in_n = in_map.size();
|
|
int32 cld_n = cld_map.size();
|
|
|
|
int32* in_total = new int32[in_n];
|
|
memset(in_total, 0, in_n * sizeof(int32));
|
|
int32* in_matches = new int32[in_n];
|
|
memset(in_matches, 0, in_n * sizeof(int32));
|
|
string* in_str = new string[in_n];
|
|
for (StringIntMap::iterator it = in_map.begin(); it != in_map.end(); ++it) {
|
|
in_str[it->second] = it->first;
|
|
}
|
|
|
|
int32* cld_total = new int32[cld_n];
|
|
memset(cld_total, 0, cld_n * sizeof(int32));
|
|
int32* cld_matches = new int32[cld_n];
|
|
memset(cld_matches, 0, cld_n * sizeof(int32));
|
|
string* cld_str = new string[cld_n];
|
|
for (StringIntMap::iterator it = cld_map.begin(); it != cld_map.end(); ++it) {
|
|
cld_str[it->second] = it->first;
|
|
}
|
|
|
|
for (StringIntMap::iterator it = in_cld_map.begin();
|
|
it != in_cld_map.end(); ++it) {
|
|
string in_cld = it->first;
|
|
int32 in_cld_int = it->second;
|
|
//VERYTEMP
|
|
//printf("%s[%d] = %d\n", in_cld.c_str(), in_cld_int, in_cld_count[in_cld_int]);
|
|
|
|
// Decompose it all
|
|
int under_pos = in_cld.find("_");
|
|
string in_langscript = in_cld.substr(0, under_pos);
|
|
string cld_langscript = in_cld.substr(under_pos + 1);
|
|
int32 in_int = MapToSmallInt(in_langscript, &in_map, &next_in_map);
|
|
int32 cld_int = MapToSmallInt(cld_langscript, &cld_map, &next_cld_map);
|
|
|
|
Language in_lang = GetLanguageFromNumberOrName(in_langscript.c_str());
|
|
ULScript in_lscript = GetLScriptFromNumberOrName(in_langscript.c_str());
|
|
Language cld_lang = GetLanguageFromNumberOrName(cld_langscript.c_str());
|
|
ULScript cld_lscript = GetLScriptFromNumberOrName(cld_langscript.c_str());
|
|
|
|
bool match = CarefulMatch(in_langscript.c_str(),
|
|
in_lang, in_lscript, cld_lang, cld_lscript);
|
|
|
|
//VERYTEMP
|
|
//printf("%s-%s %s-%s #=%d %d %d %s\n",
|
|
// ExtLanguageCode(in_lang), UnicodeLScriptCode(in_lscript),
|
|
// ExtLanguageCode(cld_lang), UnicodeLScriptCode(cld_lscript),
|
|
// in_cld_count[in_cld_int], in_int, cld_int, match ? "match" : "!=");
|
|
|
|
in_total[in_int] += in_cld_count[in_cld_int];
|
|
cld_total[cld_int] += in_cld_count[in_cld_int];
|
|
if (match) {
|
|
in_matches[in_int] += in_cld_count[in_cld_int];
|
|
cld_matches[cld_int] += in_cld_count[in_cld_int];
|
|
}
|
|
}
|
|
|
|
int32 total = 0;
|
|
int32 match_total = 0;
|
|
for (int i = 0; i < cld_n; ++i) {
|
|
printf("Precision: %s %d/%d = %6.4f\n",
|
|
cld_str[i].c_str(), cld_matches[i], cld_total[i],
|
|
cld_total[i] == 0 ? 0.0 : (cld_matches[i] * 1.0) / cld_total[i]);
|
|
total += cld_total[i];
|
|
match_total += cld_matches[i];
|
|
}
|
|
printf("Precision: %s %d/%d = %6.4f\n",
|
|
"TOTAL", match_total, total,
|
|
total == 0 ? 0.0 : (match_total * 1.0) / total);
|
|
|
|
total = 0;
|
|
match_total = 0;
|
|
for (int i = 0; i < in_n; ++i) {
|
|
printf("Recall: %s %d/%d = %6.4f\n",
|
|
in_str[i].c_str(), in_matches[i], in_total[i],
|
|
in_total[i] == 0 ? 0.0 : (in_matches[i] * 1.0) / in_total[i]);
|
|
total += in_total[i];
|
|
match_total += in_matches[i];
|
|
}
|
|
printf("Recall: %s %d/%d = %6.4f\n",
|
|
"TOTAL", match_total, total,
|
|
total == 0 ? 0.0 : (match_total * 1.0) / total);
|
|
|
|
#endif
|
|
}
|
|
|
|
bool SkipMe(char c) {
|
|
if (static_cast<uint8>(c) <= '9') {return true;}
|
|
return false;
|
|
}
|
|
|
|
// Remove any trailing digits/spaces (possible mapreduce counts)
|
|
// Return length
|
|
int Trim(char* buffer) {
|
|
int buffer_len = strlen(buffer);
|
|
while (SkipMe(buffer[buffer_len - 1])) {--buffer_len;}
|
|
buffer[buffer_len] = '\0';
|
|
return buffer_len;
|
|
}
|
|
|
|
void LangDetLinesOfFile(int flags, bool get_vector, const char* fname) {
|
|
FILE* fin = fopen(fname, "rb");
|
|
if (fin == NULL) {
|
|
fprintf(stderr, "Did not open %s\n", fname);
|
|
return;
|
|
}
|
|
|
|
// Expecting
|
|
// Samp af-Latn /afr/ word tot skuldig bevind volgens die wet, in...
|
|
char buffer[kMaxBuffer];
|
|
while (ReadLine(fin, buffer, kMaxBuffer)) {
|
|
if (IsComment(buffer)) {continue;}
|
|
|
|
int buffer_len = Trim(buffer);
|
|
|
|
string buffer_str(buffer, buffer_len);
|
|
string lang_script;
|
|
string tld;
|
|
|
|
// Get lang-script
|
|
GetStatedLangScript(buffer_str, &lang_script, &tld);
|
|
Language in_lang = GetLanguageFromName(lang_script.c_str());
|
|
ULScript in_lscript = GetULScriptFromName(lang_script.c_str());
|
|
|
|
// Get Text; skip over any prefix fields
|
|
int pos = GetTextBeginPos(buffer_str);
|
|
if (pos == string::npos) {continue;}
|
|
|
|
const char* src = buffer_str.data() + pos;
|
|
int src_len = buffer_str.size() - pos;
|
|
|
|
if (src_len < FLAGS_minsize) {continue;} // Skip if too short
|
|
|
|
// Detect language in one line of UTF-8
|
|
bool is_plain_text = false;
|
|
const char* tldhint = "";
|
|
Encoding enchint = UNKNOWN_ENCODING;
|
|
Language langhint = UNKNOWN_LANGUAGE;
|
|
// Full-blown flag-bit and hints interface
|
|
bool allow_extended_lang = true;
|
|
// Caller initializes flags
|
|
Language plus_one = UNKNOWN_LANGUAGE;
|
|
|
|
Language language3[3];
|
|
int percent3[3];
|
|
double normalized_score3[3];
|
|
ResultChunkVector resultchunkvector;
|
|
int text_bytes;
|
|
bool is_reliable;
|
|
|
|
// Detected language biased summary (biased against English)
|
|
Language summary_lang = UNKNOWN_LANGUAGE;
|
|
|
|
// Identify the expected value
|
|
fprintf(stderr, "Samp %s ", lang_script.c_str());
|
|
flags |= kCLDFlagQuiet;
|
|
|
|
CLDHints cldhints = {NULL, tldhint, enchint, langhint};
|
|
|
|
summary_lang = DetectLanguageSummaryV2(
|
|
src, src_len,
|
|
is_plain_text,
|
|
&cldhints,
|
|
allow_extended_lang,
|
|
flags,
|
|
plus_one,
|
|
language3,
|
|
percent3,
|
|
normalized_score3,
|
|
get_vector ? &resultchunkvector : NULL,
|
|
&text_bytes,
|
|
&is_reliable);
|
|
|
|
#if 0
|
|
if (FLAGS_noext) {
|
|
summary_lang = DetectLanguageSummary(
|
|
src, src_len,
|
|
is_plain_text,
|
|
language3,
|
|
percent3,
|
|
&text_bytes,
|
|
&is_reliable);
|
|
} else {
|
|
summary_lang = ExtDetectLanguageSummary(
|
|
src, src_len,
|
|
is_plain_text,
|
|
language3,
|
|
percent3,
|
|
&text_bytes,
|
|
&is_reliable);
|
|
}
|
|
#endif
|
|
if (get_vector) {
|
|
DumpResultChunkVector(stderr, src, &resultchunkvector);
|
|
}
|
|
|
|
if (!is_reliable) {summary_lang = UNKNOWN_LANGUAGE;}
|
|
|
|
RecordCLDResult(buffer, lang_script.c_str(),
|
|
in_lang, in_lscript,
|
|
summary_lang, in_lscript);
|
|
}
|
|
|
|
fclose(fin);
|
|
}
|
|
|
|
|
|
|
|
|
|
int main (int argc, char *argv[])
|
|
{
|
|
int flags = 0;
|
|
bool get_vector = false;
|
|
|
|
for (int i = 1; i < argc; ++i) {
|
|
if (strcmp(argv[i], "--scoreasquads") == 0) {flags |= kCLDFlagScoreAsQuads;}
|
|
if (strcmp(argv[i], "--html") == 0) {flags |= kCLDFlagHtml;}
|
|
if (strcmp(argv[i], "--cr") == 0) {flags |= kCLDFlagCr;}
|
|
if (strcmp(argv[i], "--verbose") == 0) {flags |= kCLDFlagVerbose;}
|
|
if (strcmp(argv[i], "--vector") == 0) {get_vector = true;}
|
|
}
|
|
|
|
if (FLAGS_cld2_html) {
|
|
// Begin HTML file
|
|
fprintf(stderr, "<html><meta charset=\"UTF-8\"><body>\n");
|
|
fprintf(stderr, "<style media=\"print\" type=\"text/css\"> "
|
|
":root { -webkit-print-color-adjust: exact; } </style>\n");
|
|
fprintf(stderr, "<span style=\"font-size: 7pt\">\n");
|
|
}
|
|
|
|
|
|
InitResult();
|
|
for (int i = 1; i < argc; ++i) {
|
|
if (argv[i][0] != '-') {
|
|
const char* fname = argv[i];
|
|
fprintf(stderr, "file = %s<br><br>\n", fname ? fname : "stdin");
|
|
LangDetLinesOfFile(flags, get_vector, fname);
|
|
}
|
|
}
|
|
FinishResult();
|
|
|
|
if (FLAGS_cld2_html) {
|
|
fprintf(stderr, "\n</span></body></html><br>");
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
} // End namespace CLD2
|
|
|
|
|
|
int main(int argc, char *argv[]) {
|
|
return CLD2::main(argc, argv);
|
|
}
|
|
|