new ku-Latn text, bad UTF-8, besteffort flag, comments

git-svn-id: https://cld2.googlecode.com/svn/trunk@174 b252ecd4-b096-bf77-eb8e-91563289f87e
This commit is contained in:
dsites@google.com
2014-10-28 20:32:54 +00:00
parent 92a3e24c4e
commit b312a68c4a
3 changed files with 82 additions and 40 deletions

View File

@@ -42,18 +42,16 @@ typedef int32 Encoding;
static const Encoding UNKNOWN_ENCODING = 0;
#ifndef CLD2_DYNAMIC_MODE
// Linker supplies the right tables; see ScoringTables compact_lang_det_impl.cc
// These are here JUST for printing versions
extern const UTF8PropObj cld_generated_CjkUni_obj;
extern const CLD2TableSummary kCjkDeltaBi_obj;
extern const CLD2TableSummary kDistinctBiTable_obj;
extern const CLD2TableSummary kQuad_obj;
extern const CLD2TableSummary kDeltaOcta_obj;
extern const CLD2TableSummary kDistinctOcta_obj;
extern const CLD2TableSummary kOcta2_obj;
extern const short kAvgDeltaOctaScore[];
#endif
// Linker supplies the right tables; see ScoringTables compact_lang_det_impl.cc
// These are here JUST for printing versions
extern const UTF8PropObj cld_generated_CjkUni_obj;
extern const CLD2TableSummary kCjkDeltaBi_obj;
extern const CLD2TableSummary kDistinctBiTable_obj;
extern const CLD2TableSummary kQuad_obj;
extern const CLD2TableSummary kDeltaOcta_obj;
extern const CLD2TableSummary kDistinctOcta_obj;
extern const CLD2TableSummary kOcta2_obj;
extern const short kAvgDeltaOctaScore[];
bool FLAGS_cld_version = false;
bool FLAGS_cld_html = true;
@@ -203,7 +201,6 @@ void DumpLanguages(Language summary_lang,
int main(int argc, char** argv) {
if (FLAGS_cld_version) {
#ifndef CLD2_DYNAMIC_MODE
printf("%s %4dKB uni build date, bytes\n",
"........",
cld_generated_CjkUni_obj.total_size >> 10);
@@ -219,14 +216,12 @@ int main(int argc, char** argv) {
kDeltaOcta_obj.kCLDTableBuildDate,
(kDeltaOcta_obj.kCLDTableSize *
sizeof(IndirectProbBucket4)) >> 10);
#else
printf("FLAGS_cld_version doesn't work with dynamic data mode\n");
#endif
exit(0);
} // End FLAGS_cld_version
int flags = 0;
bool get_vector = false;
const char* data_file = NULL;
bool do_line = false;
const char* fname = NULL;
for (int i = 1; i < argc; ++i) {
if (argv[i][0] != '-') {fname = argv[i];}
@@ -236,24 +231,18 @@ int main(int argc, char** argv) {
if (strcmp(argv[i], "--verbose") == 0) {flags |= kCLDFlagVerbose;}
if (strcmp(argv[i], "--echo") == 0) {flags |= kCLDFlagEcho;}
if (strcmp(argv[i], "--vector") == 0) {get_vector = true;}
if (strcmp(argv[i], "--data-file") == 0) { data_file = argv[++i];}
if (strcmp(argv[i], "--line") == 0) {do_line = true;}
}
#ifdef CLD2_DYNAMIC_MODE
if (data_file == NULL) {
fprintf(stderr, "When running in dynamic mode, you must specify --data-file [FILE]\n");
return -1;
}
fprintf(stdout, "Loading data from: %s\n", data_file);
CLD2::loadDataFromFile(data_file);
fprintf(stdout, "Data loaded, test commencing\n");
#endif
FILE* fin;
if (fname == NULL) {
fin = stdin;
} else {
fin = fopen(fname, "rb");
if (do_line) {
fin = fopen(fname, "r");
} else {
fin = fopen(fname, "rb");
}
if (fin == NULL) {
fprintf(stderr, "%s did not open\n", fname);
exit(0);
@@ -272,6 +261,51 @@ int main(int argc, char** argv) {
char* buffer = new char[10000000]; // Max 10MB of input for this test program
struct timeval news, newe;
// Full-blown flag-bit and hints interface
bool allow_extended_lang = true;
Language plus_one = UNKNOWN_LANGUAGE;
bool ignore_7bit = false;
if (do_line) {
while (Readline(fin, buffer)) {
if (IsComment(buffer)) {continue;}
// Detect language one line at a time
Language summary_lang = UNKNOWN_LANGUAGE;
Language language3[3];
int percent3[3];
double normalized_score3[3];
ResultChunkVector resultchunkvector;
bool is_plain_text = FLAGS_plain;
int text_bytes;
CLDHints cldhints = {NULL, tldhint, enchint, langhint};
summary_lang = CLD2::DetectLanguageSummaryV2(
buffer,
strlen(buffer),
is_plain_text,
&cldhints,
allow_extended_lang,
flags,
plus_one,
language3,
percent3,
normalized_score3,
get_vector ? &resultchunkvector : NULL,
&text_bytes,
&is_reliable);
printf("%s%s %d%% %s\n",
LanguageName(language3[0]),
is_reliable ? "" : "*",
percent3[0],
buffer);
}
fclose(fin);
delete[] buffer;
return 0;
}
if ((flags & kCLDFlagHtml) != 0) {
// Begin HTML file
@@ -287,16 +321,11 @@ int main(int argc, char** argv) {
fprintf(stderr, "file = %s<br>\n", fname ? fname : "stdin");
}
// Full-blown flag-bit and hints interface
bool allow_extended_lang = true;
Language plus_one = UNKNOWN_LANGUAGE;
// Read entire file
int n = fread(buffer, 1, 10000000, fin);
bool ignore_7bit = false;
// Detect language
// Detect languages in entire file
Language summary_lang = UNKNOWN_LANGUAGE;
Language language3[3];