new ku-Latn text, bad UTF-8, besteffort flag, comments
git-svn-id: https://cld2.googlecode.com/svn/trunk@174 b252ecd4-b096-bf77-eb8e-91563289f87e
This commit is contained in:
@@ -42,18 +42,16 @@ typedef int32 Encoding;
|
||||
static const Encoding UNKNOWN_ENCODING = 0;
|
||||
|
||||
|
||||
#ifndef CLD2_DYNAMIC_MODE
|
||||
// Linker supplies the right tables; see ScoringTables compact_lang_det_impl.cc
|
||||
// These are here JUST for printing versions
|
||||
extern const UTF8PropObj cld_generated_CjkUni_obj;
|
||||
extern const CLD2TableSummary kCjkDeltaBi_obj;
|
||||
extern const CLD2TableSummary kDistinctBiTable_obj;
|
||||
extern const CLD2TableSummary kQuad_obj;
|
||||
extern const CLD2TableSummary kDeltaOcta_obj;
|
||||
extern const CLD2TableSummary kDistinctOcta_obj;
|
||||
extern const CLD2TableSummary kOcta2_obj;
|
||||
extern const short kAvgDeltaOctaScore[];
|
||||
#endif
|
||||
// Linker supplies the right tables; see ScoringTables compact_lang_det_impl.cc
|
||||
// These are here JUST for printing versions
|
||||
extern const UTF8PropObj cld_generated_CjkUni_obj;
|
||||
extern const CLD2TableSummary kCjkDeltaBi_obj;
|
||||
extern const CLD2TableSummary kDistinctBiTable_obj;
|
||||
extern const CLD2TableSummary kQuad_obj;
|
||||
extern const CLD2TableSummary kDeltaOcta_obj;
|
||||
extern const CLD2TableSummary kDistinctOcta_obj;
|
||||
extern const CLD2TableSummary kOcta2_obj;
|
||||
extern const short kAvgDeltaOctaScore[];
|
||||
|
||||
bool FLAGS_cld_version = false;
|
||||
bool FLAGS_cld_html = true;
|
||||
@@ -203,7 +201,6 @@ void DumpLanguages(Language summary_lang,
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
if (FLAGS_cld_version) {
|
||||
#ifndef CLD2_DYNAMIC_MODE
|
||||
printf("%s %4dKB uni build date, bytes\n",
|
||||
"........",
|
||||
cld_generated_CjkUni_obj.total_size >> 10);
|
||||
@@ -219,14 +216,12 @@ int main(int argc, char** argv) {
|
||||
kDeltaOcta_obj.kCLDTableBuildDate,
|
||||
(kDeltaOcta_obj.kCLDTableSize *
|
||||
sizeof(IndirectProbBucket4)) >> 10);
|
||||
#else
|
||||
printf("FLAGS_cld_version doesn't work with dynamic data mode\n");
|
||||
#endif
|
||||
exit(0);
|
||||
} // End FLAGS_cld_version
|
||||
|
||||
int flags = 0;
|
||||
bool get_vector = false;
|
||||
const char* data_file = NULL;
|
||||
bool do_line = false;
|
||||
const char* fname = NULL;
|
||||
for (int i = 1; i < argc; ++i) {
|
||||
if (argv[i][0] != '-') {fname = argv[i];}
|
||||
@@ -236,24 +231,18 @@ int main(int argc, char** argv) {
|
||||
if (strcmp(argv[i], "--verbose") == 0) {flags |= kCLDFlagVerbose;}
|
||||
if (strcmp(argv[i], "--echo") == 0) {flags |= kCLDFlagEcho;}
|
||||
if (strcmp(argv[i], "--vector") == 0) {get_vector = true;}
|
||||
if (strcmp(argv[i], "--data-file") == 0) { data_file = argv[++i];}
|
||||
if (strcmp(argv[i], "--line") == 0) {do_line = true;}
|
||||
}
|
||||
|
||||
#ifdef CLD2_DYNAMIC_MODE
|
||||
if (data_file == NULL) {
|
||||
fprintf(stderr, "When running in dynamic mode, you must specify --data-file [FILE]\n");
|
||||
return -1;
|
||||
}
|
||||
fprintf(stdout, "Loading data from: %s\n", data_file);
|
||||
CLD2::loadDataFromFile(data_file);
|
||||
fprintf(stdout, "Data loaded, test commencing\n");
|
||||
#endif
|
||||
|
||||
FILE* fin;
|
||||
if (fname == NULL) {
|
||||
fin = stdin;
|
||||
} else {
|
||||
fin = fopen(fname, "rb");
|
||||
if (do_line) {
|
||||
fin = fopen(fname, "r");
|
||||
} else {
|
||||
fin = fopen(fname, "rb");
|
||||
}
|
||||
if (fin == NULL) {
|
||||
fprintf(stderr, "%s did not open\n", fname);
|
||||
exit(0);
|
||||
@@ -272,6 +261,51 @@ int main(int argc, char** argv) {
|
||||
char* buffer = new char[10000000]; // Max 10MB of input for this test program
|
||||
struct timeval news, newe;
|
||||
|
||||
// Full-blown flag-bit and hints interface
|
||||
bool allow_extended_lang = true;
|
||||
Language plus_one = UNKNOWN_LANGUAGE;
|
||||
bool ignore_7bit = false;
|
||||
|
||||
if (do_line) {
|
||||
while (Readline(fin, buffer)) {
|
||||
if (IsComment(buffer)) {continue;}
|
||||
|
||||
// Detect language one line at a time
|
||||
Language summary_lang = UNKNOWN_LANGUAGE;
|
||||
|
||||
Language language3[3];
|
||||
int percent3[3];
|
||||
double normalized_score3[3];
|
||||
ResultChunkVector resultchunkvector;
|
||||
bool is_plain_text = FLAGS_plain;
|
||||
int text_bytes;
|
||||
|
||||
CLDHints cldhints = {NULL, tldhint, enchint, langhint};
|
||||
|
||||
summary_lang = CLD2::DetectLanguageSummaryV2(
|
||||
buffer,
|
||||
strlen(buffer),
|
||||
is_plain_text,
|
||||
&cldhints,
|
||||
allow_extended_lang,
|
||||
flags,
|
||||
plus_one,
|
||||
language3,
|
||||
percent3,
|
||||
normalized_score3,
|
||||
get_vector ? &resultchunkvector : NULL,
|
||||
&text_bytes,
|
||||
&is_reliable);
|
||||
printf("%s%s %d%% %s\n",
|
||||
LanguageName(language3[0]),
|
||||
is_reliable ? "" : "*",
|
||||
percent3[0],
|
||||
buffer);
|
||||
}
|
||||
fclose(fin);
|
||||
delete[] buffer;
|
||||
return 0;
|
||||
}
|
||||
|
||||
if ((flags & kCLDFlagHtml) != 0) {
|
||||
// Begin HTML file
|
||||
@@ -287,16 +321,11 @@ int main(int argc, char** argv) {
|
||||
fprintf(stderr, "file = %s<br>\n", fname ? fname : "stdin");
|
||||
}
|
||||
|
||||
// Full-blown flag-bit and hints interface
|
||||
bool allow_extended_lang = true;
|
||||
Language plus_one = UNKNOWN_LANGUAGE;
|
||||
|
||||
// Read entire file
|
||||
int n = fread(buffer, 1, 10000000, fin);
|
||||
|
||||
bool ignore_7bit = false;
|
||||
|
||||
|
||||
// Detect language
|
||||
|
||||
// Detect languages in entire file
|
||||
Language summary_lang = UNKNOWN_LANGUAGE;
|
||||
|
||||
Language language3[3];
|
||||
|
Reference in New Issue
Block a user