diff --git a/internal/compact_lang_det.cc b/internal/compact_lang_det.cc index ccc6588..045e0b2 100644 --- a/internal/compact_lang_det.cc +++ b/internal/compact_lang_det.cc @@ -28,11 +28,32 @@ namespace CLD2 { // String is "code_version - data_scrape_date" -//static const char* kDetectLanguageVersion = "V2.0 - 20130715"; - +// static const char* kDetectLanguageVersion = "V2.0 - 20141015"; // Large-table version for all ~160 languages -// Small-table version for all ~60 languages +// Small-table version for all ~80 languages + + +// Scan interchange-valid UTF-8 bytes and detect most likely language +// If the input is in fact not valid UTF-8, this returns immediately with +// the result value UNKNOWN_LANGUAGE and is_reliable set false. +// +// In all cases, valid_prefix_bytes will be set to the number of leading +// bytes that are valid UTF-8. If this is < buffer_length, there is invalid +// input starting at the following byte. +Language DetectLanguageCheckUTF8( + const char* buffer, + int buffer_length, + bool is_plain_text, + bool* is_reliable, + int* valid_prefix_bytes) { + *valid_prefix_bytes = SpanInterchangeValid(buffer, buffer_length); + if (*valid_prefix_bytes < buffer_length) { + *is_reliable = false; + return UNKNOWN_LANGUAGE; + } + return DetectLanguage(buffer, buffer_length, is_plain_text, is_reliable); +} // Scan interchange-valid UTF-8 bytes and detect most likely language Language DetectLanguage( @@ -272,7 +293,70 @@ Language ExtDetectLanguageSummary( return lang; } + // Use this one. +// +// Hints are collected into a struct. +// Flags are passed in (normally zero). +// +// Also returns 3 internal language scores as a ratio to +// normal score for real text in that language. Scores close to 1.0 indicate +// normal text, while scores far away from 1.0 indicate badly-skewed text or +// gibberish +// +// Returns a vector of chunks in different languages, so that caller may +// spell-check, translate, or otherwise process different parts of the input +// buffer in language-dependant ways. +// +// If the input is in fact not valid UTF-8, this returns immediately with +// the result value UNKNOWN_LANGUAGE and is_reliable set false. +// +// In all cases, valid_prefix_bytes will be set to the number of leading +// bytes that are valid UTF-8. If this is < buffer_length, there is invalid +// input starting at the following byte. +Language ExtDetectLanguageSummaryCheckUTF8( + const char* buffer, + int buffer_length, + bool is_plain_text, + const CLDHints* cld_hints, + int flags, + Language* language3, + int* percent3, + double* normalized_score3, + ResultChunkVector* resultchunkvector, + int* text_bytes, + bool* is_reliable, + int* valid_prefix_bytes) { + *valid_prefix_bytes = SpanInterchangeValid(buffer, buffer_length); + if (*valid_prefix_bytes < buffer_length) { + *is_reliable = false; + return UNKNOWN_LANGUAGE; + } + + bool allow_extended_lang = true; + Language plus_one = UNKNOWN_LANGUAGE; + + Language lang = DetectLanguageSummaryV2( + buffer, + buffer_length, + is_plain_text, + cld_hints, + allow_extended_lang, + flags, + plus_one, + language3, + percent3, + normalized_score3, + resultchunkvector, + text_bytes, + is_reliable); + // Do not default to English + return lang; +} + +// Use this one ONLY if you can prove the the input text is valid UTF-8 by +// design because it went thorough a known-good conversion program. +// // Hints are collected into a struct. // Flags are passed in (normally zero). // @@ -318,5 +402,7 @@ Language ExtDetectLanguageSummary( return lang; } + + } // End namespace CLD2 diff --git a/internal/compact_lang_det_impl.cc b/internal/compact_lang_det_impl.cc index 3277b18..eafbdfc 100644 --- a/internal/compact_lang_det_impl.cc +++ b/internal/compact_lang_det_impl.cc @@ -27,6 +27,7 @@ #include "debug.h" #include "integral_types.h" #include "lang_script.h" +#include "utf8acceptinterchange.h" #include "utf8statetable.h" #ifdef CLD2_DYNAMIC_MODE @@ -68,6 +69,16 @@ extern const CLD2TableSummary kDeltaOcta_obj; extern const CLD2TableSummary kDistinctOcta_obj; extern const short kAvgDeltaOctaScore[]; +// Returns the length in bytes of the prefix of src that is all +// interchange valid UTF-8 +int SpanInterchangeValid(const char* src, int byte_length) { + int bytes_consumed; + const UTF8ReplaceObj* st = &utf8acceptinterchange_obj; + StringPiece str(src, byte_length); + UTF8GenericScan(st, str, &bytes_consumed); + return bytes_consumed; +} + #ifdef CLD2_DYNAMIC_MODE // CLD2_DYNAMIC_MODE is defined: // Data will be read from an mmap opened at runtime. @@ -426,6 +437,9 @@ inline bool FlagTop40(int flags) {return (flags & kCLDFlagTop40) != 0;} inline bool FlagShort(int flags) {return (flags & kCLDFlagShort) != 0;} inline bool FlagHint(int flags) {return (flags & kCLDFlagHint) != 0;} inline bool FlagUseWords(int flags) {return (flags & kCLDFlagUseWords) != 0;} +inline bool FlagBestEffort(int flags) { + return (flags & kCLDFlagBestEffort) != 0; +} // Defines Top40 packed languages @@ -679,7 +693,7 @@ int CheapRepWordsInplace(char* isrc, int src_len, int* hash, int* tbl) { // This alternate form overwrites redundant words, thus avoiding corrupting the -// backmap for generate a vector of original-text ranges. +// backmap for generating a vector of original-text ranges. int CheapRepWordsInplaceOverwrite(char* isrc, int src_len, int* hash, int* tbl) { const uint8* src = reinterpret_cast(isrc); const uint8* srclimit = src + src_len; @@ -851,7 +865,7 @@ int CheapSqueezeInplace(char* isrc, } // This alternate form overwrites redundant words, thus avoiding corrupting the -// backmap for generate a vector of original-text ranges. +// backmap for generating a vector of original-text ranges. int CheapSqueezeInplaceOverwrite(char* isrc, int src_len, int ichunksize) { @@ -1402,7 +1416,8 @@ void CalcSummaryLang(DocTote* doc_tote, int total_text_bytes, const Language* language3, const int* percent3, Language* summary_lang, bool* is_reliable, - bool FLAGS_cld2_html, bool FLAGS_cld2_quiet) { + bool FLAGS_cld2_html, bool FLAGS_cld2_quiet, + int flags) { // Vector of active languages; changes if we delete some int slot_count = 3; int active_slot[3] = {0, 1, 2}; @@ -1417,7 +1432,7 @@ void CalcSummaryLang(DocTote* doc_tote, int total_text_bytes, for (int i = 0; i < 3; ++i) { if (language3[i] == TG_UNKNOWN_LANGUAGE) { ignore_percent += percent3[i]; - // Move the rest up, levaing input vectors unchanged + // Move the rest up, leaving input vectors unchanged for (int j=i+1; j < 3; ++j) { active_slot[j - 1] = active_slot[j]; } @@ -1475,7 +1490,7 @@ void CalcSummaryLang(DocTote* doc_tote, int total_text_bytes, } // If return percent is too small (too many languages), return UNKNOWN - if ((return_percent < kGoodFirstMinPercent)) { + if ((return_percent < kGoodFirstMinPercent) && !FlagBestEffort(flags)) { if (FLAGS_cld2_html && !FLAGS_cld2_quiet) { fprintf(stderr, "{Unreli %s %d%% percent too small} ", LanguageCode(*summary_lang), return_percent); @@ -1666,15 +1681,27 @@ void ApplyHints(const char* buffer, } } } - - - - - - } +// Extend results to fully cover the [lo..hi) range +void FinishResultVector(int lo, int hi, ResultChunkVector* vec) { + if (vec == NULL) {return;} + if (vec->size() == 0) {return;} + ResultChunk* rc = &(*vec)[0]; + if (rc->offset > lo) { + int diff = rc->offset - lo; + rc->offset -= diff; + rc->bytes += diff; + } + ResultChunk* rc2 = &(*vec)[vec->size() - 1]; + int rc2hi = rc2->offset + rc2->bytes; + if (rc2hi < hi) { + int diff = hi - rc2hi; + rc2->bytes += diff; + } +} + // Results language3/percent3/text_bytes must be exactly three items Language DetectLanguageSummaryV2( @@ -1968,7 +1995,9 @@ Language DetectLanguageSummaryV2( // This is the real, non-recursive return // Move bytes for unreliable langs to another lang or UNKNOWN - RemoveUnreliableLanguages(&doc_tote, FLAGS_cld2_html, FLAGS_cld2_quiet); + if (!FlagBestEffort(flags)) { + RemoveUnreliableLanguages(&doc_tote, FLAGS_cld2_html, FLAGS_cld2_quiet); + } // Redo the result extraction after the removal above doc_tote.Sort(3); @@ -1976,13 +2005,11 @@ Language DetectLanguageSummaryV2( reliable_percent3, language3, percent3, normalized_score3, text_bytes, is_reliable); - - Language summary_lang; CalcSummaryLang(&doc_tote, total_text_bytes, reliable_percent3, language3, percent3, &summary_lang, is_reliable, - FLAGS_cld2_html, FLAGS_cld2_quiet); + FLAGS_cld2_html, FLAGS_cld2_quiet, flags); if (FLAGS_cld2_html && !FLAGS_cld2_quiet) { for (int i = 0; i < 3; ++i) { @@ -2015,6 +2042,9 @@ Language DetectLanguageSummaryV2( fprintf(stderr, "
\n"); } + // Extend results to fully cover the input buffer + FinishResultVector(0, buffer_length, resultchunkvector); + return summary_lang; } diff --git a/internal/compact_lang_det_impl.h b/internal/compact_lang_det_impl.h index 6cadb3d..0cde880 100644 --- a/internal/compact_lang_det_impl.h +++ b/internal/compact_lang_det_impl.h @@ -126,6 +126,10 @@ At the end of the first pass -- const uint32* kQuadValueTable; } LangDetObj; + // Returns the length in bytes of the prefix of src that is all + // interchange valid UTF-8 + int SpanInterchangeValid(const char* src, int byte_length); + // For HTML documents, tags are skipped, along with // and sequences, and entities are expanded. // diff --git a/public/compact_lang_det.h b/public/compact_lang_det.h index 359457c..ecd75e9 100644 --- a/public/compact_lang_det.h +++ b/public/compact_lang_det.h @@ -42,7 +42,7 @@ // HAUSA (Latin, Arabic) // KASHMIRI (Arabic, Devanagari) // KAZAKH (Latin, Cyrillic, Arabic) -// KURDISH (Latin*, Arabic) +// KURDISH (Latin, Arabic) // KYRGYZ (Cyrillic, Arabic) // LIMBU (Devanagari, Limbu) // MONGOLIAN (Cyrillic, Mongolian) @@ -56,8 +56,7 @@ // UZBEK (Latin, Cyrillic, Arabic) // // * Due to a shortage of training text, AZERBAIJANI is not currently detected -// in Arabic or Cyrillic scripts, nor KURDISH in Latin script, nor TAJIK in -// Arabic script. +// in Arabic or Cyrillic scripts, nor TAJIK in Arabic script. // #ifndef I18N_ENCODINGS_CLD2_PUBLIC_COMPACT_LANG_DET_H_ @@ -65,10 +64,19 @@ #include #include -#include "../internal/lang_script.h" // For Language +#include "../internal/integral_types.h" // For uint8 etc. +#include "../internal/lang_script.h" // For Language namespace CLD2 { +// NOTE: If you cannot prove the the input text is valid UTF-8 by design because +// it went thorough a known-good conversion program, call one of the *CheckUTF8 +// routines. For example, never trust raw user-supplied bytes. It is especially +// important to do a UTF8-to-UTF8 conversion on raw bytes that claim to be +// UTF-8, using a converter that guarantees to produce valid UTF-8, turning +// other byte sequences into the Unicode replacement character U+FFFD (deleting +// or turning into space or question-mark can create security holes). + // Scan interchange-valid UTF-8 bytes and detect most likely language, // or set of languages. // @@ -129,19 +137,42 @@ namespace CLD2 { Language language_hint; // ITALIAN boosts it } CLDHints; - static const int kMaxResultChunkBytes = 65535; + static const int32 kMaxResultChunkBytes = 0x7fffffff; + // Note: this was initially over-optimized to fit into 8 bytes, + // causing too much work to deal with with greater than 16-bit byte lengths. // For returning a vector of per-language pieces of the input buffer // Unreliable and too-short are mapped to UNKNOWN_LANGUAGE typedef struct { int offset; // Starting byte offset in original buffer - uint16 bytes; // Number of bytes in chunk + int32 bytes; // Number of bytes in chunk uint16 lang1; // Top lang, as full Language. Apply - // static_cast() to this short value. + // static_cast() to this short value. + uint16 pad; // Make multiple of 4 bytes } ResultChunk; typedef std::vector ResultChunkVector; + // These initial simple versions all cascade through the full-blown last + // version which it would be better for you to use directly because you will + // get better results passing in any available hints. + + // Scan interchange-valid UTF-8 bytes and detect most likely language + // If the input is in fact not valid UTF-8, this returns immediately with + // the result value UNKNOWN_LANGUAGE and is_reliable set false. + // + // In all cases, valid_prefix_bytes will be set to the number of leading + // bytes that are valid UTF-8. If this is < buffer_length, there is invalid + // input starting at the following byte. + Language DetectLanguageCheckUTF8( + const char* buffer, + int buffer_length, + bool is_plain_text, + bool* is_reliable, + int* valid_prefix_bytes); + + // Use this one ONLY if you can prove the the input text is valid UTF-8 by + // design because it went thorough a known-good conversion program. // Scan interchange-valid UTF-8 bytes and detect most likely language Language DetectLanguage( const char* buffer, @@ -149,6 +180,8 @@ namespace CLD2 { bool is_plain_text, bool* is_reliable); + // Use this one ONLY if you can prove the the input text is valid UTF-8 by + // design because it went thorough a known-good conversion program. // Scan interchange-valid UTF-8 bytes and detect list of top 3 languages. // language3[0] is usually also the return value Language DetectLanguageSummary( @@ -160,6 +193,8 @@ namespace CLD2 { int* text_bytes, bool* is_reliable); + // Use this one ONLY if you can prove the the input text is valid UTF-8 by + // design because it went thorough a known-good conversion program. // Same as above, with hints supplied // Scan interchange-valid UTF-8 bytes and detect list of top 3 languages. // language3[0] is usually also the return value @@ -175,6 +210,8 @@ namespace CLD2 { int* text_bytes, bool* is_reliable); + // Use this one ONLY if you can prove the the input text is valid UTF-8 by + // design because it went thorough a known-good conversion program. // Scan interchange-valid UTF-8 bytes and detect list of top 3 extended // languages. // @@ -191,6 +228,8 @@ namespace CLD2 { int* text_bytes, bool* is_reliable); + // Use this one ONLY if you can prove the the input text is valid UTF-8 by + // design because it went thorough a known-good conversion program. // Same as above, with hints supplied // Scan interchange-valid UTF-8 bytes and detect list of top 3 extended // languages. @@ -211,6 +250,8 @@ namespace CLD2 { int* text_bytes, bool* is_reliable); + // Use this one ONLY if you can prove the the input text is valid UTF-8 by + // design because it went thorough a known-good conversion program. // Same as above, and also returns 3 internal language scores as a ratio to // normal score for real text in that language. Scores close to 1.0 indicate // normal text, while scores far away from 1.0 indicate badly-skewed text or @@ -231,6 +272,42 @@ namespace CLD2 { // Use this one. + // + // Hints are collected into a struct. + // Flags are passed in (normally zero). + // + // Also returns 3 internal language scores as a ratio to + // normal score for real text in that language. Scores close to 1.0 indicate + // normal text, while scores far away from 1.0 indicate badly-skewed text or + // gibberish + // + // Returns a vector of chunks in different languages, so that caller may + // spell-check, translate, or otherwise process different parts of the input + // buffer in language-dependant ways. + // + // If the input is in fact not valid UTF-8, this returns immediately with + // the result value UNKNOWN_LANGUAGE and is_reliable set false. + // + // In all cases, valid_prefix_bytes will be set to the number of leading + // bytes that are valid UTF-8. If this is < buffer_length, there is invalid + // input starting at the following byte. + Language ExtDetectLanguageSummaryCheckUTF8( + const char* buffer, + int buffer_length, + bool is_plain_text, + const CLDHints* cld_hints, + int flags, + Language* language3, + int* percent3, + double* normalized_score3, + ResultChunkVector* resultchunkvector, + int* text_bytes, + bool* is_reliable, + int* valid_prefix_bytes); + + // Use this one ONLY if you can prove the the input text is valid UTF-8 by + // design because it went thorough a known-good conversion program. + // // Hints are collected into a struct. // Flags are passed in (normally zero). // @@ -268,6 +345,8 @@ namespace CLD2 { static const int kCLDFlagVerbose = 0x0800; // More debug HTML => stderr static const int kCLDFlagQuiet = 0x1000; // Less debug HTML => stderr static const int kCLDFlagEcho = 0x2000; // Echo input => stderr + static const int kCLDFlagBestEffort = 0x4000; // Give best-effort answer, + // even on short text /*** @@ -290,6 +369,10 @@ Flag meanings: In that HTML file, suppress most of the output detail. kCLDFlagEcho Echo every input buffer to stderr. + kCLDFlagBestEffort + Give best-effort answer, instead of UNKNOWN_LANGUAGE. May be useful for + short text if the caller prefers an approximate answer over none. + ***/ // Debug output: Print the resultchunkvector to file f