ResultVector fix, BestEffort flag, add CheckUTF8 routines

git-svn-id: https://cld2.googlecode.com/svn/trunk@172 b252ecd4-b096-bf77-eb8e-91563289f87e
This commit is contained in:
dsites@google.com
2014-10-28 20:18:58 +00:00
parent 54479ccc2c
commit f1f0dcb00a
4 changed files with 228 additions and 25 deletions

View File

@@ -28,11 +28,32 @@
namespace CLD2 { namespace CLD2 {
// String is "code_version - data_scrape_date" // String is "code_version - data_scrape_date"
//static const char* kDetectLanguageVersion = "V2.0 - 20130715"; // static const char* kDetectLanguageVersion = "V2.0 - 20141015";
// Large-table version for all ~160 languages // Large-table version for all ~160 languages
// Small-table version for all ~60 languages // Small-table version for all ~80 languages
// Scan interchange-valid UTF-8 bytes and detect most likely language
// If the input is in fact not valid UTF-8, this returns immediately with
// the result value UNKNOWN_LANGUAGE and is_reliable set false.
//
// In all cases, valid_prefix_bytes will be set to the number of leading
// bytes that are valid UTF-8. If this is < buffer_length, there is invalid
// input starting at the following byte.
Language DetectLanguageCheckUTF8(
const char* buffer,
int buffer_length,
bool is_plain_text,
bool* is_reliable,
int* valid_prefix_bytes) {
*valid_prefix_bytes = SpanInterchangeValid(buffer, buffer_length);
if (*valid_prefix_bytes < buffer_length) {
*is_reliable = false;
return UNKNOWN_LANGUAGE;
}
return DetectLanguage(buffer, buffer_length, is_plain_text, is_reliable);
}
// Scan interchange-valid UTF-8 bytes and detect most likely language // Scan interchange-valid UTF-8 bytes and detect most likely language
Language DetectLanguage( Language DetectLanguage(
@@ -272,7 +293,70 @@ Language ExtDetectLanguageSummary(
return lang; return lang;
} }
// Use this one. // Use this one.
//
// Hints are collected into a struct.
// Flags are passed in (normally zero).
//
// Also returns 3 internal language scores as a ratio to
// normal score for real text in that language. Scores close to 1.0 indicate
// normal text, while scores far away from 1.0 indicate badly-skewed text or
// gibberish
//
// Returns a vector of chunks in different languages, so that caller may
// spell-check, translate, or otherwise process different parts of the input
// buffer in language-dependant ways.
//
// If the input is in fact not valid UTF-8, this returns immediately with
// the result value UNKNOWN_LANGUAGE and is_reliable set false.
//
// In all cases, valid_prefix_bytes will be set to the number of leading
// bytes that are valid UTF-8. If this is < buffer_length, there is invalid
// input starting at the following byte.
Language ExtDetectLanguageSummaryCheckUTF8(
const char* buffer,
int buffer_length,
bool is_plain_text,
const CLDHints* cld_hints,
int flags,
Language* language3,
int* percent3,
double* normalized_score3,
ResultChunkVector* resultchunkvector,
int* text_bytes,
bool* is_reliable,
int* valid_prefix_bytes) {
*valid_prefix_bytes = SpanInterchangeValid(buffer, buffer_length);
if (*valid_prefix_bytes < buffer_length) {
*is_reliable = false;
return UNKNOWN_LANGUAGE;
}
bool allow_extended_lang = true;
Language plus_one = UNKNOWN_LANGUAGE;
Language lang = DetectLanguageSummaryV2(
buffer,
buffer_length,
is_plain_text,
cld_hints,
allow_extended_lang,
flags,
plus_one,
language3,
percent3,
normalized_score3,
resultchunkvector,
text_bytes,
is_reliable);
// Do not default to English
return lang;
}
// Use this one ONLY if you can prove the the input text is valid UTF-8 by
// design because it went thorough a known-good conversion program.
//
// Hints are collected into a struct. // Hints are collected into a struct.
// Flags are passed in (normally zero). // Flags are passed in (normally zero).
// //
@@ -318,5 +402,7 @@ Language ExtDetectLanguageSummary(
return lang; return lang;
} }
} // End namespace CLD2 } // End namespace CLD2

View File

@@ -27,6 +27,7 @@
#include "debug.h" #include "debug.h"
#include "integral_types.h" #include "integral_types.h"
#include "lang_script.h" #include "lang_script.h"
#include "utf8acceptinterchange.h"
#include "utf8statetable.h" #include "utf8statetable.h"
#ifdef CLD2_DYNAMIC_MODE #ifdef CLD2_DYNAMIC_MODE
@@ -68,6 +69,16 @@ extern const CLD2TableSummary kDeltaOcta_obj;
extern const CLD2TableSummary kDistinctOcta_obj; extern const CLD2TableSummary kDistinctOcta_obj;
extern const short kAvgDeltaOctaScore[]; extern const short kAvgDeltaOctaScore[];
// Returns the length in bytes of the prefix of src that is all
// interchange valid UTF-8
int SpanInterchangeValid(const char* src, int byte_length) {
int bytes_consumed;
const UTF8ReplaceObj* st = &utf8acceptinterchange_obj;
StringPiece str(src, byte_length);
UTF8GenericScan(st, str, &bytes_consumed);
return bytes_consumed;
}
#ifdef CLD2_DYNAMIC_MODE #ifdef CLD2_DYNAMIC_MODE
// CLD2_DYNAMIC_MODE is defined: // CLD2_DYNAMIC_MODE is defined:
// Data will be read from an mmap opened at runtime. // Data will be read from an mmap opened at runtime.
@@ -426,6 +437,9 @@ inline bool FlagTop40(int flags) {return (flags & kCLDFlagTop40) != 0;}
inline bool FlagShort(int flags) {return (flags & kCLDFlagShort) != 0;} inline bool FlagShort(int flags) {return (flags & kCLDFlagShort) != 0;}
inline bool FlagHint(int flags) {return (flags & kCLDFlagHint) != 0;} inline bool FlagHint(int flags) {return (flags & kCLDFlagHint) != 0;}
inline bool FlagUseWords(int flags) {return (flags & kCLDFlagUseWords) != 0;} inline bool FlagUseWords(int flags) {return (flags & kCLDFlagUseWords) != 0;}
inline bool FlagBestEffort(int flags) {
return (flags & kCLDFlagBestEffort) != 0;
}
// Defines Top40 packed languages // Defines Top40 packed languages
@@ -679,7 +693,7 @@ int CheapRepWordsInplace(char* isrc, int src_len, int* hash, int* tbl) {
// This alternate form overwrites redundant words, thus avoiding corrupting the // This alternate form overwrites redundant words, thus avoiding corrupting the
// backmap for generate a vector of original-text ranges. // backmap for generating a vector of original-text ranges.
int CheapRepWordsInplaceOverwrite(char* isrc, int src_len, int* hash, int* tbl) { int CheapRepWordsInplaceOverwrite(char* isrc, int src_len, int* hash, int* tbl) {
const uint8* src = reinterpret_cast<const uint8*>(isrc); const uint8* src = reinterpret_cast<const uint8*>(isrc);
const uint8* srclimit = src + src_len; const uint8* srclimit = src + src_len;
@@ -851,7 +865,7 @@ int CheapSqueezeInplace(char* isrc,
} }
// This alternate form overwrites redundant words, thus avoiding corrupting the // This alternate form overwrites redundant words, thus avoiding corrupting the
// backmap for generate a vector of original-text ranges. // backmap for generating a vector of original-text ranges.
int CheapSqueezeInplaceOverwrite(char* isrc, int CheapSqueezeInplaceOverwrite(char* isrc,
int src_len, int src_len,
int ichunksize) { int ichunksize) {
@@ -1402,7 +1416,8 @@ void CalcSummaryLang(DocTote* doc_tote, int total_text_bytes,
const Language* language3, const Language* language3,
const int* percent3, const int* percent3,
Language* summary_lang, bool* is_reliable, Language* summary_lang, bool* is_reliable,
bool FLAGS_cld2_html, bool FLAGS_cld2_quiet) { bool FLAGS_cld2_html, bool FLAGS_cld2_quiet,
int flags) {
// Vector of active languages; changes if we delete some // Vector of active languages; changes if we delete some
int slot_count = 3; int slot_count = 3;
int active_slot[3] = {0, 1, 2}; int active_slot[3] = {0, 1, 2};
@@ -1417,7 +1432,7 @@ void CalcSummaryLang(DocTote* doc_tote, int total_text_bytes,
for (int i = 0; i < 3; ++i) { for (int i = 0; i < 3; ++i) {
if (language3[i] == TG_UNKNOWN_LANGUAGE) { if (language3[i] == TG_UNKNOWN_LANGUAGE) {
ignore_percent += percent3[i]; ignore_percent += percent3[i];
// Move the rest up, levaing input vectors unchanged // Move the rest up, leaving input vectors unchanged
for (int j=i+1; j < 3; ++j) { for (int j=i+1; j < 3; ++j) {
active_slot[j - 1] = active_slot[j]; active_slot[j - 1] = active_slot[j];
} }
@@ -1475,7 +1490,7 @@ void CalcSummaryLang(DocTote* doc_tote, int total_text_bytes,
} }
// If return percent is too small (too many languages), return UNKNOWN // If return percent is too small (too many languages), return UNKNOWN
if ((return_percent < kGoodFirstMinPercent)) { if ((return_percent < kGoodFirstMinPercent) && !FlagBestEffort(flags)) {
if (FLAGS_cld2_html && !FLAGS_cld2_quiet) { if (FLAGS_cld2_html && !FLAGS_cld2_quiet) {
fprintf(stderr, "{Unreli %s %d%% percent too small} ", fprintf(stderr, "{Unreli %s %d%% percent too small} ",
LanguageCode(*summary_lang), return_percent); LanguageCode(*summary_lang), return_percent);
@@ -1666,15 +1681,27 @@ void ApplyHints(const char* buffer,
} }
} }
} }
} }
// Extend results to fully cover the [lo..hi) range
void FinishResultVector(int lo, int hi, ResultChunkVector* vec) {
if (vec == NULL) {return;}
if (vec->size() == 0) {return;}
ResultChunk* rc = &(*vec)[0];
if (rc->offset > lo) {
int diff = rc->offset - lo;
rc->offset -= diff;
rc->bytes += diff;
}
ResultChunk* rc2 = &(*vec)[vec->size() - 1];
int rc2hi = rc2->offset + rc2->bytes;
if (rc2hi < hi) {
int diff = hi - rc2hi;
rc2->bytes += diff;
}
}
// Results language3/percent3/text_bytes must be exactly three items // Results language3/percent3/text_bytes must be exactly three items
Language DetectLanguageSummaryV2( Language DetectLanguageSummaryV2(
@@ -1968,7 +1995,9 @@ Language DetectLanguageSummaryV2(
// This is the real, non-recursive return // This is the real, non-recursive return
// Move bytes for unreliable langs to another lang or UNKNOWN // Move bytes for unreliable langs to another lang or UNKNOWN
if (!FlagBestEffort(flags)) {
RemoveUnreliableLanguages(&doc_tote, FLAGS_cld2_html, FLAGS_cld2_quiet); RemoveUnreliableLanguages(&doc_tote, FLAGS_cld2_html, FLAGS_cld2_quiet);
}
// Redo the result extraction after the removal above // Redo the result extraction after the removal above
doc_tote.Sort(3); doc_tote.Sort(3);
@@ -1976,13 +2005,11 @@ Language DetectLanguageSummaryV2(
reliable_percent3, language3, percent3, normalized_score3, reliable_percent3, language3, percent3, normalized_score3,
text_bytes, is_reliable); text_bytes, is_reliable);
Language summary_lang; Language summary_lang;
CalcSummaryLang(&doc_tote, total_text_bytes, CalcSummaryLang(&doc_tote, total_text_bytes,
reliable_percent3, language3, percent3, reliable_percent3, language3, percent3,
&summary_lang, is_reliable, &summary_lang, is_reliable,
FLAGS_cld2_html, FLAGS_cld2_quiet); FLAGS_cld2_html, FLAGS_cld2_quiet, flags);
if (FLAGS_cld2_html && !FLAGS_cld2_quiet) { if (FLAGS_cld2_html && !FLAGS_cld2_quiet) {
for (int i = 0; i < 3; ++i) { for (int i = 0; i < 3; ++i) {
@@ -2015,6 +2042,9 @@ Language DetectLanguageSummaryV2(
fprintf(stderr, "<br>\n"); fprintf(stderr, "<br>\n");
} }
// Extend results to fully cover the input buffer
FinishResultVector(0, buffer_length, resultchunkvector);
return summary_lang; return summary_lang;
} }

View File

@@ -126,6 +126,10 @@ At the end of the first pass --
const uint32* kQuadValueTable; const uint32* kQuadValueTable;
} LangDetObj; } LangDetObj;
// Returns the length in bytes of the prefix of src that is all
// interchange valid UTF-8
int SpanInterchangeValid(const char* src, int byte_length);
// For HTML documents, tags are skipped, along with <script> ... </script> // For HTML documents, tags are skipped, along with <script> ... </script>
// and <style> ... </style> sequences, and entities are expanded. // and <style> ... </style> sequences, and entities are expanded.
// //

View File

@@ -42,7 +42,7 @@
// HAUSA (Latin, Arabic) // HAUSA (Latin, Arabic)
// KASHMIRI (Arabic, Devanagari) // KASHMIRI (Arabic, Devanagari)
// KAZAKH (Latin, Cyrillic, Arabic) // KAZAKH (Latin, Cyrillic, Arabic)
// KURDISH (Latin*, Arabic) // KURDISH (Latin, Arabic)
// KYRGYZ (Cyrillic, Arabic) // KYRGYZ (Cyrillic, Arabic)
// LIMBU (Devanagari, Limbu) // LIMBU (Devanagari, Limbu)
// MONGOLIAN (Cyrillic, Mongolian) // MONGOLIAN (Cyrillic, Mongolian)
@@ -56,8 +56,7 @@
// UZBEK (Latin, Cyrillic, Arabic) // UZBEK (Latin, Cyrillic, Arabic)
// //
// * Due to a shortage of training text, AZERBAIJANI is not currently detected // * Due to a shortage of training text, AZERBAIJANI is not currently detected
// in Arabic or Cyrillic scripts, nor KURDISH in Latin script, nor TAJIK in // in Arabic or Cyrillic scripts, nor TAJIK in Arabic script.
// Arabic script.
// //
#ifndef I18N_ENCODINGS_CLD2_PUBLIC_COMPACT_LANG_DET_H_ #ifndef I18N_ENCODINGS_CLD2_PUBLIC_COMPACT_LANG_DET_H_
@@ -65,10 +64,19 @@
#include <stdint.h> #include <stdint.h>
#include <vector> #include <vector>
#include "../internal/integral_types.h" // For uint8 etc.
#include "../internal/lang_script.h" // For Language #include "../internal/lang_script.h" // For Language
namespace CLD2 { namespace CLD2 {
// NOTE: If you cannot prove the the input text is valid UTF-8 by design because
// it went thorough a known-good conversion program, call one of the *CheckUTF8
// routines. For example, never trust raw user-supplied bytes. It is especially
// important to do a UTF8-to-UTF8 conversion on raw bytes that claim to be
// UTF-8, using a converter that guarantees to produce valid UTF-8, turning
// other byte sequences into the Unicode replacement character U+FFFD (deleting
// or turning into space or question-mark can create security holes).
// Scan interchange-valid UTF-8 bytes and detect most likely language, // Scan interchange-valid UTF-8 bytes and detect most likely language,
// or set of languages. // or set of languages.
// //
@@ -129,19 +137,42 @@ namespace CLD2 {
Language language_hint; // ITALIAN boosts it Language language_hint; // ITALIAN boosts it
} CLDHints; } CLDHints;
static const int kMaxResultChunkBytes = 65535; static const int32 kMaxResultChunkBytes = 0x7fffffff;
// Note: this was initially over-optimized to fit into 8 bytes,
// causing too much work to deal with with greater than 16-bit byte lengths.
// For returning a vector of per-language pieces of the input buffer // For returning a vector of per-language pieces of the input buffer
// Unreliable and too-short are mapped to UNKNOWN_LANGUAGE // Unreliable and too-short are mapped to UNKNOWN_LANGUAGE
typedef struct { typedef struct {
int offset; // Starting byte offset in original buffer int offset; // Starting byte offset in original buffer
uint16 bytes; // Number of bytes in chunk int32 bytes; // Number of bytes in chunk
uint16 lang1; // Top lang, as full Language. Apply uint16 lang1; // Top lang, as full Language. Apply
// static_cast<Language>() to this short value. // static_cast<Language>() to this short value.
uint16 pad; // Make multiple of 4 bytes
} ResultChunk; } ResultChunk;
typedef std::vector<ResultChunk> ResultChunkVector; typedef std::vector<ResultChunk> ResultChunkVector;
// These initial simple versions all cascade through the full-blown last
// version which it would be better for you to use directly because you will
// get better results passing in any available hints.
// Scan interchange-valid UTF-8 bytes and detect most likely language
// If the input is in fact not valid UTF-8, this returns immediately with
// the result value UNKNOWN_LANGUAGE and is_reliable set false.
//
// In all cases, valid_prefix_bytes will be set to the number of leading
// bytes that are valid UTF-8. If this is < buffer_length, there is invalid
// input starting at the following byte.
Language DetectLanguageCheckUTF8(
const char* buffer,
int buffer_length,
bool is_plain_text,
bool* is_reliable,
int* valid_prefix_bytes);
// Use this one ONLY if you can prove the the input text is valid UTF-8 by
// design because it went thorough a known-good conversion program.
// Scan interchange-valid UTF-8 bytes and detect most likely language // Scan interchange-valid UTF-8 bytes and detect most likely language
Language DetectLanguage( Language DetectLanguage(
const char* buffer, const char* buffer,
@@ -149,6 +180,8 @@ namespace CLD2 {
bool is_plain_text, bool is_plain_text,
bool* is_reliable); bool* is_reliable);
// Use this one ONLY if you can prove the the input text is valid UTF-8 by
// design because it went thorough a known-good conversion program.
// Scan interchange-valid UTF-8 bytes and detect list of top 3 languages. // Scan interchange-valid UTF-8 bytes and detect list of top 3 languages.
// language3[0] is usually also the return value // language3[0] is usually also the return value
Language DetectLanguageSummary( Language DetectLanguageSummary(
@@ -160,6 +193,8 @@ namespace CLD2 {
int* text_bytes, int* text_bytes,
bool* is_reliable); bool* is_reliable);
// Use this one ONLY if you can prove the the input text is valid UTF-8 by
// design because it went thorough a known-good conversion program.
// Same as above, with hints supplied // Same as above, with hints supplied
// Scan interchange-valid UTF-8 bytes and detect list of top 3 languages. // Scan interchange-valid UTF-8 bytes and detect list of top 3 languages.
// language3[0] is usually also the return value // language3[0] is usually also the return value
@@ -175,6 +210,8 @@ namespace CLD2 {
int* text_bytes, int* text_bytes,
bool* is_reliable); bool* is_reliable);
// Use this one ONLY if you can prove the the input text is valid UTF-8 by
// design because it went thorough a known-good conversion program.
// Scan interchange-valid UTF-8 bytes and detect list of top 3 extended // Scan interchange-valid UTF-8 bytes and detect list of top 3 extended
// languages. // languages.
// //
@@ -191,6 +228,8 @@ namespace CLD2 {
int* text_bytes, int* text_bytes,
bool* is_reliable); bool* is_reliable);
// Use this one ONLY if you can prove the the input text is valid UTF-8 by
// design because it went thorough a known-good conversion program.
// Same as above, with hints supplied // Same as above, with hints supplied
// Scan interchange-valid UTF-8 bytes and detect list of top 3 extended // Scan interchange-valid UTF-8 bytes and detect list of top 3 extended
// languages. // languages.
@@ -211,6 +250,8 @@ namespace CLD2 {
int* text_bytes, int* text_bytes,
bool* is_reliable); bool* is_reliable);
// Use this one ONLY if you can prove the the input text is valid UTF-8 by
// design because it went thorough a known-good conversion program.
// Same as above, and also returns 3 internal language scores as a ratio to // Same as above, and also returns 3 internal language scores as a ratio to
// normal score for real text in that language. Scores close to 1.0 indicate // normal score for real text in that language. Scores close to 1.0 indicate
// normal text, while scores far away from 1.0 indicate badly-skewed text or // normal text, while scores far away from 1.0 indicate badly-skewed text or
@@ -231,6 +272,42 @@ namespace CLD2 {
// Use this one. // Use this one.
//
// Hints are collected into a struct.
// Flags are passed in (normally zero).
//
// Also returns 3 internal language scores as a ratio to
// normal score for real text in that language. Scores close to 1.0 indicate
// normal text, while scores far away from 1.0 indicate badly-skewed text or
// gibberish
//
// Returns a vector of chunks in different languages, so that caller may
// spell-check, translate, or otherwise process different parts of the input
// buffer in language-dependant ways.
//
// If the input is in fact not valid UTF-8, this returns immediately with
// the result value UNKNOWN_LANGUAGE and is_reliable set false.
//
// In all cases, valid_prefix_bytes will be set to the number of leading
// bytes that are valid UTF-8. If this is < buffer_length, there is invalid
// input starting at the following byte.
Language ExtDetectLanguageSummaryCheckUTF8(
const char* buffer,
int buffer_length,
bool is_plain_text,
const CLDHints* cld_hints,
int flags,
Language* language3,
int* percent3,
double* normalized_score3,
ResultChunkVector* resultchunkvector,
int* text_bytes,
bool* is_reliable,
int* valid_prefix_bytes);
// Use this one ONLY if you can prove the the input text is valid UTF-8 by
// design because it went thorough a known-good conversion program.
//
// Hints are collected into a struct. // Hints are collected into a struct.
// Flags are passed in (normally zero). // Flags are passed in (normally zero).
// //
@@ -268,6 +345,8 @@ namespace CLD2 {
static const int kCLDFlagVerbose = 0x0800; // More debug HTML => stderr static const int kCLDFlagVerbose = 0x0800; // More debug HTML => stderr
static const int kCLDFlagQuiet = 0x1000; // Less debug HTML => stderr static const int kCLDFlagQuiet = 0x1000; // Less debug HTML => stderr
static const int kCLDFlagEcho = 0x2000; // Echo input => stderr static const int kCLDFlagEcho = 0x2000; // Echo input => stderr
static const int kCLDFlagBestEffort = 0x4000; // Give best-effort answer,
// even on short text
/*** /***
@@ -290,6 +369,10 @@ Flag meanings:
In that HTML file, suppress most of the output detail. In that HTML file, suppress most of the output detail.
kCLDFlagEcho kCLDFlagEcho
Echo every input buffer to stderr. Echo every input buffer to stderr.
kCLDFlagBestEffort
Give best-effort answer, instead of UNKNOWN_LANGUAGE. May be useful for
short text if the caller prefers an approximate answer over none.
***/ ***/
// Debug output: Print the resultchunkvector to file f // Debug output: Print the resultchunkvector to file f