ResultVector fix, BestEffort flag, add CheckUTF8 routines
git-svn-id: https://cld2.googlecode.com/svn/trunk@172 b252ecd4-b096-bf77-eb8e-91563289f87e
This commit is contained in:
@@ -28,11 +28,32 @@
|
|||||||
namespace CLD2 {
|
namespace CLD2 {
|
||||||
|
|
||||||
// String is "code_version - data_scrape_date"
|
// String is "code_version - data_scrape_date"
|
||||||
//static const char* kDetectLanguageVersion = "V2.0 - 20130715";
|
// static const char* kDetectLanguageVersion = "V2.0 - 20141015";
|
||||||
|
|
||||||
|
|
||||||
// Large-table version for all ~160 languages
|
// Large-table version for all ~160 languages
|
||||||
// Small-table version for all ~60 languages
|
// Small-table version for all ~80 languages
|
||||||
|
|
||||||
|
|
||||||
|
// Scan interchange-valid UTF-8 bytes and detect most likely language
|
||||||
|
// If the input is in fact not valid UTF-8, this returns immediately with
|
||||||
|
// the result value UNKNOWN_LANGUAGE and is_reliable set false.
|
||||||
|
//
|
||||||
|
// In all cases, valid_prefix_bytes will be set to the number of leading
|
||||||
|
// bytes that are valid UTF-8. If this is < buffer_length, there is invalid
|
||||||
|
// input starting at the following byte.
|
||||||
|
Language DetectLanguageCheckUTF8(
|
||||||
|
const char* buffer,
|
||||||
|
int buffer_length,
|
||||||
|
bool is_plain_text,
|
||||||
|
bool* is_reliable,
|
||||||
|
int* valid_prefix_bytes) {
|
||||||
|
*valid_prefix_bytes = SpanInterchangeValid(buffer, buffer_length);
|
||||||
|
if (*valid_prefix_bytes < buffer_length) {
|
||||||
|
*is_reliable = false;
|
||||||
|
return UNKNOWN_LANGUAGE;
|
||||||
|
}
|
||||||
|
return DetectLanguage(buffer, buffer_length, is_plain_text, is_reliable);
|
||||||
|
}
|
||||||
|
|
||||||
// Scan interchange-valid UTF-8 bytes and detect most likely language
|
// Scan interchange-valid UTF-8 bytes and detect most likely language
|
||||||
Language DetectLanguage(
|
Language DetectLanguage(
|
||||||
@@ -272,7 +293,70 @@ Language ExtDetectLanguageSummary(
|
|||||||
return lang;
|
return lang;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
// Use this one.
|
// Use this one.
|
||||||
|
//
|
||||||
|
// Hints are collected into a struct.
|
||||||
|
// Flags are passed in (normally zero).
|
||||||
|
//
|
||||||
|
// Also returns 3 internal language scores as a ratio to
|
||||||
|
// normal score for real text in that language. Scores close to 1.0 indicate
|
||||||
|
// normal text, while scores far away from 1.0 indicate badly-skewed text or
|
||||||
|
// gibberish
|
||||||
|
//
|
||||||
|
// Returns a vector of chunks in different languages, so that caller may
|
||||||
|
// spell-check, translate, or otherwise process different parts of the input
|
||||||
|
// buffer in language-dependant ways.
|
||||||
|
//
|
||||||
|
// If the input is in fact not valid UTF-8, this returns immediately with
|
||||||
|
// the result value UNKNOWN_LANGUAGE and is_reliable set false.
|
||||||
|
//
|
||||||
|
// In all cases, valid_prefix_bytes will be set to the number of leading
|
||||||
|
// bytes that are valid UTF-8. If this is < buffer_length, there is invalid
|
||||||
|
// input starting at the following byte.
|
||||||
|
Language ExtDetectLanguageSummaryCheckUTF8(
|
||||||
|
const char* buffer,
|
||||||
|
int buffer_length,
|
||||||
|
bool is_plain_text,
|
||||||
|
const CLDHints* cld_hints,
|
||||||
|
int flags,
|
||||||
|
Language* language3,
|
||||||
|
int* percent3,
|
||||||
|
double* normalized_score3,
|
||||||
|
ResultChunkVector* resultchunkvector,
|
||||||
|
int* text_bytes,
|
||||||
|
bool* is_reliable,
|
||||||
|
int* valid_prefix_bytes) {
|
||||||
|
*valid_prefix_bytes = SpanInterchangeValid(buffer, buffer_length);
|
||||||
|
if (*valid_prefix_bytes < buffer_length) {
|
||||||
|
*is_reliable = false;
|
||||||
|
return UNKNOWN_LANGUAGE;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool allow_extended_lang = true;
|
||||||
|
Language plus_one = UNKNOWN_LANGUAGE;
|
||||||
|
|
||||||
|
Language lang = DetectLanguageSummaryV2(
|
||||||
|
buffer,
|
||||||
|
buffer_length,
|
||||||
|
is_plain_text,
|
||||||
|
cld_hints,
|
||||||
|
allow_extended_lang,
|
||||||
|
flags,
|
||||||
|
plus_one,
|
||||||
|
language3,
|
||||||
|
percent3,
|
||||||
|
normalized_score3,
|
||||||
|
resultchunkvector,
|
||||||
|
text_bytes,
|
||||||
|
is_reliable);
|
||||||
|
// Do not default to English
|
||||||
|
return lang;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Use this one ONLY if you can prove the the input text is valid UTF-8 by
|
||||||
|
// design because it went thorough a known-good conversion program.
|
||||||
|
//
|
||||||
// Hints are collected into a struct.
|
// Hints are collected into a struct.
|
||||||
// Flags are passed in (normally zero).
|
// Flags are passed in (normally zero).
|
||||||
//
|
//
|
||||||
@@ -318,5 +402,7 @@ Language ExtDetectLanguageSummary(
|
|||||||
return lang;
|
return lang;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
} // End namespace CLD2
|
} // End namespace CLD2
|
||||||
|
|
||||||
|
@@ -27,6 +27,7 @@
|
|||||||
#include "debug.h"
|
#include "debug.h"
|
||||||
#include "integral_types.h"
|
#include "integral_types.h"
|
||||||
#include "lang_script.h"
|
#include "lang_script.h"
|
||||||
|
#include "utf8acceptinterchange.h"
|
||||||
#include "utf8statetable.h"
|
#include "utf8statetable.h"
|
||||||
|
|
||||||
#ifdef CLD2_DYNAMIC_MODE
|
#ifdef CLD2_DYNAMIC_MODE
|
||||||
@@ -68,6 +69,16 @@ extern const CLD2TableSummary kDeltaOcta_obj;
|
|||||||
extern const CLD2TableSummary kDistinctOcta_obj;
|
extern const CLD2TableSummary kDistinctOcta_obj;
|
||||||
extern const short kAvgDeltaOctaScore[];
|
extern const short kAvgDeltaOctaScore[];
|
||||||
|
|
||||||
|
// Returns the length in bytes of the prefix of src that is all
|
||||||
|
// interchange valid UTF-8
|
||||||
|
int SpanInterchangeValid(const char* src, int byte_length) {
|
||||||
|
int bytes_consumed;
|
||||||
|
const UTF8ReplaceObj* st = &utf8acceptinterchange_obj;
|
||||||
|
StringPiece str(src, byte_length);
|
||||||
|
UTF8GenericScan(st, str, &bytes_consumed);
|
||||||
|
return bytes_consumed;
|
||||||
|
}
|
||||||
|
|
||||||
#ifdef CLD2_DYNAMIC_MODE
|
#ifdef CLD2_DYNAMIC_MODE
|
||||||
// CLD2_DYNAMIC_MODE is defined:
|
// CLD2_DYNAMIC_MODE is defined:
|
||||||
// Data will be read from an mmap opened at runtime.
|
// Data will be read from an mmap opened at runtime.
|
||||||
@@ -426,6 +437,9 @@ inline bool FlagTop40(int flags) {return (flags & kCLDFlagTop40) != 0;}
|
|||||||
inline bool FlagShort(int flags) {return (flags & kCLDFlagShort) != 0;}
|
inline bool FlagShort(int flags) {return (flags & kCLDFlagShort) != 0;}
|
||||||
inline bool FlagHint(int flags) {return (flags & kCLDFlagHint) != 0;}
|
inline bool FlagHint(int flags) {return (flags & kCLDFlagHint) != 0;}
|
||||||
inline bool FlagUseWords(int flags) {return (flags & kCLDFlagUseWords) != 0;}
|
inline bool FlagUseWords(int flags) {return (flags & kCLDFlagUseWords) != 0;}
|
||||||
|
inline bool FlagBestEffort(int flags) {
|
||||||
|
return (flags & kCLDFlagBestEffort) != 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
// Defines Top40 packed languages
|
// Defines Top40 packed languages
|
||||||
@@ -679,7 +693,7 @@ int CheapRepWordsInplace(char* isrc, int src_len, int* hash, int* tbl) {
|
|||||||
|
|
||||||
|
|
||||||
// This alternate form overwrites redundant words, thus avoiding corrupting the
|
// This alternate form overwrites redundant words, thus avoiding corrupting the
|
||||||
// backmap for generate a vector of original-text ranges.
|
// backmap for generating a vector of original-text ranges.
|
||||||
int CheapRepWordsInplaceOverwrite(char* isrc, int src_len, int* hash, int* tbl) {
|
int CheapRepWordsInplaceOverwrite(char* isrc, int src_len, int* hash, int* tbl) {
|
||||||
const uint8* src = reinterpret_cast<const uint8*>(isrc);
|
const uint8* src = reinterpret_cast<const uint8*>(isrc);
|
||||||
const uint8* srclimit = src + src_len;
|
const uint8* srclimit = src + src_len;
|
||||||
@@ -851,7 +865,7 @@ int CheapSqueezeInplace(char* isrc,
|
|||||||
}
|
}
|
||||||
|
|
||||||
// This alternate form overwrites redundant words, thus avoiding corrupting the
|
// This alternate form overwrites redundant words, thus avoiding corrupting the
|
||||||
// backmap for generate a vector of original-text ranges.
|
// backmap for generating a vector of original-text ranges.
|
||||||
int CheapSqueezeInplaceOverwrite(char* isrc,
|
int CheapSqueezeInplaceOverwrite(char* isrc,
|
||||||
int src_len,
|
int src_len,
|
||||||
int ichunksize) {
|
int ichunksize) {
|
||||||
@@ -1402,7 +1416,8 @@ void CalcSummaryLang(DocTote* doc_tote, int total_text_bytes,
|
|||||||
const Language* language3,
|
const Language* language3,
|
||||||
const int* percent3,
|
const int* percent3,
|
||||||
Language* summary_lang, bool* is_reliable,
|
Language* summary_lang, bool* is_reliable,
|
||||||
bool FLAGS_cld2_html, bool FLAGS_cld2_quiet) {
|
bool FLAGS_cld2_html, bool FLAGS_cld2_quiet,
|
||||||
|
int flags) {
|
||||||
// Vector of active languages; changes if we delete some
|
// Vector of active languages; changes if we delete some
|
||||||
int slot_count = 3;
|
int slot_count = 3;
|
||||||
int active_slot[3] = {0, 1, 2};
|
int active_slot[3] = {0, 1, 2};
|
||||||
@@ -1417,7 +1432,7 @@ void CalcSummaryLang(DocTote* doc_tote, int total_text_bytes,
|
|||||||
for (int i = 0; i < 3; ++i) {
|
for (int i = 0; i < 3; ++i) {
|
||||||
if (language3[i] == TG_UNKNOWN_LANGUAGE) {
|
if (language3[i] == TG_UNKNOWN_LANGUAGE) {
|
||||||
ignore_percent += percent3[i];
|
ignore_percent += percent3[i];
|
||||||
// Move the rest up, levaing input vectors unchanged
|
// Move the rest up, leaving input vectors unchanged
|
||||||
for (int j=i+1; j < 3; ++j) {
|
for (int j=i+1; j < 3; ++j) {
|
||||||
active_slot[j - 1] = active_slot[j];
|
active_slot[j - 1] = active_slot[j];
|
||||||
}
|
}
|
||||||
@@ -1475,7 +1490,7 @@ void CalcSummaryLang(DocTote* doc_tote, int total_text_bytes,
|
|||||||
}
|
}
|
||||||
|
|
||||||
// If return percent is too small (too many languages), return UNKNOWN
|
// If return percent is too small (too many languages), return UNKNOWN
|
||||||
if ((return_percent < kGoodFirstMinPercent)) {
|
if ((return_percent < kGoodFirstMinPercent) && !FlagBestEffort(flags)) {
|
||||||
if (FLAGS_cld2_html && !FLAGS_cld2_quiet) {
|
if (FLAGS_cld2_html && !FLAGS_cld2_quiet) {
|
||||||
fprintf(stderr, "{Unreli %s %d%% percent too small} ",
|
fprintf(stderr, "{Unreli %s %d%% percent too small} ",
|
||||||
LanguageCode(*summary_lang), return_percent);
|
LanguageCode(*summary_lang), return_percent);
|
||||||
@@ -1666,15 +1681,27 @@ void ApplyHints(const char* buffer,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// Extend results to fully cover the [lo..hi) range
|
||||||
|
void FinishResultVector(int lo, int hi, ResultChunkVector* vec) {
|
||||||
|
if (vec == NULL) {return;}
|
||||||
|
if (vec->size() == 0) {return;}
|
||||||
|
ResultChunk* rc = &(*vec)[0];
|
||||||
|
if (rc->offset > lo) {
|
||||||
|
int diff = rc->offset - lo;
|
||||||
|
rc->offset -= diff;
|
||||||
|
rc->bytes += diff;
|
||||||
|
}
|
||||||
|
ResultChunk* rc2 = &(*vec)[vec->size() - 1];
|
||||||
|
int rc2hi = rc2->offset + rc2->bytes;
|
||||||
|
if (rc2hi < hi) {
|
||||||
|
int diff = hi - rc2hi;
|
||||||
|
rc2->bytes += diff;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
// Results language3/percent3/text_bytes must be exactly three items
|
// Results language3/percent3/text_bytes must be exactly three items
|
||||||
Language DetectLanguageSummaryV2(
|
Language DetectLanguageSummaryV2(
|
||||||
@@ -1968,7 +1995,9 @@ Language DetectLanguageSummaryV2(
|
|||||||
// This is the real, non-recursive return
|
// This is the real, non-recursive return
|
||||||
|
|
||||||
// Move bytes for unreliable langs to another lang or UNKNOWN
|
// Move bytes for unreliable langs to another lang or UNKNOWN
|
||||||
RemoveUnreliableLanguages(&doc_tote, FLAGS_cld2_html, FLAGS_cld2_quiet);
|
if (!FlagBestEffort(flags)) {
|
||||||
|
RemoveUnreliableLanguages(&doc_tote, FLAGS_cld2_html, FLAGS_cld2_quiet);
|
||||||
|
}
|
||||||
|
|
||||||
// Redo the result extraction after the removal above
|
// Redo the result extraction after the removal above
|
||||||
doc_tote.Sort(3);
|
doc_tote.Sort(3);
|
||||||
@@ -1976,13 +2005,11 @@ Language DetectLanguageSummaryV2(
|
|||||||
reliable_percent3, language3, percent3, normalized_score3,
|
reliable_percent3, language3, percent3, normalized_score3,
|
||||||
text_bytes, is_reliable);
|
text_bytes, is_reliable);
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
Language summary_lang;
|
Language summary_lang;
|
||||||
CalcSummaryLang(&doc_tote, total_text_bytes,
|
CalcSummaryLang(&doc_tote, total_text_bytes,
|
||||||
reliable_percent3, language3, percent3,
|
reliable_percent3, language3, percent3,
|
||||||
&summary_lang, is_reliable,
|
&summary_lang, is_reliable,
|
||||||
FLAGS_cld2_html, FLAGS_cld2_quiet);
|
FLAGS_cld2_html, FLAGS_cld2_quiet, flags);
|
||||||
|
|
||||||
if (FLAGS_cld2_html && !FLAGS_cld2_quiet) {
|
if (FLAGS_cld2_html && !FLAGS_cld2_quiet) {
|
||||||
for (int i = 0; i < 3; ++i) {
|
for (int i = 0; i < 3; ++i) {
|
||||||
@@ -2015,6 +2042,9 @@ Language DetectLanguageSummaryV2(
|
|||||||
fprintf(stderr, "<br>\n");
|
fprintf(stderr, "<br>\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Extend results to fully cover the input buffer
|
||||||
|
FinishResultVector(0, buffer_length, resultchunkvector);
|
||||||
|
|
||||||
return summary_lang;
|
return summary_lang;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@@ -126,6 +126,10 @@ At the end of the first pass --
|
|||||||
const uint32* kQuadValueTable;
|
const uint32* kQuadValueTable;
|
||||||
} LangDetObj;
|
} LangDetObj;
|
||||||
|
|
||||||
|
// Returns the length in bytes of the prefix of src that is all
|
||||||
|
// interchange valid UTF-8
|
||||||
|
int SpanInterchangeValid(const char* src, int byte_length);
|
||||||
|
|
||||||
// For HTML documents, tags are skipped, along with <script> ... </script>
|
// For HTML documents, tags are skipped, along with <script> ... </script>
|
||||||
// and <style> ... </style> sequences, and entities are expanded.
|
// and <style> ... </style> sequences, and entities are expanded.
|
||||||
//
|
//
|
||||||
|
@@ -42,7 +42,7 @@
|
|||||||
// HAUSA (Latin, Arabic)
|
// HAUSA (Latin, Arabic)
|
||||||
// KASHMIRI (Arabic, Devanagari)
|
// KASHMIRI (Arabic, Devanagari)
|
||||||
// KAZAKH (Latin, Cyrillic, Arabic)
|
// KAZAKH (Latin, Cyrillic, Arabic)
|
||||||
// KURDISH (Latin*, Arabic)
|
// KURDISH (Latin, Arabic)
|
||||||
// KYRGYZ (Cyrillic, Arabic)
|
// KYRGYZ (Cyrillic, Arabic)
|
||||||
// LIMBU (Devanagari, Limbu)
|
// LIMBU (Devanagari, Limbu)
|
||||||
// MONGOLIAN (Cyrillic, Mongolian)
|
// MONGOLIAN (Cyrillic, Mongolian)
|
||||||
@@ -56,8 +56,7 @@
|
|||||||
// UZBEK (Latin, Cyrillic, Arabic)
|
// UZBEK (Latin, Cyrillic, Arabic)
|
||||||
//
|
//
|
||||||
// * Due to a shortage of training text, AZERBAIJANI is not currently detected
|
// * Due to a shortage of training text, AZERBAIJANI is not currently detected
|
||||||
// in Arabic or Cyrillic scripts, nor KURDISH in Latin script, nor TAJIK in
|
// in Arabic or Cyrillic scripts, nor TAJIK in Arabic script.
|
||||||
// Arabic script.
|
|
||||||
//
|
//
|
||||||
|
|
||||||
#ifndef I18N_ENCODINGS_CLD2_PUBLIC_COMPACT_LANG_DET_H_
|
#ifndef I18N_ENCODINGS_CLD2_PUBLIC_COMPACT_LANG_DET_H_
|
||||||
@@ -65,10 +64,19 @@
|
|||||||
|
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include "../internal/lang_script.h" // For Language
|
#include "../internal/integral_types.h" // For uint8 etc.
|
||||||
|
#include "../internal/lang_script.h" // For Language
|
||||||
|
|
||||||
namespace CLD2 {
|
namespace CLD2 {
|
||||||
|
|
||||||
|
// NOTE: If you cannot prove the the input text is valid UTF-8 by design because
|
||||||
|
// it went thorough a known-good conversion program, call one of the *CheckUTF8
|
||||||
|
// routines. For example, never trust raw user-supplied bytes. It is especially
|
||||||
|
// important to do a UTF8-to-UTF8 conversion on raw bytes that claim to be
|
||||||
|
// UTF-8, using a converter that guarantees to produce valid UTF-8, turning
|
||||||
|
// other byte sequences into the Unicode replacement character U+FFFD (deleting
|
||||||
|
// or turning into space or question-mark can create security holes).
|
||||||
|
|
||||||
// Scan interchange-valid UTF-8 bytes and detect most likely language,
|
// Scan interchange-valid UTF-8 bytes and detect most likely language,
|
||||||
// or set of languages.
|
// or set of languages.
|
||||||
//
|
//
|
||||||
@@ -129,19 +137,42 @@ namespace CLD2 {
|
|||||||
Language language_hint; // ITALIAN boosts it
|
Language language_hint; // ITALIAN boosts it
|
||||||
} CLDHints;
|
} CLDHints;
|
||||||
|
|
||||||
static const int kMaxResultChunkBytes = 65535;
|
static const int32 kMaxResultChunkBytes = 0x7fffffff;
|
||||||
|
|
||||||
|
// Note: this was initially over-optimized to fit into 8 bytes,
|
||||||
|
// causing too much work to deal with with greater than 16-bit byte lengths.
|
||||||
// For returning a vector of per-language pieces of the input buffer
|
// For returning a vector of per-language pieces of the input buffer
|
||||||
// Unreliable and too-short are mapped to UNKNOWN_LANGUAGE
|
// Unreliable and too-short are mapped to UNKNOWN_LANGUAGE
|
||||||
typedef struct {
|
typedef struct {
|
||||||
int offset; // Starting byte offset in original buffer
|
int offset; // Starting byte offset in original buffer
|
||||||
uint16 bytes; // Number of bytes in chunk
|
int32 bytes; // Number of bytes in chunk
|
||||||
uint16 lang1; // Top lang, as full Language. Apply
|
uint16 lang1; // Top lang, as full Language. Apply
|
||||||
// static_cast<Language>() to this short value.
|
// static_cast<Language>() to this short value.
|
||||||
|
uint16 pad; // Make multiple of 4 bytes
|
||||||
} ResultChunk;
|
} ResultChunk;
|
||||||
typedef std::vector<ResultChunk> ResultChunkVector;
|
typedef std::vector<ResultChunk> ResultChunkVector;
|
||||||
|
|
||||||
|
|
||||||
|
// These initial simple versions all cascade through the full-blown last
|
||||||
|
// version which it would be better for you to use directly because you will
|
||||||
|
// get better results passing in any available hints.
|
||||||
|
|
||||||
|
// Scan interchange-valid UTF-8 bytes and detect most likely language
|
||||||
|
// If the input is in fact not valid UTF-8, this returns immediately with
|
||||||
|
// the result value UNKNOWN_LANGUAGE and is_reliable set false.
|
||||||
|
//
|
||||||
|
// In all cases, valid_prefix_bytes will be set to the number of leading
|
||||||
|
// bytes that are valid UTF-8. If this is < buffer_length, there is invalid
|
||||||
|
// input starting at the following byte.
|
||||||
|
Language DetectLanguageCheckUTF8(
|
||||||
|
const char* buffer,
|
||||||
|
int buffer_length,
|
||||||
|
bool is_plain_text,
|
||||||
|
bool* is_reliable,
|
||||||
|
int* valid_prefix_bytes);
|
||||||
|
|
||||||
|
// Use this one ONLY if you can prove the the input text is valid UTF-8 by
|
||||||
|
// design because it went thorough a known-good conversion program.
|
||||||
// Scan interchange-valid UTF-8 bytes and detect most likely language
|
// Scan interchange-valid UTF-8 bytes and detect most likely language
|
||||||
Language DetectLanguage(
|
Language DetectLanguage(
|
||||||
const char* buffer,
|
const char* buffer,
|
||||||
@@ -149,6 +180,8 @@ namespace CLD2 {
|
|||||||
bool is_plain_text,
|
bool is_plain_text,
|
||||||
bool* is_reliable);
|
bool* is_reliable);
|
||||||
|
|
||||||
|
// Use this one ONLY if you can prove the the input text is valid UTF-8 by
|
||||||
|
// design because it went thorough a known-good conversion program.
|
||||||
// Scan interchange-valid UTF-8 bytes and detect list of top 3 languages.
|
// Scan interchange-valid UTF-8 bytes and detect list of top 3 languages.
|
||||||
// language3[0] is usually also the return value
|
// language3[0] is usually also the return value
|
||||||
Language DetectLanguageSummary(
|
Language DetectLanguageSummary(
|
||||||
@@ -160,6 +193,8 @@ namespace CLD2 {
|
|||||||
int* text_bytes,
|
int* text_bytes,
|
||||||
bool* is_reliable);
|
bool* is_reliable);
|
||||||
|
|
||||||
|
// Use this one ONLY if you can prove the the input text is valid UTF-8 by
|
||||||
|
// design because it went thorough a known-good conversion program.
|
||||||
// Same as above, with hints supplied
|
// Same as above, with hints supplied
|
||||||
// Scan interchange-valid UTF-8 bytes and detect list of top 3 languages.
|
// Scan interchange-valid UTF-8 bytes and detect list of top 3 languages.
|
||||||
// language3[0] is usually also the return value
|
// language3[0] is usually also the return value
|
||||||
@@ -175,6 +210,8 @@ namespace CLD2 {
|
|||||||
int* text_bytes,
|
int* text_bytes,
|
||||||
bool* is_reliable);
|
bool* is_reliable);
|
||||||
|
|
||||||
|
// Use this one ONLY if you can prove the the input text is valid UTF-8 by
|
||||||
|
// design because it went thorough a known-good conversion program.
|
||||||
// Scan interchange-valid UTF-8 bytes and detect list of top 3 extended
|
// Scan interchange-valid UTF-8 bytes and detect list of top 3 extended
|
||||||
// languages.
|
// languages.
|
||||||
//
|
//
|
||||||
@@ -191,6 +228,8 @@ namespace CLD2 {
|
|||||||
int* text_bytes,
|
int* text_bytes,
|
||||||
bool* is_reliable);
|
bool* is_reliable);
|
||||||
|
|
||||||
|
// Use this one ONLY if you can prove the the input text is valid UTF-8 by
|
||||||
|
// design because it went thorough a known-good conversion program.
|
||||||
// Same as above, with hints supplied
|
// Same as above, with hints supplied
|
||||||
// Scan interchange-valid UTF-8 bytes and detect list of top 3 extended
|
// Scan interchange-valid UTF-8 bytes and detect list of top 3 extended
|
||||||
// languages.
|
// languages.
|
||||||
@@ -211,6 +250,8 @@ namespace CLD2 {
|
|||||||
int* text_bytes,
|
int* text_bytes,
|
||||||
bool* is_reliable);
|
bool* is_reliable);
|
||||||
|
|
||||||
|
// Use this one ONLY if you can prove the the input text is valid UTF-8 by
|
||||||
|
// design because it went thorough a known-good conversion program.
|
||||||
// Same as above, and also returns 3 internal language scores as a ratio to
|
// Same as above, and also returns 3 internal language scores as a ratio to
|
||||||
// normal score for real text in that language. Scores close to 1.0 indicate
|
// normal score for real text in that language. Scores close to 1.0 indicate
|
||||||
// normal text, while scores far away from 1.0 indicate badly-skewed text or
|
// normal text, while scores far away from 1.0 indicate badly-skewed text or
|
||||||
@@ -231,6 +272,42 @@ namespace CLD2 {
|
|||||||
|
|
||||||
|
|
||||||
// Use this one.
|
// Use this one.
|
||||||
|
//
|
||||||
|
// Hints are collected into a struct.
|
||||||
|
// Flags are passed in (normally zero).
|
||||||
|
//
|
||||||
|
// Also returns 3 internal language scores as a ratio to
|
||||||
|
// normal score for real text in that language. Scores close to 1.0 indicate
|
||||||
|
// normal text, while scores far away from 1.0 indicate badly-skewed text or
|
||||||
|
// gibberish
|
||||||
|
//
|
||||||
|
// Returns a vector of chunks in different languages, so that caller may
|
||||||
|
// spell-check, translate, or otherwise process different parts of the input
|
||||||
|
// buffer in language-dependant ways.
|
||||||
|
//
|
||||||
|
// If the input is in fact not valid UTF-8, this returns immediately with
|
||||||
|
// the result value UNKNOWN_LANGUAGE and is_reliable set false.
|
||||||
|
//
|
||||||
|
// In all cases, valid_prefix_bytes will be set to the number of leading
|
||||||
|
// bytes that are valid UTF-8. If this is < buffer_length, there is invalid
|
||||||
|
// input starting at the following byte.
|
||||||
|
Language ExtDetectLanguageSummaryCheckUTF8(
|
||||||
|
const char* buffer,
|
||||||
|
int buffer_length,
|
||||||
|
bool is_plain_text,
|
||||||
|
const CLDHints* cld_hints,
|
||||||
|
int flags,
|
||||||
|
Language* language3,
|
||||||
|
int* percent3,
|
||||||
|
double* normalized_score3,
|
||||||
|
ResultChunkVector* resultchunkvector,
|
||||||
|
int* text_bytes,
|
||||||
|
bool* is_reliable,
|
||||||
|
int* valid_prefix_bytes);
|
||||||
|
|
||||||
|
// Use this one ONLY if you can prove the the input text is valid UTF-8 by
|
||||||
|
// design because it went thorough a known-good conversion program.
|
||||||
|
//
|
||||||
// Hints are collected into a struct.
|
// Hints are collected into a struct.
|
||||||
// Flags are passed in (normally zero).
|
// Flags are passed in (normally zero).
|
||||||
//
|
//
|
||||||
@@ -268,6 +345,8 @@ namespace CLD2 {
|
|||||||
static const int kCLDFlagVerbose = 0x0800; // More debug HTML => stderr
|
static const int kCLDFlagVerbose = 0x0800; // More debug HTML => stderr
|
||||||
static const int kCLDFlagQuiet = 0x1000; // Less debug HTML => stderr
|
static const int kCLDFlagQuiet = 0x1000; // Less debug HTML => stderr
|
||||||
static const int kCLDFlagEcho = 0x2000; // Echo input => stderr
|
static const int kCLDFlagEcho = 0x2000; // Echo input => stderr
|
||||||
|
static const int kCLDFlagBestEffort = 0x4000; // Give best-effort answer,
|
||||||
|
// even on short text
|
||||||
|
|
||||||
|
|
||||||
/***
|
/***
|
||||||
@@ -290,6 +369,10 @@ Flag meanings:
|
|||||||
In that HTML file, suppress most of the output detail.
|
In that HTML file, suppress most of the output detail.
|
||||||
kCLDFlagEcho
|
kCLDFlagEcho
|
||||||
Echo every input buffer to stderr.
|
Echo every input buffer to stderr.
|
||||||
|
kCLDFlagBestEffort
|
||||||
|
Give best-effort answer, instead of UNKNOWN_LANGUAGE. May be useful for
|
||||||
|
short text if the caller prefers an approximate answer over none.
|
||||||
|
|
||||||
***/
|
***/
|
||||||
|
|
||||||
// Debug output: Print the resultchunkvector to file f
|
// Debug output: Print the resultchunkvector to file f
|
||||||
|
Reference in New Issue
Block a user