Initial CLD2 source upload.
git-svn-id: https://cld2.googlecode.com/svn/trunk@3 b252ecd4-b096-bf77-eb8e-91563289f87e
This commit is contained in:
300
public/compact_lang_det.h
Normal file
300
public/compact_lang_det.h
Normal file
@@ -0,0 +1,300 @@
|
||||
// Copyright 2013 Google Inc. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
//
|
||||
// Author: dsites@google.com (Dick Sites)
|
||||
//
|
||||
|
||||
// NOTE:
|
||||
// Baybayin (ancient script of the Philippines) is detected as TAGALOG.
|
||||
// Chu Nom (Vietnamese ancient Han characters) is detected as VIETNAMESE.
|
||||
// HAITIAN_CREOLE is detected as such.
|
||||
// NORWEGIAN and NORWEGIAN_N are detected separately (but not robustly)
|
||||
// PORTUGUESE, PORTUGUESE_P, and PORTUGUESE_B are all detected as PORTUGUESE.
|
||||
// ROMANIAN-Latin is detected as ROMANIAN; ROMANIAN-Cyrillic as ROMANIAN.
|
||||
// BOSNIAN is not detected as such, but likely scores as Croatian or Serbian.
|
||||
// MONTENEGRIN is not detected as such, but likely scores as Serbian.
|
||||
// CROATIAN is detected in the Latin script
|
||||
// SERBIAN is detected in the Cyrililc and Latin scripts
|
||||
// Zhuang is detected in the Latin script only.
|
||||
//
|
||||
// The languages X_PIG_LATIN and X_KLINGON are detected in the
|
||||
// extended calls ExtDetectLanguageSummary().
|
||||
//
|
||||
// UNKNOWN_LANGUAGE is returned if no language's internal reliablity measure
|
||||
// is high enough. This happens with non-text input such as the bytes of a
|
||||
// JPEG, and also with text in languages outside training set.
|
||||
//
|
||||
// The following languages are to be detected in multiple scripts:
|
||||
// AZERBAIJANI (Latin, Cyrillic*, Arabic*)
|
||||
// BURMESE (Latin, Myanmar)
|
||||
// HAUSA (Latin, Arabic)
|
||||
// KASHMIRI (Arabic, Devanagari)
|
||||
// KAZAKH (Latin, Cyrillic, Arabic)
|
||||
// KURDISH (Latin*, Arabic)
|
||||
// KYRGYZ (Cyrillic, Arabic)
|
||||
// LIMBU (Devanagari, Limbu)
|
||||
// MONGOLIAN (Cyrillic, Mongolian)
|
||||
// SANSKRIT (Latin, Devanagari)
|
||||
// SINDHI (Arabic, Devanagari)
|
||||
// TAGALOG (Latin, Tagalog)
|
||||
// TAJIK (Cyrillic, Arabic*)
|
||||
// TATAR (Latin, Cyrillic, Arabic)
|
||||
// TURKMEN (Latin, Cyrillic, Arabic)
|
||||
// UIGHUR (Latin, Cyrillic, Arabic)
|
||||
// UZBEK (Latin, Cyrillic, Arabic)
|
||||
//
|
||||
// * Due to a shortage of training text, AZERBAIJANI is not currently detected
|
||||
// in Arabic or Cyrillic scripts, nor KURDISH in Latin script, nor TAJIK in
|
||||
// Arabic script.
|
||||
//
|
||||
|
||||
#ifndef I18N_ENCODINGS_CLD2_PUBLIC_COMPACT_LANG_DET_H_
|
||||
#define I18N_ENCODINGS_CLD2_PUBLIC_COMPACT_LANG_DET_H_
|
||||
|
||||
#include <vector>
|
||||
#include "../internal/lang_script.h" // For Language
|
||||
|
||||
namespace CLD2 {
|
||||
|
||||
// Scan interchange-valid UTF-8 bytes and detect most likely language,
|
||||
// or set of languages.
|
||||
//
|
||||
// Design goals:
|
||||
// Skip over big stretches of HTML tags
|
||||
// Able to return ranges of different languages
|
||||
// Relatively small tables and relatively fast processing
|
||||
// Thread safe
|
||||
//
|
||||
// For HTML documents, tags are skipped, along with <script> ... </script>
|
||||
// and <style> ... </style> sequences, and entities are expanded.
|
||||
//
|
||||
// We distinguish between bytes of the raw input buffer and bytes of non-tag
|
||||
// text letters. Since tags can be over 50% of the bytes of an HTML Page,
|
||||
// and are nearly all seven-bit ASCII English, we prefer to distinguish
|
||||
// language mixture fractions based on just the non-tag text.
|
||||
//
|
||||
// Inputs: text and text_length
|
||||
// Code skips HTML tags and expands HTML entities, unless
|
||||
// is_plain_text is true
|
||||
// Outputs:
|
||||
// language3 is an array of the top 3 languages or UNKNOWN_LANGUAGE
|
||||
// percent3 is an array of the text percentages 0..100 of the top 3 languages
|
||||
// text_bytes is the amount of non-tag/letters-only text found
|
||||
// is_reliable set true if the returned Language is some amount more
|
||||
// probable then the second-best Language. Calculation is a complex function
|
||||
// of the length of the text and the different-script runs of text.
|
||||
// Return value: the most likely Language for the majority of the input text
|
||||
// Length 0 input returns UNKNOWN_LANGUAGE. Very short indeterminate text
|
||||
// defaults to ENGLISH.
|
||||
//
|
||||
// The first two versions return ENGLISH instead of UNKNOWN_LANGUAGE, for
|
||||
// backwards compatibility with a different detector.
|
||||
//
|
||||
// The third version may return UNKNOWN_LANGUAGE, and also returns extended
|
||||
// language codes from lang_script.h
|
||||
//
|
||||
|
||||
|
||||
// Instead of individual arguments, pass in hints as an initialized struct
|
||||
// Init to {NULL, NULL, UNKNOWN_ENCODING, UNKNOWN_LANGUAGE} if not known.
|
||||
//
|
||||
// Pass in hints whenever possible; doing so improves detection accuracy. The
|
||||
// set of passed-in hints are all information that is external to the text
|
||||
// itself.
|
||||
//
|
||||
// The content_language_hint is intended to come from an HTTP header
|
||||
// Content-Language: field, the tld_hint from the hostname of a URL, the
|
||||
// encoding-hint from an encoding detector applied to the input
|
||||
// document, and the language hint from any other context you might have.
|
||||
// The lang= tags inside an HTML document will be picked up as hints
|
||||
// by code within the compact language detector.
|
||||
|
||||
typedef struct {
|
||||
const char* content_language_hint; // "mi,en" boosts Maori and English
|
||||
const char* tld_hint; // "id" boosts Indonesian
|
||||
int encoding_hint; // SJS boosts Japanese
|
||||
Language language_hint; // ITALIAN boosts it
|
||||
} CLDHints;
|
||||
|
||||
static const int kMaxResultChunkBytes = 65535;
|
||||
|
||||
// For returning a vector of per-language pieces of the input buffer
|
||||
// Unreliable and too-short are mapped to UNKNOWN_LANGUAGE
|
||||
typedef struct {
|
||||
int offset; // Starting byte offset in original buffer
|
||||
uint16 bytes; // Number of bytes in chunk
|
||||
uint16 lang1; // Top lang, as full Language. Apply
|
||||
// static_cast<Language>() to this short value.
|
||||
} ResultChunk;
|
||||
typedef std::vector<ResultChunk> ResultChunkVector;
|
||||
|
||||
|
||||
// Scan interchange-valid UTF-8 bytes and detect most likely language
|
||||
Language DetectLanguage(
|
||||
const char* buffer,
|
||||
int buffer_length,
|
||||
bool is_plain_text,
|
||||
bool* is_reliable);
|
||||
|
||||
// Scan interchange-valid UTF-8 bytes and detect list of top 3 languages.
|
||||
// language3[0] is usually also the return value
|
||||
Language DetectLanguageSummary(
|
||||
const char* buffer,
|
||||
int buffer_length,
|
||||
bool is_plain_text,
|
||||
Language* language3,
|
||||
int* percent3,
|
||||
int* text_bytes,
|
||||
bool* is_reliable);
|
||||
|
||||
// Same as above, with hints supplied
|
||||
// Scan interchange-valid UTF-8 bytes and detect list of top 3 languages.
|
||||
// language3[0] is usually also the return value
|
||||
Language DetectLanguageSummary(
|
||||
const char* buffer,
|
||||
int buffer_length,
|
||||
bool is_plain_text,
|
||||
const char* tld_hint, // "id" boosts Indonesian
|
||||
int encoding_hint, // SJS boosts Japanese
|
||||
Language language_hint, // ITALIAN boosts it
|
||||
Language* language3,
|
||||
int* percent3,
|
||||
int* text_bytes,
|
||||
bool* is_reliable);
|
||||
|
||||
// Scan interchange-valid UTF-8 bytes and detect list of top 3 extended
|
||||
// languages.
|
||||
//
|
||||
// Extended languages are additional interface languages and Unicode
|
||||
// single-language scripts, from lang_script.h
|
||||
//
|
||||
// language3[0] is usually also the return value
|
||||
Language ExtDetectLanguageSummary(
|
||||
const char* buffer,
|
||||
int buffer_length,
|
||||
bool is_plain_text,
|
||||
Language* language3,
|
||||
int* percent3,
|
||||
int* text_bytes,
|
||||
bool* is_reliable);
|
||||
|
||||
// Same as above, with hints supplied
|
||||
// Scan interchange-valid UTF-8 bytes and detect list of top 3 extended
|
||||
// languages.
|
||||
//
|
||||
// Extended languages are additional Google interface languages and Unicode
|
||||
// single-language scripts, from lang_script.h
|
||||
//
|
||||
// language3[0] is usually also the return value
|
||||
Language ExtDetectLanguageSummary(
|
||||
const char* buffer,
|
||||
int buffer_length,
|
||||
bool is_plain_text,
|
||||
const char* tld_hint, // "id" boosts Indonesian
|
||||
int encoding_hint, // SJS boosts Japanese
|
||||
Language language_hint, // ITALIAN boosts it
|
||||
Language* language3,
|
||||
int* percent3,
|
||||
int* text_bytes,
|
||||
bool* is_reliable);
|
||||
|
||||
// Same as above, and also returns 3 internal language scores as a ratio to
|
||||
// normal score for real text in that language. Scores close to 1.0 indicate
|
||||
// normal text, while scores far away from 1.0 indicate badly-skewed text or
|
||||
// gibberish
|
||||
//
|
||||
Language ExtDetectLanguageSummary(
|
||||
const char* buffer,
|
||||
int buffer_length,
|
||||
bool is_plain_text,
|
||||
const char* tld_hint, // "id" boosts Indonesian
|
||||
int encoding_hint, // SJS boosts Japanese
|
||||
Language language_hint, // ITALIAN boosts it
|
||||
Language* language3,
|
||||
int* percent3,
|
||||
double* normalized_score3,
|
||||
int* text_bytes,
|
||||
bool* is_reliable);
|
||||
|
||||
|
||||
// Use this one.
|
||||
// Hints are collected into a struct.
|
||||
// Flags are passed in (normally zero).
|
||||
//
|
||||
// Also returns 3 internal language scores as a ratio to
|
||||
// normal score for real text in that language. Scores close to 1.0 indicate
|
||||
// normal text, while scores far away from 1.0 indicate badly-skewed text or
|
||||
// gibberish
|
||||
//
|
||||
// Returns a vector of chunks in different languages, so that caller may
|
||||
// spell-check, translate, or otherwaise process different parts of the input
|
||||
// buffer in language-dependant ways.
|
||||
//
|
||||
Language ExtDetectLanguageSummary(
|
||||
const char* buffer,
|
||||
int buffer_length,
|
||||
bool is_plain_text,
|
||||
const CLDHints* cld_hints,
|
||||
int flags,
|
||||
Language* language3,
|
||||
int* percent3,
|
||||
double* normalized_score3,
|
||||
ResultChunkVector* resultchunkvector,
|
||||
int* text_bytes,
|
||||
bool* is_reliable);
|
||||
|
||||
// Return version text string
|
||||
// String is "code_version - data_build_date"
|
||||
const char* DetectLanguageVersion();
|
||||
|
||||
|
||||
// Public use flags, debug output controls
|
||||
static const int kCLDFlagScoreAsQuads = 0x0100; // Force Greek, etc. => quads
|
||||
static const int kCLDFlagHtml = 0x0200; // Debug HTML => stderr
|
||||
static const int kCLDFlagCr = 0x0400; // <cr> per chunk if HTML
|
||||
static const int kCLDFlagVerbose = 0x0800; // More debug HTML => stderr
|
||||
static const int kCLDFlagQuiet = 0x1000; // Less debug HTML => stderr
|
||||
static const int kCLDFlagEcho = 0x2000; // Echo input => stderr
|
||||
|
||||
|
||||
/***
|
||||
|
||||
Flag meanings:
|
||||
kCLDFlagScoreAsQuads
|
||||
Normally, several languages are detected solely by their Unicode script.
|
||||
Combined with appropritate lookup tables, this flag forces them instead
|
||||
to be detected via quadgrams. This can be a useful refinement when looking
|
||||
for meaningful text in these languages, instead of just character sets.
|
||||
The default tables do not support this use.
|
||||
kCLDFlagHtml
|
||||
For each detection call, write an HTML file to stderr, showing the text
|
||||
chunks and their detected languages.
|
||||
kCLDFlagCr
|
||||
In that HTML file, force a new line for each chunk.
|
||||
kCLDFlagVerbose
|
||||
In that HTML file, show every lookup entry.
|
||||
kCLDFlagQuiet
|
||||
In that HTML file, suppress most of the output detail.
|
||||
kCLDFlagEcho
|
||||
Echo every input buffer to stderr.
|
||||
***/
|
||||
|
||||
// Debug output: Print the resultchunkvector to file f
|
||||
void DumpResultChunkVector(FILE* f, const char* src,
|
||||
ResultChunkVector* resultchunkvector);
|
||||
|
||||
}; // End namespace CLD2
|
||||
|
||||
#endif // I18N_ENCODINGS_CLD2_PUBLIC_COMPACT_LANG_DET_H_
|
169
public/encodings.h
Normal file
169
public/encodings.h
Normal file
@@ -0,0 +1,169 @@
|
||||
// Copyright 2013 Google Inc. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
//
|
||||
// Author: dsites@google.com (Dick Sites)
|
||||
//
|
||||
|
||||
#ifndef I18N_ENCODINGS_CLD2_PUBLIC_ENCODINGS_H__
|
||||
#define I18N_ENCODINGS_CLD2_PUBLIC_ENCODINGS_H__
|
||||
|
||||
namespace CLD2 {
|
||||
|
||||
enum Encoding {
|
||||
ISO_8859_1 = 0, // ASCII
|
||||
ISO_8859_2 = 1, // Latin2
|
||||
ISO_8859_3 = 2, //
|
||||
ISO_8859_4 = 3, // Latin4
|
||||
ISO_8859_5 = 4, // ISO-8859-5
|
||||
ISO_8859_6 = 5, // Arabic
|
||||
ISO_8859_7 = 6, // Greek
|
||||
ISO_8859_8 = 7, // Hebrew
|
||||
ISO_8859_9 = 8, //
|
||||
ISO_8859_10 = 9, //
|
||||
JAPANESE_EUC_JP = 10, // EUC_JP
|
||||
JAPANESE_SHIFT_JIS = 11, // SJS
|
||||
JAPANESE_JIS = 12, // JIS
|
||||
CHINESE_BIG5 = 13, // BIG5
|
||||
CHINESE_GB = 14, // GB
|
||||
CHINESE_EUC_CN = 15, // Misnamed. Should be EUC_TW. Was Basis Tech
|
||||
// CNS11643EUC, before that EUC-CN(!)
|
||||
KOREAN_EUC_KR = 16, // KSC
|
||||
UNICODE = 17, // Unicode
|
||||
CHINESE_EUC_DEC = 18, // Misnamed. Should be EUC_TW. Was
|
||||
// CNS11643EUC, before that EUC.
|
||||
CHINESE_CNS = 19, // Misnamed. Should be EUC_TW. Was
|
||||
// CNS11643EUC, before that CNS.
|
||||
CHINESE_BIG5_CP950 = 20, // BIG5_CP950
|
||||
JAPANESE_CP932 = 21, // CP932
|
||||
UTF8 = 22,
|
||||
UNKNOWN_ENCODING = 23,
|
||||
ASCII_7BIT = 24, // ISO_8859_1 with all characters <= 127.
|
||||
RUSSIAN_KOI8_R = 25, // KOI8R
|
||||
RUSSIAN_CP1251 = 26, // CP1251
|
||||
|
||||
//----------------------------------------------------------
|
||||
MSFT_CP1252 = 27, // 27: CP1252 aka MSFT euro ascii
|
||||
RUSSIAN_KOI8_RU = 28, // CP21866 aka KOI8-U, used for Ukrainian.
|
||||
// Misnamed, this is _not_ KOI8-RU but KOI8-U.
|
||||
// KOI8-U is used much more often than KOI8-RU.
|
||||
MSFT_CP1250 = 29, // CP1250 aka MSFT eastern european
|
||||
ISO_8859_15 = 30, // aka ISO_8859_0 aka ISO_8859_1 euroized
|
||||
//----------------------------------------------------------
|
||||
|
||||
//----------------------------------------------------------
|
||||
MSFT_CP1254 = 31, // used for Turkish
|
||||
MSFT_CP1257 = 32, // used in Baltic countries
|
||||
//----------------------------------------------------------
|
||||
|
||||
//----------------------------------------------------------
|
||||
//----------------------------------------------------------
|
||||
ISO_8859_11 = 33, // aka TIS-620, used for Thai
|
||||
MSFT_CP874 = 34, // used for Thai
|
||||
MSFT_CP1256 = 35, // used for Arabic
|
||||
|
||||
//----------------------------------------------------------
|
||||
MSFT_CP1255 = 36, // Logical Hebrew Microsoft
|
||||
ISO_8859_8_I = 37, // Iso Hebrew Logical
|
||||
HEBREW_VISUAL = 38, // Iso Hebrew Visual
|
||||
//----------------------------------------------------------
|
||||
|
||||
//----------------------------------------------------------
|
||||
CZECH_CP852 = 39,
|
||||
CZECH_CSN_369103 = 40, // aka ISO_IR_139 aka KOI8_CS
|
||||
MSFT_CP1253 = 41, // used for Greek
|
||||
RUSSIAN_CP866 = 42,
|
||||
//----------------------------------------------------------
|
||||
|
||||
//----------------------------------------------------------
|
||||
// Handled by iconv in glibc
|
||||
ISO_8859_13 = 43,
|
||||
ISO_2022_KR = 44,
|
||||
GBK = 45,
|
||||
GB18030 = 46,
|
||||
BIG5_HKSCS = 47,
|
||||
ISO_2022_CN = 48,
|
||||
|
||||
//-----------------------------------------------------------
|
||||
// Following 4 encodings are deprecated (font encodings)
|
||||
TSCII = 49,
|
||||
TAMIL_MONO = 50,
|
||||
TAMIL_BI = 51,
|
||||
JAGRAN = 52,
|
||||
|
||||
|
||||
MACINTOSH_ROMAN = 53,
|
||||
UTF7 = 54,
|
||||
|
||||
//-----------------------------------------------------------
|
||||
// Following 2 encodings are deprecated (font encodings)
|
||||
BHASKAR = 55, // Indic encoding - Devanagari
|
||||
HTCHANAKYA = 56, // 56 Indic encoding - Devanagari
|
||||
|
||||
//-----------------------------------------------------------
|
||||
UTF16BE = 57, // big-endian UTF-16
|
||||
UTF16LE = 58, // little-endian UTF-16
|
||||
UTF32BE = 59, // big-endian UTF-32
|
||||
UTF32LE = 60, // little-endian UTF-32
|
||||
//-----------------------------------------------------------
|
||||
|
||||
//-----------------------------------------------------------
|
||||
// An encoding that means "This is not text, but it may have some
|
||||
// simple ASCII text embedded". Intended input conversion
|
||||
// is to keep strings of >=4 seven-bit ASCII characters
|
||||
BINARYENC = 61,
|
||||
//-----------------------------------------------------------
|
||||
|
||||
//-----------------------------------------------------------
|
||||
// Some Web pages allow a mixture of HZ-GB and GB-2312 by using
|
||||
// ~{ ... ~} for 2-byte pairs, and the browsers support this.
|
||||
HZ_GB_2312 = 62,
|
||||
//-----------------------------------------------------------
|
||||
|
||||
//-----------------------------------------------------------
|
||||
// Some external vendors make the common input error of
|
||||
// converting MSFT_CP1252 to UTF8 *twice*.
|
||||
UTF8UTF8 = 63,
|
||||
//-----------------------------------------------------------
|
||||
|
||||
//-----------------------------------------------------------
|
||||
// Following 6 encodings are deprecated (font encodings)
|
||||
TAM_ELANGO = 64, // Elango - Tamil
|
||||
TAM_LTTMBARANI = 65, // Barani - Tamil
|
||||
TAM_SHREE = 66, // Shree - Tamil
|
||||
TAM_TBOOMIS = 67, // TBoomis - Tamil
|
||||
TAM_TMNEWS = 68, // TMNews - Tamil
|
||||
TAM_WEBTAMIL = 69, // Webtamil - Tamil
|
||||
//-----------------------------------------------------------
|
||||
|
||||
//-----------------------------------------------------------
|
||||
// Shift_JIS variants used by Japanese cell phone carriers.
|
||||
KDDI_SHIFT_JIS = 70,
|
||||
DOCOMO_SHIFT_JIS = 71,
|
||||
SOFTBANK_SHIFT_JIS = 72,
|
||||
// ISO-2022-JP variants used by KDDI and SoftBank.
|
||||
KDDI_ISO_2022_JP = 73,
|
||||
SOFTBANK_ISO_2022_JP = 74,
|
||||
//-----------------------------------------------------------
|
||||
|
||||
NUM_ENCODINGS = 75, // Always keep this at the end. It is not a
|
||||
// valid Encoding enum, it is only used to
|
||||
// indicate the total number of Encodings.
|
||||
};
|
||||
|
||||
} // End namespace CLD2
|
||||
|
||||
#endif // I18N_ENCODINGS_CLD2_PUBLIC_ENCODINGS_H__
|
||||
|
||||
|
Reference in New Issue
Block a user