Initial CLD2 source upload.

git-svn-id: https://cld2.googlecode.com/svn/trunk@3 b252ecd4-b096-bf77-eb8e-91563289f87e
This commit is contained in:
dsites@google.com
2013-06-24 23:52:45 +00:00
parent 84f0cff75d
commit af2750714b
65 changed files with 136486 additions and 0 deletions

300
public/compact_lang_det.h Normal file
View File

@@ -0,0 +1,300 @@
// Copyright 2013 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Author: dsites@google.com (Dick Sites)
//
// NOTE:
// Baybayin (ancient script of the Philippines) is detected as TAGALOG.
// Chu Nom (Vietnamese ancient Han characters) is detected as VIETNAMESE.
// HAITIAN_CREOLE is detected as such.
// NORWEGIAN and NORWEGIAN_N are detected separately (but not robustly)
// PORTUGUESE, PORTUGUESE_P, and PORTUGUESE_B are all detected as PORTUGUESE.
// ROMANIAN-Latin is detected as ROMANIAN; ROMANIAN-Cyrillic as ROMANIAN.
// BOSNIAN is not detected as such, but likely scores as Croatian or Serbian.
// MONTENEGRIN is not detected as such, but likely scores as Serbian.
// CROATIAN is detected in the Latin script
// SERBIAN is detected in the Cyrililc and Latin scripts
// Zhuang is detected in the Latin script only.
//
// The languages X_PIG_LATIN and X_KLINGON are detected in the
// extended calls ExtDetectLanguageSummary().
//
// UNKNOWN_LANGUAGE is returned if no language's internal reliablity measure
// is high enough. This happens with non-text input such as the bytes of a
// JPEG, and also with text in languages outside training set.
//
// The following languages are to be detected in multiple scripts:
// AZERBAIJANI (Latin, Cyrillic*, Arabic*)
// BURMESE (Latin, Myanmar)
// HAUSA (Latin, Arabic)
// KASHMIRI (Arabic, Devanagari)
// KAZAKH (Latin, Cyrillic, Arabic)
// KURDISH (Latin*, Arabic)
// KYRGYZ (Cyrillic, Arabic)
// LIMBU (Devanagari, Limbu)
// MONGOLIAN (Cyrillic, Mongolian)
// SANSKRIT (Latin, Devanagari)
// SINDHI (Arabic, Devanagari)
// TAGALOG (Latin, Tagalog)
// TAJIK (Cyrillic, Arabic*)
// TATAR (Latin, Cyrillic, Arabic)
// TURKMEN (Latin, Cyrillic, Arabic)
// UIGHUR (Latin, Cyrillic, Arabic)
// UZBEK (Latin, Cyrillic, Arabic)
//
// * Due to a shortage of training text, AZERBAIJANI is not currently detected
// in Arabic or Cyrillic scripts, nor KURDISH in Latin script, nor TAJIK in
// Arabic script.
//
#ifndef I18N_ENCODINGS_CLD2_PUBLIC_COMPACT_LANG_DET_H_
#define I18N_ENCODINGS_CLD2_PUBLIC_COMPACT_LANG_DET_H_
#include <vector>
#include "../internal/lang_script.h" // For Language
namespace CLD2 {
// Scan interchange-valid UTF-8 bytes and detect most likely language,
// or set of languages.
//
// Design goals:
// Skip over big stretches of HTML tags
// Able to return ranges of different languages
// Relatively small tables and relatively fast processing
// Thread safe
//
// For HTML documents, tags are skipped, along with <script> ... </script>
// and <style> ... </style> sequences, and entities are expanded.
//
// We distinguish between bytes of the raw input buffer and bytes of non-tag
// text letters. Since tags can be over 50% of the bytes of an HTML Page,
// and are nearly all seven-bit ASCII English, we prefer to distinguish
// language mixture fractions based on just the non-tag text.
//
// Inputs: text and text_length
// Code skips HTML tags and expands HTML entities, unless
// is_plain_text is true
// Outputs:
// language3 is an array of the top 3 languages or UNKNOWN_LANGUAGE
// percent3 is an array of the text percentages 0..100 of the top 3 languages
// text_bytes is the amount of non-tag/letters-only text found
// is_reliable set true if the returned Language is some amount more
// probable then the second-best Language. Calculation is a complex function
// of the length of the text and the different-script runs of text.
// Return value: the most likely Language for the majority of the input text
// Length 0 input returns UNKNOWN_LANGUAGE. Very short indeterminate text
// defaults to ENGLISH.
//
// The first two versions return ENGLISH instead of UNKNOWN_LANGUAGE, for
// backwards compatibility with a different detector.
//
// The third version may return UNKNOWN_LANGUAGE, and also returns extended
// language codes from lang_script.h
//
// Instead of individual arguments, pass in hints as an initialized struct
// Init to {NULL, NULL, UNKNOWN_ENCODING, UNKNOWN_LANGUAGE} if not known.
//
// Pass in hints whenever possible; doing so improves detection accuracy. The
// set of passed-in hints are all information that is external to the text
// itself.
//
// The content_language_hint is intended to come from an HTTP header
// Content-Language: field, the tld_hint from the hostname of a URL, the
// encoding-hint from an encoding detector applied to the input
// document, and the language hint from any other context you might have.
// The lang= tags inside an HTML document will be picked up as hints
// by code within the compact language detector.
typedef struct {
const char* content_language_hint; // "mi,en" boosts Maori and English
const char* tld_hint; // "id" boosts Indonesian
int encoding_hint; // SJS boosts Japanese
Language language_hint; // ITALIAN boosts it
} CLDHints;
static const int kMaxResultChunkBytes = 65535;
// For returning a vector of per-language pieces of the input buffer
// Unreliable and too-short are mapped to UNKNOWN_LANGUAGE
typedef struct {
int offset; // Starting byte offset in original buffer
uint16 bytes; // Number of bytes in chunk
uint16 lang1; // Top lang, as full Language. Apply
// static_cast<Language>() to this short value.
} ResultChunk;
typedef std::vector<ResultChunk> ResultChunkVector;
// Scan interchange-valid UTF-8 bytes and detect most likely language
Language DetectLanguage(
const char* buffer,
int buffer_length,
bool is_plain_text,
bool* is_reliable);
// Scan interchange-valid UTF-8 bytes and detect list of top 3 languages.
// language3[0] is usually also the return value
Language DetectLanguageSummary(
const char* buffer,
int buffer_length,
bool is_plain_text,
Language* language3,
int* percent3,
int* text_bytes,
bool* is_reliable);
// Same as above, with hints supplied
// Scan interchange-valid UTF-8 bytes and detect list of top 3 languages.
// language3[0] is usually also the return value
Language DetectLanguageSummary(
const char* buffer,
int buffer_length,
bool is_plain_text,
const char* tld_hint, // "id" boosts Indonesian
int encoding_hint, // SJS boosts Japanese
Language language_hint, // ITALIAN boosts it
Language* language3,
int* percent3,
int* text_bytes,
bool* is_reliable);
// Scan interchange-valid UTF-8 bytes and detect list of top 3 extended
// languages.
//
// Extended languages are additional interface languages and Unicode
// single-language scripts, from lang_script.h
//
// language3[0] is usually also the return value
Language ExtDetectLanguageSummary(
const char* buffer,
int buffer_length,
bool is_plain_text,
Language* language3,
int* percent3,
int* text_bytes,
bool* is_reliable);
// Same as above, with hints supplied
// Scan interchange-valid UTF-8 bytes and detect list of top 3 extended
// languages.
//
// Extended languages are additional Google interface languages and Unicode
// single-language scripts, from lang_script.h
//
// language3[0] is usually also the return value
Language ExtDetectLanguageSummary(
const char* buffer,
int buffer_length,
bool is_plain_text,
const char* tld_hint, // "id" boosts Indonesian
int encoding_hint, // SJS boosts Japanese
Language language_hint, // ITALIAN boosts it
Language* language3,
int* percent3,
int* text_bytes,
bool* is_reliable);
// Same as above, and also returns 3 internal language scores as a ratio to
// normal score for real text in that language. Scores close to 1.0 indicate
// normal text, while scores far away from 1.0 indicate badly-skewed text or
// gibberish
//
Language ExtDetectLanguageSummary(
const char* buffer,
int buffer_length,
bool is_plain_text,
const char* tld_hint, // "id" boosts Indonesian
int encoding_hint, // SJS boosts Japanese
Language language_hint, // ITALIAN boosts it
Language* language3,
int* percent3,
double* normalized_score3,
int* text_bytes,
bool* is_reliable);
// Use this one.
// Hints are collected into a struct.
// Flags are passed in (normally zero).
//
// Also returns 3 internal language scores as a ratio to
// normal score for real text in that language. Scores close to 1.0 indicate
// normal text, while scores far away from 1.0 indicate badly-skewed text or
// gibberish
//
// Returns a vector of chunks in different languages, so that caller may
// spell-check, translate, or otherwaise process different parts of the input
// buffer in language-dependant ways.
//
Language ExtDetectLanguageSummary(
const char* buffer,
int buffer_length,
bool is_plain_text,
const CLDHints* cld_hints,
int flags,
Language* language3,
int* percent3,
double* normalized_score3,
ResultChunkVector* resultchunkvector,
int* text_bytes,
bool* is_reliable);
// Return version text string
// String is "code_version - data_build_date"
const char* DetectLanguageVersion();
// Public use flags, debug output controls
static const int kCLDFlagScoreAsQuads = 0x0100; // Force Greek, etc. => quads
static const int kCLDFlagHtml = 0x0200; // Debug HTML => stderr
static const int kCLDFlagCr = 0x0400; // <cr> per chunk if HTML
static const int kCLDFlagVerbose = 0x0800; // More debug HTML => stderr
static const int kCLDFlagQuiet = 0x1000; // Less debug HTML => stderr
static const int kCLDFlagEcho = 0x2000; // Echo input => stderr
/***
Flag meanings:
kCLDFlagScoreAsQuads
Normally, several languages are detected solely by their Unicode script.
Combined with appropritate lookup tables, this flag forces them instead
to be detected via quadgrams. This can be a useful refinement when looking
for meaningful text in these languages, instead of just character sets.
The default tables do not support this use.
kCLDFlagHtml
For each detection call, write an HTML file to stderr, showing the text
chunks and their detected languages.
kCLDFlagCr
In that HTML file, force a new line for each chunk.
kCLDFlagVerbose
In that HTML file, show every lookup entry.
kCLDFlagQuiet
In that HTML file, suppress most of the output detail.
kCLDFlagEcho
Echo every input buffer to stderr.
***/
// Debug output: Print the resultchunkvector to file f
void DumpResultChunkVector(FILE* f, const char* src,
ResultChunkVector* resultchunkvector);
}; // End namespace CLD2
#endif // I18N_ENCODINGS_CLD2_PUBLIC_COMPACT_LANG_DET_H_

169
public/encodings.h Normal file
View File

@@ -0,0 +1,169 @@
// Copyright 2013 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Author: dsites@google.com (Dick Sites)
//
#ifndef I18N_ENCODINGS_CLD2_PUBLIC_ENCODINGS_H__
#define I18N_ENCODINGS_CLD2_PUBLIC_ENCODINGS_H__
namespace CLD2 {
enum Encoding {
ISO_8859_1 = 0, // ASCII
ISO_8859_2 = 1, // Latin2
ISO_8859_3 = 2, //
ISO_8859_4 = 3, // Latin4
ISO_8859_5 = 4, // ISO-8859-5
ISO_8859_6 = 5, // Arabic
ISO_8859_7 = 6, // Greek
ISO_8859_8 = 7, // Hebrew
ISO_8859_9 = 8, //
ISO_8859_10 = 9, //
JAPANESE_EUC_JP = 10, // EUC_JP
JAPANESE_SHIFT_JIS = 11, // SJS
JAPANESE_JIS = 12, // JIS
CHINESE_BIG5 = 13, // BIG5
CHINESE_GB = 14, // GB
CHINESE_EUC_CN = 15, // Misnamed. Should be EUC_TW. Was Basis Tech
// CNS11643EUC, before that EUC-CN(!)
KOREAN_EUC_KR = 16, // KSC
UNICODE = 17, // Unicode
CHINESE_EUC_DEC = 18, // Misnamed. Should be EUC_TW. Was
// CNS11643EUC, before that EUC.
CHINESE_CNS = 19, // Misnamed. Should be EUC_TW. Was
// CNS11643EUC, before that CNS.
CHINESE_BIG5_CP950 = 20, // BIG5_CP950
JAPANESE_CP932 = 21, // CP932
UTF8 = 22,
UNKNOWN_ENCODING = 23,
ASCII_7BIT = 24, // ISO_8859_1 with all characters <= 127.
RUSSIAN_KOI8_R = 25, // KOI8R
RUSSIAN_CP1251 = 26, // CP1251
//----------------------------------------------------------
MSFT_CP1252 = 27, // 27: CP1252 aka MSFT euro ascii
RUSSIAN_KOI8_RU = 28, // CP21866 aka KOI8-U, used for Ukrainian.
// Misnamed, this is _not_ KOI8-RU but KOI8-U.
// KOI8-U is used much more often than KOI8-RU.
MSFT_CP1250 = 29, // CP1250 aka MSFT eastern european
ISO_8859_15 = 30, // aka ISO_8859_0 aka ISO_8859_1 euroized
//----------------------------------------------------------
//----------------------------------------------------------
MSFT_CP1254 = 31, // used for Turkish
MSFT_CP1257 = 32, // used in Baltic countries
//----------------------------------------------------------
//----------------------------------------------------------
//----------------------------------------------------------
ISO_8859_11 = 33, // aka TIS-620, used for Thai
MSFT_CP874 = 34, // used for Thai
MSFT_CP1256 = 35, // used for Arabic
//----------------------------------------------------------
MSFT_CP1255 = 36, // Logical Hebrew Microsoft
ISO_8859_8_I = 37, // Iso Hebrew Logical
HEBREW_VISUAL = 38, // Iso Hebrew Visual
//----------------------------------------------------------
//----------------------------------------------------------
CZECH_CP852 = 39,
CZECH_CSN_369103 = 40, // aka ISO_IR_139 aka KOI8_CS
MSFT_CP1253 = 41, // used for Greek
RUSSIAN_CP866 = 42,
//----------------------------------------------------------
//----------------------------------------------------------
// Handled by iconv in glibc
ISO_8859_13 = 43,
ISO_2022_KR = 44,
GBK = 45,
GB18030 = 46,
BIG5_HKSCS = 47,
ISO_2022_CN = 48,
//-----------------------------------------------------------
// Following 4 encodings are deprecated (font encodings)
TSCII = 49,
TAMIL_MONO = 50,
TAMIL_BI = 51,
JAGRAN = 52,
MACINTOSH_ROMAN = 53,
UTF7 = 54,
//-----------------------------------------------------------
// Following 2 encodings are deprecated (font encodings)
BHASKAR = 55, // Indic encoding - Devanagari
HTCHANAKYA = 56, // 56 Indic encoding - Devanagari
//-----------------------------------------------------------
UTF16BE = 57, // big-endian UTF-16
UTF16LE = 58, // little-endian UTF-16
UTF32BE = 59, // big-endian UTF-32
UTF32LE = 60, // little-endian UTF-32
//-----------------------------------------------------------
//-----------------------------------------------------------
// An encoding that means "This is not text, but it may have some
// simple ASCII text embedded". Intended input conversion
// is to keep strings of >=4 seven-bit ASCII characters
BINARYENC = 61,
//-----------------------------------------------------------
//-----------------------------------------------------------
// Some Web pages allow a mixture of HZ-GB and GB-2312 by using
// ~{ ... ~} for 2-byte pairs, and the browsers support this.
HZ_GB_2312 = 62,
//-----------------------------------------------------------
//-----------------------------------------------------------
// Some external vendors make the common input error of
// converting MSFT_CP1252 to UTF8 *twice*.
UTF8UTF8 = 63,
//-----------------------------------------------------------
//-----------------------------------------------------------
// Following 6 encodings are deprecated (font encodings)
TAM_ELANGO = 64, // Elango - Tamil
TAM_LTTMBARANI = 65, // Barani - Tamil
TAM_SHREE = 66, // Shree - Tamil
TAM_TBOOMIS = 67, // TBoomis - Tamil
TAM_TMNEWS = 68, // TMNews - Tamil
TAM_WEBTAMIL = 69, // Webtamil - Tamil
//-----------------------------------------------------------
//-----------------------------------------------------------
// Shift_JIS variants used by Japanese cell phone carriers.
KDDI_SHIFT_JIS = 70,
DOCOMO_SHIFT_JIS = 71,
SOFTBANK_SHIFT_JIS = 72,
// ISO-2022-JP variants used by KDDI and SoftBank.
KDDI_ISO_2022_JP = 73,
SOFTBANK_ISO_2022_JP = 74,
//-----------------------------------------------------------
NUM_ENCODINGS = 75, // Always keep this at the end. It is not a
// valid Encoding enum, it is only used to
// indicate the total number of Encodings.
};
} // End namespace CLD2
#endif // I18N_ENCODINGS_CLD2_PUBLIC_ENCODINGS_H__