Initial CLD2 source upload.
git-svn-id: https://cld2.googlecode.com/svn/trunk@3 b252ecd4-b096-bf77-eb8e-91563289f87e
This commit is contained in:
328
internal/compact_lang_det.cc
Normal file
328
internal/compact_lang_det.cc
Normal file
@@ -0,0 +1,328 @@
|
||||
// Copyright 2013 Google Inc. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
//
|
||||
// Author: dsites@google.com (Dick Sites)
|
||||
//
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#include "../public/compact_lang_det.h"
|
||||
#include "../public/encodings.h"
|
||||
#include "compact_lang_det_impl.h"
|
||||
#include "integral_types.h"
|
||||
#include "lang_script.h"
|
||||
|
||||
namespace CLD2 {
|
||||
|
||||
// String is "code_version - data_scrape_date"
|
||||
static const char* kDetectLanguageVersion = "V2.0 - 20130614";
|
||||
|
||||
// Large-table version for all ~160 languages
|
||||
// Small-table version for all ~60 languages
|
||||
|
||||
// Scan interchange-valid UTF-8 bytes and detect most likely language
|
||||
Language DetectLanguage(
|
||||
const char* buffer,
|
||||
int buffer_length,
|
||||
bool is_plain_text,
|
||||
bool* is_reliable) {
|
||||
bool allow_extended_lang = false;
|
||||
Language language3[3];
|
||||
int percent3[3];
|
||||
double normalized_score3[3];
|
||||
int text_bytes;
|
||||
int flags = 0;
|
||||
Language plus_one = UNKNOWN_LANGUAGE;
|
||||
const char* tld_hint = "";
|
||||
int encoding_hint = UNKNOWN_ENCODING;
|
||||
Language language_hint = UNKNOWN_LANGUAGE;
|
||||
CLDHints cldhints = {NULL, tld_hint, encoding_hint, language_hint};
|
||||
|
||||
Language lang = DetectLanguageSummaryV2(
|
||||
buffer,
|
||||
buffer_length,
|
||||
is_plain_text,
|
||||
&cldhints,
|
||||
allow_extended_lang,
|
||||
flags,
|
||||
plus_one,
|
||||
language3,
|
||||
percent3,
|
||||
normalized_score3,
|
||||
NULL,
|
||||
&text_bytes,
|
||||
is_reliable);
|
||||
// Default to English
|
||||
if (lang == UNKNOWN_LANGUAGE) {
|
||||
lang = ENGLISH;
|
||||
}
|
||||
return lang;
|
||||
}
|
||||
|
||||
// Scan interchange-valid UTF-8 bytes and detect list of top 3 languages.
|
||||
Language DetectLanguageSummary(
|
||||
const char* buffer,
|
||||
int buffer_length,
|
||||
bool is_plain_text,
|
||||
Language* language3,
|
||||
int* percent3,
|
||||
int* text_bytes,
|
||||
bool* is_reliable) {
|
||||
double normalized_score3[3];
|
||||
bool allow_extended_lang = false;
|
||||
int flags = 0;
|
||||
Language plus_one = UNKNOWN_LANGUAGE;
|
||||
const char* tld_hint = "";
|
||||
int encoding_hint = UNKNOWN_ENCODING;
|
||||
Language language_hint = UNKNOWN_LANGUAGE;
|
||||
CLDHints cldhints = {NULL, tld_hint, encoding_hint, language_hint};
|
||||
|
||||
Language lang = DetectLanguageSummaryV2(
|
||||
buffer,
|
||||
buffer_length,
|
||||
is_plain_text,
|
||||
&cldhints,
|
||||
allow_extended_lang,
|
||||
flags,
|
||||
plus_one,
|
||||
language3,
|
||||
percent3,
|
||||
normalized_score3,
|
||||
NULL,
|
||||
text_bytes,
|
||||
is_reliable);
|
||||
// Default to English
|
||||
if (lang == UNKNOWN_LANGUAGE) {
|
||||
lang = ENGLISH;
|
||||
}
|
||||
return lang;
|
||||
}
|
||||
|
||||
// Same as above, with hints supplied
|
||||
// Scan interchange-valid UTF-8 bytes and detect list of top 3 languages.
|
||||
Language DetectLanguageSummary(
|
||||
const char* buffer,
|
||||
int buffer_length,
|
||||
bool is_plain_text,
|
||||
const char* tld_hint, // "id" boosts Indonesian
|
||||
int encoding_hint, // SJS boosts Japanese
|
||||
Language language_hint, // ITALIAN boosts it
|
||||
Language* language3,
|
||||
int* percent3,
|
||||
int* text_bytes,
|
||||
bool* is_reliable) {
|
||||
double normalized_score3[3];
|
||||
bool allow_extended_lang = false;
|
||||
int flags = 0;
|
||||
Language plus_one = UNKNOWN_LANGUAGE;
|
||||
CLDHints cldhints = {NULL, tld_hint, encoding_hint, language_hint};
|
||||
|
||||
Language lang = DetectLanguageSummaryV2(
|
||||
buffer,
|
||||
buffer_length,
|
||||
is_plain_text,
|
||||
&cldhints,
|
||||
allow_extended_lang,
|
||||
flags,
|
||||
plus_one,
|
||||
language3,
|
||||
percent3,
|
||||
normalized_score3,
|
||||
NULL,
|
||||
text_bytes,
|
||||
is_reliable);
|
||||
// Default to English
|
||||
if (lang == UNKNOWN_LANGUAGE) {
|
||||
lang = ENGLISH;
|
||||
}
|
||||
return lang;
|
||||
}
|
||||
|
||||
|
||||
// Scan interchange-valid UTF-8 bytes and detect list of top 3 extended
|
||||
// languages.
|
||||
// Extended languages are additional Google interface languages and Unicode
|
||||
// single-language scripts, from ext_lang_enc.h
|
||||
Language ExtDetectLanguageSummary(
|
||||
const char* buffer,
|
||||
int buffer_length,
|
||||
bool is_plain_text,
|
||||
Language* language3,
|
||||
int* percent3,
|
||||
int* text_bytes,
|
||||
bool* is_reliable) {
|
||||
double normalized_score3[3];
|
||||
bool allow_extended_lang = true;
|
||||
int flags = 0;
|
||||
Language plus_one = UNKNOWN_LANGUAGE;
|
||||
const char* tld_hint = "";
|
||||
int encoding_hint = UNKNOWN_ENCODING;
|
||||
Language language_hint = UNKNOWN_LANGUAGE;
|
||||
CLDHints cldhints = {NULL, tld_hint, encoding_hint, language_hint};
|
||||
|
||||
Language lang = DetectLanguageSummaryV2(
|
||||
buffer,
|
||||
buffer_length,
|
||||
is_plain_text,
|
||||
&cldhints,
|
||||
allow_extended_lang,
|
||||
flags,
|
||||
plus_one,
|
||||
language3,
|
||||
percent3,
|
||||
normalized_score3,
|
||||
NULL,
|
||||
text_bytes,
|
||||
is_reliable);
|
||||
// Do not default to English
|
||||
return lang;
|
||||
}
|
||||
|
||||
// Same as above, with hints supplied
|
||||
// Scan interchange-valid UTF-8 bytes and detect list of top 3 extended
|
||||
// languages.
|
||||
// Extended languages are additional Google interface languages and Unicode
|
||||
// single-language scripts, from ext_lang_enc.h
|
||||
Language ExtDetectLanguageSummary(
|
||||
const char* buffer,
|
||||
int buffer_length,
|
||||
bool is_plain_text,
|
||||
const char* tld_hint, // "id" boosts Indonesian
|
||||
int encoding_hint, // SJS boosts Japanese
|
||||
Language language_hint, // ITALIAN boosts it
|
||||
Language* language3,
|
||||
int* percent3,
|
||||
int* text_bytes,
|
||||
bool* is_reliable) {
|
||||
double normalized_score3[3];
|
||||
bool allow_extended_lang = true;
|
||||
int flags = 0;
|
||||
Language plus_one = UNKNOWN_LANGUAGE;
|
||||
CLDHints cldhints = {NULL, tld_hint, encoding_hint, language_hint};
|
||||
|
||||
Language lang = DetectLanguageSummaryV2(
|
||||
buffer,
|
||||
buffer_length,
|
||||
is_plain_text,
|
||||
&cldhints,
|
||||
allow_extended_lang,
|
||||
flags,
|
||||
plus_one,
|
||||
language3,
|
||||
percent3,
|
||||
normalized_score3,
|
||||
NULL,
|
||||
text_bytes,
|
||||
is_reliable);
|
||||
// Do not default to English
|
||||
return lang;
|
||||
}
|
||||
|
||||
// Same as above, and also returns internal language scores as a ratio to
|
||||
// normal score for real text in that language. Scores close to 1.0 indicate
|
||||
// normal text, while scores far away from 1.0 indicate badly-skewed text or
|
||||
// gibberish
|
||||
//
|
||||
Language ExtDetectLanguageSummary(
|
||||
const char* buffer,
|
||||
int buffer_length,
|
||||
bool is_plain_text,
|
||||
const char* tld_hint, // "id" boosts Indonesian
|
||||
int encoding_hint, // SJS boosts Japanese
|
||||
Language language_hint, // ITALIAN boosts it
|
||||
Language* language3,
|
||||
int* percent3,
|
||||
double* normalized_score3,
|
||||
int* text_bytes,
|
||||
bool* is_reliable) {
|
||||
bool allow_extended_lang = true;
|
||||
int flags = 0;
|
||||
Language plus_one = UNKNOWN_LANGUAGE;
|
||||
CLDHints cldhints = {NULL, tld_hint, encoding_hint, language_hint};
|
||||
|
||||
Language lang = DetectLanguageSummaryV2(
|
||||
buffer,
|
||||
buffer_length,
|
||||
is_plain_text,
|
||||
&cldhints,
|
||||
allow_extended_lang,
|
||||
flags,
|
||||
plus_one,
|
||||
language3,
|
||||
percent3,
|
||||
normalized_score3,
|
||||
NULL,
|
||||
text_bytes,
|
||||
is_reliable);
|
||||
// Do not default to English
|
||||
return lang;
|
||||
}
|
||||
|
||||
// Use this one.
|
||||
// Hints are collected into a struct.
|
||||
// Flags are passed in (normally zero).
|
||||
//
|
||||
// Also returns 3 internal language scores as a ratio to
|
||||
// normal score for real text in that language. Scores close to 1.0 indicate
|
||||
// normal text, while scores far away from 1.0 indicate badly-skewed text or
|
||||
// gibberish
|
||||
//
|
||||
// Returns a vector of chunks in different languages, so that caller may
|
||||
// spell-check, translate, or otherwaise process different parts of the input
|
||||
// buffer in language-dependant ways.
|
||||
//
|
||||
Language ExtDetectLanguageSummary(
|
||||
const char* buffer,
|
||||
int buffer_length,
|
||||
bool is_plain_text,
|
||||
const CLDHints* cld_hints,
|
||||
int flags,
|
||||
Language* language3,
|
||||
int* percent3,
|
||||
double* normalized_score3,
|
||||
ResultChunkVector* resultchunkvector,
|
||||
int* text_bytes,
|
||||
bool* is_reliable) {
|
||||
bool allow_extended_lang = true;
|
||||
Language plus_one = UNKNOWN_LANGUAGE;
|
||||
|
||||
Language lang = DetectLanguageSummaryV2(
|
||||
buffer,
|
||||
buffer_length,
|
||||
is_plain_text,
|
||||
cld_hints,
|
||||
allow_extended_lang,
|
||||
flags,
|
||||
plus_one,
|
||||
language3,
|
||||
percent3,
|
||||
normalized_score3,
|
||||
resultchunkvector,
|
||||
text_bytes,
|
||||
is_reliable);
|
||||
// Do not default to English
|
||||
return lang;
|
||||
}
|
||||
|
||||
|
||||
// Return version text string
|
||||
// String is "code_version - data_build_date"
|
||||
const char* DetectLanguageVersion() {
|
||||
return kDetectLanguageVersion;
|
||||
}
|
||||
|
||||
} // End namespace CLD2
|
||||
|
Reference in New Issue
Block a user