Initial CLD2 source upload.
git-svn-id: https://cld2.googlecode.com/svn/trunk@3 b252ecd4-b096-bf77-eb8e-91563289f87e
This commit is contained in:
546
internal/lang_script.cc
Normal file
546
internal/lang_script.cc
Normal file
@@ -0,0 +1,546 @@
|
||||
// Copyright 2013 Google Inc. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
//
|
||||
// File: lang_script.cc
|
||||
// ================
|
||||
//
|
||||
// Author: dsites@google.com (Dick Sites)
|
||||
//
|
||||
// This file declares language and script numbers and names for CLD2
|
||||
//
|
||||
|
||||
#include "lang_script.h"
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
#include "generated_language.h"
|
||||
#include "generated_ulscript.h"
|
||||
|
||||
namespace CLD2 {
|
||||
|
||||
// Language tables
|
||||
// Subscripted by enum Language
|
||||
extern const int kLanguageToNameSize;
|
||||
extern const char* const kLanguageToName[];
|
||||
extern const int kLanguageToCodeSize;
|
||||
extern const char* const kLanguageToCode[];
|
||||
extern const int kLanguageToCNameSize;
|
||||
extern const char* const kLanguageToCName[];
|
||||
extern const int kLanguageToScriptsSize;
|
||||
extern const FourScripts kLanguageToScripts[];
|
||||
|
||||
// Subscripted by Language
|
||||
extern const int kLanguageToPLangSize;
|
||||
extern const uint8 kLanguageToPLang[];
|
||||
// Subscripted by per-script language
|
||||
extern const uint16 kPLangToLanguageLatn[];
|
||||
extern const uint16 kPLangToLanguageOthr[];
|
||||
|
||||
// Alphabetical order for binary search
|
||||
extern const int kNameToLanguageSize;
|
||||
extern const CharIntPair kNameToLanguage[];
|
||||
extern const int kCodeToLanguageSize;
|
||||
extern const CharIntPair kCodeToLanguage[];
|
||||
|
||||
// ULScript tables
|
||||
// Subscripted by enum ULScript
|
||||
extern const int kULScriptToNameSize;
|
||||
extern const char* const kULScriptToName[];
|
||||
extern const int kULScriptToCodeSize;
|
||||
extern const char* const kULScriptToCode[];
|
||||
extern const int kULScriptToCNameSize;
|
||||
extern const char* const kULScriptToCName[];
|
||||
extern const int kULScriptToRtypeSize;
|
||||
extern const ULScriptRType kULScriptToRtype[];
|
||||
extern const int kULScriptToDefaultLangSize;
|
||||
extern const Language kULScriptToDefaultLang[];
|
||||
|
||||
// Alphabetical order for binary search
|
||||
extern const int kNameToULScriptSize;
|
||||
extern const CharIntPair kNameToULScript[];
|
||||
extern const int kCodeToULScriptSize;
|
||||
extern const CharIntPair kCodeToULScript[];
|
||||
|
||||
|
||||
//
|
||||
// File: lang_script.h
|
||||
// ================
|
||||
//
|
||||
// Author: dsites@google.com (Dick Sites)
|
||||
//
|
||||
// This file declares language and script numbers and names for CLD2
|
||||
//
|
||||
|
||||
|
||||
// NOTE: The script numbers and language numbers here are not guaranteed to be
|
||||
// stable. If you want to record a result for posterity, save the ISO codes
|
||||
// as character strings.
|
||||
//
|
||||
//
|
||||
// The Unicode scripts recognized by CLD2 are numbered almost arbitrarily,
|
||||
// specified in an enum. Each script has human-readable script name and a
|
||||
// 4-letter ISO 15924 script code. Each has a C name (largely for use by
|
||||
// programs that generate declarations in cld2_generated_scripts.h). Each
|
||||
// also has a recognition type
|
||||
// r_type: 0 script-only, 1 nilgrams, 2 quadgrams, 3 CJK
|
||||
//
|
||||
// The declarations for a particular version of Unicode are machine-generated in
|
||||
// cld2_generated_scripts.h
|
||||
//
|
||||
// This file includes that one and declares the access routines. The type
|
||||
// involved is called "ULScript" to signify Unicode Letters-Marks Scripts,
|
||||
// which are not quite Unicode Scripts. In particular, the CJK scripts are
|
||||
// merged into a single number because CLD2 recognizes the CJK languages from
|
||||
// four scripts intermixed: Hani (both Hans and Hant), Hangul, Hiragana, and
|
||||
// Katakana.
|
||||
|
||||
// Each script has one of these four recognition types.
|
||||
// RTypeNone: There is no language associated with this script. In extended
|
||||
// language recognition calls, return a fake language number that maps to
|
||||
// xx-Cham, with literally "xx" for the language code,and with the script
|
||||
// code instead of "Cham". In non-extended calls, return UNKNOWN_LANGUAGE.
|
||||
// RTypeOne: The script maps 1:1 to a single language. No letters are examined
|
||||
// during recognition and no lookups done.
|
||||
// RTypeMany: The usual quadgram + delta-octagram + distinctive-words scoring
|
||||
// is done to determine the languages involved.
|
||||
// RTypeCJK: The CJK unigram + delta-bigram scoring is done to determine the
|
||||
// languages involved.
|
||||
//
|
||||
// Note that the choice of recognition type is a function of script, not
|
||||
// language. In particular, some languges are recognized in multiple scripts
|
||||
// and those have different recognition types (Mongolian mn-Latn vs. mn-Mong
|
||||
// for example).
|
||||
|
||||
//----------------------------------------------------------------------------//
|
||||
// Functions of ULScript //
|
||||
//----------------------------------------------------------------------------//
|
||||
|
||||
// If the input is out of range or otherwise unrecognized, it is treated
|
||||
// as UNKNOWN_ULSCRIPT (which never participates in language recognition)
|
||||
const char* ULScriptName(ULScript ulscript) {
|
||||
int i_ulscript = ulscript;
|
||||
if (i_ulscript < 0) {i_ulscript = UNKNOWN_ULSCRIPT;}
|
||||
if (i_ulscript >= NUM_ULSCRIPTS) {i_ulscript = UNKNOWN_ULSCRIPT;}
|
||||
return kULScriptToName[i_ulscript];
|
||||
}
|
||||
|
||||
const char* ULScriptCode(ULScript ulscript) {
|
||||
int i_ulscript = ulscript;
|
||||
if (i_ulscript < 0) {i_ulscript = UNKNOWN_ULSCRIPT;}
|
||||
if (i_ulscript >= NUM_ULSCRIPTS) {i_ulscript = UNKNOWN_ULSCRIPT;}
|
||||
return kULScriptToCode[i_ulscript];
|
||||
}
|
||||
|
||||
const char* ULScriptDeclaredName(ULScript ulscript) {
|
||||
int i_ulscript = ulscript;
|
||||
if (i_ulscript < 0) {i_ulscript = UNKNOWN_ULSCRIPT;}
|
||||
if (i_ulscript >= NUM_ULSCRIPTS) {i_ulscript = UNKNOWN_ULSCRIPT;}
|
||||
return kULScriptToCName[i_ulscript];
|
||||
}
|
||||
|
||||
ULScriptRType ULScriptRecognitionType(ULScript ulscript) {
|
||||
int i_ulscript = ulscript;
|
||||
if (i_ulscript < 0) {i_ulscript = UNKNOWN_ULSCRIPT;}
|
||||
if (i_ulscript >= NUM_ULSCRIPTS) {i_ulscript = UNKNOWN_ULSCRIPT;}
|
||||
return kULScriptToRtype[i_ulscript];
|
||||
}
|
||||
|
||||
|
||||
|
||||
// The languages recognized by CLD2 are numbered almost arbitrarily,
|
||||
// specified in an enum. Each language has human-readable language name and a
|
||||
// 2- or 3-letter ISO 639 language code. Each has a C name (largely for use by
|
||||
// programs that generate declarations in cld2_generated_languagess.h).
|
||||
// Each has a list of up to four scripts in which it is currently recognized.
|
||||
//
|
||||
// The declarations for a particular set of recognized languages are
|
||||
// machine-generated in
|
||||
// cld2_generated_languages.h
|
||||
//
|
||||
// The Language enum is intended to match the internal Google Language enum
|
||||
// in i18n/languages/proto/languages.proto up to NUM_LANGUAGES, with additional
|
||||
// languages assigned above that. Over time, some languages may be renumbered
|
||||
// if they are moved into the Language enum.
|
||||
//
|
||||
// The Language enum includes the fake language numbers for RTypeNone above.
|
||||
//
|
||||
// In an open-source environment, the Google-specific Language enum is not
|
||||
// available. Language decouples the two environments while maintaining
|
||||
// internal compatibility.
|
||||
|
||||
|
||||
// If the input is out of range or otherwise unrecognized, it is treated
|
||||
// as UNKNOWN_LANGUAGE
|
||||
//
|
||||
// LanguageCode
|
||||
// ------------
|
||||
// Given the Language, return the language code, e.g. "ko"
|
||||
// This is determined by
|
||||
// the following (in order of preference):
|
||||
// - ISO-639-1 two-letter language code
|
||||
// (all except those mentioned below)
|
||||
// - ISO-639-2 three-letter bibliographic language code
|
||||
// (Tibetan, Dhivehi, Cherokee, Syriac)
|
||||
// - Google-specific language code
|
||||
// (ChineseT ("zh-TW"), Teragram Unknown, Unknown,
|
||||
// Portuguese-Portugal, Portuguese-Brazil, Limbu)
|
||||
// - Fake RTypeNone names.
|
||||
|
||||
//----------------------------------------------------------------------------//
|
||||
// Functions of Language //
|
||||
//----------------------------------------------------------------------------//
|
||||
|
||||
const char* LanguageName(Language lang) {
|
||||
int i_lang = lang;
|
||||
if (i_lang < 0) {i_lang = UNKNOWN_LANGUAGE;}
|
||||
if (i_lang >= NUM_LANGUAGES) {i_lang = UNKNOWN_LANGUAGE;}
|
||||
return kLanguageToName[i_lang];
|
||||
}
|
||||
const char* LanguageCode(Language lang) {
|
||||
int i_lang = lang;
|
||||
if (i_lang < 0) {i_lang = UNKNOWN_LANGUAGE;}
|
||||
if (i_lang >= NUM_LANGUAGES) {i_lang = UNKNOWN_LANGUAGE;}
|
||||
return kLanguageToCode[i_lang];
|
||||
}
|
||||
|
||||
const char* LanguageDeclaredName(Language lang) {
|
||||
int i_lang = lang;
|
||||
if (i_lang < 0) {i_lang = UNKNOWN_LANGUAGE;}
|
||||
if (i_lang >= NUM_LANGUAGES) {i_lang = UNKNOWN_LANGUAGE;}
|
||||
return kLanguageToCName[i_lang];
|
||||
}
|
||||
|
||||
// n is in 0..3. Trailing entries are filled with
|
||||
// UNKNOWN_LANGUAGE (which never participates in language recognition)
|
||||
ULScript LanguageRecognizedScript(Language lang, int n) {
|
||||
int i_lang = lang;
|
||||
if (i_lang < 0) {i_lang = UNKNOWN_LANGUAGE;}
|
||||
if (i_lang >= NUM_LANGUAGES) {i_lang = UNKNOWN_LANGUAGE;}
|
||||
return static_cast<ULScript>(kLanguageToScripts[i_lang][n]);
|
||||
}
|
||||
|
||||
// Given the Language, returns its string name used as the output by
|
||||
// the lang/enc identifier, e.g. "Korean"
|
||||
// "invalid_language" if the input is invalid.
|
||||
// TG_UNKNOWN_LANGUAGE is used as a placeholder for the "ignore me" language,
|
||||
// used to subtract out HTML, link farms, DNA strings, and alittle English porn
|
||||
const char* ExtLanguageName(const Language lang) {
|
||||
return LanguageName(lang);
|
||||
}
|
||||
|
||||
// Given the Language, return the language code, e.g. "ko"
|
||||
const char* ExtLanguageCode(const Language lang) {
|
||||
return LanguageCode(lang);
|
||||
}
|
||||
|
||||
|
||||
// Given the Language, returns its Language enum spelling, for use by
|
||||
// programs that create C declarations, e.g. "KOREAN"
|
||||
// "UNKNOWN_LANGUAGE" if the input is invalid.
|
||||
const char* ExtLanguageDeclaredName(const Language lang) {
|
||||
return LanguageDeclaredName(lang);
|
||||
}
|
||||
|
||||
// Returns which set of statistically-close languages lang is in. 0 means none.
|
||||
int LanguageCloseSet(Language lang) {
|
||||
// Scaffolding
|
||||
// id ms # INDONESIAN MALAY coef=0.4698 Problematic w/o extra words
|
||||
// bo dz # TIBETAN DZONGKHA coef=0.4571
|
||||
// cs sk # CZECH SLOVAK coef=0.4273
|
||||
// zu xh # ZULU XHOSA coef=0.3716
|
||||
//
|
||||
// bs hr sr srm # BOSNIAN CROATIAN SERBIAN MONTENEGRIN
|
||||
// hi mr bh ne # HINDI MARATHI BIHARI NEPALI
|
||||
// no nn da # NORWEGIAN NORWEGIAN_N DANISH
|
||||
// gl es pt # GALICIAN SPANISH PORTUGUESE
|
||||
// rw rn # KINYARWANDA RUNDI
|
||||
|
||||
if (lang == INDONESIAN) {return 1;}
|
||||
if (lang == MALAY) {return 1;}
|
||||
|
||||
if (lang == TIBETAN) {return 2;}
|
||||
if (lang == DZONGKHA) {return 2;}
|
||||
|
||||
if (lang == CZECH) {return 3;}
|
||||
if (lang == SLOVAK) {return 3;}
|
||||
|
||||
if (lang == ZULU) {return 4;}
|
||||
if (lang == XHOSA) {return 4;}
|
||||
|
||||
if (lang == BOSNIAN) {return 5;}
|
||||
if (lang == CROATIAN) {return 5;}
|
||||
if (lang == SERBIAN) {return 5;}
|
||||
if (lang == MONTENEGRIN) {return 5;}
|
||||
|
||||
if (lang == HINDI) {return 6;}
|
||||
if (lang == MARATHI) {return 6;}
|
||||
if (lang == BIHARI) {return 6;}
|
||||
if (lang == NEPALI) {return 6;}
|
||||
|
||||
if (lang == NORWEGIAN) {return 7;}
|
||||
if (lang == NORWEGIAN_N) {return 7;}
|
||||
if (lang == DANISH) {return 7;}
|
||||
|
||||
if (lang == GALICIAN) {return 8;}
|
||||
if (lang == SPANISH) {return 8;}
|
||||
if (lang == PORTUGUESE) {return 8;}
|
||||
|
||||
if (lang == KINYARWANDA) {return 9;}
|
||||
if (lang == RUNDI) {return 9;}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------------//
|
||||
// Functions of ULScript and Language //
|
||||
//----------------------------------------------------------------------------//
|
||||
|
||||
Language DefaultLanguage(ULScript ulscript) {
|
||||
if (ulscript < 0) {return UNKNOWN_LANGUAGE;}
|
||||
if (ulscript >= NUM_ULSCRIPTS) {return UNKNOWN_LANGUAGE;}
|
||||
return kULScriptToDefaultLang[ulscript];
|
||||
}
|
||||
|
||||
uint8 PerScriptNumber(ULScript ulscript, Language lang) {
|
||||
if (ulscript < 0) {return 0;}
|
||||
if (ulscript >= NUM_ULSCRIPTS) {return 0;}
|
||||
if (kULScriptToRtype[ulscript] == RTypeNone) {return 1;}
|
||||
if (lang >= kLanguageToPLangSize) {return 0;}
|
||||
return kLanguageToPLang[lang];
|
||||
}
|
||||
|
||||
Language FromPerScriptNumber(ULScript ulscript, uint8 perscript_number) {
|
||||
if (ulscript < 0) {return UNKNOWN_LANGUAGE;}
|
||||
if (ulscript >= NUM_ULSCRIPTS) {return UNKNOWN_LANGUAGE;}
|
||||
if ((kULScriptToRtype[ulscript] == RTypeNone) ||
|
||||
(kULScriptToRtype[ulscript] == RTypeOne)) {
|
||||
return kULScriptToDefaultLang[ulscript];
|
||||
}
|
||||
|
||||
if (ulscript == ULScript_Latin) {
|
||||
return static_cast<Language>(kPLangToLanguageLatn[perscript_number]);
|
||||
} else {
|
||||
return static_cast<Language>(kPLangToLanguageOthr[perscript_number]);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
//----------------------------------------------------------------------------//
|
||||
// Other //
|
||||
//----------------------------------------------------------------------------//
|
||||
|
||||
// Returns mid if key found in lo <= mid < hi, else -1
|
||||
int BinarySearch(const char* key, int lo, int hi, const CharIntPair* cipair) {
|
||||
// binary search
|
||||
while (lo < hi) {
|
||||
int mid = (lo + hi) >> 1;
|
||||
if (strcmp(key, cipair[mid].s) < 0) {
|
||||
hi = mid;
|
||||
} else if (strcmp(key, cipair[mid].s) > 0) {
|
||||
lo = mid + 1;
|
||||
} else {
|
||||
return mid;
|
||||
}
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
Language MakeLang(int i) {return static_cast<Language>(i);}
|
||||
|
||||
// Name can be either full name or ISO code, or can be ISO code embedded in
|
||||
// a language-script combination such as "ABKHAZIAN", "en", "en-Latn-GB"
|
||||
Language GetLanguageFromName(const char* src) {
|
||||
const char* hyphen1 = strchr(src, '-');
|
||||
const char* hyphen2 = NULL;
|
||||
if (hyphen1 != NULL) {hyphen2 = strchr(hyphen1 + 1, '-');}
|
||||
|
||||
int match = -1;
|
||||
if (hyphen1 == NULL) {
|
||||
// Bare name. Look at full name, then code
|
||||
match = BinarySearch(src, 0, kNameToLanguageSize, kNameToLanguage);
|
||||
if (match >= 0) {return MakeLang(kNameToLanguage[match].i);} // aa
|
||||
match = BinarySearch(src, 0, kCodeToLanguageSize, kCodeToLanguage);
|
||||
if (match >= 0) {return MakeLang(kCodeToLanguage[match].i);} // aa
|
||||
return UNKNOWN_LANGUAGE;
|
||||
}
|
||||
|
||||
if (hyphen2 == NULL) {
|
||||
// aa-bb. Not a full name; must be code-something. Try zh-TW then bare zh
|
||||
match = BinarySearch(src, 0, kCodeToLanguageSize, kCodeToLanguage);
|
||||
if (match >= 0) {return MakeLang(kCodeToLanguage[match].i);} // aa-bb
|
||||
|
||||
int len = strlen(src);
|
||||
if (len >= 16) {return UNKNOWN_LANGUAGE;} // Real codes are shorter
|
||||
|
||||
char temp[16];
|
||||
int hyphen1_offset = hyphen1 - src;
|
||||
// Take off part after hyphen1
|
||||
memcpy(temp, src, len);
|
||||
temp[hyphen1_offset] = '\0';
|
||||
match = BinarySearch(temp, 0, kCodeToLanguageSize, kCodeToLanguage);
|
||||
if (match >= 0) {return MakeLang(kCodeToLanguage[match].i);} // aa
|
||||
|
||||
return UNKNOWN_LANGUAGE;
|
||||
}
|
||||
|
||||
// aa-bb-cc. Must be code-something. Try en-Latn-US, en-Latn, en-US, en
|
||||
match = BinarySearch(src, 0, kCodeToLanguageSize, kCodeToLanguage);
|
||||
if (match >= 0) {return MakeLang(kCodeToLanguage[match].i);} // aa-bb-cc
|
||||
|
||||
|
||||
int len = strlen(src);
|
||||
if (len >= 16) {return UNKNOWN_LANGUAGE;} // Real codes are shorter
|
||||
|
||||
char temp[16];
|
||||
int hyphen1_offset = hyphen1 - src;
|
||||
int hyphen2_offset = hyphen2 - src;
|
||||
// Take off part after hyphen2
|
||||
memcpy(temp, src, len);
|
||||
temp[hyphen2_offset] = '\0';
|
||||
match = BinarySearch(temp, 0, kCodeToLanguageSize, kCodeToLanguage);
|
||||
if (match >= 0) {return MakeLang(kCodeToLanguage[match].i);} // aa-bb
|
||||
|
||||
|
||||
// Take off part between hyphen1 and hyphen2
|
||||
int len2 = len - hyphen2_offset;
|
||||
memcpy(temp, src, len);
|
||||
memcpy(&temp[hyphen1_offset], hyphen2, len2);
|
||||
temp[hyphen1_offset + len2] = '\0';
|
||||
match = BinarySearch(temp, 0, kCodeToLanguageSize, kCodeToLanguage);
|
||||
if (match >= 0) {return MakeLang(kCodeToLanguage[match].i);} // aa-cc
|
||||
|
||||
|
||||
// Take off everything after hyphen1
|
||||
memcpy(temp, src, len);
|
||||
temp[hyphen1_offset] = '\0';
|
||||
match = BinarySearch(temp, 0, kCodeToLanguageSize, kCodeToLanguage);
|
||||
if (match >= 0) {return MakeLang(kCodeToLanguage[match].i);} // aa
|
||||
|
||||
|
||||
return UNKNOWN_LANGUAGE;
|
||||
}
|
||||
|
||||
|
||||
// Name can be either full name or ISO code, or can be ISO code embedded in
|
||||
// a language-script combination such as "en-Latn-GB"
|
||||
// MORE WORK to do here. also kLanguageToScripts [4] is bogus
|
||||
// if bare language name, no script, want zh, ja, ko to Hani, pt to Latn, etc.
|
||||
// Something like map code to Language, then Language to kLanguageToScripts[x][0]
|
||||
// ADD BIAS: kLanguageToScripts lists default script first
|
||||
// If total mismatch, reutrn Latn
|
||||
// if (strcmp(src, "nd") == 0) {return NDEBELE;} // [nd was wrong]
|
||||
// if (strcmp(src, "sit-NP-Limb") == 0) {return ULScript_Limbu;}
|
||||
|
||||
ULScript MakeULScr(int i) {return static_cast<ULScript>(i);}
|
||||
|
||||
ULScript GetULScriptFromName(const char* src) {
|
||||
const char* hyphen1 = strchr(src, '-');
|
||||
const char* hyphen2 = NULL;
|
||||
if (hyphen1 != NULL) {hyphen2 = strchr(hyphen1 + 1, '-');}
|
||||
|
||||
int match = -1;
|
||||
if (hyphen1 == NULL) {
|
||||
// Bare name. Look at full name, then code, then try backmapping as Language
|
||||
match = BinarySearch(src, 0, kNameToULScriptSize, kNameToULScript);
|
||||
if (match >= 0) {return MakeULScr(kNameToULScript[match].i);} // aa
|
||||
match = BinarySearch(src, 0, kCodeToULScriptSize, kCodeToULScript);
|
||||
if (match >= 0) {return MakeULScr(kCodeToULScript[match].i);} // aa
|
||||
|
||||
Language backmap_me = GetLanguageFromName(src);
|
||||
if (backmap_me != UNKNOWN_LANGUAGE) {
|
||||
return static_cast<ULScript>(kLanguageToScripts[backmap_me][0]);
|
||||
}
|
||||
return ULScript_Latin;
|
||||
}
|
||||
|
||||
if (hyphen2 == NULL) {
|
||||
// aa-bb. Not a full name; must be code-something. Try en-Latn, bare Latn
|
||||
if (strcmp(src, "zh-TW") == 0) {return ULScript_Hani;}
|
||||
if (strcmp(src, "zh-CN") == 0) {return ULScript_Hani;}
|
||||
if (strcmp(src, "sit-NP") == 0) {return ULScript_Limbu;}
|
||||
if (strcmp(src, "sit-Limb") == 0) {return ULScript_Limbu;}
|
||||
if (strcmp(src, "sr-ME") == 0) {return ULScript_Latin;}
|
||||
match = BinarySearch(src, 0, kCodeToULScriptSize, kCodeToULScript);
|
||||
if (match >= 0) {return MakeULScr(kCodeToULScript[match].i);} // aa-bb
|
||||
|
||||
int len = strlen(src);
|
||||
if (len >= 16) {return ULScript_Latin;} // Real codes are shorter
|
||||
|
||||
char temp[16];
|
||||
int hyphen1_offset = hyphen1 - src;
|
||||
int len1 = len - hyphen1_offset - 1; // Exclude the hyphen
|
||||
// Take off part before hyphen1
|
||||
memcpy(temp, hyphen1 + 1, len1);
|
||||
temp[len1] = '\0';
|
||||
match = BinarySearch(temp, 0, kCodeToULScriptSize, kCodeToULScript);
|
||||
if (match >= 0) {return MakeULScr(kCodeToULScript[match].i);} // bb
|
||||
|
||||
// Take off part after hyphen1
|
||||
memcpy(temp, src, len);
|
||||
temp[hyphen1_offset] = '\0';
|
||||
match = BinarySearch(temp, 0, kCodeToULScriptSize, kCodeToULScript);
|
||||
if (match >= 0) {return MakeULScr(kCodeToULScript[match].i);} // aa
|
||||
|
||||
return ULScript_Latin;
|
||||
}
|
||||
|
||||
// aa-bb-cc. Must be code-something. Try en-Latn-US, en-Latn, en-US, en
|
||||
if (strcmp(src, "sit-NP-Limb") == 0) {return ULScript_Limbu;}
|
||||
if (strcmp(src, "sr-ME-Latn") == 0) {return ULScript_Latin;}
|
||||
if (strcmp(src, "sr-ME-Cyrl") == 0) {return ULScript_Cyrillic;}
|
||||
match = BinarySearch(src, 0, kCodeToULScriptSize, kCodeToULScript);
|
||||
if (match >= 0) {return MakeULScr(kCodeToULScript[match].i);} // aa-bb-cc
|
||||
|
||||
int len = strlen(src);
|
||||
if (len >= 16) {return ULScript_Latin;} // Real codes are shorter
|
||||
|
||||
char temp[16];
|
||||
int hyphen1_offset = hyphen1 - src;
|
||||
int hyphen2_offset = hyphen2 - src;
|
||||
int len2 = len - hyphen2_offset - 1; // Exclude the hyphen
|
||||
int lenmid = hyphen2_offset - hyphen1_offset - 1; // Exclude the hyphen
|
||||
// Keep part between hyphen1 and hyphen2
|
||||
memcpy(temp, hyphen1 + 1, lenmid);
|
||||
temp[lenmid] = '\0';
|
||||
match = BinarySearch(temp, 0, kCodeToULScriptSize, kCodeToULScript);
|
||||
if (match >= 0) {return MakeULScr(kCodeToULScript[match].i);} // bb
|
||||
|
||||
// Keep part after hyphen2
|
||||
memcpy(temp, hyphen2 + 1, len2);
|
||||
temp[len2] = '\0';
|
||||
match = BinarySearch(temp, 0, kCodeToULScriptSize, kCodeToULScript);
|
||||
if (match >= 0) {return MakeULScr(kCodeToULScript[match].i);} // cc
|
||||
|
||||
// Keep part before hyphen1
|
||||
memcpy(temp, src, len);
|
||||
temp[hyphen1_offset] = '\0';
|
||||
match = BinarySearch(temp, 0, kCodeToULScriptSize, kCodeToULScript);
|
||||
if (match >= 0) {return MakeULScr(kCodeToULScript[match].i);} // aa
|
||||
|
||||
return ULScript_Latin;
|
||||
}
|
||||
|
||||
// Map script into Latin, Cyrillic, Arabic, Other
|
||||
int LScript4(ULScript ulscript) {
|
||||
if (ulscript == ULScript_Latin) {return 0;}
|
||||
if (ulscript == ULScript_Cyrillic) {return 1;}
|
||||
if (ulscript == ULScript_Arabic) {return 2;}
|
||||
return 3;
|
||||
}
|
||||
|
||||
} // namespace CLD2
|
||||
|
Reference in New Issue
Block a user