Initial CLD2 source upload.
git-svn-id: https://cld2.googlecode.com/svn/trunk@3 b252ecd4-b096-bf77-eb8e-91563289f87e
This commit is contained in:
BIN
docs/InterpretingCLD2UnitTestOutput.pdf
Normal file
BIN
docs/InterpretingCLD2UnitTestOutput.pdf
Normal file
Binary file not shown.
886
internal/CLD2UnitTestOutput.html
Normal file
886
internal/CLD2UnitTestOutput.html
Normal file
@@ -0,0 +1,886 @@
|
||||
<html><meta charset="UTF-8"><body>
|
||||
<style media="print" type="text/css"> :root { -webkit-print-color-adjust: exact; } </style>
|
||||
<span style="font-size: 7pt">
|
||||
file = cld2_unittest<br>
|
||||
[en] <span style="background:#FFFFF4;color:#000000;">
|
||||
confiscation of goods is assigned as the penalty part most of the courts consist of </span>
|
||||
[] <span style="background:#FFFFF4;color:#000000;">
|
||||
members and when it is necessary to bring public cases before a jury of members two courts combine for the purpose the most important cases of all are brought jurors or </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[ 0] en 253B 370p 25300R,
|
||||
2 chunks scored<br>
|
||||
en.100R(99%) 254 bytes = ENGLISH <br><br>
|
||||
[hy] <span style="background:#F8FFD8;color:#007F1F;">
|
||||
ա յ եվ նա հիացած աչքերով նայում է հինգհարկանի շենքի տարօրինակ փոքրիկ քառակուսի պատուհաններին դեռ մենք շատ ենք հետամնաց ասում է նա այսպես է </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[ 1] hy 255B 255p 25500R,
|
||||
1 chunks scored<br>
|
||||
hy.100R(100%) 255 bytes = ARMENIAN <br><br>
|
||||
[chr] <span style="background:#FFEBD8;color:#007F1F;">
|
||||
ᎠᎢᏍᎩ ᎠᏟᎶᏍᏗ ᏥᏄᏍᏛᎩ ᎦᎫᏍᏛᏅᎯ ᎾᎥᎢ </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[11] chr 75B 75p 7500R,
|
||||
1 chunks scored<br>
|
||||
chr.100R(100%) 75 bytes = CHEROKEE <br><br>
|
||||
[dv] <span style="background:#FFD8F7;color:#007F1F;">
|
||||
ހިންދީ ބަހުން ވާހަކަ ދައްކާއިރު ދެވަނަ ބަހެއްގެ ގޮތުގައާއި އެނޫން ގޮތްގޮތުން ހިންދީ ބަހުން ވާހަކަ ދައްކާ މީހުންގެ އަދަދު މިލިއަނަށް </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[10] dv 249B 249p 24900R,
|
||||
1 chunks scored<br>
|
||||
dv.100R(100%) 249 bytes = DHIVEHI <br><br>
|
||||
[ka] <span style="background:#FFEBD8;color:#3F7F00;">
|
||||
ა ბირთვიდან მიღებული ელემენტი მენდელეევის პერიოდულ სიტემაში გადაინაცვლებს ორი უჯრით </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[11] ka 233B 233p 23300R,
|
||||
1 chunks scored<br>
|
||||
ka.100R(100%) 233 bytes = GEORGIAN <br><br>
|
||||
[el] <span style="background:#D8FFE7;color:#7F2F00;">
|
||||
ή αρνητική αναζήτηση λέξης κλειδιού καταστήστε τις μεμονωμένες λέξεις κλειδιά περισσότερο στοχοθετημένες με τη μετατροπή τους σε </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[ 2] el 242B 242p 24200R,
|
||||
1 chunks scored<br>
|
||||
el.100R(100%) 242 bytes = GREEK <br><br>
|
||||
[gu] <span style="background:#EFD8FF;color:#6F7F00;">
|
||||
આના પરિણામ પ્રમાણસર ફોન્ટ અવતરણ ચિન્હવાળા પાઠને છુપાવો બધા સમૂહો શોધાયા હાલનો જ સંદેશ વિષયની </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[ 4] gu 250B 250p 25000R,
|
||||
1 chunks scored<br>
|
||||
gu.100R(100%) 250 bytes = GUJARATI <br><br>
|
||||
[iu] <span style="background:#D8FFF3;color:#007F7F;">
|
||||
ᐃᑯᒪᒻᒪᑦ ᕿᓈᖏᓐᓇᓲᖑᒻᒪᑦ ᑎᑎᖅᑕᓕᒫᖅᓃᕕᑦ ᑎᑦᕆᐊᑐᓐᖏᑦᑕᑎᑦ ᑎᑎᖅᑕᑉᐱᑦ ᓯᕗᓂᖓᓂ ᑎᑎᖅᖃᖅ ᑎᑎᕆᐊᑐᓐᖏᑕᐃᑦ ᕿᓂᓲᖑᔪᒍᑦ ᑎᑎᖅᑕᓕᒫᖅᓃᕕᑦ </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[13] iu 254B 254p 25400R,
|
||||
1 chunks scored<br>
|
||||
iu.100R(100%) 254 bytes = INUKTITUT <br><br>
|
||||
[kn] <span style="background:#FFEBD8;color:#6F7F00;">
|
||||
ಂಠಯ್ಯನವರು ತುಮಕೂರು ಜಿಲ್ಲೆಯ ಚಿಕ್ಕನಾಯಕನಹಳ್ಳಿ ತಾಲ್ಲೂಕಿನ ತೀರ್ಥಪುರ ವೆಂಬ ಸಾಧಾರಣ ಹಳ್ಳಿಯ ಶ್ಯಾನುಭೋಗರ </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[11] kn 254B 254p 25400R,
|
||||
1 chunks scored<br>
|
||||
kn.100R(100%) 254 bytes = KANNADA <br><br>
|
||||
[km] <span style="background:#D8FFFF;color:#007F1F;">
|
||||
ក ខ គ ឃ ង ច ឆ ជ ឈ ញ ដ ឋ ឌ ឍ ណ ត ថ ទ ធ ន ប ផ ព ភ ម យ រ ល វ ស ហ ឡ អ ឥ ឦ ឧ ឪ ឫ ឬ ឯ ឱ ទាំងអស់ </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[ 8] km 187B 187p 18700R,
|
||||
1 chunks scored<br>
|
||||
km.100R(100%) 187 bytes = KHMER <br><br>
|
||||
[lo] <span style="background:#D8FFE7;color:#007F1F;">
|
||||
ກຫາທົ່ວທັງເວັບ ແລະໃນເວັບໄຮ້ສາຍ ທຳອິດໃຫ້ທຳການຊອກຫາກ່ອນ ຈາກນັ້ນ ໃຫ້ກົດປຸ່ມເມນູ ໃນໜ້າຜົນໄດ້ </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[ 2] lo 256B 256p 25600R,
|
||||
1 chunks scored<br>
|
||||
lo.100R(100%) 256 bytes = LAOTHIAN <br><br>
|
||||
[lif] <span style="background:#D8FFF3;color:#007F1F;">
|
||||
ᤁᤡᤖᤠᤳ ᤕᤠᤰᤌᤢᤱ ᤆᤢᤶᤗᤢᤱᤖᤧ ᤛᤥᤎᤢᤱᤃᤧᤴ ᤀᤡᤔᤠᤴᤛᤡᤱ ᤆᤧᤶᤈᤱᤗᤧ ᤁᤢᤔᤡᤱᤅᤥ ᤏᤠᤈᤡᤖᤡ ᤋᤱᤒᤣ ᤒᤠ ᤈᤏᤘᤖᤡ ᤗᤠᤏᤢᤀᤠᤱ ᤁ᤹ᤏᤠ ᤋᤱᤒᤣ ᤁᤠᤰ ᤏᤠ᤺ᤳᤋᤢ ᤕᤢᤖᤢᤒᤠ ᤀᤡᤔᤠᤴᤛᤡᤱ ᤋᤱᤃᤡᤵᤛᤡᤱ ᤌᤡᤶᤒᤣᤴ ᤂᤠᤃᤴ ᤛᤡᤛᤣ᤺ᤰᤗᤠ ᤂᤧᤴ ᤀᤡᤛᤡᤰ ᤈᤏᤘᤖᤡ ᤀᤥ ᤏᤠᤛᤢᤵ ᤆᤥ᤺ᤰᤔᤠ ᤌᤡᤶᤒᤣ ᤋᤱᤃᤠᤶᤛᤡᤱᤗ ᤐᤳᤐᤠ ᤀᤡᤱᤄᤱ ᤘᤠ᤹ </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[13] lif 580B 580p 58000R,
|
||||
1 chunks scored<br>
|
||||
lif.100R(100%) 580 bytes = LIMBU <br><br>
|
||||
[ml] <span style="background:#E3D8FF;color:#7F5F00;">
|
||||
ം അങ്ങനെ ഞങ്ങള് അവരുടെ മുമ്പില് നിന്നു ഔടും ഉടനെ നിങ്ങള് പതിയിരിപ്പില് നിന്നു എഴുന്നേറ്റു </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[ 9] ml 247B 247p 24700R,
|
||||
1 chunks scored<br>
|
||||
ml.100R(100%) 247 bytes = MALAYALAM <br><br>
|
||||
[or] <span style="background:#D8E7FF;color:#007F1F;">
|
||||
ଅକ୍ଟୋବର ଡିସେମ୍ବର </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[14] or 48B 48p 4800R,
|
||||
1 chunks scored<br>
|
||||
or.100R(100%) 48 bytes = ORIYA <br><br>
|
||||
[pa] <span style="background:#EFFFD8;color:#6F7F00;">
|
||||
ਂ ਦਿਨਾਂ ਵਿਚ ਭਾਈ ਸਾਹਿਬ ਦੀ ਬੁੱਚੜ ਗੋਬਿੰਦ ਰਾਮ ਨਾਲ ਅੜਫਸ ਚੱਲ ਰਹੀ ਸੀ ਗੋਬਿੰਦ ਰਾਮ ਨੇ ਭਾਈ ਸਾਹਿਬ ਦੀਆਂ ਭੈਣਾ </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[12] pa 247B 247p 24700R,
|
||||
1 chunks scored<br>
|
||||
pa.100R(100%) 247 bytes = PUNJABI <br><br>
|
||||
[si] <span style="background:#F8D8FF;color:#3F7F00;">
|
||||
අනුරාධ මිහිඳුකුල නමින් සකුරා ට ලිපියක් තැපෑලෙන් එවා තිබුණා කි ් රස්ටි ෂෙල්ටන් ප ් රනාන්දු ද </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[15] si 243B 243p 24300R,
|
||||
1 chunks scored<br>
|
||||
si.100R(100%) 243 bytes = SINHALESE <br><br>
|
||||
[syr] <span style="background:#EFFFD8;color:#007F1F;">
|
||||
ܐܕܪܝܣ ܓܛܘ ܫܘܪܝܐ ܡܢ ܦܪܢܣܐ ܡܢ ܐܣܦܢܝܐ ܚܐܪܘܬܐ ܒܐܕܪ ܒܢܝܣܢ ܫܛܝܚܘܬܐ ܟܠܢܝܐ ܡܝ̈ܐ ܒܥܠܡܐ </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[12] syr 143B 143p 14300R,
|
||||
1 chunks scored<br>
|
||||
syr.100R(100%) 143 bytes = SYRIAC <br><br>
|
||||
[tl] <span style="background:#FFD8D8;color:#7F5F00;">
|
||||
ᜋᜇ᜔ ᜐᜓᜎᜆ᜔ ᜃ ᜈᜅ᜔ ᜊᜌ᜔ᜊᜌᜒᜈ᜔ ᜂᜉᜅ᜔᜔ ᜋᜐᜈᜌ᜔ ᜎᜅ᜔ ᜁᜐ ᜉᜅ᜔ ᜀᜃ᜔ᜎᜆ᜔ ᜆᜓᜅ᜔ᜃᜓᜎ᜔ ᜐ ᜊᜌ᜔ᜊᜌᜒᜈ᜔ ᜐ ᜆᜒᜅᜒᜈ᜔ ᜃᜓ </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[ 0] tl 228B 228p 22800R,
|
||||
1 chunks scored<br>
|
||||
tl.100R(100%) 228 bytes = TAGALOG <br><br>
|
||||
[ta] <span style="background:#D8E7FF;color:#7F5F00;">
|
||||
அங்கு ராஜேந்திர சோழனால் கட்டப்பட்ட பிரம்மாண்டமான சிவன் கோவில் ஒன்றும் உள்ளது தொகு </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[14] ta 227B 227p 22700R,
|
||||
1 chunks scored<br>
|
||||
ta.100R(100%) 227 bytes = TAMIL <br><br>
|
||||
[te] <span style="background:#EFFFD8;color:#7F5F00;">
|
||||
ఁ దనర జయించిన తత్వ మరసి చూడఁ దాన యగును రాజయోగి యిట్లు తేజరిల్లుచు నుండు విశ్వదాభిరామ వినర వేమ </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[12] te 253B 253p 25300R,
|
||||
1 chunks scored<br>
|
||||
te.100R(100%) 253 bytes = TELUGU <br><br>
|
||||
[th] <span style="background:#FFD8EB;color:#6F7F00;">
|
||||
กฏในการค้นหา หรือหน้าเนื้อหา หากท่านเลือกลงโฆษณา ท่านอาจจะปรับต้องเพิ่มงบประมาณรายวันตา </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[ 5] th 257B 257p 25700R,
|
||||
1 chunks scored<br>
|
||||
th.100R(100%) 257 bytes = THAI <br><br>
|
||||
[zh] <span style="background:#FFD8D8;color:#7F2F00;">
|
||||
产品的简报和公告 提交该申请后无法进行更改 请确认您的选择是正确的 对于要提交的图书 我</span>
|
||||
[] <span style="background:#FFD8D8;color:#7F2F00;">
|
||||
确认 我是版权所有者或已得到版权所有者的授权 要更改您的国家 地区 请在此表的最上端更改您的 </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[ 0] zh 255B 506p 25500R,
|
||||
2 chunks scored<br>
|
||||
zh.100R(99%) 256 bytes = Chinese <br><br>
|
||||
[zh-Hant] <span style="background:#FFD8EB;color:#3F7F00;">
|
||||
之前為 帳單交易作業區 已變更 廣告內容 之前為 銷售代表 之前為 張貼日期為 百分比之前為 合約 為 目標對象條件已刪除 結束日期之前為 </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[ 5] zh-Hant 184B 343p 18400R,
|
||||
1 chunks scored<br>
|
||||
zh-Hant.100R(99%) 185 bytes = ChineseT <br><br>
|
||||
[ja] <span style="background:#D8FFFF;color:#000000;">
|
||||
このペ ジでは アカウントに指定された予算の履歴を一覧にしています それぞれの項目に</span>
|
||||
[] <span style="background:#D8FFFF;color:#000000;">
|
||||
は 予算額と特定期間のステ タスが表示されます 現在または今後の予算を設定するには </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[ 8] ja 238B 766p 23800R,
|
||||
2 chunks scored<br>
|
||||
ja.100R(99%) 239 bytes = Japanese <br><br>
|
||||
[ko] <span style="background:#E3D8FF;color:#000000;">
|
||||
개별적으로 리포트 액세스 권한을 부여할 수 있습니다 액세스 권한 부여사용자에게 프로필 리포</span>
|
||||
[] <span style="background:#E3D8FF;color:#000000;">
|
||||
트에 액세스할 수 있는 권한을 부여하시려면 가용 프로필 상자에서 프로필 이름을 선택한 다음 </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[ 9] ko 255B 924p 25500R,
|
||||
2 chunks scored<br>
|
||||
ko.100R(99%) 256 bytes = Korean <br><br>
|
||||
[af] <span style="background:#FFD8EB;color:#007F1F;">
|
||||
aam skukuza die naam beteken hy wat skoonvee of hy wat alles onderstebo keer wysig bosveldkampe boskampe is </span>
|
||||
[] <span style="background:#FFD8EB;color:#007F1F;">
|
||||
kleiner afgeleë ruskampe wat oor min fasiliteite beskik daar is geen restaurante </span>
|
||||
[] <span style="background:#FFD8EB;color:#007F1F;">
|
||||
of winkels nie en slegs oornagbesoekers word toegelaat bateleur </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[ 5] af 254B 248p 25400R,
|
||||
3 chunks scored<br>
|
||||
af.100R(99%) 255 bytes = AFRIKAANS <br><br>
|
||||
[sq] <span style="background:#D8FFF3;color:#7F5F00;">
|
||||
a do të kërkoni nga beogradi që të njohë pavarësinë e kosovës </span>
|
||||
[] <span style="background:#D8FFF3;color:#7F5F00;">
|
||||
zoti thaçi prishtina është gati ta njoh pavarësinë e serbisë ndërsa natyri</span>
|
||||
[] <span style="background:#D8FFF3;color:#7F5F00;">
|
||||
sht se do të kërkohet një gjë e tillë që edhe beogradi ta njoh shtetin e pavarur dhe sovran të </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[13] sq 253B 429p 24291R,
|
||||
3 chunks scored<br>
|
||||
sq.96R(99%) 254 bytes = ALBANIAN <br><br>
|
||||
[ar*.32/fa.10] <span style="background:#FFF7D8;color:#6F7F00;">
|
||||
احتيالية بيع أي حساب </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[ 6] ar 38B 32p 2736R,
|
||||
1 chunks scored<br>
|
||||
ar.72R(97%) 39 bytes = ARABIC <br><br>
|
||||
[az] <span style="background:#FFD8F7;color:#3F7F00;">
|
||||
a az qalıb breyn rinq intellektual oyunu üzrə yarışın zona mərhələləri keçiri</span>
|
||||
[] <span style="background:#FFD8F7;color:#3F7F00;">
|
||||
lib miq un qalıqlarının dənizdən çıxarılması davam edir məhəmməd </span>
|
||||
[] <span style="background:#FFD8F7;color:#3F7F00;">
|
||||
peyğəmbərin karikaturalarını çap edən qəzetin baş redaktoru iş otağında ölüb </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[10] az 256B 370p 25600R,
|
||||
3 chunks scored<br>
|
||||
az.100R(99%) 257 bytes = AZERBAIJANI <br><br>
|
||||
[eu] <span style="background:#E3D8FF;color:#6F7F00;">
|
||||
a den eraso bat honen kontra hortaz eragiketa bakarrik behar dituen eraso batek aes </span>
|
||||
[] <span style="background:#E3D8FF;color:#6F7F00;">
|
||||
apurtuko luke nahiz eta oraingoz eraso bideraezina izan gaur egungo teknologiaren mu</span>
|
||||
[] <span style="background:#E3D8FF;color:#6F7F00;">
|
||||
gak direla eta oraingoz kezka hauek alde batera utzi daitezke orain arteko indar </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[ 9] eu 249B 368p 24900R,
|
||||
3 chunks scored<br>
|
||||
eu.100R(99%) 250 bytes = BASQUE <br><br>
|
||||
[be] <span style="background:#F8D8FF;color:#7F5F00;">
|
||||
а друкаваць іх не было тэхнічна магчыма бліжэй за вільню тым самым ча</span>
|
||||
[] <span style="background:#F8D8FF;color:#7F5F00;">
|
||||
сам нямецкае кіраўніцтва прапаноўвала апроч ўвядзення лацінкі яе </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[15] be 248B 257p 24800R,
|
||||
2 chunks scored<br>
|
||||
be.100R(99%) 249 bytes = BELARUSIAN <br><br>
|
||||
[bn] <span style="background:#FFD8EB;color:#7F5F00;">
|
||||
ংখ্যা নমুনায়ন বিন্যাস পরিসংখ্যানিক মডেল পরিসংখ্যানিক সিদ্ধান্ত ফাংশন পরিসংখ্যানিক </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[ 5] bn 231B 144p 23100R,
|
||||
1 chunks scored<br>
|
||||
bn.100R(99%) 232 bytes = BENGALI <br><br>
|
||||
[hi] <span style="background:#D8F3FF;color:#7F5F00;">
|
||||
विकिपीडिया इंटरनेट आधारित एक मुक्त ज्ञानकोष परियोजना ह ई विकि के रुप मेँ बा </span>
|
||||
[bh] <span style="background:#D8F3FF;color:#6F7F00;">
|
||||
यानी एगो अईसन जाल पृष्ठ जे सभन के संपादन करे के छूट देवेला विकिपीडिया शब्द विकि अउर इनसाइक्लोपीड</span>
|
||||
[] <span style="background:#D8F3FF;color:#6F7F00;">
|
||||
िया ज्ञानकोष शब्दन के मिला के बनल बा विकिपीडिया एक बहुभाषीय प्रकल्प ह अउर स्</span>
|
||||
[] <span style="background:#D8F3FF;color:#6F7F00;">
|
||||
वयंसेवकन के सहकार से निर्मित बा जेहु के भी इंटरनेट तक पहुँच बा ऊ विकिपी</span>
|
||||
[] <span style="background:#D8F3FF;color:#6F7F00;">
|
||||
डिया पर लिख सकत बा अउर लेखन के संपादन कर सकत बा विकिपीडिया </span>
|
||||
[] <span style="background:#D8F3FF;color:#6F7F00;">
|
||||
के मुख्य सर्वर टैम्पा फ्लोरीडा में बा अतिरिक्त सर्वर एम्सटर्डम अउर सियोल में बा </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[ 3] hi 200B 105p 20000R,
|
||||
[11] bh 1002B 534p 100200R,
|
||||
6 chunks scored<br>
|
||||
{CloseLangPair: hi.100R,200B => bh}<br>
|
||||
bh.100R(99%) 1203 bytes = BIHARI <br><br>
|
||||
[bg] <span style="background:#FFEBD8;color:#7F2F00;">
|
||||
а дума попада в състояние на изпитание ключовите думи с пр</span>
|
||||
[] <span style="background:#FFEBD8;color:#7F2F00;">
|
||||
едсказана малко под то изискване на страниците за търсене в </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[11] bg 216B 183p 21600R,
|
||||
2 chunks scored<br>
|
||||
bg.100R(99%) 217 bytes = BULGARIAN <br><br>
|
||||
[ca] <span style="background:#E3FFD8;color:#6F7F00;">
|
||||
al final en un únic lloc nhorabona l correu electrònic està concebut com a eina de productivitat aleshores per què perdre </span>
|
||||
[] <span style="background:#E3FFD8;color:#6F7F00;">
|
||||
el temps arxivant missatges per després intentar recordar on els veu desar i per què heu d eliminar missatges importants per l </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[ 7] ca 255B 254p 22404R,
|
||||
2 chunks scored<br>
|
||||
ca.87R(99%) 256 bytes = CATALAN <br><br>
|
||||
[ceb] <span style="background:#FFD8EB;color:#001F7F;">
|
||||
ang sugbo usa sa mga labing ugmad nga lalawigan sa nasod kini ang sentro </span>
|
||||
[] <span style="background:#FFD8EB;color:#001F7F;">
|
||||
sa komersyo edukasyon ug industriya sa sentral ug habagatang dapit sa kapupod an ang mipada</span>
|
||||
[] <span style="background:#FFD8EB;color:#001F7F;">
|
||||
yag sa sugbo isip ikapito nga labing nindot nga pulo sa ang nag inusa</span>
|
||||
[tl*.42/ceb.39] <span style="background:#FFD8D8;color:#7F5F00;">
|
||||
rang pulo sa pilipinas nga napasidunggan sa maong magasin sukad pa sa tuig </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[ 0] tl 75B 42p 2025R,
|
||||
[ 5] ceb 233B 284p 23300R,
|
||||
4 chunks scored<br>
|
||||
{Unreli tl.27R,75B} ceb.100R(75%) 309 bytes = CEBUANO* <br><br>
|
||||
[sr] <span style="background:#D8FFF3;color:#7F2F00;">
|
||||
fender precision bass prva je bas gitara ikada napravljena i promovirana na tržištu model se pojavio a svoj </span>
|
||||
[] <span style="background:#D8FFF3;color:#7F2F00;">
|
||||
prepoznatljiv dizajn koji i dan danas ima dobio je godine autor tog instrumenta je leo fender koji je imao </span>
|
||||
[hr] <span style="background:#EFFFD8;color:#7F2F00;">
|
||||
namjeru napraviti instrument koji bi bio manji od kontrabasa i kojega bi mogli glazbenici staviti u automobil bas </span>
|
||||
[] <span style="background:#EFFFD8;color:#7F2F00;">
|
||||
gitara je zauvijek promijenila glazbu i model precision se i danas pojavljuje u svim žanrovima rock heavy metal pop i slično </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[12] hr 241B 212p 24100R,
|
||||
[13] sr 217B 201p 20844R,
|
||||
4 chunks scored<br>
|
||||
{CloseLangPair: sr.96R,217B => hr}<br>
|
||||
hr.98R(99%) 459 bytes = CROATIAN <br><br>
|
||||
[cs] <span style="background:#F8FFD8;color:#7F2F00;">
|
||||
a akci opakujte film uložen vykreslit gmail tokio smazat obsah adresáře nelze načíst sy</span>
|
||||
[] <span style="background:#F8FFD8;color:#7F2F00;">
|
||||
stémový profil jednotky smoot okud používáte pro určení polokoule značky z západ </span>
|
||||
[] <span style="background:#F8FFD8;color:#7F2F00;">
|
||||
nebo v východ používejte nezáporné hodnoty zeměpisné délky nelze </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[ 1] cs 255B 292p 23476R,
|
||||
3 chunks scored<br>
|
||||
cs.92R(99%) 256 bytes = CZECH <br><br>
|
||||
[da] <span style="background:#F8FFD8;color:#000000;">
|
||||
a z tallene og punktummer der er tilladte log ud angiv den ønskede adgangskode igen november gem personlige </span>
|
||||
[] <span style="background:#F8FFD8;color:#000000;">
|
||||
oplysninger kontrolspørgsmål det sidste tegn i dit brugernavn skal være et bogstav a z eller tal skriv de tegn du kan se i billedet nedenfor </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[ 1] da 253B 296p 22564R,
|
||||
2 chunks scored<br>
|
||||
da.89R(99%) 254 bytes = DANISH <br><br>
|
||||
[nl] <span style="background:#D8FFE7;color:#000000;">
|
||||
a als volgt te werk om een configuratiebestand te maken sitemap gen py ebruik filters om de s op te ge</span>
|
||||
[] <span style="background:#D8FFE7;color:#000000;">
|
||||
ven die moeten worden toegevoegd of uitgesloten op basis van de opmaaktaal elke sitemap mag alleen de s bevatten voor een bepaalde opmaaktaal dit </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[ 2] nl 248B 253p 24800R,
|
||||
2 chunks scored<br>
|
||||
nl.100R(99%) 249 bytes = DUTCH <br><br>
|
||||
[en] <span style="background:#FFFFF4;color:#000000;">
|
||||
a backup credit card by visiting your billing preferences page or visit the adwords </span>
|
||||
[] <span style="background:#FFFFF4;color:#000000;">
|
||||
help centre for more details https adwords google com support bin answer py answer hl en </span>
|
||||
[] <span style="background:#FFFFF4;color:#000000;">
|
||||
we were unable to process the payment of for your outstanding google adwords </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[ 0] en 250B 275p 24555R,
|
||||
3 chunks scored<br>
|
||||
en.98R(99%) 251 bytes = ENGLISH <br><br>
|
||||
[et] <span style="background:#D8FFFF;color:#7F2F00;">
|
||||
a niipea kui sinu maksimaalne igakuine krediidi limiit on meie poolt heaks kiidetud on sinu kohustuseks see krediidilimiit </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[ 8] et 123B 155p 12300R,
|
||||
1 chunks scored<br>
|
||||
et.100R(99%) 124 bytes = ESTONIAN <br><br>
|
||||
[fi] <span style="background:#D8F3FF;color:#000000;">
|
||||
a joilla olet käynyt tämä kerro meille kuka ä olet ei tunnistettavia käyttötietoja kuten virhe</span>
|
||||
[] <span style="background:#D8F3FF;color:#000000;">
|
||||
raportteja käytetään google desktopin parantamiseen etsi näyttää mu</span>
|
||||
[] <span style="background:#D8F3FF;color:#000000;">
|
||||
kautettuja uutisia google desktop keskivaihto leikkaa voit kaksoisnapsauttaa </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[ 3] fi 250B 308p 25000R,
|
||||
3 chunks scored<br>
|
||||
fi.100R(99%) 251 bytes = FINNISH <br><br>
|
||||
[fr] <span style="background:#EFD8FF;color:#000000;">
|
||||
a accès aux collections et aux frontaux qui lui ont été attribués il peut consulter </span>
|
||||
[] <span style="background:#EFD8FF;color:#000000;">
|
||||
et modifier ses collections et exporter des configurations de collection toutefois </span>
|
||||
[] <span style="background:#EFD8FF;color:#000000;">
|
||||
il ne peut pas créer ni supprimer des collections enfin il a accès aux fonctions </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[ 4] fr 254B 347p 23564R,
|
||||
3 chunks scored<br>
|
||||
fr.92R(99%) 255 bytes = FRENCH <br><br>
|
||||
[gl] <span style="background:#F8D8FF;color:#7F2F00;">
|
||||
debe ser como mínimo taranto tendas de venda polo miúdo cociñas servizos bordado ca</span>
|
||||
[] <span style="background:#F8D8FF;color:#7F2F00;">
|
||||
nadá viaxes parques de vehículos de recreo hotel oriental habitación recibir unha postal no enderezo indicado anteriormente </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[15] gl 213B 149p 21300R,
|
||||
2 chunks scored<br>
|
||||
gl.100R(99%) 214 bytes = GALICIAN <br><br>
|
||||
[lg] <span style="background:#D8E7FF;color:#004F7F;">
|
||||
abaana ba bani lukaaga mu ana mu babiri abaana ba bebayi lukaaga mu abiri mu ba</span>
|
||||
[] <span style="background:#D8E7FF;color:#004F7F;">
|
||||
satu abaana ba azugaadi lukumi mu ebikumi bibiri mu abiri mu babiri abaana ba adoni</span>
|
||||
[] <span style="background:#D8E7FF;color:#004F7F;">
|
||||
kamu lukaaga mu nltaaga mu mukaaga abaana ba biguvaayi enkumi bbiri mu ataano mu mukaaga </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[14] lg 251B 340p 25100R,
|
||||
3 chunks scored<br>
|
||||
lg.100R(99%) 252 bytes = GANDA <br><br>
|
||||
[de] <span style="background:#FFD8EB;color:#000000;">
|
||||
abschnitt ordner aktivieren werden die ordnereinstellungen im fa</span>
|
||||
[] <span style="background:#FFD8EB;color:#000000;">
|
||||
rbabschnitt deaktiviert öchten sie wirklich fortfahren eldtypen angeben optional n die</span>
|
||||
[] <span style="background:#FFD8EB;color:#000000;">
|
||||
sem schritt geben sie für jedesfeld aus dem datenset den typ an ieser schritt ist optional eldtypen </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[ 5] de 252B 308p 25200R,
|
||||
3 chunks scored<br>
|
||||
de.100R(99%) 253 bytes = GERMAN <br><br>
|
||||
[ht] <span style="background:#FFEBD8;color:#007F7F;">
|
||||
ak pitit tout sosyete a chita se pou sa leta dwe pwoteje yo nimewo leta fèt pou </span>
|
||||
[] <span style="background:#FFEBD8;color:#007F7F;">
|
||||
li pwoteje tout paran ak pitit nan peyi a menm jan kit paran yo marye kit yo pa marye </span>
|
||||
[] <span style="background:#FFEBD8;color:#007F7F;">
|
||||
tout manman ki fè pitit leta fèt pou ba yo konkoul menm jan tou pou timoun piti ak pou </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[11] ht 256B 475p 21936R,
|
||||
3 chunks scored<br>
|
||||
ht.85R(99%) 257 bytes = HAITIAN_CREOLE <br><br>
|
||||
[iw] <span style="background:#FFF7D8;color:#000000;">
|
||||
או לערוך את העדפות ההפצה אנא עקוב אחרי השלבים הבאים כנס לחשבון האישי שלך ב </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[ 6] iw 135B 106p 13500R,
|
||||
1 chunks scored<br>
|
||||
iw.100R(99%) 136 bytes = HEBREW <br><br>
|
||||
[hi] <span style="background:#D8F3FF;color:#7F5F00;">
|
||||
ं ऐडवर्ड्स विज्ञापनों के अनुभव पर आधारित हैं और इनकी मदद से आपको अपने विज्ञापनों का अधिकतम लाभ </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[ 3] hi 249B 176p 24900R,
|
||||
1 chunks scored<br>
|
||||
hi.100R(99%) 250 bytes = HINDI <br><br>
|
||||
[blu] <span style="background:#D8FFFF;color:#001F7F;">
|
||||
kuv hlub koj txawm lub ntuj yuav si ntshi nphaus los kuv tsis ua siab nkaug </span>
|
||||
[] <span style="background:#D8FFFF;color:#001F7F;">
|
||||
txawm ntiab teb yuav si ntshi nphaus los kuv tseem ua lon tsaug vim kuv hlub koj tag lub siab </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[ 8] blu 170B 384p 17000R,
|
||||
2 chunks scored<br>
|
||||
blu.100R(99%) 171 bytes = HMONG <br><br>
|
||||
[hu] <span style="background:#E3FFD8;color:#7F2F00;">
|
||||
a felhasználóim a google azonosító szöveget ikor látják a felhasználóim a google azono</span>
|
||||
[] <span style="background:#E3FFD8;color:#7F2F00;">
|
||||
sító szöveget felhasználók a google azonosító szöveget fogják látni </span>
|
||||
[] <span style="background:#E3FFD8;color:#7F2F00;">
|
||||
minden tranzakció után ha a vásárlását regisztrációját oldalunk </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[ 7] hu 246B 346p 24600R,
|
||||
3 chunks scored<br>
|
||||
hu.100R(99%) 247 bytes = HUNGARIAN <br><br>
|
||||
[is] <span style="background:#D8F3FF;color:#7F2F00;">
|
||||
a afköst leitarorða þinna leitarorð neikvæð leitarorð auglýsingahópa byggja upp aðallista yfir ný leita</span>
|
||||
[] <span style="background:#D8F3FF;color:#7F2F00;">
|
||||
rorð fyrir auglýsingahópana og skoða ítarleg gögn um árangur </span>
|
||||
[] <span style="background:#D8F3FF;color:#7F2F00;">
|
||||
leitarorða eins og samkeppni auglýsenda og leitarmagn er krafist notkun </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[ 3] is 256B 332p 25600R,
|
||||
3 chunks scored<br>
|
||||
is.100R(99%) 257 bytes = ICELANDIC <br><br>
|
||||
[ms] <span style="background:#D8FFFF;color:#7F5F00;">
|
||||
geng pengembaraan bermula adalah film animasi d cgi pertama yang diproduksi di mala</span>
|
||||
[id] <span style="background:#FFF7D8;color:#7F5F00;">
|
||||
ysia film ini dibuat oleh les copaque production lcp dan dirilis di bioskop bioskop seluruh ma</span>
|
||||
[ms] <span style="background:#D8FFFF;color:#7F5F00;">
|
||||
laysia pada februari film geng pertama kali diluncurkan dalam sebuah acara peluncuran pada </span>
|
||||
[id] <span style="background:#FFF7D8;color:#7F5F00;">
|
||||
september bersama dengan serial animasi pendek upin ipin yang berhubungan dengan film tersebut pembuatan film ini </span>
|
||||
[] <span style="background:#FFF7D8;color:#7F5F00;">
|
||||
didukung oleh berbagai pihak seperti kementerian sains teknologi dan inovasi malaysia mosti dengan memberi bantuan berupa dana sebesar rm juta </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[ 6] id 351B 427p 35100R,
|
||||
[ 8] ms 174B 207p 17400R,
|
||||
5 chunks scored<br>
|
||||
{CloseLangPair: ms.100R,174B => id}<br>
|
||||
id.100R(99%) 526 bytes = INDONESIAN <br><br>
|
||||
[ga] <span style="background:#D8E7FF;color:#7F2F00;">
|
||||
a bhfuil na focail go léir i do cheist le fáil orthu ní gá ach focail bre</span>
|
||||
[] <span style="background:#D8E7FF;color:#7F2F00;">
|
||||
ise a chur leis na cinn a cuardaíodh cheana chun an cuardach a bheachtú nó a chúngú má chu</span>
|
||||
[] <span style="background:#D8E7FF;color:#7F2F00;">
|
||||
irtear focal breise isteach aimseofar fo aicme ar leith de na torthaí a fuarthas </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[14] ga 255B 378p 25500R,
|
||||
3 chunks scored<br>
|
||||
ga.100R(99%) 256 bytes = IRISH <br><br>
|
||||
[it] <span style="background:#E3FFD8;color:#000000;">
|
||||
a causa di un intervento di manutenzione del sistema fino alle ore circa ora legale costa del pacifico del novembre le ca</span>
|
||||
[] <span style="background:#E3FFD8;color:#000000;">
|
||||
mpagne esistenti continueranno a essere pubblicate come di consueto anche durante questo breve periodo di inattività ci scusiamo per </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[ 7] it 255B 228p 23490R,
|
||||
2 chunks scored<br>
|
||||
it.92R(99%) 256 bytes = ITALIAN <br><br>
|
||||
[jw] <span style="background:#FFD8D8;color:#6F7F00;">
|
||||
account ten server niki kalian username meniko tanpo judul cacahe account nggonanmu wes pol pesen mu wes di</span>
|
||||
[] <span style="background:#FFD8D8;color:#6F7F00;">
|
||||
guwak pesenan mu wes di simpen sante wae pesenan mu wes ke kirim mbuh te</span>
|
||||
[jw*.49/id.42] <span style="background:#FFD8D8;color:#6F7F00;">
|
||||
kan ora pesenan e ke kethok pesenan mu wes ke kirim mbuh tekan ora pesenan </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[ 0] jw 254B 153p 23150R,
|
||||
3 chunks scored<br>
|
||||
jw.91R(99%) 255 bytes = JAVANESE <br><br>
|
||||
[rw] <span style="background:#F8D8FF;color:#007F7F;">
|
||||
dore ibyo ukeneye kumenya ukwo watubona ibibazo byinshi abandi babaza ububonero </span>
|
||||
[] <span style="background:#F8D8FF;color:#007F7F;">
|
||||
byibibina google onjela ho izina dyikyibina kyawe onjela ho yawe mulugo kulaho ibyandiko byawe sh</span>
|
||||
[] <span style="background:#F8D8FF;color:#007F7F;">
|
||||
yilaho tegula yawe tulubaka tukongeraho iyanya mishya buliko tulambula </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[15] rw 248B 294p 23680R,
|
||||
3 chunks scored<br>
|
||||
rw.95R(99%) 249 bytes = KINYARWANDA <br><br>
|
||||
[lv] <span style="background:#EFD8FF;color:#7F2F00;">
|
||||
a gadskārtējā izpārdošana slēpošana jāņi atlaide izmaiņas trafikā kas saistītas ar sezo</span>
|
||||
[] <span style="background:#EFD8FF;color:#7F2F00;">
|
||||
nas izpārdošanu speciālajām atlaidēm u c ir parastas un atslēgvārdi kas </span>
|
||||
[] <span style="background:#EFD8FF;color:#7F2F00;">
|
||||
ir populāri noteiktos laika posmos šajā laikā saņems lielāku klikšķu </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[ 4] lv 255B 304p 25500R,
|
||||
3 chunks scored<br>
|
||||
lv.100R(99%) 256 bytes = LATVIAN <br><br>
|
||||
[lt] <span style="background:#FFD8EB;color:#7F2F00;">
|
||||
a išsijungia mano idėja dėl geriausio laiko po pastarųjų savo santykių pasimo</span>
|
||||
[] <span style="background:#FFD8EB;color:#7F2F00;">
|
||||
kiau penki dalykai be kurių negaliu gyventi mano miegamajame tu surasi ide</span>
|
||||
[] <span style="background:#FFD8EB;color:#7F2F00;">
|
||||
ali pora išsilavinimas aukštoji mokykla koledžas universitetas pagrindinis laipsnis metai </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[ 5] lt 251B 311p 25100R,
|
||||
3 chunks scored<br>
|
||||
lt.100R(99%) 252 bytes = LITHUANIAN <br><br>
|
||||
[mk] <span style="background:#EFD8FF;color:#7F5F00;">
|
||||
гласовите коалицијата на вмро дпмне како партија со најмногу ос</span>
|
||||
[] <span style="background:#EFD8FF;color:#7F5F00;">
|
||||
воени гласови ќе добие евра а на сметката на коализијата за македонија </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[ 4] mk 247B 295p 24700R,
|
||||
2 chunks scored<br>
|
||||
mk.100R(99%) 248 bytes = MACEDONIAN <br><br>
|
||||
[ms] <span style="background:#D8FFFF;color:#7F5F00;">
|
||||
daripada dirinya hirako shinji seorang pemuda merujuk diri mereka seba</span>
|
||||
[] <span style="background:#D8FFFF;color:#7F5F00;">
|
||||
gai vizard shinji telah cuba untuk menyakinkan ichigo untuk menyertai ku</span>
|
||||
[] <span style="background:#D8FFFF;color:#7F5F00;">
|
||||
mpulan mereka mengatakan bahawa hanya dia sahaja yang mampu mengajar ichigo teknik untuk mengawal hollow </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[ 8] ms 247B 426p 24490R,
|
||||
3 chunks scored<br>
|
||||
ms.99R(99%) 248 bytes = MALAY <br><br>
|
||||
[mt] <span style="background:#F8FFD8;color:#3F7F00;">
|
||||
ata ikteb messaġġ lil indirizzi differenti billi tagħżilhom u tagħfas il buttuna ik</span>
|
||||
[] <span style="background:#F8FFD8;color:#3F7F00;">
|
||||
teb żid numri tfittxijja tal kotba mur print home kotba minn pagni ghal pagna minn </span>
|
||||
[] <span style="background:#F8FFD8;color:#3F7F00;">
|
||||
ghall ktieb ta aċċessa stieden habib iehor grazzi it tim tal gruppi google </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[ 1] mt 249B 319p 24900R,
|
||||
3 chunks scored<br>
|
||||
mt.100R(99%) 250 bytes = MALTESE <br><br>
|
||||
[mr] <span style="background:#FFD8D8;color:#3F7F00;">
|
||||
हैदराबाद उच्चार ऐका सहाय्य माहिती तेलुगू </span>
|
||||
[te] <span style="background:#EFFFD8;color:#7F5F00;">
|
||||
హైదరాబాదు </span>
|
||||
[hi] <span style="background:#D8F3FF;color:#7F5F00;">
|
||||
उर्दू </span>
|
||||
[ur*.14/fa.10] <span style="background:#D8FFE7;color:#6F7F00;">
|
||||
حیدر آباد </span>
|
||||
[mr] <span style="background:#FFD8D8;color:#3F7F00;">
|
||||
हे भारतातील आंध्र प्रदेश राज्याच्या राजधानीचे शहर आहे हैदराबादची लोकसंख</span>
|
||||
[] <span style="background:#FFD8D8;color:#3F7F00;">
|
||||
्या लाख हजार आहे मोत्यांचे शहर अशी एकेकाळी ओळख असलेल्या या शहराला ऐतिहासिक सांस्क</span>
|
||||
[] <span style="background:#FFD8D8;color:#3F7F00;">
|
||||
ृतिक आणि स्थापत्यशास्त्रीय वारसा लाभला आहे नंतर शिक्षण आणि माहिती तंत्रज्ञान त्याचप्र</span>
|
||||
[] <span style="background:#FFD8D8;color:#3F7F00;">
|
||||
माणे औषधनिर्मिती आणि जैवतंत्रज्ञान क्षेत्रातील उद्योगधंद्यांची वाढ शह</span>
|
||||
[] <span style="background:#FFD8D8;color:#3F7F00;">
|
||||
रात झाली दक्षिण मध्य भारतातील पर्यटन आणि तेलुगू चित्रपटनिर्मितीचे हैदराबाद हे केंद्र आहे </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[ 0] mr 1190B 736p 117890R,
|
||||
[ 2] ur 18B 14p 648R,
|
||||
[ 3] hi 16B 10p 1600R,
|
||||
[12] te 29B 29p 2900R,
|
||||
9 chunks scored<br>
|
||||
{CloseLangPair: hi.100R,16B => mr}<br>
|
||||
{Unreli ur.36R,18B} mr.99R(95%) te.100R(3%) 1257 bytes = MARATHI <br><br>
|
||||
[ne] <span style="background:#FFEBD8;color:#7F5F00;">
|
||||
अरू ठाऊँबाटपनि खुलेको छ यो खाता अर अरू ठाऊँबाटपनि खुलेको छ यो खाता अर ू </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[11] ne 186B 56p 17484R,
|
||||
1 chunks scored<br>
|
||||
ne.94R(99%) 187 bytes = NEPALI <br><br>
|
||||
[no] <span style="background:#FFD8F7;color:#000000;">
|
||||
a er obligatorisk tidsforskyvning plassering av katalogsøk planinformasjon loggfilbane gruppe</span>
|
||||
[] <span style="background:#FFD8F7;color:#000000;">
|
||||
navn kontoinformasjon passord domene gruppeinformasjon alle kampanjesporing alternativ bruker grupper oppgaveplanlegger oppgavehistorikk kontosammendrag antall </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[10] no 254B 250p 25400R,
|
||||
2 chunks scored<br>
|
||||
no.100R(99%) 255 bytes = NORWEGIAN <br><br>
|
||||
[fa] <span style="background:#D8FFF3;color:#3F7F00;">
|
||||
آب خوردن عجله می کردند به جای باز ی کتک کاری می کردند و همه چيز مثل قبل بود فقط من ماندم و يک دنيا حرف و انتظار تا عاقبت رسيد احضاريه ی ای با </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[13] fa 249B 231p 24900R,
|
||||
1 chunks scored<br>
|
||||
fa.100R(99%) 250 bytes = PERSIAN <br><br>
|
||||
[pl] <span style="background:#FFEBD8;color:#000000;">
|
||||
a australii będzie widział inne reklamy niż użytkownik z kanady kierowanie geograficzne spra</span>
|
||||
[] <span style="background:#FFEBD8;color:#000000;">
|
||||
wia że reklamy są lepiej dopasowane do użytkownika twojej strony oznacza </span>
|
||||
[] <span style="background:#FFEBD8;color:#000000;">
|
||||
to także że możesz nie zobaczyć wszystkich reklam które są wyświetlane na </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[11] pl 253B 444p 25300R,
|
||||
3 chunks scored<br>
|
||||
pl.100R(99%) 254 bytes = POLISH <br><br>
|
||||
[pt] <span style="background:#EFFFD8;color:#000000;">
|
||||
a abit prevê que a entrada desses produtos estrangeiros no mercado têxtil e vestuário do brasil possa redu</span>
|
||||
[] <span style="background:#EFFFD8;color:#000000;">
|
||||
zir os preços em cerca de a partir de má notícia para os empresários que terão que lutar para garantir suas margens de lucro mas boa notícia </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[12] pt 256B 234p 25600R,
|
||||
2 chunks scored<br>
|
||||
pt.100R(99%) 257 bytes = PORTUGUESE <br><br>
|
||||
[ro] <span style="background:#FFF7D8;color:#7F2F00;">
|
||||
a anunţurilor reţineţi nu plătiţi pentru clicuri sau impresii ci numai atunci când pe </span>
|
||||
[] <span style="background:#FFF7D8;color:#7F2F00;">
|
||||
site ul dvs survine o acţiune dorită site urile negative nu </span>
|
||||
[] <span style="background:#FFF7D8;color:#7F2F00;">
|
||||
pot avea uri de destinaţie daţi instrucţiuni societăţii dvs bancare sau constructoare să </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[ 6] ro 249B 259p 24716R,
|
||||
3 chunks scored<br>
|
||||
ro.99R(99%) 250 bytes = ROMANIAN <br><br>
|
||||
[ro] <span style="background:#FFF7D8;color:#7F2F00;">
|
||||
оперативэ а органелор ши институциилор екзекутиве ши а органелор жу</span>
|
||||
[] <span style="background:#FFF7D8;color:#7F2F00;">
|
||||
дичиаре але путерий де стат фиекэруй орган ал путерий де стат и се </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[ 6] ro 246B 224p 24600R,
|
||||
2 chunks scored<br>
|
||||
ro.100R(99%) 247 bytes = ROMANIAN <br><br>
|
||||
[ru] <span style="background:#D8FFF3;color:#000000;">
|
||||
а неправильный формат идентификатора дн назад </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[13] ru 86B 47p 8600R,
|
||||
1 chunks scored<br>
|
||||
ru.100R(98%) 87 bytes = RUSSIAN <br><br>
|
||||
[gd] <span style="background:#D8FFF3;color:#6F7F00;">
|
||||
air son is gum bi casg air a h uile briosgaid no gum faigh thu brath nuair a tha </span>
|
||||
[] <span style="background:#D8FFF3;color:#6F7F00;">
|
||||
briosgaid a tighinn gad rannsachadh ghoogle gu ceart mura bheil briosgaidean </span>
|
||||
[] <span style="background:#D8FFF3;color:#6F7F00;">
|
||||
ceadaichte cuiridh google briosgaid dha do neach cleachdaidh fa leth tha google a cleachdadh </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[13] gd 251B 370p 25100R,
|
||||
3 chunks scored<br>
|
||||
gd.100R(99%) 252 bytes = SCOTS_GAELIC <br><br>
|
||||
[sr] <span style="background:#D8FFF3;color:#7F2F00;">
|
||||
балчак балчак на мапи србије уреди демографија у насељу балчак живи пунолетна </span>
|
||||
[] <span style="background:#D8FFF3;color:#7F2F00;">
|
||||
становника а просечна старост становништва износи година </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[13] sr 251B 252p 24351R,
|
||||
2 chunks scored<br>
|
||||
sr.97R(99%) 252 bytes = SERBIAN <br><br>
|
||||
[sr] <span style="background:#D8FFF3;color:#7F2F00;">
|
||||
autonomnih pokrajina saveznim zakonom može se propisati poseban sastav organizacija i delokrug saveta za poslove narodne </span>
|
||||
[] <span style="background:#D8FFF3;color:#7F2F00;">
|
||||
odbrane članove saveta federacije bira na predlog predsedništva savezna skupština iz reda društveno političkih i drugih javnih </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[13] sr 254B 213p 25400R,
|
||||
2 chunks scored<br>
|
||||
sr.100R(99%) 255 bytes = SERBIAN <br><br>
|
||||
[sk] <span style="background:#EFD8FF;color:#3F7F00;">
|
||||
a aktivovať reklamnú kampaň ak chcete kampaň pred spustením ešte prispôsobiť uložte ju </span>
|
||||
[] <span style="background:#EFD8FF;color:#3F7F00;">
|
||||
ako šablónu a pokračujte v úprave vyberte si jednu z možností nižšie a kli</span>
|
||||
[] <span style="background:#EFD8FF;color:#3F7F00;">
|
||||
knite na tlačidlo uložiť kampaň nastavenia kampane môžete ľubovoľne </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[ 4] sk 254B 320p 25400R,
|
||||
3 chunks scored<br>
|
||||
sk.100R(99%) 255 bytes = SLOVAK <br><br>
|
||||
[sl] <span style="background:#F8D8FF;color:#6F7F00;">
|
||||
adsense stanje prijave za google adsense google adsense račun je bil začasno za</span>
|
||||
[] <span style="background:#F8D8FF;color:#6F7F00;">
|
||||
mrznjen pozdravljeni hvala za vaše zanimanje v google adsense po pregledu vaše prijavnice </span>
|
||||
[] <span style="background:#F8D8FF;color:#6F7F00;">
|
||||
so naši strokovnjaki ugotovili da spletna stran ki je trenutno povezana z vašim </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[15] sl 255B 235p 25500R,
|
||||
3 chunks scored<br>
|
||||
sl.100R(99%) 256 bytes = SLOVENIAN <br><br>
|
||||
[es] <span style="background:#D8E7FF;color:#000000;">
|
||||
a continuación haz clic en el botón obtener ruta también puedes desplazarte hasta el final de la página para </span>
|
||||
[] <span style="background:#D8E7FF;color:#000000;">
|
||||
cambiar tus opciones de búsqueda gráfico y detalles ésta es una lista de los vídeos que te recomendamos nuestras recomendaciones se basan </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[14] es 255B 297p 22675R,
|
||||
2 chunks scored<br>
|
||||
es.88R(99%) 256 bytes = SPANISH <br><br>
|
||||
[sw] <span style="background:#D8E7FF;color:#6F7F00;">
|
||||
a ujumbe mpya jumla unda tafuta na angalia vikundi vya kujadiliana na kushiriki </span>
|
||||
[] <span style="background:#D8E7FF;color:#6F7F00;">
|
||||
mawazo iliyopangwa kwa tarehe watumiaji wapya futa orodha hizi lugha </span>
|
||||
[] <span style="background:#D8E7FF;color:#6F7F00;">
|
||||
hoja vishikanisho vilivyo dhaminiwa ujumbe sanaa na tamasha toka udhibitisho wa neno kwa haraka fikia </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[14] sw 251B 405p 21897R,
|
||||
3 chunks scored<br>
|
||||
sw.87R(99%) 252 bytes = SWAHILI <br><br>
|
||||
[sv] <span style="background:#F8D8FF;color:#000000;">
|
||||
a bort objekt från google desktop post äldst meny öretag dress etaljer alternativ för vad är inne yaste google skrivbord plugin </span>
|
||||
[] <span style="background:#F8D8FF;color:#000000;">
|
||||
program för nyheter google visa nyheter som är anpassade efter de artiklar som du läser om du till exempel läser </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[15] sv 250B 196p 25000R,
|
||||
2 chunks scored<br>
|
||||
sv.100R(99%) 251 bytes = SWEDISH <br><br>
|
||||
[tl] <span style="background:#FFD8D8;color:#7F5F00;">
|
||||
a na ugma sa google ay nakaka bantog sa gitna nang kliks na nangyayari sa </span>
|
||||
[] <span style="background:#FFD8D8;color:#7F5F00;">
|
||||
pamamagitan nang ordinaryong paggagamit at sa kliks na likha nang pandaraya o </span>
|
||||
[] <span style="background:#FFD8D8;color:#7F5F00;">
|
||||
hindi tunay na paggamit bunga nito nasasala namin ang mga kliks na hindi kailangan o hindi gusto nang </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[ 0] tl 254B 282p 25400R,
|
||||
3 chunks scored<br>
|
||||
tl.100R(99%) 255 bytes = TAGALOG <br><br>
|
||||
[tr] <span style="background:#F8FFD8;color:#7F5F00;">
|
||||
a ayarlarınızı görmeniz ve yönetmeniz içindir eğer kampanyanız için günlük bütçe</span>
|
||||
[] <span style="background:#F8FFD8;color:#7F5F00;">
|
||||
nizi gözden geçirebileceğiniz yeri arıyorsanız kampanya yönetimi ne </span>
|
||||
[] <span style="background:#F8FFD8;color:#7F5F00;">
|
||||
gidin kampanyanızı seçin ve kampanya ayarlarını düzenle yi tıklayın sunumu </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[ 1] tr 250B 351p 25000R,
|
||||
3 chunks scored<br>
|
||||
tr.100R(99%) 251 bytes = TURKISH <br><br>
|
||||
[uk] <span style="background:#D8FFE7;color:#7F5F00;">
|
||||
а більший бюджет щоб забезпечити собі максимум прибутків від пе</span>
|
||||
[] <span style="background:#D8FFE7;color:#7F5F00;">
|
||||
реходів відстежуйте свої об яви за датою географічним розташуванням </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[ 2] uk 244B 205p 24400R,
|
||||
2 chunks scored<br>
|
||||
uk.100R(99%) 245 bytes = UKRAINIAN <br><br>
|
||||
[ur] <span style="background:#D8FFE7;color:#6F7F00;">
|
||||
آپ کو کم سے کم ممکنہ رقم چارج کرتا ہے اس کی مثال کے طور پر فرض کریں </span>
|
||||
[] <span style="background:#D8FFE7;color:#6F7F00;">
|
||||
اگر آپ کی زیادہ سے زیادہ قیمت فی کلِک امریکی ڈالر اور کلِک کرنے کی شرح ہو تو </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[ 2] ur 254B 314p 25400R,
|
||||
2 chunks scored<br>
|
||||
ur.100R(99%) 255 bytes = URDU <br><br>
|
||||
[vi] <span style="background:#D8FFE7;color:#3F7F00;">
|
||||
adsense cho nội dung nhà cung cấp dịch vụ di động xác minh tín du</span>
|
||||
[] <span style="background:#D8FFE7;color:#3F7F00;">
|
||||
̣ng thay đổi nhãn kg các ô xem chi phí cho từ chối các đơn đặt hàng dạng cấp dữ liệu ác minh trang web của bạn để xem </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[ 2] vi 252B 202p 24365R,
|
||||
2 chunks scored<br>
|
||||
vi.96R(99%) 253 bytes = VIETNAMESE <br><br>
|
||||
[cy] <span style="background:#FFD8F7;color:#7F5F00;">
|
||||
a chofrestru eich cyfrif ymwelwch a unwaith i chi greu eich cyfrif mi fydd yn </span>
|
||||
[] <span style="background:#FFD8F7;color:#7F5F00;">
|
||||
cael ei hysbysu o ch cyfeiriad ebost newydd fel eich bod yn gallu cadw mewn </span>
|
||||
[] <span style="background:#FFD8F7;color:#7F5F00;">
|
||||
cysylltiad drwy gmail os nad ydych chi wedi clywed yn barod am gmail mae n gwasanaeth gwebost </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[10] cy 248B 426p 24800R,
|
||||
3 chunks scored<br>
|
||||
cy.100R(99%) 249 bytes = WELSH <br><br>
|
||||
[yi] <span style="background:#FFEBD8;color:#0F7F00;">
|
||||
און פאנטאזיע ער איז באקאנט צים מערסטן פאר זיינע באַלאַדעס ער האָט געוווינט אין ווארשע יעס פאריס ליווערפול און לאנדאן סוף כל סוף איז ער </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[11] yi 245B 215p 24500R,
|
||||
1 chunks scored<br>
|
||||
yi.100R(99%) 246 bytes = YIDDISH <br><br>
|
||||
[ms] <span style="background:#D8FFFF;color:#7F5F00;">
|
||||
sukiyaki wikipedia indonesia ensiklopedia bebas berbahasa bebas berbahasa </span>
|
||||
[id] <span style="background:#FFF7D8;color:#7F5F00;">
|
||||
indonesia langsung ke navigasi cari untuk pengertian lain dari sukiyaki lihat suki</span>
|
||||
[] <span style="background:#FFF7D8;color:#7F5F00;">
|
||||
yaki irisan tipis daging sapi sayur sayuran dan tahu di dalam panci besi yang </span>
|
||||
[] <span style="background:#FFF7D8;color:#7F5F00;">
|
||||
dimasak di atas meja makan dengan cara direbus sukiyaki dimakan dengan mence </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[ 6] id 237B 257p 23700R,
|
||||
[ 8] ms 74B 77p 7400R,
|
||||
4 chunks scored<br>
|
||||
{CloseLangPair: ms.100R,74B => id}<br>
|
||||
id.100R(99%) 312 bytes = INDONESIAN <br><br>
|
||||
[ms] <span style="background:#D8FFFF;color:#7F5F00;">
|
||||
sukiyaki wikipedia bahasa melayu ensiklopedia bebas sukiyaki dari wikipedia bahasa melayu </span>
|
||||
[] <span style="background:#D8FFFF;color:#7F5F00;">
|
||||
ensiklopedia bebas lompat ke navigasi gelintar sukiyaki sukiyaki hirisan tipis daging lembu sayur sayuran </span>
|
||||
[] <span style="background:#D8FFFF;color:#7F5F00;">
|
||||
dan tauhu di dalam periuk besi yang dimasak di atas meja makan dengan cara rebusan sukiyaki dimakan dengan mence </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[ 8] ms 309B 326p 29310R,
|
||||
3 chunks scored<br>
|
||||
ms.94R(99%) 310 bytes = MALAY <br><br>
|
||||
[fr] <span style="background:#EFD8FF;color:#000000;">
|
||||
a accès aux chiens et aux frontaux qui lui ont été il peut consulter et modifier ses co</span>
|
||||
[en] <span style="background:#FFFFF4;color:#000000;">
|
||||
llections et exporter this article is about the country france is the largest country </span>
|
||||
[] <span style="background:#FFFFF4;color:#000000;">
|
||||
in western europe and the third largest in europe as a whole cet article concerne le pays euro</span>
|
||||
[fr] <span style="background:#EFD8FF;color:#000000;">
|
||||
péen aujourd hui appelé république française pour d autres usages du nom france motoring events began soon </span>
|
||||
[en] <span style="background:#FFFFF4;color:#000000;">
|
||||
after the construction of the first successful gasoline fueled automobiles the quick brown fox jumped over the lazy dog </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[ 0] en 300B 424p 30000R,
|
||||
[ 4] fr 201B 230p 19380R,
|
||||
5 chunks scored<br>
|
||||
en.100R(59%) fr.96R(40%) 502 bytes = FRENCH <br><br>
|
||||
[sw] <span style="background:#D8E7FF;color:#6F7F00;">
|
||||
qpdbmrmxyzptlkuuddlrlrbas las les qpdbmrmx</span>
|
||||
[] <span style="background:#D8E7FF;color:#6F7F00;">
|
||||
yzptlkuuddlrlrbas el la qpdbmrmxyzptlkuuddlrlrbas </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[14] sw 92B 108p 9200R,
|
||||
2 chunks scored<br>
|
||||
sw.100R(98%) 93 bytes = SWAHILI <br><br>
|
||||
PASS
|
||||
|
||||
</span></body></html>
|
||||
886
internal/CLD2UnitTestOutput0614.html
Normal file
886
internal/CLD2UnitTestOutput0614.html
Normal file
@@ -0,0 +1,886 @@
|
||||
<html><meta charset="UTF-8"><body>
|
||||
<style media="print" type="text/css"> :root { -webkit-print-color-adjust: exact; } </style>
|
||||
<span style="font-size: 7pt">
|
||||
file = cld2_unittest<br>
|
||||
[en] <span style="background:#FFFFF4;color:#000000;">
|
||||
confiscation of goods is assigned as the penalty part most of the courts consist of </span>
|
||||
[] <span style="background:#FFFFF4;color:#000000;">
|
||||
members and when it is necessary to bring public cases before a jury of members two courts combine for the purpose the most important cases of all are brought jurors or </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[ 0] en 253B 370p 25300R,
|
||||
2 chunks scored<br>
|
||||
en.100R(99%) 254 bytes = ENGLISH <br><br>
|
||||
[hy] <span style="background:#F8FFD8;color:#007F1F;">
|
||||
ա յ եվ նա հիացած աչքերով նայում է հինգհարկանի շենքի տարօրինակ փոքրիկ քառակուսի պատուհաններին դեռ մենք շատ ենք հետամնաց ասում է նա այսպես է </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[ 1] hy 255B 255p 25500R,
|
||||
1 chunks scored<br>
|
||||
hy.100R(100%) 255 bytes = ARMENIAN <br><br>
|
||||
[chr] <span style="background:#FFEBD8;color:#007F1F;">
|
||||
ᎠᎢᏍᎩ ᎠᏟᎶᏍᏗ ᏥᏄᏍᏛᎩ ᎦᎫᏍᏛᏅᎯ ᎾᎥᎢ </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[11] chr 75B 75p 7500R,
|
||||
1 chunks scored<br>
|
||||
chr.100R(100%) 75 bytes = CHEROKEE <br><br>
|
||||
[dv] <span style="background:#FFD8F7;color:#007F1F;">
|
||||
ހިންދީ ބަހުން ވާހަކަ ދައްކާއިރު ދެވަނަ ބަހެއްގެ ގޮތުގައާއި އެނޫން ގޮތްގޮތުން ހިންދީ ބަހުން ވާހަކަ ދައްކާ މީހުންގެ އަދަދު މިލިއަނަށް </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[10] dv 249B 249p 24900R,
|
||||
1 chunks scored<br>
|
||||
dv.100R(100%) 249 bytes = DHIVEHI <br><br>
|
||||
[ka] <span style="background:#FFEBD8;color:#3F7F00;">
|
||||
ა ბირთვიდან მიღებული ელემენტი მენდელეევის პერიოდულ სიტემაში გადაინაცვლებს ორი უჯრით </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[11] ka 233B 233p 23300R,
|
||||
1 chunks scored<br>
|
||||
ka.100R(100%) 233 bytes = GEORGIAN <br><br>
|
||||
[el] <span style="background:#D8FFE7;color:#7F2F00;">
|
||||
ή αρνητική αναζήτηση λέξης κλειδιού καταστήστε τις μεμονωμένες λέξεις κλειδιά περισσότερο στοχοθετημένες με τη μετατροπή τους σε </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[ 2] el 242B 242p 24200R,
|
||||
1 chunks scored<br>
|
||||
el.100R(100%) 242 bytes = GREEK <br><br>
|
||||
[gu] <span style="background:#EFD8FF;color:#6F7F00;">
|
||||
આના પરિણામ પ્રમાણસર ફોન્ટ અવતરણ ચિન્હવાળા પાઠને છુપાવો બધા સમૂહો શોધાયા હાલનો જ સંદેશ વિષયની </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[ 4] gu 250B 250p 25000R,
|
||||
1 chunks scored<br>
|
||||
gu.100R(100%) 250 bytes = GUJARATI <br><br>
|
||||
[iu] <span style="background:#D8FFF3;color:#007F7F;">
|
||||
ᐃᑯᒪᒻᒪᑦ ᕿᓈᖏᓐᓇᓲᖑᒻᒪᑦ ᑎᑎᖅᑕᓕᒫᖅᓃᕕᑦ ᑎᑦᕆᐊᑐᓐᖏᑦᑕᑎᑦ ᑎᑎᖅᑕᑉᐱᑦ ᓯᕗᓂᖓᓂ ᑎᑎᖅᖃᖅ ᑎᑎᕆᐊᑐᓐᖏᑕᐃᑦ ᕿᓂᓲᖑᔪᒍᑦ ᑎᑎᖅᑕᓕᒫᖅᓃᕕᑦ </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[13] iu 254B 254p 25400R,
|
||||
1 chunks scored<br>
|
||||
iu.100R(100%) 254 bytes = INUKTITUT <br><br>
|
||||
[kn] <span style="background:#FFEBD8;color:#6F7F00;">
|
||||
ಂಠಯ್ಯನವರು ತುಮಕೂರು ಜಿಲ್ಲೆಯ ಚಿಕ್ಕನಾಯಕನಹಳ್ಳಿ ತಾಲ್ಲೂಕಿನ ತೀರ್ಥಪುರ ವೆಂಬ ಸಾಧಾರಣ ಹಳ್ಳಿಯ ಶ್ಯಾನುಭೋಗರ </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[11] kn 254B 254p 25400R,
|
||||
1 chunks scored<br>
|
||||
kn.100R(100%) 254 bytes = KANNADA <br><br>
|
||||
[km] <span style="background:#D8FFFF;color:#007F1F;">
|
||||
ក ខ គ ឃ ង ច ឆ ជ ឈ ញ ដ ឋ ឌ ឍ ណ ត ថ ទ ធ ន ប ផ ព ភ ម យ រ ល វ ស ហ ឡ អ ឥ ឦ ឧ ឪ ឫ ឬ ឯ ឱ ទាំងអស់ </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[ 8] km 187B 187p 18700R,
|
||||
1 chunks scored<br>
|
||||
km.100R(100%) 187 bytes = KHMER <br><br>
|
||||
[lo] <span style="background:#D8FFE7;color:#007F1F;">
|
||||
ກຫາທົ່ວທັງເວັບ ແລະໃນເວັບໄຮ້ສາຍ ທຳອິດໃຫ້ທຳການຊອກຫາກ່ອນ ຈາກນັ້ນ ໃຫ້ກົດປຸ່ມເມນູ ໃນໜ້າຜົນໄດ້ </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[ 2] lo 256B 256p 25600R,
|
||||
1 chunks scored<br>
|
||||
lo.100R(100%) 256 bytes = LAOTHIAN <br><br>
|
||||
[lif] <span style="background:#D8FFF3;color:#007F1F;">
|
||||
ᤁᤡᤖᤠᤳ ᤕᤠᤰᤌᤢᤱ ᤆᤢᤶᤗᤢᤱᤖᤧ ᤛᤥᤎᤢᤱᤃᤧᤴ ᤀᤡᤔᤠᤴᤛᤡᤱ ᤆᤧᤶᤈᤱᤗᤧ ᤁᤢᤔᤡᤱᤅᤥ ᤏᤠᤈᤡᤖᤡ ᤋᤱᤒᤣ ᤒᤠ ᤈᤏᤘᤖᤡ ᤗᤠᤏᤢᤀᤠᤱ ᤁ᤹ᤏᤠ ᤋᤱᤒᤣ ᤁᤠᤰ ᤏᤠ᤺ᤳᤋᤢ ᤕᤢᤖᤢᤒᤠ ᤀᤡᤔᤠᤴᤛᤡᤱ ᤋᤱᤃᤡᤵᤛᤡᤱ ᤌᤡᤶᤒᤣᤴ ᤂᤠᤃᤴ ᤛᤡᤛᤣ᤺ᤰᤗᤠ ᤂᤧᤴ ᤀᤡᤛᤡᤰ ᤈᤏᤘᤖᤡ ᤀᤥ ᤏᤠᤛᤢᤵ ᤆᤥ᤺ᤰᤔᤠ ᤌᤡᤶᤒᤣ ᤋᤱᤃᤠᤶᤛᤡᤱᤗ ᤐᤳᤐᤠ ᤀᤡᤱᤄᤱ ᤘᤠ᤹ </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[13] lif 580B 580p 58000R,
|
||||
1 chunks scored<br>
|
||||
lif.100R(100%) 580 bytes = LIMBU <br><br>
|
||||
[ml] <span style="background:#E3D8FF;color:#7F5F00;">
|
||||
ം അങ്ങനെ ഞങ്ങള് അവരുടെ മുമ്പില് നിന്നു ഔടും ഉടനെ നിങ്ങള് പതിയിരിപ്പില് നിന്നു എഴുന്നേറ്റു </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[ 9] ml 247B 247p 24700R,
|
||||
1 chunks scored<br>
|
||||
ml.100R(100%) 247 bytes = MALAYALAM <br><br>
|
||||
[or] <span style="background:#D8E7FF;color:#007F1F;">
|
||||
ଅକ୍ଟୋବର ଡିସେମ୍ବର </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[14] or 48B 48p 4800R,
|
||||
1 chunks scored<br>
|
||||
or.100R(100%) 48 bytes = ORIYA <br><br>
|
||||
[pa] <span style="background:#EFFFD8;color:#6F7F00;">
|
||||
ਂ ਦਿਨਾਂ ਵਿਚ ਭਾਈ ਸਾਹਿਬ ਦੀ ਬੁੱਚੜ ਗੋਬਿੰਦ ਰਾਮ ਨਾਲ ਅੜਫਸ ਚੱਲ ਰਹੀ ਸੀ ਗੋਬਿੰਦ ਰਾਮ ਨੇ ਭਾਈ ਸਾਹਿਬ ਦੀਆਂ ਭੈਣਾ </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[12] pa 247B 247p 24700R,
|
||||
1 chunks scored<br>
|
||||
pa.100R(100%) 247 bytes = PUNJABI <br><br>
|
||||
[si] <span style="background:#F8D8FF;color:#3F7F00;">
|
||||
අනුරාධ මිහිඳුකුල නමින් සකුරා ට ලිපියක් තැපෑලෙන් එවා තිබුණා කි ් රස්ටි ෂෙල්ටන් ප ් රනාන්දු ද </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[15] si 243B 243p 24300R,
|
||||
1 chunks scored<br>
|
||||
si.100R(100%) 243 bytes = SINHALESE <br><br>
|
||||
[syr] <span style="background:#EFFFD8;color:#007F1F;">
|
||||
ܐܕܪܝܣ ܓܛܘ ܫܘܪܝܐ ܡܢ ܦܪܢܣܐ ܡܢ ܐܣܦܢܝܐ ܚܐܪܘܬܐ ܒܐܕܪ ܒܢܝܣܢ ܫܛܝܚܘܬܐ ܟܠܢܝܐ ܡܝ̈ܐ ܒܥܠܡܐ </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[12] syr 143B 143p 14300R,
|
||||
1 chunks scored<br>
|
||||
syr.100R(100%) 143 bytes = SYRIAC <br><br>
|
||||
[tl] <span style="background:#FFD8D8;color:#7F5F00;">
|
||||
ᜋᜇ᜔ ᜐᜓᜎᜆ᜔ ᜃ ᜈᜅ᜔ ᜊᜌ᜔ᜊᜌᜒᜈ᜔ ᜂᜉᜅ᜔᜔ ᜋᜐᜈᜌ᜔ ᜎᜅ᜔ ᜁᜐ ᜉᜅ᜔ ᜀᜃ᜔ᜎᜆ᜔ ᜆᜓᜅ᜔ᜃᜓᜎ᜔ ᜐ ᜊᜌ᜔ᜊᜌᜒᜈ᜔ ᜐ ᜆᜒᜅᜒᜈ᜔ ᜃᜓ </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[ 0] tl 228B 228p 22800R,
|
||||
1 chunks scored<br>
|
||||
tl.100R(100%) 228 bytes = TAGALOG <br><br>
|
||||
[ta] <span style="background:#D8E7FF;color:#7F5F00;">
|
||||
அங்கு ராஜேந்திர சோழனால் கட்டப்பட்ட பிரம்மாண்டமான சிவன் கோவில் ஒன்றும் உள்ளது தொகு </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[14] ta 227B 227p 22700R,
|
||||
1 chunks scored<br>
|
||||
ta.100R(100%) 227 bytes = TAMIL <br><br>
|
||||
[te] <span style="background:#EFFFD8;color:#7F5F00;">
|
||||
ఁ దనర జయించిన తత్వ మరసి చూడఁ దాన యగును రాజయోగి యిట్లు తేజరిల్లుచు నుండు విశ్వదాభిరామ వినర వేమ </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[12] te 253B 253p 25300R,
|
||||
1 chunks scored<br>
|
||||
te.100R(100%) 253 bytes = TELUGU <br><br>
|
||||
[th] <span style="background:#FFD8EB;color:#6F7F00;">
|
||||
กฏในการค้นหา หรือหน้าเนื้อหา หากท่านเลือกลงโฆษณา ท่านอาจจะปรับต้องเพิ่มงบประมาณรายวันตา </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[ 5] th 257B 257p 25700R,
|
||||
1 chunks scored<br>
|
||||
th.100R(100%) 257 bytes = THAI <br><br>
|
||||
[zh] <span style="background:#FFD8D8;color:#7F2F00;">
|
||||
产品的简报和公告 提交该申请后无法进行更改 请确认您的选择是正确的 对于要提交的图书 我</span>
|
||||
[] <span style="background:#FFD8D8;color:#7F2F00;">
|
||||
确认 我是版权所有者或已得到版权所有者的授权 要更改您的国家 地区 请在此表的最上端更改您的 </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[ 0] zh 255B 506p 25500R,
|
||||
2 chunks scored<br>
|
||||
zh.100R(99%) 256 bytes = Chinese <br><br>
|
||||
[zh-Hant] <span style="background:#FFD8EB;color:#3F7F00;">
|
||||
之前為 帳單交易作業區 已變更 廣告內容 之前為 銷售代表 之前為 張貼日期為 百分比之前為 合約 為 目標對象條件已刪除 結束日期之前為 </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[ 5] zh-Hant 184B 343p 18400R,
|
||||
1 chunks scored<br>
|
||||
zh-Hant.100R(99%) 185 bytes = ChineseT <br><br>
|
||||
[ja] <span style="background:#D8FFFF;color:#000000;">
|
||||
このペ ジでは アカウントに指定された予算の履歴を一覧にしています それぞれの項目に</span>
|
||||
[] <span style="background:#D8FFFF;color:#000000;">
|
||||
は 予算額と特定期間のステ タスが表示されます 現在または今後の予算を設定するには </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[ 8] ja 238B 766p 23800R,
|
||||
2 chunks scored<br>
|
||||
ja.100R(99%) 239 bytes = Japanese <br><br>
|
||||
[ko] <span style="background:#E3D8FF;color:#000000;">
|
||||
개별적으로 리포트 액세스 권한을 부여할 수 있습니다 액세스 권한 부여사용자에게 프로필 리포</span>
|
||||
[] <span style="background:#E3D8FF;color:#000000;">
|
||||
트에 액세스할 수 있는 권한을 부여하시려면 가용 프로필 상자에서 프로필 이름을 선택한 다음 </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[ 9] ko 255B 924p 25500R,
|
||||
2 chunks scored<br>
|
||||
ko.100R(99%) 256 bytes = Korean <br><br>
|
||||
[af] <span style="background:#FFD8EB;color:#007F1F;">
|
||||
aam skukuza die naam beteken hy wat skoonvee of hy wat alles onderstebo keer wysig bosveldkampe boskampe is </span>
|
||||
[] <span style="background:#FFD8EB;color:#007F1F;">
|
||||
kleiner afgeleë ruskampe wat oor min fasiliteite beskik daar is geen restaurante </span>
|
||||
[] <span style="background:#FFD8EB;color:#007F1F;">
|
||||
of winkels nie en slegs oornagbesoekers word toegelaat bateleur </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[ 5] af 254B 248p 25400R,
|
||||
3 chunks scored<br>
|
||||
af.100R(99%) 255 bytes = AFRIKAANS <br><br>
|
||||
[sq] <span style="background:#D8FFF3;color:#7F5F00;">
|
||||
a do të kërkoni nga beogradi që të njohë pavarësinë e kosovës </span>
|
||||
[] <span style="background:#D8FFF3;color:#7F5F00;">
|
||||
zoti thaçi prishtina është gati ta njoh pavarësinë e serbisë ndërsa natyri</span>
|
||||
[] <span style="background:#D8FFF3;color:#7F5F00;">
|
||||
sht se do të kërkohet një gjë e tillë që edhe beogradi ta njoh shtetin e pavarur dhe sovran të </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[13] sq 253B 429p 24291R,
|
||||
3 chunks scored<br>
|
||||
sq.96R(99%) 254 bytes = ALBANIAN <br><br>
|
||||
[ar*.32/fa.10] <span style="background:#FFF7D8;color:#6F7F00;">
|
||||
احتيالية بيع أي حساب </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[ 6] ar 38B 32p 2736R,
|
||||
1 chunks scored<br>
|
||||
ar.72R(97%) 39 bytes = ARABIC <br><br>
|
||||
[az] <span style="background:#FFD8F7;color:#3F7F00;">
|
||||
a az qalıb breyn rinq intellektual oyunu üzrə yarışın zona mərhələləri keçiri</span>
|
||||
[] <span style="background:#FFD8F7;color:#3F7F00;">
|
||||
lib miq un qalıqlarının dənizdən çıxarılması davam edir məhəmməd </span>
|
||||
[] <span style="background:#FFD8F7;color:#3F7F00;">
|
||||
peyğəmbərin karikaturalarını çap edən qəzetin baş redaktoru iş otağında ölüb </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[10] az 256B 370p 25600R,
|
||||
3 chunks scored<br>
|
||||
az.100R(99%) 257 bytes = AZERBAIJANI <br><br>
|
||||
[eu] <span style="background:#E3D8FF;color:#6F7F00;">
|
||||
a den eraso bat honen kontra hortaz eragiketa bakarrik behar dituen eraso batek aes </span>
|
||||
[] <span style="background:#E3D8FF;color:#6F7F00;">
|
||||
apurtuko luke nahiz eta oraingoz eraso bideraezina izan gaur egungo teknologiaren mu</span>
|
||||
[] <span style="background:#E3D8FF;color:#6F7F00;">
|
||||
gak direla eta oraingoz kezka hauek alde batera utzi daitezke orain arteko indar </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[ 9] eu 249B 368p 24900R,
|
||||
3 chunks scored<br>
|
||||
eu.100R(99%) 250 bytes = BASQUE <br><br>
|
||||
[be] <span style="background:#F8D8FF;color:#7F5F00;">
|
||||
а друкаваць іх не было тэхнічна магчыма бліжэй за вільню тым самым ча</span>
|
||||
[] <span style="background:#F8D8FF;color:#7F5F00;">
|
||||
сам нямецкае кіраўніцтва прапаноўвала апроч ўвядзення лацінкі яе </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[15] be 248B 257p 24800R,
|
||||
2 chunks scored<br>
|
||||
be.100R(99%) 249 bytes = BELARUSIAN <br><br>
|
||||
[bn] <span style="background:#FFD8EB;color:#7F5F00;">
|
||||
ংখ্যা নমুনায়ন বিন্যাস পরিসংখ্যানিক মডেল পরিসংখ্যানিক সিদ্ধান্ত ফাংশন পরিসংখ্যানিক </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[ 5] bn 231B 144p 23100R,
|
||||
1 chunks scored<br>
|
||||
bn.100R(99%) 232 bytes = BENGALI <br><br>
|
||||
[hi] <span style="background:#D8F3FF;color:#7F5F00;">
|
||||
विकिपीडिया इंटरनेट आधारित एक मुक्त ज्ञानकोष परियोजना ह ई विकि के रुप मेँ बा </span>
|
||||
[bh] <span style="background:#D8F3FF;color:#6F7F00;">
|
||||
यानी एगो अईसन जाल पृष्ठ जे सभन के संपादन करे के छूट देवेला विकिपीडिया शब्द विकि अउर इनसाइक्लोपीड</span>
|
||||
[] <span style="background:#D8F3FF;color:#6F7F00;">
|
||||
िया ज्ञानकोष शब्दन के मिला के बनल बा विकिपीडिया एक बहुभाषीय प्रकल्प ह अउर स्</span>
|
||||
[] <span style="background:#D8F3FF;color:#6F7F00;">
|
||||
वयंसेवकन के सहकार से निर्मित बा जेहु के भी इंटरनेट तक पहुँच बा ऊ विकिपी</span>
|
||||
[] <span style="background:#D8F3FF;color:#6F7F00;">
|
||||
डिया पर लिख सकत बा अउर लेखन के संपादन कर सकत बा विकिपीडिया </span>
|
||||
[] <span style="background:#D8F3FF;color:#6F7F00;">
|
||||
के मुख्य सर्वर टैम्पा फ्लोरीडा में बा अतिरिक्त सर्वर एम्सटर्डम अउर सियोल में बा </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[ 3] hi 200B 105p 20000R,
|
||||
[11] bh 1002B 534p 100200R,
|
||||
6 chunks scored<br>
|
||||
{CloseLangPair: hi.100R,200B => bh}<br>
|
||||
bh.100R(99%) 1203 bytes = BIHARI <br><br>
|
||||
[bg] <span style="background:#FFEBD8;color:#7F2F00;">
|
||||
а дума попада в състояние на изпитание ключовите думи с пр</span>
|
||||
[] <span style="background:#FFEBD8;color:#7F2F00;">
|
||||
едсказана малко под то изискване на страниците за търсене в </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[11] bg 216B 183p 21600R,
|
||||
2 chunks scored<br>
|
||||
bg.100R(99%) 217 bytes = BULGARIAN <br><br>
|
||||
[ca] <span style="background:#E3FFD8;color:#6F7F00;">
|
||||
al final en un únic lloc nhorabona l correu electrònic està concebut com a eina de productivitat aleshores per què perdre </span>
|
||||
[] <span style="background:#E3FFD8;color:#6F7F00;">
|
||||
el temps arxivant missatges per després intentar recordar on els veu desar i per què heu d eliminar missatges importants per l </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[ 7] ca 255B 254p 22404R,
|
||||
2 chunks scored<br>
|
||||
ca.87R(99%) 256 bytes = CATALAN <br><br>
|
||||
[ceb] <span style="background:#FFD8EB;color:#001F7F;">
|
||||
ang sugbo usa sa mga labing ugmad nga lalawigan sa nasod kini ang sentro </span>
|
||||
[] <span style="background:#FFD8EB;color:#001F7F;">
|
||||
sa komersyo edukasyon ug industriya sa sentral ug habagatang dapit sa kapupod an ang mipada</span>
|
||||
[] <span style="background:#FFD8EB;color:#001F7F;">
|
||||
yag sa sugbo isip ikapito nga labing nindot nga pulo sa ang nag inusa</span>
|
||||
[tl*.42/ceb.39] <span style="background:#FFD8D8;color:#7F5F00;">
|
||||
rang pulo sa pilipinas nga napasidunggan sa maong magasin sukad pa sa tuig </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[ 0] tl 75B 42p 2025R,
|
||||
[ 5] ceb 233B 272p 23300R,
|
||||
4 chunks scored<br>
|
||||
{Unreli tl.27R,75B} ceb.100R(75%) 309 bytes = CEBUANO* <br><br>
|
||||
[sr] <span style="background:#D8FFF3;color:#7F2F00;">
|
||||
fender precision bass prva je bas gitara ikada napravljena i promovirana na tržištu model se pojavio a svoj </span>
|
||||
[] <span style="background:#D8FFF3;color:#7F2F00;">
|
||||
prepoznatljiv dizajn koji i dan danas ima dobio je godine autor tog instrumenta je leo fender koji je imao </span>
|
||||
[hr] <span style="background:#EFFFD8;color:#7F2F00;">
|
||||
namjeru napraviti instrument koji bi bio manji od kontrabasa i kojega bi mogli glazbenici staviti u automobil bas </span>
|
||||
[] <span style="background:#EFFFD8;color:#7F2F00;">
|
||||
gitara je zauvijek promijenila glazbu i model precision se i danas pojavljuje u svim žanrovima rock heavy metal pop i slično </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[12] hr 241B 212p 24100R,
|
||||
[13] sr 217B 176p 21486R,
|
||||
4 chunks scored<br>
|
||||
{CloseLangPair: sr.99R,217B => hr}<br>
|
||||
hr.99R(99%) 459 bytes = CROATIAN <br><br>
|
||||
[cs] <span style="background:#F8FFD8;color:#7F2F00;">
|
||||
a akci opakujte film uložen vykreslit gmail tokio smazat obsah adresáře nelze načíst sy</span>
|
||||
[] <span style="background:#F8FFD8;color:#7F2F00;">
|
||||
stémový profil jednotky smoot okud používáte pro určení polokoule značky z západ </span>
|
||||
[] <span style="background:#F8FFD8;color:#7F2F00;">
|
||||
nebo v východ používejte nezáporné hodnoty zeměpisné délky nelze </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[ 1] cs 255B 292p 23476R,
|
||||
3 chunks scored<br>
|
||||
cs.92R(99%) 256 bytes = CZECH <br><br>
|
||||
[da] <span style="background:#F8FFD8;color:#000000;">
|
||||
a z tallene og punktummer der er tilladte log ud angiv den ønskede adgangskode igen november gem personlige </span>
|
||||
[] <span style="background:#F8FFD8;color:#000000;">
|
||||
oplysninger kontrolspørgsmål det sidste tegn i dit brugernavn skal være et bogstav a z eller tal skriv de tegn du kan se i billedet nedenfor </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[ 1] da 253B 296p 22564R,
|
||||
2 chunks scored<br>
|
||||
da.89R(99%) 254 bytes = DANISH <br><br>
|
||||
[nl] <span style="background:#D8FFE7;color:#000000;">
|
||||
a als volgt te werk om een configuratiebestand te maken sitemap gen py ebruik filters om de s op te ge</span>
|
||||
[] <span style="background:#D8FFE7;color:#000000;">
|
||||
ven die moeten worden toegevoegd of uitgesloten op basis van de opmaaktaal elke sitemap mag alleen de s bevatten voor een bepaalde opmaaktaal dit </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[ 2] nl 248B 253p 24800R,
|
||||
2 chunks scored<br>
|
||||
nl.100R(99%) 249 bytes = DUTCH <br><br>
|
||||
[en] <span style="background:#FFFFF4;color:#000000;">
|
||||
a backup credit card by visiting your billing preferences page or visit the adwords </span>
|
||||
[] <span style="background:#FFFFF4;color:#000000;">
|
||||
help centre for more details https adwords google com support bin answer py answer hl en </span>
|
||||
[] <span style="background:#FFFFF4;color:#000000;">
|
||||
we were unable to process the payment of for your outstanding google adwords </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[ 0] en 250B 275p 24555R,
|
||||
3 chunks scored<br>
|
||||
en.98R(99%) 251 bytes = ENGLISH <br><br>
|
||||
[et] <span style="background:#D8FFFF;color:#7F2F00;">
|
||||
a niipea kui sinu maksimaalne igakuine krediidi limiit on meie poolt heaks kiidetud on sinu kohustuseks see krediidilimiit </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[ 8] et 123B 143p 12300R,
|
||||
1 chunks scored<br>
|
||||
et.100R(99%) 124 bytes = ESTONIAN <br><br>
|
||||
[fi] <span style="background:#D8F3FF;color:#000000;">
|
||||
a joilla olet käynyt tämä kerro meille kuka ä olet ei tunnistettavia käyttötietoja kuten virhe</span>
|
||||
[] <span style="background:#D8F3FF;color:#000000;">
|
||||
raportteja käytetään google desktopin parantamiseen etsi näyttää mu</span>
|
||||
[] <span style="background:#D8F3FF;color:#000000;">
|
||||
kautettuja uutisia google desktop keskivaihto leikkaa voit kaksoisnapsauttaa </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[ 3] fi 250B 308p 25000R,
|
||||
3 chunks scored<br>
|
||||
fi.100R(99%) 251 bytes = FINNISH <br><br>
|
||||
[fr] <span style="background:#EFD8FF;color:#000000;">
|
||||
a accès aux collections et aux frontaux qui lui ont été attribués il peut consulter </span>
|
||||
[] <span style="background:#EFD8FF;color:#000000;">
|
||||
et modifier ses collections et exporter des configurations de collection toutefois </span>
|
||||
[] <span style="background:#EFD8FF;color:#000000;">
|
||||
il ne peut pas créer ni supprimer des collections enfin il a accès aux fonctions </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[ 4] fr 254B 336p 24062R,
|
||||
3 chunks scored<br>
|
||||
fr.94R(99%) 255 bytes = FRENCH <br><br>
|
||||
[gl] <span style="background:#F8D8FF;color:#7F2F00;">
|
||||
debe ser como mínimo taranto tendas de venda polo miúdo cociñas servizos bordado ca</span>
|
||||
[] <span style="background:#F8D8FF;color:#7F2F00;">
|
||||
nadá viaxes parques de vehículos de recreo hotel oriental habitación recibir unha postal no enderezo indicado anteriormente </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[15] gl 213B 149p 21300R,
|
||||
2 chunks scored<br>
|
||||
gl.100R(99%) 214 bytes = GALICIAN <br><br>
|
||||
[lg] <span style="background:#D8E7FF;color:#004F7F;">
|
||||
abaana ba bani lukaaga mu ana mu babiri abaana ba bebayi lukaaga mu abiri mu ba</span>
|
||||
[] <span style="background:#D8E7FF;color:#004F7F;">
|
||||
satu abaana ba azugaadi lukumi mu ebikumi bibiri mu abiri mu babiri abaana ba adoni</span>
|
||||
[] <span style="background:#D8E7FF;color:#004F7F;">
|
||||
kamu lukaaga mu nltaaga mu mukaaga abaana ba biguvaayi enkumi bbiri mu ataano mu mukaaga </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[14] lg 251B 332p 25100R,
|
||||
3 chunks scored<br>
|
||||
lg.100R(99%) 252 bytes = GANDA <br><br>
|
||||
[de] <span style="background:#FFD8EB;color:#000000;">
|
||||
abschnitt ordner aktivieren werden die ordnereinstellungen im fa</span>
|
||||
[] <span style="background:#FFD8EB;color:#000000;">
|
||||
rbabschnitt deaktiviert öchten sie wirklich fortfahren eldtypen angeben optional n die</span>
|
||||
[] <span style="background:#FFD8EB;color:#000000;">
|
||||
sem schritt geben sie für jedesfeld aus dem datenset den typ an ieser schritt ist optional eldtypen </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[ 5] de 252B 308p 25200R,
|
||||
3 chunks scored<br>
|
||||
de.100R(99%) 253 bytes = GERMAN <br><br>
|
||||
[ht] <span style="background:#FFEBD8;color:#007F7F;">
|
||||
ak pitit tout sosyete a chita se pou sa leta dwe pwoteje yo nimewo leta fèt pou </span>
|
||||
[] <span style="background:#FFEBD8;color:#007F7F;">
|
||||
li pwoteje tout paran ak pitit nan peyi a menm jan kit paran yo marye kit yo pa marye </span>
|
||||
[] <span style="background:#FFEBD8;color:#007F7F;">
|
||||
tout manman ki fè pitit leta fèt pou ba yo konkoul menm jan tou pou timoun piti ak pou </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[11] ht 256B 475p 21936R,
|
||||
3 chunks scored<br>
|
||||
ht.85R(99%) 257 bytes = HAITIAN_CREOLE <br><br>
|
||||
[iw] <span style="background:#FFF7D8;color:#000000;">
|
||||
או לערוך את העדפות ההפצה אנא עקוב אחרי השלבים הבאים כנס לחשבון האישי שלך ב </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[ 6] iw 135B 106p 13500R,
|
||||
1 chunks scored<br>
|
||||
iw.100R(99%) 136 bytes = HEBREW <br><br>
|
||||
[hi] <span style="background:#D8F3FF;color:#7F5F00;">
|
||||
ं ऐडवर्ड्स विज्ञापनों के अनुभव पर आधारित हैं और इनकी मदद से आपको अपने विज्ञापनों का अधिकतम लाभ </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[ 3] hi 249B 176p 24900R,
|
||||
1 chunks scored<br>
|
||||
hi.100R(99%) 250 bytes = HINDI <br><br>
|
||||
[blu] <span style="background:#D8FFFF;color:#001F7F;">
|
||||
kuv hlub koj txawm lub ntuj yuav si ntshi nphaus los kuv tsis ua siab nkaug </span>
|
||||
[] <span style="background:#D8FFFF;color:#001F7F;">
|
||||
txawm ntiab teb yuav si ntshi nphaus los kuv tseem ua lon tsaug vim kuv hlub koj tag lub siab </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[ 8] blu 170B 384p 17000R,
|
||||
2 chunks scored<br>
|
||||
blu.100R(99%) 171 bytes = HMONG <br><br>
|
||||
[hu] <span style="background:#E3FFD8;color:#7F2F00;">
|
||||
a felhasználóim a google azonosító szöveget ikor látják a felhasználóim a google azono</span>
|
||||
[] <span style="background:#E3FFD8;color:#7F2F00;">
|
||||
sító szöveget felhasználók a google azonosító szöveget fogják látni </span>
|
||||
[] <span style="background:#E3FFD8;color:#7F2F00;">
|
||||
minden tranzakció után ha a vásárlását regisztrációját oldalunk </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[ 7] hu 246B 346p 24600R,
|
||||
3 chunks scored<br>
|
||||
hu.100R(99%) 247 bytes = HUNGARIAN <br><br>
|
||||
[is] <span style="background:#D8F3FF;color:#7F2F00;">
|
||||
a afköst leitarorða þinna leitarorð neikvæð leitarorð auglýsingahópa byggja upp aðallista yfir ný leita</span>
|
||||
[] <span style="background:#D8F3FF;color:#7F2F00;">
|
||||
rorð fyrir auglýsingahópana og skoða ítarleg gögn um árangur </span>
|
||||
[] <span style="background:#D8F3FF;color:#7F2F00;">
|
||||
leitarorða eins og samkeppni auglýsenda og leitarmagn er krafist notkun </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[ 3] is 256B 332p 25600R,
|
||||
3 chunks scored<br>
|
||||
is.100R(99%) 257 bytes = ICELANDIC <br><br>
|
||||
[ms] <span style="background:#D8FFFF;color:#7F5F00;">
|
||||
geng pengembaraan bermula adalah film animasi d cgi pertama yang diproduksi di mala</span>
|
||||
[id] <span style="background:#FFF7D8;color:#7F5F00;">
|
||||
ysia film ini dibuat oleh les copaque production lcp dan dirilis di bioskop bioskop seluruh ma</span>
|
||||
[ms] <span style="background:#D8FFFF;color:#7F5F00;">
|
||||
laysia pada februari film geng pertama kali diluncurkan dalam sebuah acara peluncuran pada </span>
|
||||
[id] <span style="background:#FFF7D8;color:#7F5F00;">
|
||||
september bersama dengan serial animasi pendek upin ipin yang berhubungan dengan film tersebut pembuatan film ini </span>
|
||||
[] <span style="background:#FFF7D8;color:#7F5F00;">
|
||||
didukung oleh berbagai pihak seperti kementerian sains teknologi dan inovasi malaysia mosti dengan memberi bantuan berupa dana sebesar rm juta </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[ 6] id 351B 427p 35100R,
|
||||
[ 8] ms 174B 207p 17400R,
|
||||
5 chunks scored<br>
|
||||
{CloseLangPair: ms.100R,174B => id}<br>
|
||||
id.100R(99%) 526 bytes = INDONESIAN <br><br>
|
||||
[ga] <span style="background:#D8E7FF;color:#7F2F00;">
|
||||
a bhfuil na focail go léir i do cheist le fáil orthu ní gá ach focail bre</span>
|
||||
[] <span style="background:#D8E7FF;color:#7F2F00;">
|
||||
ise a chur leis na cinn a cuardaíodh cheana chun an cuardach a bheachtú nó a chúngú má chu</span>
|
||||
[] <span style="background:#D8E7FF;color:#7F2F00;">
|
||||
irtear focal breise isteach aimseofar fo aicme ar leith de na torthaí a fuarthas </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[14] ga 255B 378p 25500R,
|
||||
3 chunks scored<br>
|
||||
ga.100R(99%) 256 bytes = IRISH <br><br>
|
||||
[it] <span style="background:#E3FFD8;color:#000000;">
|
||||
a causa di un intervento di manutenzione del sistema fino alle ore circa ora legale costa del pacifico del novembre le ca</span>
|
||||
[] <span style="background:#E3FFD8;color:#000000;">
|
||||
mpagne esistenti continueranno a essere pubblicate come di consueto anche durante questo breve periodo di inattività ci scusiamo per </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[ 7] it 255B 228p 23490R,
|
||||
2 chunks scored<br>
|
||||
it.92R(99%) 256 bytes = ITALIAN <br><br>
|
||||
[jw] <span style="background:#FFD8D8;color:#6F7F00;">
|
||||
account ten server niki kalian username meniko tanpo judul cacahe account nggonanmu wes pol pesen mu wes di</span>
|
||||
[] <span style="background:#FFD8D8;color:#6F7F00;">
|
||||
guwak pesenan mu wes di simpen sante wae pesenan mu wes ke kirim mbuh te</span>
|
||||
[jw*.49/id.42] <span style="background:#FFD8D8;color:#6F7F00;">
|
||||
kan ora pesenan e ke kethok pesenan mu wes ke kirim mbuh tekan ora pesenan </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[ 0] jw 254B 146p 22187R,
|
||||
3 chunks scored<br>
|
||||
jw.87R(99%) 255 bytes = JAVANESE <br><br>
|
||||
[rw] <span style="background:#F8D8FF;color:#007F7F;">
|
||||
dore ibyo ukeneye kumenya ukwo watubona ibibazo byinshi abandi babaza ububonero </span>
|
||||
[] <span style="background:#F8D8FF;color:#007F7F;">
|
||||
byibibina google onjela ho izina dyikyibina kyawe onjela ho yawe mulugo kulaho ibyandiko byawe sh</span>
|
||||
[] <span style="background:#F8D8FF;color:#007F7F;">
|
||||
yilaho tegula yawe tulubaka tukongeraho iyanya mishya buliko tulambula </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[15] rw 248B 294p 23680R,
|
||||
3 chunks scored<br>
|
||||
rw.95R(99%) 249 bytes = KINYARWANDA <br><br>
|
||||
[lv] <span style="background:#EFD8FF;color:#7F2F00;">
|
||||
a gadskārtējā izpārdošana slēpošana jāņi atlaide izmaiņas trafikā kas saistītas ar sezo</span>
|
||||
[] <span style="background:#EFD8FF;color:#7F2F00;">
|
||||
nas izpārdošanu speciālajām atlaidēm u c ir parastas un atslēgvārdi kas </span>
|
||||
[] <span style="background:#EFD8FF;color:#7F2F00;">
|
||||
ir populāri noteiktos laika posmos šajā laikā saņems lielāku klikšķu </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[ 4] lv 255B 304p 25500R,
|
||||
3 chunks scored<br>
|
||||
lv.100R(99%) 256 bytes = LATVIAN <br><br>
|
||||
[lt] <span style="background:#FFD8EB;color:#7F2F00;">
|
||||
a išsijungia mano idėja dėl geriausio laiko po pastarųjų savo santykių pasimo</span>
|
||||
[] <span style="background:#FFD8EB;color:#7F2F00;">
|
||||
kiau penki dalykai be kurių negaliu gyventi mano miegamajame tu surasi ide</span>
|
||||
[] <span style="background:#FFD8EB;color:#7F2F00;">
|
||||
ali pora išsilavinimas aukštoji mokykla koledžas universitetas pagrindinis laipsnis metai </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[ 5] lt 251B 311p 25100R,
|
||||
3 chunks scored<br>
|
||||
lt.100R(99%) 252 bytes = LITHUANIAN <br><br>
|
||||
[mk] <span style="background:#EFD8FF;color:#7F5F00;">
|
||||
гласовите коалицијата на вмро дпмне како партија со најмногу ос</span>
|
||||
[] <span style="background:#EFD8FF;color:#7F5F00;">
|
||||
воени гласови ќе добие евра а на сметката на коализијата за македонија </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[ 4] mk 247B 295p 24700R,
|
||||
2 chunks scored<br>
|
||||
mk.100R(99%) 248 bytes = MACEDONIAN <br><br>
|
||||
[ms] <span style="background:#D8FFFF;color:#7F5F00;">
|
||||
daripada dirinya hirako shinji seorang pemuda merujuk diri mereka seba</span>
|
||||
[] <span style="background:#D8FFFF;color:#7F5F00;">
|
||||
gai vizard shinji telah cuba untuk menyakinkan ichigo untuk menyertai ku</span>
|
||||
[] <span style="background:#D8FFFF;color:#7F5F00;">
|
||||
mpulan mereka mengatakan bahawa hanya dia sahaja yang mampu mengajar ichigo teknik untuk mengawal hollow </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[ 8] ms 247B 426p 24490R,
|
||||
3 chunks scored<br>
|
||||
ms.99R(99%) 248 bytes = MALAY <br><br>
|
||||
[mt] <span style="background:#F8FFD8;color:#3F7F00;">
|
||||
ata ikteb messaġġ lil indirizzi differenti billi tagħżilhom u tagħfas il buttuna ik</span>
|
||||
[] <span style="background:#F8FFD8;color:#3F7F00;">
|
||||
teb żid numri tfittxijja tal kotba mur print home kotba minn pagni ghal pagna minn </span>
|
||||
[] <span style="background:#F8FFD8;color:#3F7F00;">
|
||||
ghall ktieb ta aċċessa stieden habib iehor grazzi it tim tal gruppi google </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[ 1] mt 249B 319p 24900R,
|
||||
3 chunks scored<br>
|
||||
mt.100R(99%) 250 bytes = MALTESE <br><br>
|
||||
[mr] <span style="background:#FFD8D8;color:#3F7F00;">
|
||||
हैदराबाद उच्चार ऐका सहाय्य माहिती तेलुगू </span>
|
||||
[te] <span style="background:#EFFFD8;color:#7F5F00;">
|
||||
హైదరాబాదు </span>
|
||||
[hi] <span style="background:#D8F3FF;color:#7F5F00;">
|
||||
उर्दू </span>
|
||||
[ur*.14/fa.10] <span style="background:#D8FFE7;color:#6F7F00;">
|
||||
حیدر آباد </span>
|
||||
[mr] <span style="background:#FFD8D8;color:#3F7F00;">
|
||||
हे भारतातील आंध्र प्रदेश राज्याच्या राजधानीचे शहर आहे हैदराबादची लोकसंख</span>
|
||||
[] <span style="background:#FFD8D8;color:#3F7F00;">
|
||||
्या लाख हजार आहे मोत्यांचे शहर अशी एकेकाळी ओळख असलेल्या या शहराला ऐतिहासिक सांस्क</span>
|
||||
[] <span style="background:#FFD8D8;color:#3F7F00;">
|
||||
ृतिक आणि स्थापत्यशास्त्रीय वारसा लाभला आहे नंतर शिक्षण आणि माहिती तंत्रज्ञान त्याचप्र</span>
|
||||
[] <span style="background:#FFD8D8;color:#3F7F00;">
|
||||
माणे औषधनिर्मिती आणि जैवतंत्रज्ञान क्षेत्रातील उद्योगधंद्यांची वाढ शह</span>
|
||||
[] <span style="background:#FFD8D8;color:#3F7F00;">
|
||||
रात झाली दक्षिण मध्य भारतातील पर्यटन आणि तेलुगू चित्रपटनिर्मितीचे हैदराबाद हे केंद्र आहे </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[ 0] mr 1190B 730p 117890R,
|
||||
[ 2] ur 18B 14p 648R,
|
||||
[ 3] hi 16B 10p 1600R,
|
||||
[12] te 29B 29p 2900R,
|
||||
9 chunks scored<br>
|
||||
{CloseLangPair: hi.100R,16B => mr}<br>
|
||||
{Unreli ur.36R,18B} mr.99R(95%) te.100R(3%) 1257 bytes = MARATHI <br><br>
|
||||
[ne] <span style="background:#FFEBD8;color:#7F5F00;">
|
||||
अरू ठाऊँबाटपनि खुलेको छ यो खाता अर अरू ठाऊँबाटपनि खुलेको छ यो खाता अर ू </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[11] ne 186B 56p 17484R,
|
||||
1 chunks scored<br>
|
||||
ne.94R(99%) 187 bytes = NEPALI <br><br>
|
||||
[no] <span style="background:#FFD8F7;color:#000000;">
|
||||
a er obligatorisk tidsforskyvning plassering av katalogsøk planinformasjon loggfilbane gruppe</span>
|
||||
[] <span style="background:#FFD8F7;color:#000000;">
|
||||
navn kontoinformasjon passord domene gruppeinformasjon alle kampanjesporing alternativ bruker grupper oppgaveplanlegger oppgavehistorikk kontosammendrag antall </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[10] no 254B 239p 25400R,
|
||||
2 chunks scored<br>
|
||||
no.100R(99%) 255 bytes = NORWEGIAN <br><br>
|
||||
[fa] <span style="background:#D8FFF3;color:#3F7F00;">
|
||||
آب خوردن عجله می کردند به جای باز ی کتک کاری می کردند و همه چيز مثل قبل بود فقط من ماندم و يک دنيا حرف و انتظار تا عاقبت رسيد احضاريه ی ای با </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[13] fa 249B 225p 24900R,
|
||||
1 chunks scored<br>
|
||||
fa.100R(99%) 250 bytes = PERSIAN <br><br>
|
||||
[pl] <span style="background:#FFEBD8;color:#000000;">
|
||||
a australii będzie widział inne reklamy niż użytkownik z kanady kierowanie geograficzne spra</span>
|
||||
[] <span style="background:#FFEBD8;color:#000000;">
|
||||
wia że reklamy są lepiej dopasowane do użytkownika twojej strony oznacza </span>
|
||||
[] <span style="background:#FFEBD8;color:#000000;">
|
||||
to także że możesz nie zobaczyć wszystkich reklam które są wyświetlane na </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[11] pl 253B 444p 25300R,
|
||||
3 chunks scored<br>
|
||||
pl.100R(99%) 254 bytes = POLISH <br><br>
|
||||
[pt] <span style="background:#EFFFD8;color:#000000;">
|
||||
a abit prevê que a entrada desses produtos estrangeiros no mercado têxtil e vestuário do brasil possa redu</span>
|
||||
[] <span style="background:#EFFFD8;color:#000000;">
|
||||
zir os preços em cerca de a partir de má notícia para os empresários que terão que lutar para garantir suas margens de lucro mas boa notícia </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[12] pt 256B 234p 25600R,
|
||||
2 chunks scored<br>
|
||||
pt.100R(99%) 257 bytes = PORTUGUESE <br><br>
|
||||
[ro] <span style="background:#FFF7D8;color:#7F2F00;">
|
||||
a anunţurilor reţineţi nu plătiţi pentru clicuri sau impresii ci numai atunci când pe </span>
|
||||
[] <span style="background:#FFF7D8;color:#7F2F00;">
|
||||
site ul dvs survine o acţiune dorită site urile negative nu </span>
|
||||
[] <span style="background:#FFF7D8;color:#7F2F00;">
|
||||
pot avea uri de destinaţie daţi instrucţiuni societăţii dvs bancare sau constructoare să </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[ 6] ro 249B 259p 24716R,
|
||||
3 chunks scored<br>
|
||||
ro.99R(99%) 250 bytes = ROMANIAN <br><br>
|
||||
[ro] <span style="background:#FFF7D8;color:#7F2F00;">
|
||||
оперативэ а органелор ши институциилор екзекутиве ши а органелор жу</span>
|
||||
[] <span style="background:#FFF7D8;color:#7F2F00;">
|
||||
дичиаре але путерий де стат фиекэруй орган ал путерий де стат и се </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[ 6] ro 246B 224p 24600R,
|
||||
2 chunks scored<br>
|
||||
ro.100R(99%) 247 bytes = ROMANIAN <br><br>
|
||||
[ru] <span style="background:#D8FFF3;color:#000000;">
|
||||
а неправильный формат идентификатора дн назад </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[13] ru 86B 47p 8600R,
|
||||
1 chunks scored<br>
|
||||
ru.100R(98%) 87 bytes = RUSSIAN <br><br>
|
||||
[gd] <span style="background:#D8FFF3;color:#6F7F00;">
|
||||
air son is gum bi casg air a h uile briosgaid no gum faigh thu brath nuair a tha </span>
|
||||
[] <span style="background:#D8FFF3;color:#6F7F00;">
|
||||
briosgaid a tighinn gad rannsachadh ghoogle gu ceart mura bheil briosgaidean </span>
|
||||
[] <span style="background:#D8FFF3;color:#6F7F00;">
|
||||
ceadaichte cuiridh google briosgaid dha do neach cleachdaidh fa leth tha google a cleachdadh </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[13] gd 251B 370p 25100R,
|
||||
3 chunks scored<br>
|
||||
gd.100R(99%) 252 bytes = SCOTS_GAELIC <br><br>
|
||||
[sr] <span style="background:#D8FFF3;color:#7F2F00;">
|
||||
балчак балчак на мапи србије уреди демографија у насељу балчак живи пунолетна </span>
|
||||
[] <span style="background:#D8FFF3;color:#7F2F00;">
|
||||
становника а просечна старост становништва износи година </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[13] sr 251B 252p 24351R,
|
||||
2 chunks scored<br>
|
||||
sr.97R(99%) 252 bytes = SERBIAN <br><br>
|
||||
[sr] <span style="background:#D8FFF3;color:#7F2F00;">
|
||||
autonomnih pokrajina saveznim zakonom može se propisati poseban sastav organizacija i delokrug saveta za poslove narodne </span>
|
||||
[] <span style="background:#D8FFF3;color:#7F2F00;">
|
||||
odbrane članove saveta federacije bira na predlog predsedništva savezna skupština iz reda društveno političkih i drugih javnih </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[13] sr 254B 209p 25400R,
|
||||
2 chunks scored<br>
|
||||
sr.100R(99%) 255 bytes = SERBIAN <br><br>
|
||||
[sk] <span style="background:#EFD8FF;color:#3F7F00;">
|
||||
a aktivovať reklamnú kampaň ak chcete kampaň pred spustením ešte prispôsobiť uložte ju </span>
|
||||
[] <span style="background:#EFD8FF;color:#3F7F00;">
|
||||
ako šablónu a pokračujte v úprave vyberte si jednu z možností nižšie a kli</span>
|
||||
[] <span style="background:#EFD8FF;color:#3F7F00;">
|
||||
knite na tlačidlo uložiť kampaň nastavenia kampane môžete ľubovoľne </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[ 4] sk 254B 320p 25400R,
|
||||
3 chunks scored<br>
|
||||
sk.100R(99%) 255 bytes = SLOVAK <br><br>
|
||||
[sl] <span style="background:#F8D8FF;color:#6F7F00;">
|
||||
adsense stanje prijave za google adsense google adsense račun je bil začasno za</span>
|
||||
[] <span style="background:#F8D8FF;color:#6F7F00;">
|
||||
mrznjen pozdravljeni hvala za vaše zanimanje v google adsense po pregledu vaše prijavnice </span>
|
||||
[] <span style="background:#F8D8FF;color:#6F7F00;">
|
||||
so naši strokovnjaki ugotovili da spletna stran ki je trenutno povezana z vašim </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[15] sl 255B 235p 25500R,
|
||||
3 chunks scored<br>
|
||||
sl.100R(99%) 256 bytes = SLOVENIAN <br><br>
|
||||
[es] <span style="background:#D8E7FF;color:#000000;">
|
||||
a continuación haz clic en el botón obtener ruta también puedes desplazarte hasta el final de la página para </span>
|
||||
[] <span style="background:#D8E7FF;color:#000000;">
|
||||
cambiar tus opciones de búsqueda gráfico y detalles ésta es una lista de los vídeos que te recomendamos nuestras recomendaciones se basan </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[14] es 255B 297p 22675R,
|
||||
2 chunks scored<br>
|
||||
es.88R(99%) 256 bytes = SPANISH <br><br>
|
||||
[sw] <span style="background:#D8E7FF;color:#6F7F00;">
|
||||
a ujumbe mpya jumla unda tafuta na angalia vikundi vya kujadiliana na kushiriki </span>
|
||||
[] <span style="background:#D8E7FF;color:#6F7F00;">
|
||||
mawazo iliyopangwa kwa tarehe watumiaji wapya futa orodha hizi lugha </span>
|
||||
[] <span style="background:#D8E7FF;color:#6F7F00;">
|
||||
hoja vishikanisho vilivyo dhaminiwa ujumbe sanaa na tamasha toka udhibitisho wa neno kwa haraka fikia </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[14] sw 251B 405p 21897R,
|
||||
3 chunks scored<br>
|
||||
sw.87R(99%) 252 bytes = SWAHILI <br><br>
|
||||
[sv] <span style="background:#F8D8FF;color:#000000;">
|
||||
a bort objekt från google desktop post äldst meny öretag dress etaljer alternativ för vad är inne yaste google skrivbord plugin </span>
|
||||
[] <span style="background:#F8D8FF;color:#000000;">
|
||||
program för nyheter google visa nyheter som är anpassade efter de artiklar som du läser om du till exempel läser </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[15] sv 250B 196p 25000R,
|
||||
2 chunks scored<br>
|
||||
sv.100R(99%) 251 bytes = SWEDISH <br><br>
|
||||
[tl] <span style="background:#FFD8D8;color:#7F5F00;">
|
||||
a na ugma sa google ay nakaka bantog sa gitna nang kliks na nangyayari sa </span>
|
||||
[] <span style="background:#FFD8D8;color:#7F5F00;">
|
||||
pamamagitan nang ordinaryong paggagamit at sa kliks na likha nang pandaraya o </span>
|
||||
[] <span style="background:#FFD8D8;color:#7F5F00;">
|
||||
hindi tunay na paggamit bunga nito nasasala namin ang mga kliks na hindi kailangan o hindi gusto nang </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[ 0] tl 254B 282p 25400R,
|
||||
3 chunks scored<br>
|
||||
tl.100R(99%) 255 bytes = TAGALOG <br><br>
|
||||
[tr] <span style="background:#F8FFD8;color:#7F5F00;">
|
||||
a ayarlarınızı görmeniz ve yönetmeniz içindir eğer kampanyanız için günlük bütçe</span>
|
||||
[] <span style="background:#F8FFD8;color:#7F5F00;">
|
||||
nizi gözden geçirebileceğiniz yeri arıyorsanız kampanya yönetimi ne </span>
|
||||
[] <span style="background:#F8FFD8;color:#7F5F00;">
|
||||
gidin kampanyanızı seçin ve kampanya ayarlarını düzenle yi tıklayın sunumu </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[ 1] tr 250B 351p 25000R,
|
||||
3 chunks scored<br>
|
||||
tr.100R(99%) 251 bytes = TURKISH <br><br>
|
||||
[uk] <span style="background:#D8FFE7;color:#7F5F00;">
|
||||
а більший бюджет щоб забезпечити собі максимум прибутків від пе</span>
|
||||
[] <span style="background:#D8FFE7;color:#7F5F00;">
|
||||
реходів відстежуйте свої об яви за датою географічним розташуванням </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[ 2] uk 244B 205p 24400R,
|
||||
2 chunks scored<br>
|
||||
uk.100R(99%) 245 bytes = UKRAINIAN <br><br>
|
||||
[ur] <span style="background:#D8FFE7;color:#6F7F00;">
|
||||
آپ کو کم سے کم ممکنہ رقم چارج کرتا ہے اس کی مثال کے طور پر فرض کریں </span>
|
||||
[] <span style="background:#D8FFE7;color:#6F7F00;">
|
||||
اگر آپ کی زیادہ سے زیادہ قیمت فی کلِک امریکی ڈالر اور کلِک کرنے کی شرح ہو تو </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[ 2] ur 254B 314p 25400R,
|
||||
2 chunks scored<br>
|
||||
ur.100R(99%) 255 bytes = URDU <br><br>
|
||||
[vi] <span style="background:#D8FFE7;color:#3F7F00;">
|
||||
adsense cho nội dung nhà cung cấp dịch vụ di động xác minh tín du</span>
|
||||
[] <span style="background:#D8FFE7;color:#3F7F00;">
|
||||
̣ng thay đổi nhãn kg các ô xem chi phí cho từ chối các đơn đặt hàng dạng cấp dữ liệu ác minh trang web của bạn để xem </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[ 2] vi 252B 202p 24365R,
|
||||
2 chunks scored<br>
|
||||
vi.96R(99%) 253 bytes = VIETNAMESE <br><br>
|
||||
[cy] <span style="background:#FFD8F7;color:#7F5F00;">
|
||||
a chofrestru eich cyfrif ymwelwch a unwaith i chi greu eich cyfrif mi fydd yn </span>
|
||||
[] <span style="background:#FFD8F7;color:#7F5F00;">
|
||||
cael ei hysbysu o ch cyfeiriad ebost newydd fel eich bod yn gallu cadw mewn </span>
|
||||
[] <span style="background:#FFD8F7;color:#7F5F00;">
|
||||
cysylltiad drwy gmail os nad ydych chi wedi clywed yn barod am gmail mae n gwasanaeth gwebost </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[10] cy 248B 426p 24800R,
|
||||
3 chunks scored<br>
|
||||
cy.100R(99%) 249 bytes = WELSH <br><br>
|
||||
[yi] <span style="background:#FFEBD8;color:#0F7F00;">
|
||||
און פאנטאזיע ער איז באקאנט צים מערסטן פאר זיינע באַלאַדעס ער האָט געוווינט אין ווארשע יעס פאריס ליווערפול און לאנדאן סוף כל סוף איז ער </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[11] yi 245B 215p 24500R,
|
||||
1 chunks scored<br>
|
||||
yi.100R(99%) 246 bytes = YIDDISH <br><br>
|
||||
[ms] <span style="background:#D8FFFF;color:#7F5F00;">
|
||||
sukiyaki wikipedia indonesia ensiklopedia bebas berbahasa bebas berbahasa </span>
|
||||
[id] <span style="background:#FFF7D8;color:#7F5F00;">
|
||||
indonesia langsung ke navigasi cari untuk pengertian lain dari sukiyaki lihat suki</span>
|
||||
[] <span style="background:#FFF7D8;color:#7F5F00;">
|
||||
yaki irisan tipis daging sapi sayur sayuran dan tahu di dalam panci besi yang </span>
|
||||
[] <span style="background:#FFF7D8;color:#7F5F00;">
|
||||
dimasak di atas meja makan dengan cara direbus sukiyaki dimakan dengan mence </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[ 6] id 237B 250p 23700R,
|
||||
[ 8] ms 74B 77p 7400R,
|
||||
4 chunks scored<br>
|
||||
{CloseLangPair: ms.100R,74B => id}<br>
|
||||
id.100R(99%) 312 bytes = INDONESIAN <br><br>
|
||||
[ms] <span style="background:#D8FFFF;color:#7F5F00;">
|
||||
sukiyaki wikipedia bahasa melayu ensiklopedia bebas sukiyaki dari wikipedia bahasa melayu </span>
|
||||
[] <span style="background:#D8FFFF;color:#7F5F00;">
|
||||
ensiklopedia bebas lompat ke navigasi gelintar sukiyaki sukiyaki hirisan tipis daging lembu sayur sayuran </span>
|
||||
[] <span style="background:#D8FFFF;color:#7F5F00;">
|
||||
dan tauhu di dalam periuk besi yang dimasak di atas meja makan dengan cara rebusan sukiyaki dimakan dengan mence </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[ 8] ms 309B 326p 29310R,
|
||||
3 chunks scored<br>
|
||||
ms.94R(99%) 310 bytes = MALAY <br><br>
|
||||
[fr] <span style="background:#EFD8FF;color:#000000;">
|
||||
a accès aux chiens et aux frontaux qui lui ont été il peut consulter et modifier ses co</span>
|
||||
[en] <span style="background:#FFFFF4;color:#000000;">
|
||||
llections et exporter this article is about the country france is the largest country </span>
|
||||
[] <span style="background:#FFFFF4;color:#000000;">
|
||||
in western europe and the third largest in europe as a whole cet article concerne le pays euro</span>
|
||||
[fr] <span style="background:#EFD8FF;color:#000000;">
|
||||
péen aujourd hui appelé république française pour d autres usages du nom france motoring events began soon </span>
|
||||
[en] <span style="background:#FFFFF4;color:#000000;">
|
||||
after the construction of the first successful gasoline fueled automobiles the quick brown fox jumped over the lazy dog </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[ 0] en 300B 424p 30000R,
|
||||
[ 4] fr 201B 219p 19920R,
|
||||
5 chunks scored<br>
|
||||
en.100R(59%) fr.99R(40%) 502 bytes = FRENCH <br><br>
|
||||
[sw] <span style="background:#D8E7FF;color:#6F7F00;">
|
||||
qpdbmrmxyzptlkuuddlrlrbas las les qpdbmrmx</span>
|
||||
[] <span style="background:#D8E7FF;color:#6F7F00;">
|
||||
yzptlkuuddlrlrbas el la qpdbmrmxyzptlkuuddlrlrbas </span>
|
||||
<br>
|
||||
DocTote::Dump
|
||||
[14] sw 92B 108p 9200R,
|
||||
2 chunks scored<br>
|
||||
sw.100R(98%) 93 bytes = SWAHILI <br><br>
|
||||
PASS
|
||||
|
||||
</span></body></html>
|
||||
10089
internal/CLD2UnitTestOutputVerbose.html
Normal file
10089
internal/CLD2UnitTestOutputVerbose.html
Normal file
File diff suppressed because it is too large
Load Diff
276
internal/cld2_do_score.cc
Normal file
276
internal/cld2_do_score.cc
Normal file
@@ -0,0 +1,276 @@
|
||||
// Copyright 2013 Google Inc. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
// Little program to read lines of sample text, calculate score per 1024 bytes
|
||||
// per language-script4 combination
|
||||
// Possible input file /export/hda3/cld/pre2010/b0_samp_prune_20100722.utf8
|
||||
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <string>
|
||||
|
||||
#include "compact_lang_det_impl.h"
|
||||
#include "lang_script.h"
|
||||
|
||||
using namespace CLD2;
|
||||
|
||||
double bytes[NUM_LANGUAGES][4];
|
||||
double scores[NUM_LANGUAGES][4];
|
||||
|
||||
|
||||
// Return score per 1024 bytes for top language
|
||||
Language ScoreOneLine(const char* buffer, int buffer_length,
|
||||
int* bytes, double* score_per_1kb) {
|
||||
bool is_plain_text = true;
|
||||
const CLDHints* cld_hints = NULL;
|
||||
bool allow_extended_lang = true;
|
||||
int flags = 0;
|
||||
Language plus_one = UNKNOWN_LANGUAGE;
|
||||
Language language3[3];
|
||||
int percent3[3];
|
||||
double normalized_score3[3];
|
||||
ResultChunkVector* resultchunkvector = NULL;
|
||||
int text_bytes;
|
||||
bool is_reliable;
|
||||
Language summary_lang;
|
||||
|
||||
summary_lang = DetectLanguageSummaryV2(
|
||||
buffer,
|
||||
buffer_length,
|
||||
is_plain_text,
|
||||
cld_hints,
|
||||
allow_extended_lang,
|
||||
flags,
|
||||
plus_one,
|
||||
language3,
|
||||
percent3,
|
||||
normalized_score3,
|
||||
resultchunkvector,
|
||||
&text_bytes,
|
||||
&is_reliable);
|
||||
*bytes = text_bytes;
|
||||
*score_per_1kb = normalized_score3[0];
|
||||
return language3[0];
|
||||
}
|
||||
|
||||
#define LF 0x0a
|
||||
#define CR 0x0d
|
||||
const int kMaxBuffer = 5 * 1024;
|
||||
|
||||
bool ReadLine(FILE* infile, char* buffer, size_t maxlen) {
|
||||
char* p = fgets(buffer, maxlen, infile);
|
||||
if (p == NULL) {
|
||||
return false;
|
||||
}
|
||||
int len = strlen(buffer);
|
||||
|
||||
// trim CR LF
|
||||
if (buffer[len-1] == LF) {buffer[--len] = '\0';}
|
||||
if (buffer[len-1] == CR) {buffer[--len] = '\0';}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool IsComment(const char* buffer) {
|
||||
int len = strlen(buffer);
|
||||
if (len == 0) {return true;}
|
||||
if (buffer[0] == '#') {return true;}
|
||||
if (buffer[0] == ' ') {return true;} // Any leading space is comment
|
||||
return false;
|
||||
}
|
||||
|
||||
// Skips over xxxxx_ where _ is one or more spaces/tabs
|
||||
// Returns string::npos if no more fields
|
||||
int SkipOneField(const string& src, int pos) {
|
||||
if (pos == string::npos) {return pos;}
|
||||
|
||||
int lpos = pos;
|
||||
lpos = src.find_first_of(" \t", lpos);
|
||||
if (lpos == string::npos) {return lpos;}
|
||||
lpos = src.find_first_not_of(" \t", lpos);
|
||||
if (lpos == string::npos) {return lpos;}
|
||||
return lpos;
|
||||
}
|
||||
|
||||
// Return language and script from parsed line or defaults
|
||||
void GetLangScript(const string& src,
|
||||
Language default_lang, ULScript default_lscript,
|
||||
Language* target_lang, ULScript* target_lscript,
|
||||
string* tld) {
|
||||
*target_lang = default_lang;
|
||||
*target_lscript = default_lscript;
|
||||
*tld = "";
|
||||
int pos = 0;
|
||||
int pos2 = 0;
|
||||
if (src.substr(0,7) == "SAMPLE ") {
|
||||
// SAMPLE ll-Ssss
|
||||
pos = SkipOneField(src, pos);
|
||||
} else if (src.substr(0,5) == "SAMP ") {
|
||||
// SAMP ll-Ssss /tld2.tld/
|
||||
pos = SkipOneField(src, pos);
|
||||
pos2 = SkipOneField(src, pos);
|
||||
} else if (src.substr(0,5) == "Samp ") {
|
||||
// Samp ll-Ssss /tld2.tld/
|
||||
pos = SkipOneField(src, pos);
|
||||
pos2 = SkipOneField(src, pos);
|
||||
}
|
||||
if (pos == 0) {return;}
|
||||
if (pos == string::npos) {return;}
|
||||
|
||||
// Pos is at the first letter of language-script combination
|
||||
int end = src.find_first_of(" \t", pos); // find end of lang-script
|
||||
if (end == string::npos) {return;}
|
||||
*target_lang = GetLanguageFromName(src.substr(pos, end - pos).c_str());
|
||||
*target_lscript = GetULScriptFromName(src.substr(pos, end - pos).c_str());
|
||||
|
||||
// Pos2 is 0 or at the first letter of the tld string
|
||||
if (pos2 == 0) {return;}
|
||||
if (pos2 == string::npos) {return;}
|
||||
end = src.find_first_of(" \t", pos2);
|
||||
if (end == string::npos) {return;}
|
||||
*tld = src.substr(pos2, end - pos2);
|
||||
}
|
||||
|
||||
// Return position of start of text
|
||||
int GetTextBeginPos(const string& src) {
|
||||
int pos = 0;
|
||||
if (src.size() < 8) {return pos;}
|
||||
|
||||
if (src.substr(0,7) == "SAMPLE ") {
|
||||
// Skip SAMPLE ll-Ssss
|
||||
pos = SkipOneField(src, pos);
|
||||
pos = SkipOneField(src, pos);
|
||||
} else if (src.substr(0,5) == "SAMP ") {
|
||||
// Skip SAMP ll-Ssss /tld2.tld/
|
||||
pos = SkipOneField(src, pos);
|
||||
pos = SkipOneField(src, pos);
|
||||
pos = SkipOneField(src, pos);
|
||||
} else if (src.substr(0,5) == "Samp ") {
|
||||
// Skip Samp ll-Ssss /tld2.tld/
|
||||
pos = SkipOneField(src, pos);
|
||||
pos = SkipOneField(src, pos);
|
||||
pos = SkipOneField(src, pos);
|
||||
}
|
||||
return pos;
|
||||
}
|
||||
|
||||
// Avoid zdiv
|
||||
inline double Divisor(double x) {
|
||||
return (x > 0.0 ? x : 1.0);
|
||||
}
|
||||
|
||||
void Flush(Language cur_lang, ULScript ulscript,
|
||||
double total_score_cur_lang,
|
||||
double total_bytes_cur_lang, double total_bad_bytes_cur_lang) {
|
||||
if (cur_lang == UNKNOWN_LANGUAGE) {return;}
|
||||
|
||||
bytes[cur_lang][LScript4(ulscript)] += total_bytes_cur_lang;
|
||||
scores[cur_lang][LScript4(ulscript)] += total_score_cur_lang;
|
||||
|
||||
double score = total_score_cur_lang * 1024.0 / Divisor(total_bytes_cur_lang);
|
||||
double percent_bad = 100.0 * total_bad_bytes_cur_lang /
|
||||
Divisor(total_bytes_cur_lang + total_bad_bytes_cur_lang);
|
||||
fprintf(stdout, "%s-%s %7.0f %6.1f, %2.0f%% bad SUMMARY\n\n",
|
||||
LanguageCode(cur_lang),
|
||||
ULScriptCode(ulscript),
|
||||
total_bytes_cur_lang,
|
||||
score,
|
||||
percent_bad);
|
||||
}
|
||||
|
||||
int BytesPer1KB(int i, int j) {
|
||||
int bytes_per_1kb = ((scores[i][j] * 1024.0) / Divisor(bytes[i][j])) + 0.5;
|
||||
return bytes_per_1kb;
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
Language cur_lang = UNKNOWN_LANGUAGE;
|
||||
ULScript cur_ulscript = ULScript_Common;
|
||||
double total_score_cur_lang = 0.0;
|
||||
double total_bytes_cur_lang = 0.0;
|
||||
double total_bad_bytes_cur_lang = 0.0;
|
||||
memset(bytes, 0, sizeof(bytes));
|
||||
memset(scores, 0, sizeof(bytes));
|
||||
|
||||
char buffer[kMaxBuffer];
|
||||
int buffer_length;
|
||||
const char* filename = NULL;
|
||||
FILE* infile = stdin;
|
||||
for (int i = 1; i < argc; ++i) {
|
||||
if (argv[i][0] != '-') {
|
||||
filename = argv[i];
|
||||
}
|
||||
}
|
||||
|
||||
if (filename != NULL) {
|
||||
infile = fopen(filename, "r");
|
||||
if (infile == NULL) {
|
||||
fprintf(stderr, "%s did not open\n", filename);
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
while (ReadLine(infile, buffer, kMaxBuffer)) {
|
||||
if (IsComment(buffer)) {continue;}
|
||||
|
||||
buffer_length = strlen(buffer);
|
||||
int bytes;
|
||||
double score_per_1kb;
|
||||
Language toplang;
|
||||
Language target_lang;
|
||||
ULScript target_ulscript;
|
||||
|
||||
string src(buffer, buffer_length);
|
||||
string tld("");
|
||||
int pos = GetTextBeginPos(src);
|
||||
GetLangScript(src, UNKNOWN_LANGUAGE, ULScript_Common,
|
||||
&target_lang, &target_ulscript, &tld);
|
||||
if ((cur_lang != target_lang) || (cur_ulscript != target_ulscript)) {
|
||||
Flush(cur_lang, cur_ulscript, total_score_cur_lang,
|
||||
total_bytes_cur_lang, total_bad_bytes_cur_lang);
|
||||
cur_lang = target_lang;
|
||||
cur_ulscript = target_ulscript;
|
||||
total_score_cur_lang = 0.0;
|
||||
total_bytes_cur_lang = 0.0;
|
||||
total_bad_bytes_cur_lang = 0.0;
|
||||
}
|
||||
|
||||
toplang = ScoreOneLine(&src[pos], src.size() - pos, &bytes, &score_per_1kb);
|
||||
|
||||
fprintf(stdout, "%s%c %d %4.1f %s\n",
|
||||
LanguageCode(toplang),
|
||||
(toplang == target_lang) ? ' ' : '*',
|
||||
bytes, score_per_1kb, buffer);
|
||||
// Only count when detected lang matches the claimed target lang
|
||||
if (toplang == target_lang) {
|
||||
total_bytes_cur_lang += bytes;
|
||||
total_score_cur_lang += (score_per_1kb * bytes) / 1024.0;
|
||||
} else {
|
||||
total_bad_bytes_cur_lang += bytes;
|
||||
}
|
||||
}
|
||||
Flush(cur_lang, cur_ulscript, total_score_cur_lang,
|
||||
total_bytes_cur_lang, total_bad_bytes_cur_lang);
|
||||
|
||||
for (int i = 0; i < NUM_LANGUAGES; ++i) {
|
||||
Language ilang = static_cast<Language>(i);
|
||||
fprintf(stdout, " {%4d, %4d, %4d, %4d}, // %d %s %s\n",
|
||||
BytesPer1KB(i, 0), BytesPer1KB(i, 1),
|
||||
BytesPer1KB(i, 2), BytesPer1KB(i, 3),
|
||||
i, LanguageName(ilang), LanguageCode(ilang));
|
||||
}
|
||||
|
||||
if (infile != stdin) {
|
||||
fclose(infile);
|
||||
}
|
||||
}
|
||||
297
internal/cld2_generated_cjk_compatible.cc
Normal file
297
internal/cld2_generated_cjk_compatible.cc
Normal file
@@ -0,0 +1,297 @@
|
||||
// Copyright 2013 Google Inc. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
//
|
||||
// CJK compatible CLD2 scoring lookup table
|
||||
//
|
||||
#include "cld2tablesummary.h"
|
||||
|
||||
namespace CLD2 {
|
||||
|
||||
static const int kCompatTableBuildDate = 20130128; // yyyymmdd
|
||||
static const int kCompatTableSizeOne = 0; // One-langprob Bucket count
|
||||
static const int kCompatTableSize = 239; // Total Bucket count
|
||||
static const int kCompatTableKeyMask = 0xffffff00; // Mask hash key
|
||||
static const char* const kCompatTableRecognizedLangScripts =
|
||||
"zh-Hans zh-Hant ja-Hani ko-Hani vi-Hani za-Hani ";
|
||||
|
||||
// Empty table
|
||||
static const IndirectProbBucket4 kCompatTable[kCompatTableSize] = {
|
||||
// key[4], words[4] in UTF-8
|
||||
// value[4]
|
||||
{ {0x00000000,0x00000000,0x00000000,0x00000000}}, // [000]
|
||||
};
|
||||
|
||||
// These are back-derived CTJKVZ probabilities from the table
|
||||
// kTargetCTJKVZProbs in cldutil.cc
|
||||
// This is all part of using one-byte mappings for CJK but wanting to
|
||||
// convert them to normal langprob values to share the scoring code.
|
||||
static const uint32 kCompatTableInd[239 * 2] = {
|
||||
// [0000]
|
||||
0x00000000, 0x00000000, // [0] zh.0 zhT.0 ja.0 ko.0 vi.0 za.0
|
||||
0x00006142, 0x00000000, // [1] zh.0 zhT.0 ja.0 ko.0 vi.0 za.12
|
||||
0x00002d42, 0x00000000, // [2] zh.0 zhT.0 ja.0 ko.0 vi.12 za.0
|
||||
0x00000342, 0x00000000, // [3] zh.0 zhT.0 ja.0 ko.12 vi.0 za.0
|
||||
0x00000242, 0x00000000, // [4] zh.0 zhT.0 ja.12 ko.0 vi.0 za.0
|
||||
0x00001d42, 0x00000000, // [5] zh.0 zhT.12 ja.0 ko.0 vi.0 za.0
|
||||
0x00000542, 0x00000000, // [6] zh.12 zhT.0 ja.0 ko.0 vi.0 za.0
|
||||
0x2d00051f, 0x00000000, // [7] zh.8 zhT.0 ja.0 ko.0 vi.4 za.0
|
||||
0x0300051f, 0x00000000, // [8] zh.8 zhT.0 ja.0 ko.4 vi.0 za.0
|
||||
0x0200051f, 0x00000000, // [9] zh.8 zhT.0 ja.4 ko.0 vi.0 za.0
|
||||
0x1d00051f, 0x00000000, // [10] zh.8 zhT.4 ja.0 ko.0 vi.0 za.0
|
||||
0x031d05ea, 0x00000000, // [11] zh.8 zhT.2 ja.0 ko.2 vi.0 za.0
|
||||
0x0000611c, 0x00000000, // [12] zh.0 zhT.0 ja.0 ko.0 vi.0 za.8
|
||||
0x1d00021f, 0x00000000, // [13] zh.0 zhT.4 ja.8 ko.0 vi.0 za.0
|
||||
0x0500611f, 0x00000000, // [14] zh.4 zhT.0 ja.0 ko.0 vi.0 za.8
|
||||
0x0000021c, 0x00000000, // [15] zh.0 zhT.0 ja.8 ko.0 vi.0 za.0
|
||||
0x021d05ea, 0x00000000, // [16] zh.8 zhT.2 ja.2 ko.0 vi.0 za.0
|
||||
0x02001d1f, 0x00000000, // [17] zh.0 zhT.8 ja.4 ko.0 vi.0 za.0
|
||||
0x6100051f, 0x00000000, // [18] zh.8 zhT.0 ja.0 ko.0 vi.0 za.4
|
||||
0x02001d1d, 0x00000000, // [19] zh.0 zhT.8 ja.2 ko.0 vi.0 za.0
|
||||
0x05001d1f, 0x00000000, // [20] zh.4 zhT.8 ja.0 ko.0 vi.0 za.0
|
||||
0x03051dea, 0x00000000, // [21] zh.2 zhT.8 ja.0 ko.2 vi.0 za.0
|
||||
0x051d02ea, 0x00000000, // [22] zh.2 zhT.2 ja.8 ko.0 vi.0 za.0
|
||||
0x00001d1c, 0x00000000, // [23] zh.0 zhT.8 ja.0 ko.0 vi.0 za.0
|
||||
0x1d00021d, 0x00000000, // [24] zh.0 zhT.2 ja.8 ko.0 vi.0 za.0
|
||||
0x02051dea, 0x00000000, // [25] zh.2 zhT.8 ja.2 ko.0 vi.0 za.0
|
||||
0x0000051c, 0x00000000, // [26] zh.8 zhT.0 ja.0 ko.0 vi.0 za.0
|
||||
0x05001d1d, 0x00000000, // [27] zh.2 zhT.8 ja.0 ko.0 vi.0 za.0
|
||||
0x1d00051d, 0x00000000, // [28] zh.8 zhT.2 ja.0 ko.0 vi.0 za.0
|
||||
0x2d021ded, 0x00000000, // [29] zh.0 zhT.6 ja.2 ko.0 vi.2 za.0
|
||||
0x05002d10, 0x00000000, // [30] zh.2 zhT.0 ja.0 ko.0 vi.6 za.0
|
||||
0x05002d12, 0x00000000, // [31] zh.4 zhT.0 ja.0 ko.0 vi.6 za.0
|
||||
0x2d051dec, 0x00000000, // [32] zh.4 zhT.6 ja.0 ko.0 vi.4 za.0
|
||||
0x02051d10, 0x00002d01, // [33] zh.4 zhT.6 ja.2 ko.0 vi.2 za.0
|
||||
0x02051dec, 0x00002d01, // [34] zh.4 zhT.6 ja.4 ko.0 vi.2 za.0
|
||||
0x1d050212, 0x00000000, // [35] zh.5 zhT.4 ja.6 ko.0 vi.0 za.0
|
||||
0x2d000512, 0x00000000, // [36] zh.6 zhT.0 ja.0 ko.0 vi.4 za.0
|
||||
0x022d0510, 0x00000000, // [37] zh.6 zhT.0 ja.2 ko.0 vi.4 za.0
|
||||
0x2d0205ec, 0x00000000, // [38] zh.6 zhT.0 ja.4 ko.0 vi.4 za.0
|
||||
0x1d2d0510, 0x00000000, // [39] zh.6 zhT.2 ja.0 ko.0 vi.4 za.0
|
||||
0x022d0510, 0x00001d01, // [40] zh.6 zhT.2 ja.2 ko.0 vi.4 za.0
|
||||
0x1d020510, 0x00002d01, // [41] zh.6 zhT.2 ja.4 ko.0 vi.2 za.0
|
||||
0x2d1d0510, 0x00000000, // [42] zh.6 zhT.4 ja.0 ko.0 vi.2 za.0
|
||||
0x021d0510, 0x00002d01, // [43] zh.6 zhT.4 ja.2 ko.0 vi.2 za.0
|
||||
0x03000210, 0x00000000, // [44] zh.0 zhT.0 ja.6 ko.2 vi.0 za.0
|
||||
0x61021ded, 0x00000000, // [45] zh.0 zhT.6 ja.2 ko.0 vi.0 za.2
|
||||
0x021d61ed, 0x00000501, // [46] zh.2 zhT.2 ja.2 ko.0 vi.0 za.6
|
||||
0x05030210, 0x00001d01, // [47] zh.2 zhT.2 ja.6 ko.4 vi.0 za.0
|
||||
0x051d6110, 0x00000000, // [48] zh.2 zhT.4 ja.0 ko.0 vi.0 za.6
|
||||
0x05031d10, 0x00000000, // [49] zh.2 zhT.6 ja.0 ko.4 vi.0 za.0
|
||||
0x02031d10, 0x00000501, // [50] zh.2 zhT.6 ja.2 ko.4 vi.0 za.0
|
||||
0x03021dec, 0x00000501, // [51] zh.2 zhT.6 ja.4 ko.4 vi.0 za.0
|
||||
0x02056110, 0x00000000, // [52] zh.4 zhT.0 ja.2 ko.0 vi.0 za.6
|
||||
0x1d050210, 0x00000301, // [53] zh.4 zhT.2 ja.6 ko.2 vi.0 za.0
|
||||
0x051d61ec, 0x00000201, // [54] zh.4 zhT.4 ja.2 ko.0 vi.0 za.6
|
||||
0x02051dec, 0x00006101, // [55] zh.4 zhT.6 ja.4 ko.0 vi.0 za.2
|
||||
0x610205ed, 0x00000000, // [56] zh.6 zhT.0 ja.2 ko.0 vi.0 za.2
|
||||
0x611d05ed, 0x00000000, // [57] zh.6 zhT.2 ja.0 ko.0 vi.0 za.2
|
||||
0x02610510, 0x00001d01, // [58] zh.6 zhT.2 ja.2 ko.0 vi.0 za.4
|
||||
0x1d020510, 0x00006101, // [59] zh.6 zhT.2 ja.4 ko.0 vi.0 za.2
|
||||
0x61051dec, 0x00000201, // [60] zh.4 zhT.6 ja.2 ko.0 vi.0 za.4
|
||||
0x611d05ec, 0x00000201, // [61] zh.6 zhT.4 ja.2 ko.0 vi.0 za.4
|
||||
0x05006110, 0x00000000, // [62] zh.2 zhT.0 ja.0 ko.0 vi.0 za.6
|
||||
0x031d05ed, 0x00000000, // [63] zh.6 zhT.2 ja.0 ko.2 vi.0 za.0
|
||||
0x051d61ed, 0x00000000, // [64] zh.2 zhT.2 ja.0 ko.0 vi.0 za.6
|
||||
0x1d0205eb, 0x00000000, // [65] zh.6 zhT.2 ja.6 ko.0 vi.0 za.0
|
||||
0x021d0510, 0x00006101, // [66] zh.6 zhT.4 ja.2 ko.0 vi.0 za.2
|
||||
0x021d0510, 0x00000301, // [67] zh.6 zhT.4 ja.2 ko.2 vi.0 za.0
|
||||
0x02051dec, 0x00000301, // [68] zh.4 zhT.6 ja.4 ko.2 vi.0 za.0
|
||||
0x02610510, 0x00000000, // [69] zh.6 zhT.0 ja.2 ko.0 vi.0 za.4
|
||||
0x61020510, 0x00000000, // [70] zh.6 zhT.0 ja.4 ko.0 vi.0 za.2
|
||||
0x02000514, 0x00000000, // [71] zh.6 zhT.0 ja.6 ko.0 vi.0 za.0
|
||||
0x021d05ed, 0x00000000, // [72] zh.6 zhT.2 ja.2 ko.0 vi.0 za.0
|
||||
0x611d0510, 0x00000000, // [73] zh.6 zhT.4 ja.0 ko.0 vi.0 za.2
|
||||
0x1d020512, 0x00000000, // [74] zh.6 zhT.4 ja.5 ko.0 vi.0 za.0
|
||||
0x03001d10, 0x00000000, // [75] zh.0 zhT.6 ja.0 ko.2 vi.0 za.0
|
||||
0x03021ded, 0x00000000, // [76] zh.0 zhT.6 ja.2 ko.2 vi.0 za.0
|
||||
0x03051ded, 0x00000000, // [77] zh.2 zhT.6 ja.0 ko.2 vi.0 za.0
|
||||
0x02051ded, 0x00000301, // [78] zh.2 zhT.6 ja.2 ko.2 vi.0 za.0
|
||||
0x1d056110, 0x00000000, // [79] zh.4 zhT.2 ja.0 ko.0 vi.0 za.6
|
||||
0x611d05ec, 0x00000000, // [80] zh.6 zhT.4 ja.0 ko.0 vi.0 za.4
|
||||
0x031d0510, 0x00000000, // [81] zh.6 zhT.4 ja.0 ko.2 vi.0 za.0
|
||||
0x031d05eb, 0x00000000, // [82] zh.6 zhT.6 ja.0 ko.2 vi.0 za.0
|
||||
0x610205ec, 0x00000000, // [83] zh.6 zhT.0 ja.4 ko.0 vi.0 za.4
|
||||
0x1d610510, 0x00000000, // [84] zh.6 zhT.2 ja.0 ko.0 vi.0 za.4
|
||||
0x021d05eb, 0x00000301, // [85] zh.6 zhT.6 ja.2 ko.2 vi.0 za.0
|
||||
0x61051d10, 0x00000000, // [86] zh.4 zhT.6 ja.0 ko.0 vi.0 za.2
|
||||
0x05021deb, 0x00000000, // [87] zh.2 zhT.6 ja.6 ko.0 vi.0 za.0
|
||||
0x051d0212, 0x00000000, // [88] zh.4 zhT.5 ja.6 ko.0 vi.0 za.0
|
||||
0x03051d10, 0x00000000, // [89] zh.4 zhT.6 ja.0 ko.2 vi.0 za.0
|
||||
0x1d6105eb, 0x00000000, // [90] zh.6 zhT.2 ja.0 ko.0 vi.0 za.6
|
||||
0x03021d10, 0x00000000, // [91] zh.0 zhT.6 ja.4 ko.2 vi.0 za.0
|
||||
0x05000212, 0x00000000, // [92] zh.4 zhT.0 ja.6 ko.0 vi.0 za.0
|
||||
0x05021d10, 0x00000301, // [93] zh.2 zhT.6 ja.4 ko.2 vi.0 za.0
|
||||
0x61051dec, 0x00000000, // [94] zh.4 zhT.6 ja.0 ko.0 vi.0 za.4
|
||||
0x021d05ed, 0x00000000, // [95] zh.6 zhT.2 ja.2 ko.0 vi.0 za.0
|
||||
0x02051d10, 0x00000301, // [96] zh.4 zhT.6 ja.2 ko.2 vi.0 za.0
|
||||
0x05021d12, 0x00000000, // [97] zh.4 zhT.6 ja.5 ko.0 vi.0 za.0
|
||||
0x02000510, 0x00000000, // [98] zh.6 zhT.0 ja.2 ko.0 vi.0 za.0
|
||||
0x021d05ec, 0x00000000, // [99] zh.6 zhT.4 ja.4 ko.0 vi.0 za.0
|
||||
0x1d050210, 0x00000000, // [100] zh.4 zhT.2 ja.6 ko.0 vi.0 za.0
|
||||
0x05000210, 0x00000000, // [101] zh.2 zhT.0 ja.6 ko.0 vi.0 za.0
|
||||
0x051d61ec, 0x00000000, // [102] zh.4 zhT.4 ja.0 ko.0 vi.0 za.6
|
||||
0x051d02ec, 0x00000000, // [103] zh.4 zhT.4 ja.6 ko.0 vi.0 za.0
|
||||
0x02051d10, 0x00006101, // [104] zh.4 zhT.6 ja.2 ko.0 vi.0 za.2
|
||||
0x051d02ed, 0x00000000, // [105] zh.2 zhT.2 ja.6 ko.0 vi.0 za.0
|
||||
0x051d0210, 0x00000000, // [106] zh.2 zhT.4 ja.6 ko.0 vi.0 za.0
|
||||
0x02001d14, 0x00000000, // [107] zh.0 zhT.6 ja.6 ko.0 vi.0 za.0
|
||||
0x1d020510, 0x00000000, // [108] zh.6 zhT.2 ja.4 ko.0 vi.0 za.0
|
||||
0x1d000212, 0x00000000, // [109] zh.0 zhT.4 ja.6 ko.0 vi.0 za.0
|
||||
0x05006112, 0x00000000, // [110] zh.4 zhT.0 ja.0 ko.0 vi.0 za.6
|
||||
0x02051dec, 0x00000000, // [111] zh.4 zhT.6 ja.4 ko.0 vi.0 za.0
|
||||
0x61000514, 0x00000000, // [112] zh.6 zhT.0 ja.0 ko.0 vi.0 za.6
|
||||
0x61000510, 0x00000000, // [113] zh.6 zhT.0 ja.0 ko.0 vi.0 za.2
|
||||
0x02000512, 0x00000000, // [114] zh.6 zhT.0 ja.4 ko.0 vi.0 za.0
|
||||
0x021d0512, 0x00000000, // [115] zh.6 zhT.5 ja.4 ko.0 vi.0 za.0
|
||||
0x1d000210, 0x00000000, // [116] zh.0 zhT.2 ja.6 ko.0 vi.0 za.0
|
||||
0x0000020f, 0x00000000, // [117] zh.0 zhT.0 ja.6 ko.0 vi.0 za.0
|
||||
0x021d05eb, 0x00000000, // [118] zh.6 zhT.6 ja.2 ko.0 vi.0 za.0
|
||||
0x05021d10, 0x00000000, // [119] zh.2 zhT.6 ja.4 ko.0 vi.0 za.0
|
||||
0x021d0510, 0x00000000, // [120] zh.6 zhT.4 ja.2 ko.0 vi.0 za.0
|
||||
0x02051ded, 0x00000000, // [121] zh.2 zhT.6 ja.2 ko.0 vi.0 za.0
|
||||
0x05001d10, 0x00000000, // [122] zh.2 zhT.6 ja.0 ko.0 vi.0 za.0
|
||||
0x61000512, 0x00000000, // [123] zh.6 zhT.0 ja.0 ko.0 vi.0 za.4
|
||||
0x1d000512, 0x00000000, // [124] zh.6 zhT.4 ja.0 ko.0 vi.0 za.0
|
||||
0x1d000514, 0x00000000, // [125] zh.6 zhT.6 ja.0 ko.0 vi.0 za.0
|
||||
0x02051d12, 0x00000000, // [126] zh.5 zhT.6 ja.4 ko.0 vi.0 za.0
|
||||
0x00001d0f, 0x00000000, // [127] zh.0 zhT.6 ja.0 ko.0 vi.0 za.0
|
||||
0x1d000510, 0x00000000, // [128] zh.6 zhT.2 ja.0 ko.0 vi.0 za.0
|
||||
0x02001d10, 0x00000000, // [129] zh.0 zhT.6 ja.2 ko.0 vi.0 za.0
|
||||
0x02051d10, 0x00000000, // [130] zh.4 zhT.6 ja.2 ko.0 vi.0 za.0
|
||||
0x02001d12, 0x00000000, // [131] zh.0 zhT.6 ja.4 ko.0 vi.0 za.0
|
||||
0x05001d12, 0x00000000, // [132] zh.4 zhT.6 ja.0 ko.0 vi.0 za.0
|
||||
0x0000050f, 0x00000000, // [133] zh.6 zhT.0 ja.0 ko.0 vi.0 za.0
|
||||
0x021d0513, 0x00000000, // [134] zh.6 zhT.6 ja.5 ko.0 vi.0 za.0
|
||||
0x1d020513, 0x00000000, // [135] zh.6 zhT.5 ja.6 ko.0 vi.0 za.0
|
||||
0x05021d13, 0x00000000, // [136] zh.5 zhT.6 ja.6 ko.0 vi.0 za.0
|
||||
0x051d02af, 0x00000000, // [137] zh.5 zhT.5 ja.6 ko.0 vi.0 za.0
|
||||
0x02051daf, 0x00000000, // [138] zh.5 zhT.6 ja.5 ko.0 vi.0 za.0
|
||||
0x021d05af, 0x00000000, // [139] zh.6 zhT.5 ja.5 ko.0 vi.0 za.0
|
||||
0x021d0514, 0x00000000, // [140] zh.6 zhT.6 ja.6 ko.0 vi.0 za.0
|
||||
0x1d000513, 0x00000000, // [141] zh.6 zhT.5 ja.0 ko.0 vi.0 za.0
|
||||
0x02000513, 0x00000000, // [142] zh.6 zhT.0 ja.5 ko.0 vi.0 za.0
|
||||
0x02001d13, 0x00000000, // [143] zh.0 zhT.6 ja.5 ko.0 vi.0 za.0
|
||||
0x05001d13, 0x00000000, // [144] zh.5 zhT.6 ja.0 ko.0 vi.0 za.0
|
||||
0x05000213, 0x00000000, // [145] zh.5 zhT.0 ja.6 ko.0 vi.0 za.0
|
||||
0x1d000213, 0x00000000, // [146] zh.0 zhT.5 ja.6 ko.0 vi.0 za.0
|
||||
0x00002d06, 0x00000000, // [147] zh.0 zhT.0 ja.0 ko.0 vi.4 za.0
|
||||
0x00000306, 0x00000000, // [148] zh.0 zhT.0 ja.0 ko.4 vi.0 za.0
|
||||
0x051d2dee, 0x00000000, // [149] zh.2 zhT.2 ja.0 ko.0 vi.4 za.0
|
||||
0x021d2dee, 0x00000501, // [150] zh.2 zhT.2 ja.2 ko.0 vi.4 za.0
|
||||
0x2d051dee, 0x00000000, // [151] zh.2 zhT.4 ja.0 ko.0 vi.2 za.0
|
||||
0x02051dee, 0x00002d01, // [152] zh.2 zhT.4 ja.2 ko.0 vi.2 za.0
|
||||
0x05021d55, 0x00002d01, // [153] zh.2 zhT.4 ja.4 ko.0 vi.2 za.0
|
||||
0x022d0555, 0x00000000, // [154] zh.4 zhT.0 ja.2 ko.0 vi.4 za.0
|
||||
0x2d020555, 0x00000000, // [155] zh.4 zhT.0 ja.4 ko.0 vi.2 za.0
|
||||
0x2d1d05ee, 0x00000000, // [156] zh.4 zhT.2 ja.0 ko.0 vi.2 za.0
|
||||
0x021d05ee, 0x00002d01, // [157] zh.4 zhT.2 ja.2 ko.0 vi.2 za.0
|
||||
0x2d1d0555, 0x00000000, // [158] zh.4 zhT.4 ja.0 ko.0 vi.2 za.0
|
||||
0x021d0555, 0x00002d01, // [159] zh.4 zhT.4 ja.2 ko.0 vi.2 za.0
|
||||
0x021d0509, 0x00002d01, // [160] zh.4 zhT.4 ja.4 ko.0 vi.2 za.0
|
||||
0x1d0203ee, 0x00000000, // [161] zh.0 zhT.2 ja.2 ko.4 vi.0 za.0
|
||||
0x051d02ee, 0x00000301, // [162] zh.2 zhT.2 ja.4 ko.2 vi.0 za.0
|
||||
0x05021d55, 0x00006101, // [163] zh.2 zhT.4 ja.4 ko.0 vi.0 za.2
|
||||
0x05021d55, 0x00000301, // [164] zh.2 zhT.4 ja.4 ko.2 vi.0 za.0
|
||||
0x61020555, 0x00000000, // [165] zh.4 zhT.0 ja.4 ko.0 vi.0 za.2
|
||||
0x61020509, 0x00000000, // [166] zh.4 zhT.0 ja.4 ko.0 vi.0 za.4
|
||||
0x02030555, 0x00001d01, // [167] zh.4 zhT.2 ja.2 ko.4 vi.0 za.0
|
||||
0x031d0555, 0x00000000, // [168] zh.4 zhT.4 ja.0 ko.2 vi.0 za.0
|
||||
0x051d03ee, 0x00000000, // [169] zh.2 zhT.2 ja.0 ko.4 vi.0 za.0
|
||||
0x02051dee, 0x00000301, // [170] zh.2 zhT.4 ja.2 ko.2 vi.0 za.0
|
||||
0x021d0555, 0x00000301, // [171] zh.4 zhT.4 ja.2 ko.2 vi.0 za.0
|
||||
0x02000509, 0x00000000, // [172] zh.4 zhT.0 ja.4 ko.0 vi.0 za.0
|
||||
0x021d0509, 0x00006106, // [173] zh.4 zhT.4 ja.4 ko.0 vi.0 za.4
|
||||
0x03001d07, 0x00000000, // [174] zh.0 zhT.4 ja.0 ko.2 vi.0 za.0
|
||||
0x03021dee, 0x00000000, // [175] zh.0 zhT.4 ja.2 ko.2 vi.0 za.0
|
||||
0x610205ee, 0x00000000, // [176] zh.4 zhT.0 ja.2 ko.0 vi.0 za.2
|
||||
0x1d610555, 0x00000000, // [177] zh.4 zhT.2 ja.0 ko.0 vi.0 za.4
|
||||
0x021d61ee, 0x00000501, // [178] zh.2 zhT.2 ja.2 ko.0 vi.0 za.4
|
||||
0x03000507, 0x00000000, // [179] zh.4 zhT.0 ja.0 ko.2 vi.0 za.0
|
||||
0x021d0509, 0x00006101, // [180] zh.4 zhT.4 ja.4 ko.0 vi.0 za.2
|
||||
0x61000509, 0x00000000, // [181] zh.4 zhT.0 ja.0 ko.0 vi.0 za.4
|
||||
0x02610555, 0x00000000, // [182] zh.4 zhT.0 ja.2 ko.0 vi.0 za.4
|
||||
0x611d05ee, 0x00000000, // [183] zh.4 zhT.2 ja.0 ko.0 vi.0 za.2
|
||||
0x021d05ee, 0x00006101, // [184] zh.4 zhT.2 ja.2 ko.0 vi.0 za.2
|
||||
0x03051dee, 0x00000000, // [185] zh.2 zhT.4 ja.0 ko.2 vi.0 za.0
|
||||
0x051d61ee, 0x00000000, // [186] zh.2 zhT.2 ja.0 ko.0 vi.0 za.4
|
||||
0x05611d55, 0x00000000, // [187] zh.2 zhT.4 ja.0 ko.0 vi.0 za.4
|
||||
0x02611d55, 0x00000501, // [188] zh.2 zhT.4 ja.2 ko.0 vi.0 za.4
|
||||
0x1d020555, 0x00000000, // [189] zh.4 zhT.2 ja.4 ko.0 vi.0 za.0
|
||||
0x05000207, 0x00000000, // [190] zh.2 zhT.0 ja.4 ko.0 vi.0 za.0
|
||||
0x02000507, 0x00000000, // [191] zh.4 zhT.0 ja.2 ko.0 vi.0 za.0
|
||||
0x611d0509, 0x00000000, // [192] zh.4 zhT.4 ja.0 ko.0 vi.0 za.4
|
||||
0x611d0509, 0x00000201, // [193] zh.4 zhT.4 ja.2 ko.0 vi.0 za.4
|
||||
0x02001d09, 0x00000000, // [194] zh.0 zhT.4 ja.4 ko.0 vi.0 za.0
|
||||
0x611d0555, 0x00000000, // [195] zh.4 zhT.4 ja.0 ko.0 vi.0 za.2
|
||||
0x61051dee, 0x00000000, // [196] zh.2 zhT.4 ja.0 ko.0 vi.0 za.2
|
||||
0x051d02ee, 0x00000000, // [197] zh.2 zhT.2 ja.4 ko.0 vi.0 za.0
|
||||
0x1d000207, 0x00000000, // [198] zh.0 zhT.2 ja.4 ko.0 vi.0 za.0
|
||||
0x021d05ee, 0x00000000, // [199] zh.4 zhT.2 ja.2 ko.0 vi.0 za.0
|
||||
0x02051dee, 0x00006101, // [200] zh.2 zhT.4 ja.2 ko.0 vi.0 za.2
|
||||
0x021d0509, 0x00000000, // [201] zh.4 zhT.4 ja.4 ko.0 vi.0 za.0
|
||||
0x05021d55, 0x00000000, // [202] zh.2 zhT.4 ja.4 ko.0 vi.0 za.0
|
||||
0x00000206, 0x00000000, // [203] zh.0 zhT.0 ja.4 ko.0 vi.0 za.0
|
||||
0x02001d07, 0x00000000, // [204] zh.0 zhT.4 ja.2 ko.0 vi.0 za.0
|
||||
0x021d0555, 0x00006101, // [205] zh.4 zhT.4 ja.2 ko.0 vi.0 za.2
|
||||
0x02051dee, 0x00000000, // [206] zh.2 zhT.4 ja.2 ko.0 vi.0 za.0
|
||||
0x1d000507, 0x00000000, // [207] zh.4 zhT.2 ja.0 ko.0 vi.0 za.0
|
||||
0x1d000509, 0x00000000, // [208] zh.4 zhT.4 ja.0 ko.0 vi.0 za.0
|
||||
0x021d0555, 0x00000000, // [209] zh.4 zhT.4 ja.2 ko.0 vi.0 za.0
|
||||
0x05001d07, 0x00000000, // [210] zh.2 zhT.4 ja.0 ko.0 vi.0 za.0
|
||||
0x00001d06, 0x00000000, // [211] zh.0 zhT.4 ja.0 ko.0 vi.0 za.0
|
||||
0x00000506, 0x00000000, // [212] zh.4 zhT.0 ja.0 ko.0 vi.0 za.0
|
||||
0x2d000309, 0x00000000, // [213] zh.0 zhT.0 ja.0 ko.4 vi.4 za.0
|
||||
0x2d000209, 0x00000000, // [214] zh.0 zhT.0 ja.4 ko.0 vi.4 za.0
|
||||
0x03000209, 0x00000000, // [215] zh.0 zhT.0 ja.4 ko.4 vi.0 za.0
|
||||
0x2d001d09, 0x00000000, // [216] zh.0 zhT.4 ja.0 ko.0 vi.4 za.0
|
||||
0x03001d09, 0x00000000, // [217] zh.0 zhT.4 ja.0 ko.4 vi.0 za.0
|
||||
0x2d000509, 0x00000000, // [218] zh.4 zhT.0 ja.0 ko.0 vi.4 za.0
|
||||
0x03000509, 0x00000000, // [219] zh.4 zhT.0 ja.0 ko.4 vi.0 za.0
|
||||
0x00000501, 0x00000000, // [220] zh.2 zhT.0 ja.0 ko.0 vi.0 za.0
|
||||
0x00001d01, 0x00000000, // [221] zh.0 zhT.2 ja.0 ko.0 vi.0 za.0
|
||||
0x2d031d02, 0x00000000, // [222] zh.0 zhT.2 ja.0 ko.2 vi.2 za.0
|
||||
0x2d021d02, 0x00000000, // [223] zh.0 zhT.2 ja.2 ko.0 vi.2 za.0
|
||||
0x2d030502, 0x00000000, // [224] zh.2 zhT.0 ja.0 ko.2 vi.2 za.0
|
||||
0x2d020502, 0x00000000, // [225] zh.2 zhT.0 ja.2 ko.0 vi.2 za.0
|
||||
0x03020502, 0x00000000, // [226] zh.2 zhT.0 ja.2 ko.2 vi.0 za.0
|
||||
0x2d1d0502, 0x00000000, // [227] zh.2 zhT.2 ja.0 ko.0 vi.2 za.0
|
||||
0x021d0502, 0x00000301, // [228] zh.2 zhT.2 ja.2 ko.2 vi.0 za.0
|
||||
0x031d0502, 0x00000000, // [229] zh.2 zhT.2 ja.0 ko.2 vi.0 za.0
|
||||
0x1d000502, 0x00000000, // [230] zh.2 zhT.2 ja.0 ko.0 vi.0 za.0
|
||||
0x00000201, 0x00000000, // [231] zh.0 zhT.0 ja.2 ko.0 vi.0 za.0
|
||||
0x02001d02, 0x00000000, // [232] zh.0 zhT.2 ja.2 ko.0 vi.0 za.0
|
||||
0x021d0502, 0x00000000, // [233] zh.2 zhT.2 ja.2 ko.0 vi.0 za.0
|
||||
0x00000301, 0x00000000, // [234] zh.0 zhT.0 ja.0 ko.2 vi.0 za.0
|
||||
0x02000502, 0x00000000, // [235] zh.2 zhT.0 ja.2 ko.0 vi.0 za.0
|
||||
0x03001d02, 0x00000000, // [236] zh.0 zhT.2 ja.0 ko.2 vi.0 za.0
|
||||
0x03000202, 0x00000000, // [237] zh.0 zhT.0 ja.2 ko.2 vi.0 za.0
|
||||
0x03021d02, 0x00000000, // [238] zh.0 zhT.2 ja.2 ko.2 vi.0 za.0
|
||||
};
|
||||
|
||||
extern const CLD2TableSummary kCjkCompat_obj = {
|
||||
kCompatTable,
|
||||
kCompatTableInd,
|
||||
kCompatTableSizeOne,
|
||||
kCompatTableSize,
|
||||
kCompatTableKeyMask,
|
||||
kCompatTableBuildDate,
|
||||
kCompatTableRecognizedLangScripts,
|
||||
};
|
||||
|
||||
} // End namespace CLD2
|
||||
|
||||
// End of generated tables
|
||||
|
||||
|
||||
4547
internal/cld2_generated_deltaoctachrome0614.cc
Normal file
4547
internal/cld2_generated_deltaoctachrome0614.cc
Normal file
File diff suppressed because it is too large
Load Diff
2188
internal/cld2_generated_distinctoctachrome0604.cc
Normal file
2188
internal/cld2_generated_distinctoctachrome0604.cc
Normal file
File diff suppressed because it is too large
Load Diff
51
internal/cld2_generated_octa2_dummy.cc
Normal file
51
internal/cld2_generated_octa2_dummy.cc
Normal file
@@ -0,0 +1,51 @@
|
||||
// Copyright 2013 Google Inc. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
//
|
||||
// Degenerate CLD2 scoring lookup table, for use as placeholder
|
||||
//
|
||||
#include "cld2tablesummary.h"
|
||||
|
||||
namespace CLD2 {
|
||||
|
||||
static const int kDummyTableBuildDate = 20130101; // yyyymmdd
|
||||
static const int kDummyTableSizeOne = 1; // One-langprob Bucket count
|
||||
static const int kDummyTableSize = 1; // Total Bucket count
|
||||
static const int kDummyTableKeyMask = 0xffffffff; // Mask hash key
|
||||
static const char* const kDummyTableRecognizedLangScripts = "";
|
||||
|
||||
// Empty table
|
||||
static const IndirectProbBucket4 kDummyTable[kDummyTableSize] = {
|
||||
// key[4], words[4] in UTF-8
|
||||
// value[4]
|
||||
{ {0x00000000,0x00000000,0x00000000,0x00000000}}, // [000]
|
||||
};
|
||||
|
||||
static const uint32 kDummyTableInd[1] = {
|
||||
// [0000]
|
||||
0x00000000, };
|
||||
|
||||
extern const CLD2TableSummary kOcta2_obj = {
|
||||
kDummyTable,
|
||||
kDummyTableInd,
|
||||
kDummyTableSizeOne,
|
||||
kDummyTableSize,
|
||||
kDummyTableKeyMask,
|
||||
kDummyTableBuildDate,
|
||||
kDummyTableRecognizedLangScripts,
|
||||
};
|
||||
|
||||
} // End namespace CLD2
|
||||
|
||||
// End of generated tables
|
||||
82284
internal/cld2_generated_quadchrome0604.cc
Normal file
82284
internal/cld2_generated_quadchrome0604.cc
Normal file
File diff suppressed because it is too large
Load Diff
BIN
internal/cld2_unittest
Executable file
BIN
internal/cld2_unittest
Executable file
Binary file not shown.
352
internal/cld2_unittest.cc
Normal file
352
internal/cld2_unittest.cc
Normal file
@@ -0,0 +1,352 @@
|
||||
// Copyright 2013 Google Inc. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
//
|
||||
// Author: dsites@google.com (Dick Sites)
|
||||
//
|
||||
// Unit test compact language detector, CLD2
|
||||
// Compile with -Davoid_utf8_string_constants if your compiler cannot
|
||||
// handle UTF-8 string constants
|
||||
//
|
||||
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
|
||||
#include "../public/compact_lang_det.h"
|
||||
#include "../public/encodings.h"
|
||||
#include "unittest_data.h"
|
||||
|
||||
|
||||
namespace CLD2 {
|
||||
|
||||
// Test strings.
|
||||
const char* kTeststr_en =
|
||||
"confiscation of goods is assigned as the penalty part most of the courts "
|
||||
"consist of members and when it is necessary to bring public cases before a "
|
||||
"jury of members two courts combine for the purpose the most important cases "
|
||||
"of all are brought jurors or";
|
||||
|
||||
|
||||
// UTF8 constants. Use a UTF-8 aware editor for this file
|
||||
#ifndef avoid_utf8_string_constants
|
||||
const char* kTeststr_ks =
|
||||
"नेपाल एसिया "
|
||||
"मंज अख मुलुक"
|
||||
" राजधानी काठ"
|
||||
"माडौं नेपाल "
|
||||
"अधिराज्य पेर"
|
||||
"ेग्वाय "
|
||||
"दक्षिण अमेरि"
|
||||
"का महाद्वीपे"
|
||||
" मध् यक्षेत्"
|
||||
"रे एक देश अस"
|
||||
"् ति फणीश्वर"
|
||||
" नाथ रेणु "
|
||||
"फिजी छु दक्ष"
|
||||
"िण प्रशान् त"
|
||||
" महासागर मंज"
|
||||
" अख देश बहाम"
|
||||
"ास छु केरेबि"
|
||||
"यन मंज "
|
||||
"अख मुलुख राज"
|
||||
"धानी नसौ सम्"
|
||||
" बद्घ विषय ब"
|
||||
"ुरुंडी अफ्री"
|
||||
"का महाद्वीपे"
|
||||
" मध् "
|
||||
"यक्षेत्रे दे"
|
||||
"श अस् ति सम्"
|
||||
" बद्घ विषय";
|
||||
|
||||
#else
|
||||
|
||||
const char* kTeststr_ks =
|
||||
|
||||
"\xE0\xA4\xA8\xE0\xA5\x87\xE0\xA4\xAA\xE0\xA4\xBE\xE0\xA4\xB2 \xE0\xA4\x8F\xE0\xA4\xB8\xE0\xA4\xBF\xE0\xA4\xAF\xE0\xA4\xBE "
|
||||
"\xE0\xA4\xAE\xE0\xA4\x82\xE0\xA4\x9C \xE0\xA4\x85\xE0\xA4\x96 \xE0\xA4\xAE\xE0\xA5\x81\xE0\xA4\xB2\xE0\xA5\x81\xE0\xA4\x95"
|
||||
" \xE0\xA4\xB0\xE0\xA4\xBE\xE0\xA4\x9C\xE0\xA4\xA7\xE0\xA4\xBE\xE0\xA4\xA8\xE0\xA5\x80 \xE0\xA4\x95\xE0\xA4\xBE\xE0\xA4\xA0"
|
||||
"\xE0\xA4\xAE\xE0\xA4\xBE\xE0\xA4\xA1\xE0\xA5\x8C\xE0\xA4\x82 \xE0\xA4\xA8\xE0\xA5\x87\xE0\xA4\xAA\xE0\xA4\xBE\xE0\xA4\xB2 "
|
||||
"\xE0\xA4\x85\xE0\xA4\xA7\xE0\xA4\xBF\xE0\xA4\xB0\xE0\xA4\xBE\xE0\xA4\x9C\xE0\xA5\x8D\xE0\xA4\xAF \xE0\xA4\xAA\xE0\xA5\x87\xE0\xA4\xB0"
|
||||
"\xE0\xA5\x87\xE0\xA4\x97\xE0\xA5\x8D\xE0\xA4\xB5\xE0\xA4\xBE\xE0\xA4\xAF "
|
||||
"\xE0\xA4\xA6\xE0\xA4\x95\xE0\xA5\x8D\xE0\xA4\xB7\xE0\xA4\xBF\xE0\xA4\xA3 \xE0\xA4\x85\xE0\xA4\xAE\xE0\xA5\x87\xE0\xA4\xB0\xE0\xA4\xBF"
|
||||
"\xE0\xA4\x95\xE0\xA4\xBE \xE0\xA4\xAE\xE0\xA4\xB9\xE0\xA4\xBE\xE0\xA4\xA6\xE0\xA5\x8D\xE0\xA4\xB5\xE0\xA5\x80\xE0\xA4\xAA\xE0\xA5\x87"
|
||||
" \xE0\xA4\xAE\xE0\xA4\xA7\xE0\xA5\x8D \xE0\xA4\xAF\xE0\xA4\x95\xE0\xA5\x8D\xE0\xA4\xB7\xE0\xA5\x87\xE0\xA4\xA4\xE0\xA5\x8D"
|
||||
"\xE0\xA4\xB0\xE0\xA5\x87 \xE0\xA4\x8F\xE0\xA4\x95 \xE0\xA4\xA6\xE0\xA5\x87\xE0\xA4\xB6 \xE0\xA4\x85\xE0\xA4\xB8"
|
||||
"\xE0\xA5\x8D \xE0\xA4\xA4\xE0\xA4\xBF \xE0\xA4\xAB\xE0\xA4\xA3\xE0\xA5\x80\xE0\xA4\xB6\xE0\xA5\x8D\xE0\xA4\xB5\xE0\xA4\xB0"
|
||||
" \xE0\xA4\xA8\xE0\xA4\xBE\xE0\xA4\xA5 \xE0\xA4\xB0\xE0\xA5\x87\xE0\xA4\xA3\xE0\xA5\x81 "
|
||||
"\xE0\xA4\xAB\xE0\xA4\xBF\xE0\xA4\x9C\xE0\xA5\x80 \xE0\xA4\x9B\xE0\xA5\x81 \xE0\xA4\xA6\xE0\xA4\x95\xE0\xA5\x8D\xE0\xA4\xB7"
|
||||
"\xE0\xA4\xBF\xE0\xA4\xA3 \xE0\xA4\xAA\xE0\xA5\x8D\xE0\xA4\xB0\xE0\xA4\xB6\xE0\xA4\xBE\xE0\xA4\xA8\xE0\xA5\x8D \xE0\xA4\xA4"
|
||||
" \xE0\xA4\xAE\xE0\xA4\xB9\xE0\xA4\xBE\xE0\xA4\xB8\xE0\xA4\xBE\xE0\xA4\x97\xE0\xA4\xB0 \xE0\xA4\xAE\xE0\xA4\x82\xE0\xA4\x9C"
|
||||
" \xE0\xA4\x85\xE0\xA4\x96 \xE0\xA4\xA6\xE0\xA5\x87\xE0\xA4\xB6 \xE0\xA4\xAC\xE0\xA4\xB9\xE0\xA4\xBE\xE0\xA4\xAE"
|
||||
"\xE0\xA4\xBE\xE0\xA4\xB8 \xE0\xA4\x9B\xE0\xA5\x81 \xE0\xA4\x95\xE0\xA5\x87\xE0\xA4\xB0\xE0\xA5\x87\xE0\xA4\xAC\xE0\xA4\xBF"
|
||||
"\xE0\xA4\xAF\xE0\xA4\xA8 \xE0\xA4\xAE\xE0\xA4\x82\xE0\xA4\x9C "
|
||||
"\xE0\xA4\x85\xE0\xA4\x96 \xE0\xA4\xAE\xE0\xA5\x81\xE0\xA4\xB2\xE0\xA5\x81\xE0\xA4\x96 \xE0\xA4\xB0\xE0\xA4\xBE\xE0\xA4\x9C"
|
||||
"\xE0\xA4\xA7\xE0\xA4\xBE\xE0\xA4\xA8\xE0\xA5\x80 \xE0\xA4\xA8\xE0\xA4\xB8\xE0\xA5\x8C \xE0\xA4\xB8\xE0\xA4\xAE\xE0\xA5\x8D"
|
||||
" \xE0\xA4\xAC\xE0\xA4\xA6\xE0\xA5\x8D\xE0\xA4\x98 \xE0\xA4\xB5\xE0\xA4\xBF\xE0\xA4\xB7\xE0\xA4\xAF \xE0\xA4\xAC"
|
||||
"\xE0\xA5\x81\xE0\xA4\xB0\xE0\xA5\x81\xE0\xA4\x82\xE0\xA4\xA1\xE0\xA5\x80 \xE0\xA4\x85\xE0\xA4\xAB\xE0\xA5\x8D\xE0\xA4\xB0\xE0\xA5\x80"
|
||||
"\xE0\xA4\x95\xE0\xA4\xBE \xE0\xA4\xAE\xE0\xA4\xB9\xE0\xA4\xBE\xE0\xA4\xA6\xE0\xA5\x8D\xE0\xA4\xB5\xE0\xA5\x80\xE0\xA4\xAA\xE0\xA5\x87"
|
||||
" \xE0\xA4\xAE\xE0\xA4\xA7\xE0\xA5\x8D "
|
||||
"\xE0\xA4\xAF\xE0\xA4\x95\xE0\xA5\x8D\xE0\xA4\xB7\xE0\xA5\x87\xE0\xA4\xA4\xE0\xA5\x8D\xE0\xA4\xB0\xE0\xA5\x87 \xE0\xA4\xA6\xE0\xA5\x87"
|
||||
"\xE0\xA4\xB6 \xE0\xA4\x85\xE0\xA4\xB8\xE0\xA5\x8D \xE0\xA4\xA4\xE0\xA4\xBF \xE0\xA4\xB8\xE0\xA4\xAE\xE0\xA5\x8D"
|
||||
" \xE0\xA4\xAC\xE0\xA4\xA6\xE0\xA5\x8D\xE0\xA4\x98 \xE0\xA4\xB5\xE0\xA4\xBF\xE0\xA4\xB7\xE0\xA4\xAF";
|
||||
|
||||
#endif
|
||||
|
||||
typedef struct {
|
||||
Language lang;
|
||||
const char* text;
|
||||
} TestPair;
|
||||
|
||||
|
||||
static const TestPair kTestPair[] = {
|
||||
// A couple of simple cases to begin
|
||||
{ENGLISH, kTeststr_en},
|
||||
// Not Chrome subset {KASHMIRI, kTeststr_ks},
|
||||
|
||||
// 20 languages recognized via Unicode script
|
||||
{ARMENIAN, kTeststr_hy_Armn},
|
||||
{CHEROKEE, kTeststr_chr_Cher},
|
||||
{DHIVEHI, kTeststr_dv_Thaa},
|
||||
{GEORGIAN, kTeststr_ka_Geor},
|
||||
{GREEK, kTeststr_el_Grek},
|
||||
{GUJARATI, kTeststr_gu_Gujr},
|
||||
{INUKTITUT, kTeststr_iu_Cans},
|
||||
{KANNADA, kTeststr_kn_Knda},
|
||||
{KHMER, kTeststr_km_Khmr},
|
||||
{LAOTHIAN, kTeststr_lo_Laoo},
|
||||
{LIMBU, kTeststr_lif_Limb},
|
||||
{MALAYALAM, kTeststr_ml_Mlym},
|
||||
{ORIYA, kTeststr_or_Orya},
|
||||
{PUNJABI, kTeststr_pa_Guru},
|
||||
{SINHALESE, kTeststr_si_Sinh},
|
||||
{SYRIAC, kTeststr_syr_Syrc},
|
||||
{TAGALOG, kTeststr_tl_Tglg},
|
||||
{TAMIL, kTeststr_ta_Taml},
|
||||
{TELUGU, kTeststr_te_Telu},
|
||||
{THAI, kTeststr_th_Thai},
|
||||
|
||||
// 4 languages regognized via single letters
|
||||
{CHINESE, kTeststr_zh_Hans},
|
||||
{CHINESE_T, kTeststr_zh_Hant},
|
||||
{JAPANESE, kTeststr_ja_Hani},
|
||||
{KOREAN, kTeststr_ko_Hani},
|
||||
|
||||
// 60 languages recognized via combinations of four letters
|
||||
{AFRIKAANS, kTeststr_af_Latn},
|
||||
{ALBANIAN, kTeststr_sq_Latn},
|
||||
{ARABIC, kTeststr_ar_Arab},
|
||||
{AZERBAIJANI, kTeststr_az_Latn},
|
||||
{BASQUE, kTeststr_eu_Latn},
|
||||
{BELARUSIAN, kTeststr_be_Cyrl},
|
||||
{BENGALI, kTeststr_bn_Beng}, // No Assamese
|
||||
{BIHARI, kTeststr_bh_Deva},
|
||||
{BULGARIAN, kTeststr_bg_Cyrl},
|
||||
{CATALAN, kTeststr_ca_Latn},
|
||||
{CEBUANO, kTeststr_ceb_Latn},
|
||||
{CROATIAN, kTeststr_hr_Latn},
|
||||
{CZECH, kTeststr_cs_Latn},
|
||||
{DANISH, kTeststr_da_Latn},
|
||||
{DUTCH, kTeststr_nl_Latn},
|
||||
{ENGLISH, kTeststr_en_Latn},
|
||||
{ESTONIAN, kTeststr_et_Latn},
|
||||
{FINNISH, kTeststr_fi_Latn},
|
||||
{FRENCH, kTeststr_fr_Latn},
|
||||
{GALICIAN, kTeststr_gl_Latn},
|
||||
{GANDA, kTeststr_lg_Latn},
|
||||
{GERMAN, kTeststr_de_Latn},
|
||||
{HAITIAN_CREOLE, kTeststr_ht_Latn},
|
||||
{HEBREW, kTeststr_iw_Hebr},
|
||||
{HINDI, kTeststr_hi_Deva},
|
||||
{HMONG, kTeststr_blu_Latn},
|
||||
{HUNGARIAN, kTeststr_hu_Latn},
|
||||
{ICELANDIC, kTeststr_is_Latn},
|
||||
{INDONESIAN, kTeststr_id_Latn},
|
||||
{IRISH, kTeststr_ga_Latn},
|
||||
{ITALIAN, kTeststr_it_Latn},
|
||||
{JAVANESE, kTeststr_jw_Latn},
|
||||
{KINYARWANDA, kTeststr_rw_Latn},
|
||||
{LATVIAN, kTeststr_lv_Latn},
|
||||
{LITHUANIAN, kTeststr_lt_Latn},
|
||||
{MACEDONIAN, kTeststr_mk_Cyrl},
|
||||
{MALAY, kTeststr_ms_Latn},
|
||||
{MALTESE, kTeststr_mt_Latn},
|
||||
{MARATHI, kTeststr_mr_Deva},
|
||||
{NEPALI, kTeststr_ne_Deva},
|
||||
{NORWEGIAN, kTeststr_no_Latn},
|
||||
{PERSIAN, kTeststr_fa_Arab},
|
||||
{POLISH, kTeststr_pl_Latn},
|
||||
{PORTUGUESE, kTeststr_pt_Latn},
|
||||
{ROMANIAN, kTeststr_ro_Latn},
|
||||
{ROMANIAN, kTeststr_ro_Cyrl},
|
||||
{RUSSIAN, kTeststr_ru_Cyrl},
|
||||
{SCOTS_GAELIC, kTeststr_gd_Latn},
|
||||
{SERBIAN, kTeststr_sr_Cyrl},
|
||||
{SERBIAN, kTeststr_sr_Latn},
|
||||
{SLOVAK, kTeststr_sk_Latn},
|
||||
{SLOVENIAN, kTeststr_sl_Latn},
|
||||
{SPANISH, kTeststr_es_Latn},
|
||||
{SWAHILI, kTeststr_sw_Latn},
|
||||
{SWEDISH, kTeststr_sv_Latn},
|
||||
{TAGALOG, kTeststr_tl_Latn},
|
||||
{TURKISH, kTeststr_tr_Latn},
|
||||
{UKRAINIAN, kTeststr_uk_Cyrl},
|
||||
{URDU, kTeststr_ur_Arab},
|
||||
{VIETNAMESE, kTeststr_vi_Latn},
|
||||
{WELSH, kTeststr_cy_Latn},
|
||||
{YIDDISH, kTeststr_yi_Hebr},
|
||||
|
||||
// 2 statistically-close languages
|
||||
{INDONESIAN, kTeststr_id_close},
|
||||
{MALAY, kTeststr_ms_close},
|
||||
|
||||
// Simple intermixed French/English text
|
||||
{FRENCH, kTeststr_fr_en_Latn},
|
||||
|
||||
// Cross-check the main quadgram table build date
|
||||
// Change the expected language each time it is rebuilt
|
||||
{SWAHILI, kTeststr_version},
|
||||
|
||||
{UNKNOWN_LANGUAGE, NULL}, // Must be last
|
||||
};
|
||||
|
||||
|
||||
bool OneTest(int flags, bool get_vector,
|
||||
Language lang_expected, const char* buffer, int buffer_length) {
|
||||
bool is_plain_text = true;
|
||||
const char* tldhint = "";
|
||||
const Encoding enchint = UNKNOWN_ENCODING;
|
||||
const Language langhint = UNKNOWN_LANGUAGE;
|
||||
const CLDHints cldhints = {NULL, tldhint, enchint, langhint};
|
||||
Language language3[3];
|
||||
int percent3[3];
|
||||
double normalized_score3[3];
|
||||
ResultChunkVector resultchunkvector;
|
||||
int text_bytes;
|
||||
bool is_reliable;
|
||||
|
||||
Language lang_detected = ExtDetectLanguageSummary(
|
||||
buffer,
|
||||
buffer_length,
|
||||
is_plain_text,
|
||||
&cldhints,
|
||||
flags,
|
||||
language3,
|
||||
percent3,
|
||||
normalized_score3,
|
||||
get_vector ? &resultchunkvector : NULL,
|
||||
&text_bytes,
|
||||
&is_reliable);
|
||||
// expose DumpExtLang DumpLanguages
|
||||
|
||||
bool ok = (lang_detected == lang_expected);
|
||||
|
||||
if (!ok) {
|
||||
if ((flags & kCLDFlagHtml) != 0) {
|
||||
fprintf(stderr, "*** Wrong result. expected %s, detected %s<br>\n",
|
||||
LanguageName(lang_expected), LanguageName(lang_detected));
|
||||
}
|
||||
fprintf(stdout, "*** Wrong result. expected %s, detected %s\n",
|
||||
LanguageName(lang_expected), LanguageName(lang_detected));
|
||||
fprintf(stdout, "%s\n\n", buffer);
|
||||
}
|
||||
|
||||
if (get_vector) {
|
||||
DumpResultChunkVector(stderr, buffer, &resultchunkvector);
|
||||
}
|
||||
|
||||
#if 0
|
||||
DumpExtLang(flags, summary_lang, language3, percent3, normalized_score3,
|
||||
text_bytes, is_reliable, n);
|
||||
|
||||
if ((flags & kCLDFlagHtml) != 0) {
|
||||
DumpLanguages(summary_lang,
|
||||
language3, percent3, text_bytes, is_reliable, n);
|
||||
}
|
||||
|
||||
fprintf(stdout, " SummaryLanguage %s%s at %u of %d, %s\n",
|
||||
LanguageName(summary_lang),
|
||||
is_reliable ? "" : "(un-reliable)",
|
||||
bytes_consumed,
|
||||
n,
|
||||
argv[1]);
|
||||
#endif
|
||||
|
||||
return ok;
|
||||
}
|
||||
|
||||
void InitHtmlOut(int flags) {
|
||||
#if 1
|
||||
if ((flags & kCLDFlagHtml) != 0) {
|
||||
// Begin HTML file
|
||||
fprintf(stderr, "<html><meta charset=\"UTF-8\"><body>\n");
|
||||
// Encourage browsers to print background colors
|
||||
fprintf(stderr, "<style media=\"print\" type=\"text/css\"> "
|
||||
":root { -webkit-print-color-adjust: exact; } </style>\n");
|
||||
fprintf(stderr, "<span style=\"font-size: 7pt\">\n");
|
||||
fprintf(stderr, "file = %s<br>\n", "cld2_unittest");
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
void FinishHtmlOut(int flags) {
|
||||
#if 1
|
||||
if ((flags & kCLDFlagHtml) != 0) {
|
||||
fprintf(stderr, "\n</span></body></html>\n");
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
int RunTests (int flags, bool get_vector) {
|
||||
fprintf(stdout, "CLD2 version: %s\n", CLD2::DetectLanguageVersion());
|
||||
InitHtmlOut(flags);
|
||||
bool any_fail = false;
|
||||
int i = 0;
|
||||
while (kTestPair[i].text != NULL) {
|
||||
Language lang_expected = kTestPair[i].lang;
|
||||
const char* buffer = kTestPair[i].text;
|
||||
int buffer_length = strlen(buffer);
|
||||
bool ok = OneTest(flags, get_vector, lang_expected, buffer, buffer_length);
|
||||
any_fail |= (!ok);
|
||||
++i;
|
||||
}
|
||||
if (any_fail) {
|
||||
fprintf(stderr, "FAIL\n");
|
||||
fprintf(stdout, "FAIL\n");
|
||||
} else {
|
||||
fprintf(stderr, "PASS\n");
|
||||
fprintf(stdout, "PASS\n");
|
||||
}
|
||||
|
||||
FinishHtmlOut(flags);
|
||||
return 0;
|
||||
}
|
||||
|
||||
} // End namespace CLD2
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
// Get command-line flags
|
||||
int flags = 0;
|
||||
bool get_vector = false;
|
||||
for (int i = 1; i < argc; ++i) {
|
||||
if (strcmp(argv[i], "--html") == 0) {flags |= CLD2::kCLDFlagHtml;}
|
||||
if (strcmp(argv[i], "--cr") == 0) {flags |= CLD2::kCLDFlagCr;}
|
||||
if (strcmp(argv[i], "--verbose") == 0) {flags |= CLD2::kCLDFlagVerbose;}
|
||||
if (strcmp(argv[i], "--quiet") == 0) {flags |= CLD2::kCLDFlagQuiet;}
|
||||
if (strcmp(argv[i], "--echo") == 0) {flags |= CLD2::kCLDFlagEcho;}
|
||||
if (strcmp(argv[i], "--vector") == 0) {get_vector = true;}
|
||||
}
|
||||
|
||||
return CLD2::RunTests(flags, get_vector);
|
||||
}
|
||||
|
||||
BIN
internal/cld2_unittest_avoid
Executable file
BIN
internal/cld2_unittest_avoid
Executable file
Binary file not shown.
55
internal/cld2tablesummary.h
Normal file
55
internal/cld2tablesummary.h
Normal file
@@ -0,0 +1,55 @@
|
||||
// Copyright 2013 Google Inc. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
//
|
||||
// Author: dsites@google.com (Dick Sites)
|
||||
//
|
||||
|
||||
|
||||
#ifndef I18N_ENCODINGS_CLD2_INTERNAL_CLD2TABLESUMMARY_H_
|
||||
#define I18N_ENCODINGS_CLD2_INTERNAL_CLD2TABLESUMMARY_H_
|
||||
|
||||
#include "integral_types.h"
|
||||
|
||||
namespace CLD2 {
|
||||
|
||||
// Hash bucket for four-way associative lookup, indirect probabilities
|
||||
// 16 bytes per bucket, 4-byte entries
|
||||
typedef struct {
|
||||
uint32 keyvalue[4]; // Upper part of word is hash, lower is indirect prob
|
||||
} IndirectProbBucket4;
|
||||
|
||||
|
||||
// Expanded version December 2012.
|
||||
// Moves cutoff for 6-language vs. 3-language indirects
|
||||
// Has list of recognized lang-script combinations
|
||||
typedef struct {
|
||||
const IndirectProbBucket4* kCLDTable;
|
||||
// Each bucket has four entries, part
|
||||
// key and part indirect subscript
|
||||
const uint32* kCLDTableInd; // Each entry is three packed lang/prob
|
||||
uint32 kCLDTableSizeOne; // Indirect subscripts >= this: 2 entries
|
||||
uint32 kCLDTableSize; // Bucket count
|
||||
uint32 kCLDTableKeyMask; // Mask hash key
|
||||
uint32 kCLDTableBuildDate; // yyyymmdd
|
||||
const char* kRecognizedLangScripts; // Character string of lang-Scripts
|
||||
// recognized: "en-Latn az-Arab ..."
|
||||
// Single space delimiter, Random order
|
||||
} CLD2TableSummary;
|
||||
|
||||
} // End namespace CLD2
|
||||
|
||||
#endif // I18N_ENCODINGS_CLD2_INTERNAL_CLD2TABLESUMMARY_H_
|
||||
|
||||
|
||||
1121
internal/cld_generated_cjk_delta_bi_4.cc
Normal file
1121
internal/cld_generated_cjk_delta_bi_4.cc
Normal file
File diff suppressed because it is too large
Load Diff
7133
internal/cld_generated_cjk_uni_prop_80.cc
Normal file
7133
internal/cld_generated_cjk_uni_prop_80.cc
Normal file
File diff suppressed because it is too large
Load Diff
677
internal/cld_generated_score_quad_octa_1024_256.cc
Normal file
677
internal/cld_generated_score_quad_octa_1024_256.cc
Normal file
@@ -0,0 +1,677 @@
|
||||
// Copyright 2013 Google Inc. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
// Generated by ngram_merge on 2011-01-21 10:50:46 from:
|
||||
// score_me file /export/hda3/cld/pre2010/b0_samp_prune_20100722.utf8
|
||||
// cld_generated_quad.bin, built 20110121 bytes 4443812 hash 462c-16c4
|
||||
// cld_generated_deltaocta.bin, built 20110121 bytes 1053284 hash c834-81f5
|
||||
|
||||
// 1 # dsites Added text for
|
||||
// ak haw ig kha ks mfe mo nd nso ny ve
|
||||
// bs-Cyrl/Latn hr-Latn sr-Cyrl/Latn sr-ME-Latn
|
||||
|
||||
// score_me text [ 8] ja not found in tables
|
||||
// score_me text [ 9] ko not found in tables
|
||||
// score_me text [ 16] zh not found in tables
|
||||
// score_me text [ 18] el not found in tables
|
||||
// score_me text [ 41] ml not found in tables
|
||||
// score_me text [ 44] te not found in tables
|
||||
// score_me text [ 46] ta not found in tables
|
||||
// score_me text [ 52] gu not found in tables
|
||||
// score_me text [ 53] th not found in tables
|
||||
// score_me text [ 59] kn not found in tables
|
||||
// score_me text [ 60] pa not found in tables
|
||||
// score_me text [ 69] zhT not found in tables
|
||||
// score_me text [ 75] ka not found in tables
|
||||
// score_me text [ 79] si not found in tables
|
||||
// score_me text [ 90] tw not found in tables
|
||||
// score_me text [ 97] hy not found in tables
|
||||
// score_me text [ 98] lo not found in tables
|
||||
// score_me text [103] my not found in tables
|
||||
// score_me text [104] km not found in tables
|
||||
// score_me text [107] chr not found in tables
|
||||
// score_me text [109] sit-NP not found in tables
|
||||
// score_me text [110] or not found in tables
|
||||
// score_me text [141] iu not found in tables
|
||||
// score_me text [160] srM not found in tables
|
||||
// score_me text [185] zzb not found in tables
|
||||
// score_me text [187] zzh not found in tables
|
||||
// score_me text [189] zze not found in tables
|
||||
|
||||
// No score_me text for [ 25] xxx
|
||||
|
||||
namespace CLD2 {
|
||||
|
||||
// Average score per 1024 bytes
|
||||
extern const short kAvgDeltaOctaScore[614 * 4] = {
|
||||
// Latn Cyrl Arab Other script
|
||||
// Updated 2013.03.26 for CLD2 full
|
||||
1195, 0, 0, 0, // 0 ENGLISH en
|
||||
683, 0, 0, 0, // 1 DANISH da
|
||||
934, 0, 0, 0, // 2 DUTCH nl
|
||||
1221, 0, 0, 0, // 3 FINNISH fi
|
||||
837, 0, 0, 0, // 4 FRENCH fr
|
||||
1035, 0, 0, 0, // 5 GERMAN de
|
||||
0, 0, 0, 935, // 6 HEBREW iw
|
||||
609, 0, 0, 0, // 7 ITALIAN it
|
||||
0, 0, 0, 3348, // 8 Japanese ja
|
||||
0, 0, 0, 3688, // 9 Korean ko
|
||||
986, 0, 0, 0, // 10 NORWEGIAN no
|
||||
1364, 0, 0, 0, // 11 POLISH pl
|
||||
893, 0, 0, 0, // 12 PORTUGUESE pt
|
||||
0, 818, 0, 0, // 13 RUSSIAN ru
|
||||
680, 0, 0, 0, // 14 SPANISH es
|
||||
796, 0, 0, 0, // 15 SWEDISH sv
|
||||
0, 0, 0, 1989, // 16 Chinese zh
|
||||
1411, 0, 0, 0, // 17 CZECH cs
|
||||
0, 0, 0, 1024, // 18 GREEK el
|
||||
1220, 0, 0, 0, // 19 ICELANDIC is
|
||||
1329, 0, 0, 0, // 20 LATVIAN lv
|
||||
1205, 0, 0, 0, // 21 LITHUANIAN lt
|
||||
899, 1104, 0, 0, // 22 ROMANIAN ro
|
||||
1448, 0, 0, 0, // 23 HUNGARIAN hu
|
||||
987, 0, 0, 0, // 24 ESTONIAN et
|
||||
0, 0, 0, 0, // 25 Ignore xxx
|
||||
0, 0, 0, 0, // 26 Unknown un
|
||||
0, 788, 0, 0, // 27 BULGARIAN bg
|
||||
730, 0, 0, 0, // 28 CROATIAN hr
|
||||
717, 890, 0, 0, // 29 SERBIAN sr
|
||||
1318, 0, 0, 0, // 30 IRISH ga
|
||||
634, 0, 0, 0, // 31 GALICIAN gl
|
||||
1035, 0, 0, 0, // 32 TAGALOG tl
|
||||
1234, 0, 0, 0, // 33 TURKISH tr
|
||||
0, 821, 0, 0, // 34 UKRAINIAN uk
|
||||
0, 0, 0, 765, // 35 HINDI hi
|
||||
0, 843, 0, 0, // 36 MACEDONIAN mk
|
||||
0, 0, 0, 589, // 37 BENGALI bn
|
||||
1033, 0, 0, 0, // 38 INDONESIAN id
|
||||
626, 0, 0, 0, // 39 LATIN la
|
||||
1220, 0, 0, 0, // 40 MALAY ms
|
||||
0, 0, 0, 1024, // 41 MALAYALAM ml
|
||||
1538, 0, 0, 0, // 42 WELSH cy
|
||||
0, 0, 0, 502, // 43 NEPALI ne
|
||||
0, 0, 0, 1024, // 44 TELUGU te
|
||||
1113, 0, 0, 0, // 45 ALBANIAN sq
|
||||
0, 0, 0, 1024, // 46 TAMIL ta
|
||||
0, 983, 0, 0, // 47 BELARUSIAN be
|
||||
702, 0, 0, 0, // 48 JAVANESE jw
|
||||
467, 0, 0, 0, // 49 OCCITAN oc
|
||||
0, 0, 978, 0, // 50 URDU ur
|
||||
0, 0, 0, 643, // 51 BIHARI bh
|
||||
0, 0, 0, 1024, // 52 GUJARATI gu
|
||||
0, 0, 0, 1024, // 53 THAI th
|
||||
0, 0, 846, 0, // 54 ARABIC ar
|
||||
619, 0, 0, 0, // 55 CATALAN ca
|
||||
805, 0, 0, 0, // 56 ESPERANTO eo
|
||||
1219, 0, 0, 0, // 57 BASQUE eu
|
||||
403, 0, 0, 0, // 58 INTERLINGUA ia
|
||||
0, 0, 0, 1024, // 59 KANNADA kn
|
||||
0, 0, 0, 1024, // 60 PUNJABI pa
|
||||
1521, 0, 0, 0, // 61 SCOTS_GAELIC gd
|
||||
913, 0, 0, 0, // 62 SWAHILI sw
|
||||
938, 0, 0, 0, // 63 SLOVENIAN sl
|
||||
0, 0, 0, 590, // 64 MARATHI mr
|
||||
1045, 0, 0, 0, // 65 MALTESE mt
|
||||
1268, 0, 0, 0, // 66 VIETNAMESE vi
|
||||
810, 0, 0, 0, // 67 FRISIAN fy
|
||||
1416, 0, 0, 0, // 68 SLOVAK sk
|
||||
0, 0, 0, 1949, // 69 ChineseT zh-Hant
|
||||
1022, 0, 0, 0, // 70 FAROESE fo
|
||||
716, 0, 0, 0, // 71 SUNDANESE su
|
||||
1063, 0, 0, 0, // 72 UZBEK uz
|
||||
0, 0, 0, 391, // 73 AMHARIC am
|
||||
1373, 0, 0, 0, // 74 AZERBAIJANI az
|
||||
0, 0, 0, 1024, // 75 GEORGIAN ka
|
||||
0, 0, 0, 469, // 76 TIGRINYA ti
|
||||
0, 0, 899, 0, // 77 PERSIAN fa
|
||||
0, 0, 0, 0, // 78 BOSNIAN bs
|
||||
0, 0, 0, 1024, // 79 SINHALESE si
|
||||
840, 0, 0, 0, // 80 NORWEGIAN_N nn
|
||||
0, 0, 0, 0, // 81 81
|
||||
0, 0, 0, 0, // 82 82
|
||||
1136, 0, 0, 0, // 83 XHOSA xh
|
||||
1002, 0, 0, 0, // 84 ZULU zu
|
||||
1207, 0, 0, 0, // 85 GUARANI gn
|
||||
1014, 0, 0, 0, // 86 SESOTHO st
|
||||
1178, 812, 0, 0, // 87 TURKMEN tk
|
||||
0, 834, 1108, 0, // 88 KYRGYZ ky
|
||||
1144, 0, 0, 0, // 89 BRETON br
|
||||
0, 0, 0, 0, // 90 TWI tw
|
||||
0, 0, 0, 1004, // 91 YIDDISH yi
|
||||
0, 0, 0, 0, // 92 92
|
||||
1118, 0, 0, 0, // 93 SOMALI so
|
||||
0, 1030, 1171, 0, // 94 UIGHUR ug
|
||||
0, 0, 1073, 0, // 95 KURDISH ku
|
||||
0, 1138, 0, 0, // 96 MONGOLIAN mn
|
||||
0, 0, 0, 1024, // 97 ARMENIAN hy
|
||||
0, 0, 0, 1024, // 98 LAOTHIAN lo
|
||||
0, 0, 838, 0, // 99 SINDHI sd
|
||||
685, 0, 0, 0, // 100 RHAETO_ROMANCE rm
|
||||
921, 0, 0, 0, // 101 AFRIKAANS af
|
||||
917, 0, 0, 0, // 102 LUXEMBOURGISH lb
|
||||
0, 0, 0, 1024, // 103 BURMESE my
|
||||
0, 0, 0, 1024, // 104 KHMER km
|
||||
0, 0, 0, 632, // 105 TIBETAN bo
|
||||
0, 0, 0, 0, // 106 DHIVEHI dv
|
||||
0, 0, 0, 1024, // 107 CHEROKEE chr
|
||||
0, 0, 0, 0, // 108 SYRIAC syr
|
||||
0, 0, 0, 1024, // 109 LIMBU lif
|
||||
0, 0, 0, 1024, // 110 ORIYA or
|
||||
0, 0, 0, 378, // 111 ASSAMESE as
|
||||
539, 0, 0, 0, // 112 CORSICAN co
|
||||
544, 0, 0, 0, // 113 INTERLINGUE ie
|
||||
0, 938, 446, 0, // 114 KAZAKH kk
|
||||
658, 0, 0, 0, // 115 LINGALA ln
|
||||
0, 0, 0, 0, // 116 116
|
||||
0, 0, 790, 0, // 117 PASHTO ps
|
||||
1271, 0, 0, 0, // 118 QUECHUA qu
|
||||
945, 0, 0, 0, // 119 SHONA sn
|
||||
0, 978, 0, 0, // 120 TAJIK tg
|
||||
1211, 852, 0, 0, // 121 TATAR tt
|
||||
1013, 0, 0, 0, // 122 TONGA to
|
||||
724, 0, 0, 0, // 123 YORUBA yo
|
||||
0, 0, 0, 0, // 124 124
|
||||
0, 0, 0, 0, // 125 125
|
||||
0, 0, 0, 0, // 126 126
|
||||
0, 0, 0, 0, // 127 127
|
||||
1139, 0, 0, 0, // 128 MAORI mi
|
||||
849, 0, 0, 0, // 129 WOLOF wo
|
||||
0, 687, 0, 0, // 130 ABKHAZIAN ab
|
||||
623, 0, 0, 0, // 131 AFAR aa
|
||||
943, 0, 0, 0, // 132 AYMARA ay
|
||||
0, 797, 0, 0, // 133 BASHKIR ba
|
||||
1247, 0, 0, 0, // 134 BISLAMA bi
|
||||
0, 0, 0, 579, // 135 DZONGKHA dz
|
||||
1210, 0, 0, 0, // 136 FIJIAN fj
|
||||
1721, 0, 0, 0, // 137 GREENLANDIC kl
|
||||
871, 0, 0, 0, // 138 HAUSA ha
|
||||
1029, 0, 0, 0, // 139 HAITIAN_CREOLE ht
|
||||
447, 0, 0, 0, // 140 INUPIAK ik
|
||||
0, 0, 0, 1024, // 141 INUKTITUT iu
|
||||
0, 0, 925, 0, // 142 KASHMIRI ks
|
||||
1100, 0, 0, 0, // 143 KINYARWANDA rw
|
||||
1406, 0, 0, 0, // 144 MALAGASY mg
|
||||
927, 0, 0, 0, // 145 NAURU na
|
||||
1420, 0, 0, 0, // 146 OROMO om
|
||||
1123, 0, 0, 0, // 147 RUNDI rn
|
||||
944, 0, 0, 0, // 148 SAMOAN sm
|
||||
1526, 0, 0, 0, // 149 SANGO sg
|
||||
864, 0, 0, 423, // 150 SANSKRIT sa
|
||||
790, 0, 0, 0, // 151 SISWANT ss
|
||||
1045, 0, 0, 0, // 152 TSONGA ts
|
||||
1094, 0, 0, 0, // 153 TSWANA tn
|
||||
649, 0, 0, 0, // 154 VOLAPUK vo
|
||||
1737, 0, 0, 0, // 155 ZHUANG za
|
||||
1237, 0, 0, 0, // 156 KHASI kha
|
||||
1148, 0, 0, 0, // 157 SCOTS sco
|
||||
1082, 0, 0, 0, // 158 GANDA lg
|
||||
1381, 0, 0, 0, // 159 MANX gv
|
||||
0, 0, 0, 0, // 160 MONTENEGRIN sr-ME
|
||||
1030, 0, 0, 0, // 161 AKAN ak
|
||||
1049, 0, 0, 0, // 162 IGBO ig
|
||||
861, 0, 0, 0, // 163 MAURITIAN_CREOLE mfe
|
||||
754, 0, 0, 0, // 164 HAWAIIAN haw
|
||||
0, 0, 0, 0, // 165 CEBUANO ceb
|
||||
0, 0, 0, 0, // 166 EWE ee
|
||||
0, 0, 0, 0, // 167 GA gaa
|
||||
0, 0, 0, 0, // 168 HMONG blu
|
||||
0, 0, 0, 0, // 169 KRIO kri
|
||||
0, 0, 0, 0, // 170 LOZI loz
|
||||
0, 0, 0, 0, // 171 LUBA_LULUA lua
|
||||
0, 0, 0, 0, // 172 LUO_KENYA_AND_TANZANIA luo
|
||||
0, 0, 0, 0, // 173 NEWARI new
|
||||
809, 0, 0, 0, // 174 NYANJA ny
|
||||
0, 0, 0, 0, // 175 OSSETIAN os
|
||||
0, 0, 0, 0, // 176 PAMPANGA pam
|
||||
803, 0, 0, 0, // 177 PEDI nso
|
||||
0, 0, 0, 0, // 178 RAJASTHANI raj
|
||||
910, 0, 0, 0, // 179 SESELWA crs
|
||||
0, 0, 0, 0, // 180 TUMBUKA tum
|
||||
978, 0, 0, 0, // 181 VENDA ve
|
||||
0, 0, 0, 0, // 182 WARAY_PHILIPPINES war
|
||||
0, 0, 0, 0, // 183 183
|
||||
0, 0, 0, 0, // 184 184
|
||||
0, 0, 0, 0, // 185 185
|
||||
0, 0, 0, 0, // 186 186
|
||||
0, 0, 0, 0, // 187 187
|
||||
0, 0, 0, 0, // 188 188
|
||||
0, 0, 0, 0, // 189 189
|
||||
0, 0, 0, 0, // 190 190
|
||||
0, 0, 0, 0, // 191 191
|
||||
0, 0, 0, 0, // 192 192
|
||||
0, 0, 0, 0, // 193 193
|
||||
0, 0, 0, 0, // 194 194
|
||||
0, 0, 0, 0, // 195 195
|
||||
0, 0, 0, 0, // 196 196
|
||||
0, 0, 0, 0, // 197 197
|
||||
0, 0, 0, 0, // 198 198
|
||||
0, 0, 0, 0, // 199 199
|
||||
0, 0, 0, 0, // 200 200
|
||||
0, 0, 0, 0, // 201 201
|
||||
0, 0, 0, 0, // 202 202
|
||||
0, 0, 0, 0, // 203 203
|
||||
0, 0, 0, 0, // 204 204
|
||||
0, 0, 0, 0, // 205 205
|
||||
0, 0, 0, 0, // 206 206
|
||||
0, 0, 0, 0, // 207 207
|
||||
0, 0, 0, 0, // 208 208
|
||||
0, 0, 0, 0, // 209 209
|
||||
0, 0, 0, 0, // 210 210
|
||||
0, 0, 0, 0, // 211 211
|
||||
0, 0, 0, 0, // 212 212
|
||||
0, 0, 0, 0, // 213 213
|
||||
0, 0, 0, 0, // 214 214
|
||||
0, 0, 0, 0, // 215 215
|
||||
0, 0, 0, 0, // 216 216
|
||||
0, 0, 0, 0, // 217 217
|
||||
0, 0, 0, 0, // 218 218
|
||||
0, 0, 0, 0, // 219 219
|
||||
0, 0, 0, 0, // 220 220
|
||||
0, 0, 0, 0, // 221 221
|
||||
0, 0, 0, 0, // 222 222
|
||||
0, 0, 0, 0, // 223 223
|
||||
0, 0, 0, 0, // 224 224
|
||||
0, 0, 0, 0, // 225 225
|
||||
0, 0, 0, 0, // 226 226
|
||||
0, 0, 0, 0, // 227 227
|
||||
0, 0, 0, 0, // 228 228
|
||||
0, 0, 0, 0, // 229 229
|
||||
0, 0, 0, 0, // 230 230
|
||||
0, 0, 0, 0, // 231 231
|
||||
0, 0, 0, 0, // 232 232
|
||||
0, 0, 0, 0, // 233 233
|
||||
0, 0, 0, 0, // 234 234
|
||||
0, 0, 0, 0, // 235 235
|
||||
0, 0, 0, 0, // 236 236
|
||||
0, 0, 0, 0, // 237 237
|
||||
0, 0, 0, 0, // 238 238
|
||||
0, 0, 0, 0, // 239 239
|
||||
0, 0, 0, 0, // 240 240
|
||||
0, 0, 0, 0, // 241 241
|
||||
0, 0, 0, 0, // 242 242
|
||||
0, 0, 0, 0, // 243 243
|
||||
0, 0, 0, 0, // 244 244
|
||||
0, 0, 0, 0, // 245 245
|
||||
0, 0, 0, 0, // 246 246
|
||||
0, 0, 0, 0, // 247 247
|
||||
0, 0, 0, 0, // 248 248
|
||||
0, 0, 0, 0, // 249 249
|
||||
0, 0, 0, 0, // 250 250
|
||||
0, 0, 0, 0, // 251 251
|
||||
0, 0, 0, 0, // 252 252
|
||||
0, 0, 0, 0, // 253 253
|
||||
0, 0, 0, 0, // 254 254
|
||||
0, 0, 0, 0, // 255 255
|
||||
0, 0, 0, 0, // 256 256
|
||||
0, 0, 0, 0, // 257 257
|
||||
0, 0, 0, 0, // 258 258
|
||||
0, 0, 0, 0, // 259 259
|
||||
0, 0, 0, 0, // 260 260
|
||||
0, 0, 0, 0, // 261 261
|
||||
0, 0, 0, 0, // 262 262
|
||||
0, 0, 0, 0, // 263 263
|
||||
0, 0, 0, 0, // 264 264
|
||||
0, 0, 0, 0, // 265 265
|
||||
0, 0, 0, 0, // 266 266
|
||||
0, 0, 0, 0, // 267 267
|
||||
0, 0, 0, 0, // 268 268
|
||||
0, 0, 0, 0, // 269 269
|
||||
0, 0, 0, 0, // 270 270
|
||||
0, 0, 0, 0, // 271 271
|
||||
0, 0, 0, 0, // 272 272
|
||||
0, 0, 0, 0, // 273 273
|
||||
0, 0, 0, 0, // 274 274
|
||||
0, 0, 0, 0, // 275 275
|
||||
0, 0, 0, 0, // 276 276
|
||||
0, 0, 0, 0, // 277 277
|
||||
0, 0, 0, 0, // 278 278
|
||||
0, 0, 0, 0, // 279 279
|
||||
0, 0, 0, 0, // 280 280
|
||||
0, 0, 0, 0, // 281 281
|
||||
0, 0, 0, 0, // 282 282
|
||||
0, 0, 0, 0, // 283 283
|
||||
0, 0, 0, 0, // 284 284
|
||||
0, 0, 0, 0, // 285 285
|
||||
0, 0, 0, 0, // 286 286
|
||||
0, 0, 0, 0, // 287 287
|
||||
0, 0, 0, 0, // 288 288
|
||||
0, 0, 0, 0, // 289 289
|
||||
0, 0, 0, 0, // 290 290
|
||||
0, 0, 0, 0, // 291 291
|
||||
0, 0, 0, 0, // 292 292
|
||||
0, 0, 0, 0, // 293 293
|
||||
0, 0, 0, 0, // 294 294
|
||||
0, 0, 0, 0, // 295 295
|
||||
0, 0, 0, 0, // 296 296
|
||||
0, 0, 0, 0, // 297 297
|
||||
0, 0, 0, 0, // 298 298
|
||||
0, 0, 0, 0, // 299 299
|
||||
0, 0, 0, 0, // 300 300
|
||||
0, 0, 0, 0, // 301 301
|
||||
0, 0, 0, 0, // 302 302
|
||||
0, 0, 0, 0, // 303 303
|
||||
0, 0, 0, 0, // 304 304
|
||||
0, 0, 0, 0, // 305 305
|
||||
0, 0, 0, 0, // 306 306
|
||||
0, 0, 0, 0, // 307 307
|
||||
0, 0, 0, 0, // 308 308
|
||||
0, 0, 0, 0, // 309 309
|
||||
0, 0, 0, 0, // 310 310
|
||||
0, 0, 0, 0, // 311 311
|
||||
0, 0, 0, 0, // 312 312
|
||||
0, 0, 0, 0, // 313 313
|
||||
0, 0, 0, 0, // 314 314
|
||||
0, 0, 0, 0, // 315 315
|
||||
0, 0, 0, 0, // 316 316
|
||||
0, 0, 0, 0, // 317 317
|
||||
0, 0, 0, 0, // 318 318
|
||||
0, 0, 0, 0, // 319 319
|
||||
0, 0, 0, 0, // 320 320
|
||||
0, 0, 0, 0, // 321 321
|
||||
0, 0, 0, 0, // 322 322
|
||||
0, 0, 0, 0, // 323 323
|
||||
0, 0, 0, 0, // 324 324
|
||||
0, 0, 0, 0, // 325 325
|
||||
0, 0, 0, 0, // 326 326
|
||||
0, 0, 0, 0, // 327 327
|
||||
0, 0, 0, 0, // 328 328
|
||||
0, 0, 0, 0, // 329 329
|
||||
0, 0, 0, 0, // 330 330
|
||||
0, 0, 0, 0, // 331 331
|
||||
0, 0, 0, 0, // 332 332
|
||||
0, 0, 0, 0, // 333 333
|
||||
0, 0, 0, 0, // 334 334
|
||||
0, 0, 0, 0, // 335 335
|
||||
0, 0, 0, 0, // 336 336
|
||||
0, 0, 0, 0, // 337 337
|
||||
0, 0, 0, 0, // 338 338
|
||||
0, 0, 0, 0, // 339 339
|
||||
0, 0, 0, 0, // 340 340
|
||||
0, 0, 0, 0, // 341 341
|
||||
0, 0, 0, 0, // 342 342
|
||||
0, 0, 0, 0, // 343 343
|
||||
0, 0, 0, 0, // 344 344
|
||||
0, 0, 0, 0, // 345 345
|
||||
0, 0, 0, 0, // 346 346
|
||||
0, 0, 0, 0, // 347 347
|
||||
0, 0, 0, 0, // 348 348
|
||||
0, 0, 0, 0, // 349 349
|
||||
0, 0, 0, 0, // 350 350
|
||||
0, 0, 0, 0, // 351 351
|
||||
0, 0, 0, 0, // 352 352
|
||||
0, 0, 0, 0, // 353 353
|
||||
0, 0, 0, 0, // 354 354
|
||||
0, 0, 0, 0, // 355 355
|
||||
0, 0, 0, 0, // 356 356
|
||||
0, 0, 0, 0, // 357 357
|
||||
0, 0, 0, 0, // 358 358
|
||||
0, 0, 0, 0, // 359 359
|
||||
0, 0, 0, 0, // 360 360
|
||||
0, 0, 0, 0, // 361 361
|
||||
0, 0, 0, 0, // 362 362
|
||||
0, 0, 0, 0, // 363 363
|
||||
0, 0, 0, 0, // 364 364
|
||||
0, 0, 0, 0, // 365 365
|
||||
0, 0, 0, 0, // 366 366
|
||||
0, 0, 0, 0, // 367 367
|
||||
0, 0, 0, 0, // 368 368
|
||||
0, 0, 0, 0, // 369 369
|
||||
0, 0, 0, 0, // 370 370
|
||||
0, 0, 0, 0, // 371 371
|
||||
0, 0, 0, 0, // 372 372
|
||||
0, 0, 0, 0, // 373 373
|
||||
0, 0, 0, 0, // 374 374
|
||||
0, 0, 0, 0, // 375 375
|
||||
0, 0, 0, 0, // 376 376
|
||||
0, 0, 0, 0, // 377 377
|
||||
0, 0, 0, 0, // 378 378
|
||||
0, 0, 0, 0, // 379 379
|
||||
0, 0, 0, 0, // 380 380
|
||||
0, 0, 0, 0, // 381 381
|
||||
0, 0, 0, 0, // 382 382
|
||||
0, 0, 0, 0, // 383 383
|
||||
0, 0, 0, 0, // 384 384
|
||||
0, 0, 0, 0, // 385 385
|
||||
0, 0, 0, 0, // 386 386
|
||||
0, 0, 0, 0, // 387 387
|
||||
0, 0, 0, 0, // 388 388
|
||||
0, 0, 0, 0, // 389 389
|
||||
0, 0, 0, 0, // 390 390
|
||||
0, 0, 0, 0, // 391 391
|
||||
0, 0, 0, 0, // 392 392
|
||||
0, 0, 0, 0, // 393 393
|
||||
0, 0, 0, 0, // 394 394
|
||||
0, 0, 0, 0, // 395 395
|
||||
0, 0, 0, 0, // 396 396
|
||||
0, 0, 0, 0, // 397 397
|
||||
0, 0, 0, 0, // 398 398
|
||||
0, 0, 0, 0, // 399 399
|
||||
0, 0, 0, 0, // 400 400
|
||||
0, 0, 0, 0, // 401 401
|
||||
0, 0, 0, 0, // 402 402
|
||||
0, 0, 0, 0, // 403 403
|
||||
0, 0, 0, 0, // 404 404
|
||||
0, 0, 0, 0, // 405 405
|
||||
0, 0, 0, 0, // 406 406
|
||||
0, 0, 0, 0, // 407 407
|
||||
0, 0, 0, 0, // 408 408
|
||||
0, 0, 0, 0, // 409 409
|
||||
0, 0, 0, 0, // 410 410
|
||||
0, 0, 0, 0, // 411 411
|
||||
0, 0, 0, 0, // 412 412
|
||||
0, 0, 0, 0, // 413 413
|
||||
0, 0, 0, 0, // 414 414
|
||||
0, 0, 0, 0, // 415 415
|
||||
0, 0, 0, 0, // 416 416
|
||||
0, 0, 0, 0, // 417 417
|
||||
0, 0, 0, 0, // 418 418
|
||||
0, 0, 0, 0, // 419 419
|
||||
0, 0, 0, 0, // 420 420
|
||||
0, 0, 0, 0, // 421 421
|
||||
0, 0, 0, 0, // 422 422
|
||||
0, 0, 0, 0, // 423 423
|
||||
0, 0, 0, 0, // 424 424
|
||||
0, 0, 0, 0, // 425 425
|
||||
0, 0, 0, 0, // 426 426
|
||||
0, 0, 0, 0, // 427 427
|
||||
0, 0, 0, 0, // 428 428
|
||||
0, 0, 0, 0, // 429 429
|
||||
0, 0, 0, 0, // 430 430
|
||||
0, 0, 0, 0, // 431 431
|
||||
0, 0, 0, 0, // 432 432
|
||||
0, 0, 0, 0, // 433 433
|
||||
0, 0, 0, 0, // 434 434
|
||||
0, 0, 0, 0, // 435 435
|
||||
0, 0, 0, 0, // 436 436
|
||||
0, 0, 0, 0, // 437 437
|
||||
0, 0, 0, 0, // 438 438
|
||||
0, 0, 0, 0, // 439 439
|
||||
0, 0, 0, 0, // 440 440
|
||||
0, 0, 0, 0, // 441 441
|
||||
0, 0, 0, 0, // 442 442
|
||||
0, 0, 0, 0, // 443 443
|
||||
0, 0, 0, 0, // 444 444
|
||||
0, 0, 0, 0, // 445 445
|
||||
0, 0, 0, 0, // 446 446
|
||||
0, 0, 0, 0, // 447 447
|
||||
0, 0, 0, 0, // 448 448
|
||||
0, 0, 0, 0, // 449 449
|
||||
0, 0, 0, 0, // 450 450
|
||||
0, 0, 0, 0, // 451 451
|
||||
0, 0, 0, 0, // 452 452
|
||||
0, 0, 0, 0, // 453 453
|
||||
0, 0, 0, 0, // 454 454
|
||||
0, 0, 0, 0, // 455 455
|
||||
0, 0, 0, 0, // 456 456
|
||||
0, 0, 0, 0, // 457 457
|
||||
0, 0, 0, 0, // 458 458
|
||||
0, 0, 0, 0, // 459 459
|
||||
0, 0, 0, 0, // 460 460
|
||||
0, 0, 0, 0, // 461 461
|
||||
0, 0, 0, 0, // 462 462
|
||||
0, 0, 0, 0, // 463 463
|
||||
0, 0, 0, 0, // 464 464
|
||||
0, 0, 0, 0, // 465 465
|
||||
0, 0, 0, 0, // 466 466
|
||||
0, 0, 0, 0, // 467 467
|
||||
0, 0, 0, 0, // 468 468
|
||||
0, 0, 0, 0, // 469 469
|
||||
0, 0, 0, 0, // 470 470
|
||||
0, 0, 0, 0, // 471 471
|
||||
0, 0, 0, 0, // 472 472
|
||||
0, 0, 0, 0, // 473 473
|
||||
0, 0, 0, 0, // 474 474
|
||||
0, 0, 0, 0, // 475 475
|
||||
0, 0, 0, 0, // 476 476
|
||||
0, 0, 0, 0, // 477 477
|
||||
0, 0, 0, 0, // 478 478
|
||||
0, 0, 0, 0, // 479 479
|
||||
0, 0, 0, 0, // 480 480
|
||||
0, 0, 0, 0, // 481 481
|
||||
0, 0, 0, 0, // 482 482
|
||||
0, 0, 0, 0, // 483 483
|
||||
0, 0, 0, 0, // 484 484
|
||||
0, 0, 0, 0, // 485 485
|
||||
0, 0, 0, 0, // 486 486
|
||||
0, 0, 0, 0, // 487 487
|
||||
0, 0, 0, 0, // 488 488
|
||||
0, 0, 0, 0, // 489 489
|
||||
0, 0, 0, 0, // 490 490
|
||||
0, 0, 0, 0, // 491 491
|
||||
0, 0, 0, 0, // 492 492
|
||||
0, 0, 0, 0, // 493 493
|
||||
0, 0, 0, 0, // 494 494
|
||||
0, 0, 0, 0, // 495 495
|
||||
0, 0, 0, 0, // 496 496
|
||||
0, 0, 0, 0, // 497 497
|
||||
0, 0, 0, 0, // 498 498
|
||||
0, 0, 0, 0, // 499 499
|
||||
0, 0, 0, 0, // 500 500
|
||||
0, 0, 0, 0, // 501 501
|
||||
0, 0, 0, 0, // 502 502
|
||||
0, 0, 0, 0, // 503 503
|
||||
0, 0, 0, 0, // 504 504
|
||||
0, 0, 0, 0, // 505 505
|
||||
0, 0, 0, 0, // 506 NDEBELE nr
|
||||
0, 0, 0, 0, // 507 X_BORK_BORK_BORK zzb
|
||||
1669, 0, 0, 0, // 508 X_PIG_LATIN zzp
|
||||
0, 0, 0, 0, // 509 X_HACKER zzh
|
||||
1517, 0, 0, 0, // 510 X_KLINGON tlh
|
||||
0, 0, 0, 0, // 511 X_ELMER_FUDD zze
|
||||
0, 0, 0, 0, // 512 X_Common xx-Zyyy
|
||||
0, 0, 0, 0, // 513 X_Latin xx-Latn
|
||||
0, 0, 0, 0, // 514 X_Greek xx-Grek
|
||||
0, 0, 0, 0, // 515 X_Cyrillic xx-Cyrl
|
||||
0, 0, 0, 0, // 516 X_Armenian xx-Armn
|
||||
0, 0, 0, 0, // 517 X_Hebrew xx-Hebr
|
||||
0, 0, 0, 0, // 518 X_Arabic xx-Arab
|
||||
0, 0, 0, 0, // 519 X_Syriac xx-Syrc
|
||||
0, 0, 0, 0, // 520 X_Thaana xx-Thaa
|
||||
0, 0, 0, 0, // 521 X_Devanagari xx-Deva
|
||||
0, 0, 0, 0, // 522 X_Bengali xx-Beng
|
||||
0, 0, 0, 0, // 523 X_Gurmukhi xx-Guru
|
||||
0, 0, 0, 0, // 524 X_Gujarati xx-Gujr
|
||||
0, 0, 0, 0, // 525 X_Oriya xx-Orya
|
||||
0, 0, 0, 0, // 526 X_Tamil xx-Taml
|
||||
0, 0, 0, 0, // 527 X_Telugu xx-Telu
|
||||
0, 0, 0, 0, // 528 X_Kannada xx-Knda
|
||||
0, 0, 0, 0, // 529 X_Malayalam xx-Mlym
|
||||
0, 0, 0, 0, // 530 X_Sinhala xx-Sinh
|
||||
0, 0, 0, 0, // 531 X_Thai xx-Thai
|
||||
0, 0, 0, 0, // 532 X_Lao xx-Laoo
|
||||
0, 0, 0, 0, // 533 X_Tibetan xx-Tibt
|
||||
0, 0, 0, 0, // 534 X_Myanmar xx-Mymr
|
||||
0, 0, 0, 0, // 535 X_Georgian xx-Geor
|
||||
0, 0, 0, 0, // 536 X_Hangul xx-Hang
|
||||
0, 0, 0, 0, // 537 X_Ethiopic xx-Ethi
|
||||
0, 0, 0, 0, // 538 X_Cherokee xx-Cher
|
||||
0, 0, 0, 0, // 539 X_Canadian_Aboriginal xx-Cans
|
||||
0, 0, 0, 0, // 540 X_Ogham xx-Ogam
|
||||
0, 0, 0, 0, // 541 X_Runic xx-Runr
|
||||
0, 0, 0, 0, // 542 X_Khmer xx-Khmr
|
||||
0, 0, 0, 0, // 543 X_Mongolian xx-Mong
|
||||
0, 0, 0, 0, // 544 X_Hiragana xx-Hira
|
||||
0, 0, 0, 0, // 545 X_Katakana xx-Kana
|
||||
0, 0, 0, 0, // 546 X_Bopomofo xx-Bopo
|
||||
0, 0, 0, 0, // 547 X_Han xx-Hani
|
||||
0, 0, 0, 0, // 548 X_Yi xx-Yiii
|
||||
0, 0, 0, 0, // 549 X_Old_Italic xx-Ital
|
||||
0, 0, 0, 0, // 550 X_Gothic xx-Goth
|
||||
0, 0, 0, 0, // 551 X_Deseret xx-Dsrt
|
||||
0, 0, 0, 0, // 552 X_Inherited xx-Qaai
|
||||
0, 0, 0, 0, // 553 X_Tagalog xx-Tglg
|
||||
0, 0, 0, 0, // 554 X_Hanunoo xx-Hano
|
||||
0, 0, 0, 0, // 555 X_Buhid xx-Buhd
|
||||
0, 0, 0, 0, // 556 X_Tagbanwa xx-Tagb
|
||||
0, 0, 0, 0, // 557 X_Limbu xx-Limb
|
||||
0, 0, 0, 0, // 558 X_Tai_Le xx-Tale
|
||||
0, 0, 0, 0, // 559 X_Linear_B xx-Linb
|
||||
0, 0, 0, 0, // 560 X_Ugaritic xx-Ugar
|
||||
0, 0, 0, 0, // 561 X_Shavian xx-Shaw
|
||||
0, 0, 0, 0, // 562 X_Osmanya xx-Osma
|
||||
0, 0, 0, 0, // 563 X_Cypriot xx-Cprt
|
||||
0, 0, 0, 0, // 564 X_Braille xx-Brai
|
||||
0, 0, 0, 0, // 565 X_Buginese xx-Bugi
|
||||
0, 0, 0, 0, // 566 X_Coptic xx-Copt
|
||||
0, 0, 0, 0, // 567 X_New_Tai_Lue xx-Talu
|
||||
0, 0, 0, 0, // 568 X_Glagolitic xx-Glag
|
||||
0, 0, 0, 0, // 569 X_Tifinagh xx-Tfng
|
||||
0, 0, 0, 0, // 570 X_Syloti_Nagri xx-Sylo
|
||||
0, 0, 0, 0, // 571 X_Old_Persian xx-Xpeo
|
||||
0, 0, 0, 0, // 572 X_Kharoshthi xx-Khar
|
||||
0, 0, 0, 0, // 573 X_Balinese xx-Bali
|
||||
0, 0, 0, 0, // 574 X_Cuneiform xx-Xsux
|
||||
0, 0, 0, 0, // 575 X_Phoenician xx-Phnx
|
||||
0, 0, 0, 0, // 576 X_Phags_Pa xx-Phag
|
||||
0, 0, 0, 0, // 577 X_Nko xx-Nkoo
|
||||
0, 0, 0, 0, // 578 X_Sundanese xx-Sund
|
||||
0, 0, 0, 0, // 579 X_Lepcha xx-Lepc
|
||||
0, 0, 0, 0, // 580 X_Ol_Chiki xx-Olck
|
||||
0, 0, 0, 0, // 581 X_Vai xx-Vaii
|
||||
0, 0, 0, 0, // 582 X_Saurashtra xx-Saur
|
||||
0, 0, 0, 0, // 583 X_Kayah_Li xx-Kali
|
||||
0, 0, 0, 0, // 584 X_Rejang xx-Rjng
|
||||
0, 0, 0, 0, // 585 X_Lycian xx-Lyci
|
||||
0, 0, 0, 0, // 586 X_Carian xx-Cari
|
||||
0, 0, 0, 0, // 587 X_Lydian xx-Lydi
|
||||
0, 0, 0, 0, // 588 X_Cham xx-Cham
|
||||
0, 0, 0, 0, // 589 X_Tai_Tham xx-Lana
|
||||
0, 0, 0, 0, // 590 X_Tai_Viet xx-Tavt
|
||||
0, 0, 0, 0, // 591 X_Avestan xx-Avst
|
||||
0, 0, 0, 0, // 592 X_Egyptian_Hieroglyphs xx-Egyp
|
||||
0, 0, 0, 0, // 593 X_Samaritan xx-Samr
|
||||
0, 0, 0, 0, // 594 X_Lisu xx-Lisu
|
||||
0, 0, 0, 0, // 595 X_Bamum xx-Bamu
|
||||
0, 0, 0, 0, // 596 X_Javanese xx-Java
|
||||
0, 0, 0, 0, // 597 X_Meetei_Mayek xx-Mtei
|
||||
0, 0, 0, 0, // 598 X_Imperial_Aramaic xx-Armi
|
||||
0, 0, 0, 0, // 599 X_Old_South_Arabian xx-Sarb
|
||||
0, 0, 0, 0, // 600 X_Inscriptional_Parthian xx-Prti
|
||||
0, 0, 0, 0, // 601 X_Inscriptional_Pahlavi xx-Phli
|
||||
0, 0, 0, 0, // 602 X_Old_Turkic xx-Orkh
|
||||
0, 0, 0, 0, // 603 X_Kaithi xx-Kthi
|
||||
0, 0, 0, 0, // 604 X_Batak xx-Batk
|
||||
0, 0, 0, 0, // 605 X_Brahmi xx-Brah
|
||||
0, 0, 0, 0, // 606 X_Mandaic xx-Mand
|
||||
0, 0, 0, 0, // 607 X_Chakma xx-Cakm
|
||||
0, 0, 0, 0, // 608 X_Meroitic_Cursive xx-Merc
|
||||
0, 0, 0, 0, // 609 X_Meroitic_Hieroglyphs xx-Mero
|
||||
0, 0, 0, 0, // 610 X_Miao xx-Plrd
|
||||
0, 0, 0, 0, // 611 X_Sharada xx-Shrd
|
||||
0, 0, 0, 0, // 612 X_Sora_Sompeng xx-Sora
|
||||
0, 0, 0, 0, // 613 X_Takri xx-Takr
|
||||
};
|
||||
|
||||
} // End namespace CLD2
|
||||
|
||||
630
internal/cldutil.cc
Normal file
630
internal/cldutil.cc
Normal file
@@ -0,0 +1,630 @@
|
||||
// Copyright 2013 Google Inc. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
//
|
||||
// Author: dsites@google.com (Dick Sites)
|
||||
//
|
||||
|
||||
#include "cldutil.h"
|
||||
#include <string>
|
||||
|
||||
#include "cld2tablesummary.h"
|
||||
#include "integral_types.h"
|
||||
#include "port.h"
|
||||
#include "utf8statetable.h"
|
||||
|
||||
namespace CLD2 {
|
||||
|
||||
// Linker supplies the right tables
|
||||
// New tables from
|
||||
// cld2_generated_cjk_uni_prop_80.cc
|
||||
// cld2_generated_cjk_delta_bi_32.cc
|
||||
// cld2_generated_quad_1024.cc
|
||||
// cld2_generated_delta_octa_0.cc
|
||||
//
|
||||
// cld2_generated_score_cjk_uni_0.cc
|
||||
// cld2_generated_score_cjk_uni_bi_0.cc
|
||||
// cld2_generated_score_quad_1024.cc
|
||||
// cld2_generated_score_quad_octa_0.cc
|
||||
|
||||
// Caller supplies these.
|
||||
// extern const UTF8PropObj cld_generated_CjkUni_obj;
|
||||
// extern const CLDTableSummary kCjkDeltaBi_obj;
|
||||
// extern const CLDTableSummary kQuad_obj;
|
||||
// extern const CLDTableSummary kDeltaOcta_obj;
|
||||
|
||||
// Caller must supply these also
|
||||
// extern const short kAvgCjkUniScore[];
|
||||
// extern const short kAvgCjkUniBiScore[];
|
||||
extern const short kAvgQuadScore[];
|
||||
// extern const short kAvgQuadOctaScore[];
|
||||
|
||||
|
||||
// Runtime routines for hashing, looking up, and scoring
|
||||
// unigrams (CJK), bigrams (CJK), quadgrams, and octagrams.
|
||||
// Unigrams and bigrams are for CJK languages only, including simplified/
|
||||
// traditional Chinese, Japanese, Korean, Vietnamese Han characters, and
|
||||
// Zhuang Han characters. Surrounding spaces are not considered.
|
||||
// Quadgrams and octagrams for for non-CJK and include two bits indicating
|
||||
// preceding and trailing spaces (word boundaries).
|
||||
|
||||
|
||||
static const int kMinCJKUTF8CharBytes = 3;
|
||||
|
||||
static const int kMinGramCount = 3;
|
||||
static const int kMaxGramCount = 16;
|
||||
|
||||
static const int UTFmax = 4; // Max number of bytes in a UTF-8 character
|
||||
|
||||
// 1 to skip ASCII space, vowels AEIOU aeiou and UTF-8 continuation bytes 80-BF
|
||||
static const uint8 kSkipSpaceVowelContinue[256] = {
|
||||
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
|
||||
1,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
|
||||
0,1,0,0,0,1,0,0, 0,1,0,0,0,0,0,1, 0,0,0,0,0,1,0,0, 0,0,0,0,0,0,0,0,
|
||||
0,1,0,0,0,1,0,0, 0,1,0,0,0,0,0,1, 0,0,0,0,0,1,0,0, 0,0,0,0,0,0,0,0,
|
||||
|
||||
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
|
||||
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
|
||||
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
|
||||
};
|
||||
|
||||
// 1 to skip ASCII space, and UTF-8 continuation bytes 80-BF
|
||||
static const uint8 kSkipSpaceContinue[256] = {
|
||||
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
|
||||
1,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
|
||||
|
||||
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
|
||||
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
|
||||
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
|
||||
};
|
||||
|
||||
|
||||
// Always advances one UTF-8 character
|
||||
static const uint8 kAdvanceOneChar[256] = {
|
||||
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
|
||||
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
|
||||
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
|
||||
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
|
||||
|
||||
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
|
||||
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
|
||||
2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,
|
||||
3,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4,
|
||||
};
|
||||
|
||||
// Advances *only* on space (or illegal byte)
|
||||
static const uint8 kAdvanceOneCharSpace[256] = {
|
||||
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
|
||||
1,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
|
||||
|
||||
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
|
||||
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
|
||||
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
|
||||
};
|
||||
|
||||
|
||||
// Routines to access a hash table of <key:wordhash, value:probs> pairs
|
||||
// Buckets have 4-byte wordhash for sizes < 32K buckets, but only
|
||||
// 2-byte wordhash for sizes >= 32K buckets, with other wordhash bits used as
|
||||
// bucket subscript.
|
||||
// Probs is a packed: three languages plus a subscript for probability table
|
||||
// Buckets have all the keys together, then all the values.Key array never
|
||||
// crosses a cache-line boundary, so no-match case takes exactly one cache miss.
|
||||
// Match case may sometimes take an additional cache miss on value access.
|
||||
//
|
||||
// Other possibilites include 5 or 10 6-byte entries plus pad to make 32 or 64
|
||||
// byte buckets with single cache miss.
|
||||
// Or 2-byte key and 6-byte value, allowing 5 languages instead of three.
|
||||
//------------------------------------------------------------------------------
|
||||
|
||||
//----------------------------------------------------------------------------//
|
||||
// Hashing groups of 1/2/4/8 letters, perhaps with spaces or underscores //
|
||||
//----------------------------------------------------------------------------//
|
||||
|
||||
//----------------------------------------------------------------------------//
|
||||
// Scoring single groups of letters //
|
||||
//----------------------------------------------------------------------------//
|
||||
|
||||
// BIGRAM, QUADGRAM, OCTAGRAM score one => tote
|
||||
// Input: 4-byte entry of 3 language numbers and one probability subscript, plus
|
||||
// an accumulator tote. (language 0 means unused entry)
|
||||
// Output: running sums in tote updated
|
||||
void ProcessProbV2Tote(uint32 probs, Tote* tote) {
|
||||
uint8 prob123 = (probs >> 0) & 0xff;
|
||||
const uint8* prob123_entry = LgProb2TblEntry(prob123);
|
||||
|
||||
uint8 top1 = (probs >> 8) & 0xff;
|
||||
if (top1 > 0) {tote->Add(top1, LgProb3(prob123_entry, 0));}
|
||||
uint8 top2 = (probs >> 16) & 0xff;
|
||||
if (top2 > 0) {tote->Add(top2, LgProb3(prob123_entry, 1));}
|
||||
uint8 top3 = (probs >> 24) & 0xff;
|
||||
if (top3 > 0) {tote->Add(top3, LgProb3(prob123_entry, 2));}
|
||||
}
|
||||
|
||||
// Return score for a particular per-script language, or zero
|
||||
int GetLangScore(uint32 probs, uint8 pslang) {
|
||||
uint8 prob123 = (probs >> 0) & 0xff;
|
||||
const uint8* prob123_entry = LgProb2TblEntry(prob123);
|
||||
int retval = 0;
|
||||
uint8 top1 = (probs >> 8) & 0xff;
|
||||
if (top1 == pslang) {retval += LgProb3(prob123_entry, 0);}
|
||||
uint8 top2 = (probs >> 16) & 0xff;
|
||||
if (top2 == pslang) {retval += LgProb3(prob123_entry, 1);}
|
||||
uint8 top3 = (probs >> 24) & 0xff;
|
||||
if (top3 == pslang) {retval += LgProb3(prob123_entry, 2);}
|
||||
return retval;
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------------//
|
||||
// Routines to accumulate probabilities //
|
||||
//----------------------------------------------------------------------------//
|
||||
|
||||
|
||||
// BIGRAM, using hash table, always advancing by 1 char
|
||||
// Caller supplies table, such as &kCjkBiTable_obj or &kGibberishTable_obj
|
||||
// Score all bigrams in isrc, using languages that have bigrams (CJK)
|
||||
// Return number of bigrams that hit in the hash table
|
||||
int DoBigramScoreV3(const CLD2TableSummary* bigram_obj,
|
||||
const char* isrc, int srclen, Tote* chunk_tote) {
|
||||
int hit_count = 0;
|
||||
const char* src = isrc;
|
||||
|
||||
// Hashtable-based CJK bigram lookup
|
||||
const uint8* usrc = reinterpret_cast<const uint8*>(src);
|
||||
const uint8* usrclimit1 = usrc + srclen - UTFmax;
|
||||
|
||||
while (usrc < usrclimit1) {
|
||||
int len = kAdvanceOneChar[usrc[0]];
|
||||
int len2 = kAdvanceOneChar[usrc[len]] + len;
|
||||
|
||||
if ((kMinCJKUTF8CharBytes * 2) <= len2) { // Two CJK chars possible
|
||||
// Lookup and score this bigram
|
||||
// Always ignore pre/post spaces
|
||||
uint32 bihash = BiHashV2(reinterpret_cast<const char*>(usrc), len2);
|
||||
uint32 probs = QuadHashV3Lookup4(bigram_obj, bihash);
|
||||
// Now go indirect on the subscript
|
||||
probs = bigram_obj->kCLDTableInd[probs &
|
||||
~bigram_obj->kCLDTableKeyMask];
|
||||
|
||||
// Process the bigram
|
||||
if (probs != 0) {
|
||||
ProcessProbV2Tote(probs, chunk_tote);
|
||||
++hit_count;
|
||||
}
|
||||
}
|
||||
usrc += len; // Advance by one char
|
||||
}
|
||||
|
||||
return hit_count;
|
||||
}
|
||||
|
||||
|
||||
// Score up to 64KB of a single script span in one pass
|
||||
// Make a dummy entry off the end to calc length of last span
|
||||
// Return offset of first unused input byte
|
||||
int GetUniHits(const char* text,
|
||||
int letter_offset, int letter_limit,
|
||||
ScoringContext* scoringcontext,
|
||||
ScoringHitBuffer* hitbuffer) {
|
||||
const char* isrc = &text[letter_offset];
|
||||
const char* src = isrc;
|
||||
// Limit is end, which has extra 20 20 20 00 past len
|
||||
const char* srclimit = &text[letter_limit];
|
||||
|
||||
// Local copies
|
||||
const UTF8PropObj* unigram_obj =
|
||||
scoringcontext->scoringtables->unigram_obj;
|
||||
int next_base = hitbuffer->next_base;
|
||||
int next_base_limit = hitbuffer->maxscoringhits;
|
||||
|
||||
// Visit all unigrams
|
||||
if (src[0] == ' ') {++src;} // skip any initial space
|
||||
while (src < srclimit) {
|
||||
const uint8* usrc = reinterpret_cast<const uint8*>(src);
|
||||
int len = kAdvanceOneChar[usrc[0]];
|
||||
src += len;
|
||||
// Look up property of one UTF-8 character and advance over it.
|
||||
// Updates usrc and len (bad interface design), hence increment above
|
||||
int propval = UTF8GenericPropertyBigOneByte(unigram_obj, &usrc, &len);
|
||||
if (propval > 0) {
|
||||
// Save indirect subscript for later scoring; 1 or 2 langprobs
|
||||
int indirect_subscr = propval;
|
||||
hitbuffer->base[next_base].offset = src - text; // Offset in text
|
||||
hitbuffer->base[next_base].indirect = indirect_subscr;
|
||||
++next_base;
|
||||
}
|
||||
|
||||
if (next_base >= next_base_limit) {break;}
|
||||
}
|
||||
|
||||
hitbuffer->next_base = next_base;
|
||||
|
||||
// Make a dummy entry off the end to calc length of last span
|
||||
int dummy_offset = src - text;
|
||||
hitbuffer->base[hitbuffer->next_base].offset = dummy_offset;
|
||||
hitbuffer->base[hitbuffer->next_base].indirect = 0;
|
||||
|
||||
return src - text;
|
||||
}
|
||||
|
||||
// Score up to 64KB of a single script span, doing both delta-bi and
|
||||
// distinct bis in one pass
|
||||
void GetBiHits(const char* text,
|
||||
int letter_offset, int letter_limit,
|
||||
ScoringContext* scoringcontext,
|
||||
ScoringHitBuffer* hitbuffer) {
|
||||
const char* isrc = &text[letter_offset];
|
||||
const char* src = isrc;
|
||||
// Limit is end
|
||||
const char* srclimit1 = &text[letter_limit];
|
||||
|
||||
// Local copies
|
||||
const CLD2TableSummary* deltabi_obj =
|
||||
scoringcontext->scoringtables->deltabi_obj;
|
||||
const CLD2TableSummary* distinctbi_obj =
|
||||
scoringcontext->scoringtables->distinctbi_obj;
|
||||
int next_delta = hitbuffer->next_delta;
|
||||
int next_delta_limit = hitbuffer->maxscoringhits;
|
||||
int next_distinct = hitbuffer->next_distinct;
|
||||
// We can do 2 inserts per loop, so -1
|
||||
int next_distinct_limit = hitbuffer->maxscoringhits - 1;
|
||||
|
||||
while (src < srclimit1) {
|
||||
const uint8* usrc = reinterpret_cast<const uint8*>(src);
|
||||
int len = kAdvanceOneChar[usrc[0]];
|
||||
int len2 = kAdvanceOneChar[usrc[len]] + len;
|
||||
|
||||
if ((kMinCJKUTF8CharBytes * 2) <= len2) { // Two CJK chars possible
|
||||
// Lookup and this bigram and save <offset, indirect>
|
||||
uint32 bihash = BiHashV2(src, len2);
|
||||
uint32 probs = QuadHashV3Lookup4(deltabi_obj, bihash);
|
||||
// Now go indirect on the subscript
|
||||
if (probs != 0) {
|
||||
// Save indirect subscript for later scoring; 1 langprob
|
||||
int indirect_subscr = probs & ~deltabi_obj->kCLDTableKeyMask;
|
||||
hitbuffer->delta[next_delta].offset = src - text;
|
||||
hitbuffer->delta[next_delta].indirect = indirect_subscr;
|
||||
++next_delta;
|
||||
}
|
||||
// Lookup this distinct bigram and save <offset, indirect>
|
||||
probs = QuadHashV3Lookup4(distinctbi_obj, bihash);
|
||||
if (probs != 0) {
|
||||
int indirect_subscr = probs & ~distinctbi_obj->kCLDTableKeyMask;
|
||||
hitbuffer->distinct[next_distinct].offset = src - text;
|
||||
hitbuffer->distinct[next_distinct].indirect = indirect_subscr;
|
||||
++next_distinct;
|
||||
}
|
||||
}
|
||||
src += len; // Advance by one char (not two)
|
||||
|
||||
// Almost always srclimit hit first
|
||||
if (next_delta >= next_delta_limit) {break;}
|
||||
if (next_distinct >= next_distinct_limit) {break;}
|
||||
}
|
||||
|
||||
hitbuffer->next_delta = next_delta;
|
||||
hitbuffer->next_distinct = next_distinct;
|
||||
|
||||
// Make a dummy entry off the end to calc length of last span
|
||||
int dummy_offset = src - text;
|
||||
hitbuffer->delta[hitbuffer->next_delta].offset = dummy_offset;
|
||||
hitbuffer->delta[hitbuffer->next_delta].indirect = 0;
|
||||
hitbuffer->distinct[hitbuffer->next_distinct].offset = dummy_offset;
|
||||
hitbuffer->distinct[hitbuffer->next_distinct].indirect = 0;
|
||||
}
|
||||
|
||||
// Score up to 64KB of a single script span in one pass
|
||||
// Make a dummy entry off the end to calc length of last span
|
||||
// Return offset of first unused input byte
|
||||
int GetQuadHits(const char* text,
|
||||
int letter_offset, int letter_limit,
|
||||
ScoringContext* scoringcontext,
|
||||
ScoringHitBuffer* hitbuffer) {
|
||||
const char* isrc = &text[letter_offset];
|
||||
const char* src = isrc;
|
||||
// Limit is end, which has extra 20 20 20 00 past len
|
||||
const char* srclimit = &text[letter_limit];
|
||||
|
||||
// Local copies
|
||||
const CLD2TableSummary* quadgram_obj =
|
||||
scoringcontext->scoringtables->quadgram_obj;
|
||||
int next_base = hitbuffer->next_base;
|
||||
int next_base_limit = hitbuffer->maxscoringhits;
|
||||
|
||||
// Run a little cache of last quad hits to catch overly-repetitive "text"
|
||||
// We don't care if we miss a couple repetitions at scriptspan boundaries
|
||||
int next_prior_quadhash = 0;
|
||||
uint32 prior_quadhash[2] = {0, 0};
|
||||
|
||||
// Visit all quadgrams
|
||||
if (src[0] == ' ') {++src;} // skip any initial space
|
||||
while (src < srclimit) {
|
||||
// Find one quadgram
|
||||
const char* src_end = src;
|
||||
src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];
|
||||
src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];
|
||||
const char* src_mid = src_end;
|
||||
src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];
|
||||
src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];
|
||||
int len = src_end - src;
|
||||
// Hash the quadgram
|
||||
uint32 quadhash = QuadHashV2(src, len);
|
||||
|
||||
// Filter out recent repeats
|
||||
if ((quadhash != prior_quadhash[0]) && (quadhash != prior_quadhash[1])) {
|
||||
// Look up this quadgram and save <offset, indirect>
|
||||
uint32 probs = QuadHashV3Lookup4(quadgram_obj, quadhash);
|
||||
if (probs != 0) {
|
||||
// Round-robin two entries of actual hits
|
||||
prior_quadhash[next_prior_quadhash] = quadhash;
|
||||
next_prior_quadhash = (next_prior_quadhash + 1) & 1;
|
||||
|
||||
// Save indirect subscript for later scoring; 1 or 2 langprobs
|
||||
int indirect_subscr = probs & ~quadgram_obj->kCLDTableKeyMask;
|
||||
hitbuffer->base[next_base].offset = src - text; // Offset in text
|
||||
hitbuffer->base[next_base].indirect = indirect_subscr;
|
||||
++next_base;
|
||||
}
|
||||
}
|
||||
|
||||
// Advance: all the way past word if at end-of-word, else 2 chars
|
||||
if (src_end[0] == ' ') {
|
||||
src = src_end;
|
||||
} else {
|
||||
src = src_mid;
|
||||
}
|
||||
|
||||
// Skip over space at end of word, or ASCII vowel in middle of word
|
||||
// Use kAdvanceOneCharSpace instead to get rid of vowel hack
|
||||
if (src < srclimit) {
|
||||
src += kAdvanceOneCharSpaceVowel[(uint8)src[0]];
|
||||
} else {
|
||||
// Advancing by 4/8/16 can overshoot, but we are about to exit anyway
|
||||
src = srclimit;
|
||||
}
|
||||
|
||||
if (next_base >= next_base_limit) {break;}
|
||||
}
|
||||
|
||||
hitbuffer->next_base = next_base;
|
||||
|
||||
// Make a dummy entry off the end to calc length of last span
|
||||
int dummy_offset = src - text;
|
||||
hitbuffer->base[hitbuffer->next_base].offset = dummy_offset;
|
||||
hitbuffer->base[hitbuffer->next_base].indirect = 0;
|
||||
|
||||
return src - text;
|
||||
}
|
||||
|
||||
// inputs:
|
||||
// const tables
|
||||
// const char* isrc, int srclen (in sscriptbuffer)
|
||||
// intermediates:
|
||||
// vector of octa <offset, probs> (which need quadgram_obj indirect tbale ot decode)
|
||||
// vector of distinct <offset, probs> (which need quadgram_obj indirect tbale ot decode)
|
||||
|
||||
// Score up to 64KB of a single script span, doing both delta-octa and
|
||||
// distinct words in one pass
|
||||
void GetOctaHits(const char* text,
|
||||
int letter_offset, int letter_limit,
|
||||
ScoringContext* scoringcontext,
|
||||
ScoringHitBuffer* hitbuffer) {
|
||||
const char* isrc = &text[letter_offset];
|
||||
const char* src = isrc;
|
||||
// Limit is end+1, to include extra space char (0x20) off the end
|
||||
const char* srclimit = &text[letter_limit + 1];
|
||||
|
||||
// Local copies
|
||||
const CLD2TableSummary* deltaocta_obj =
|
||||
scoringcontext->scoringtables->deltaocta_obj;
|
||||
int next_delta = hitbuffer->next_delta;
|
||||
int next_delta_limit = hitbuffer->maxscoringhits;
|
||||
|
||||
const CLD2TableSummary* distinctocta_obj =
|
||||
scoringcontext->scoringtables->distinctocta_obj;
|
||||
int next_distinct = hitbuffer->next_distinct;
|
||||
// We can do 2 inserts per loop, so -1
|
||||
int next_distinct_limit = hitbuffer->maxscoringhits - 1;
|
||||
|
||||
// Run a little cache of last octa hits to catch overly-repetitive "text"
|
||||
// We don't care if we miss a couple repetitions at scriptspan boundaries
|
||||
int next_prior_octahash = 0;
|
||||
uint64 prior_octahash[2] = {0, 0};
|
||||
|
||||
// Score all words truncated to 8 characters
|
||||
int charcount = 0;
|
||||
// Skip any initial space
|
||||
if (src[0] == ' ') {++src;}
|
||||
|
||||
// Begin the first word
|
||||
const char* prior_word_start = src;
|
||||
const char* word_start = src;
|
||||
const char* word_end = word_start;
|
||||
while (src < srclimit) {
|
||||
// Terminate previous word or continue current word
|
||||
if (src[0] == ' ') {
|
||||
int len = word_end - word_start;
|
||||
// Hash the word
|
||||
uint64 wordhash40 = OctaHash40(word_start, len);
|
||||
uint32 probs;
|
||||
|
||||
// Filter out recent repeats. Unlike quads, we update even if no hit,
|
||||
// so we can get hits on same word if separated by non-hit words
|
||||
if ((wordhash40 != prior_octahash[0]) &&
|
||||
(wordhash40 != prior_octahash[1])) {
|
||||
// Round-robin two entries of words
|
||||
prior_octahash[next_prior_octahash] = wordhash40;
|
||||
next_prior_octahash = 1 - next_prior_octahash; // Alternates 0,1,0,1
|
||||
|
||||
// (1) Lookup distinct word PAIR. For a pair, we want an asymmetrical
|
||||
// function of the two word hashs. For words A B C, B-A and C-B are good
|
||||
// enough and fast. We use the same table as distinct single words
|
||||
// Do not look up a pair of identical words -- all pairs hash to zero
|
||||
// Both 1- and 2-word distinct lookups are in distinctocta_obj now
|
||||
// Do this first, because it has the lowest offset
|
||||
uint64 tmp_prior_hash = prior_octahash[next_prior_octahash];
|
||||
if ((tmp_prior_hash != 0) && (tmp_prior_hash != wordhash40)) {
|
||||
uint64 pair_hash = PairHash(tmp_prior_hash, wordhash40);
|
||||
probs = OctaHashV3Lookup4(distinctocta_obj, pair_hash);
|
||||
if (probs != 0) {
|
||||
int indirect_subscr = probs & ~distinctocta_obj->kCLDTableKeyMask;
|
||||
hitbuffer->distinct[next_distinct].offset = prior_word_start - text;
|
||||
hitbuffer->distinct[next_distinct].indirect = indirect_subscr;
|
||||
++next_distinct;
|
||||
}
|
||||
}
|
||||
|
||||
// (2) Lookup this distinct word and save <offset, indirect>
|
||||
probs = OctaHashV3Lookup4(distinctocta_obj, wordhash40);
|
||||
if (probs != 0) {
|
||||
int indirect_subscr = probs & ~distinctocta_obj->kCLDTableKeyMask;
|
||||
hitbuffer->distinct[next_distinct].offset = word_start - text;
|
||||
hitbuffer->distinct[next_distinct].indirect = indirect_subscr;
|
||||
++next_distinct;
|
||||
}
|
||||
|
||||
// (3) Lookup this word and save <offset, indirect>
|
||||
probs = OctaHashV3Lookup4(deltaocta_obj, wordhash40);
|
||||
if (probs != 0) {
|
||||
// Save indirect subscript for later scoring; 1 langprob
|
||||
int indirect_subscr = probs & ~deltaocta_obj->kCLDTableKeyMask;
|
||||
hitbuffer->delta[next_delta].offset = word_start - text;
|
||||
hitbuffer->delta[next_delta].indirect = indirect_subscr;
|
||||
++next_delta;
|
||||
}
|
||||
}
|
||||
|
||||
// Begin the next word
|
||||
charcount = 0;
|
||||
prior_word_start = word_start;
|
||||
word_start = src + 1; // Over the space
|
||||
word_end = word_start;
|
||||
} else {
|
||||
++charcount;
|
||||
}
|
||||
|
||||
// Advance to next char
|
||||
src += UTF8OneCharLen(src);
|
||||
if (charcount <= 8) {
|
||||
word_end = src;
|
||||
}
|
||||
// Almost always srclimit hit first
|
||||
if (next_delta >= next_delta_limit) {break;}
|
||||
if (next_distinct >= next_distinct_limit) {break;}
|
||||
}
|
||||
|
||||
hitbuffer->next_delta = next_delta;
|
||||
hitbuffer->next_distinct = next_distinct;
|
||||
|
||||
// Make a dummy entry off the end to calc length of last span
|
||||
int dummy_offset = src - text;
|
||||
hitbuffer->delta[hitbuffer->next_delta].offset = dummy_offset;
|
||||
hitbuffer->delta[hitbuffer->next_delta].indirect = 0;
|
||||
hitbuffer->distinct[hitbuffer->next_distinct].offset = dummy_offset;
|
||||
hitbuffer->distinct[hitbuffer->next_distinct].indirect = 0;
|
||||
}
|
||||
|
||||
|
||||
//----------------------------------------------------------------------------//
|
||||
// Reliability calculations, for single language and between languages //
|
||||
//----------------------------------------------------------------------------//
|
||||
|
||||
// Return reliablity of result 0..100 for top two scores
|
||||
// delta==0 is 0% reliable, delta==fully_reliable_thresh is 100% reliable
|
||||
// (on a scale where +1 is a factor of 2 ** 1.6 = 3.02)
|
||||
// Threshold is uni/quadgram increment count, bounded above and below.
|
||||
//
|
||||
// Requiring a factor of 3 improvement (e.g. +1 log base 3)
|
||||
// for each scored quadgram is too stringent, so I've backed this off to a
|
||||
// factor of 2 (e.g. +5/8 log base 3).
|
||||
//
|
||||
// I also somewhat lowered the Min/MaxGramCount limits above
|
||||
//
|
||||
// Added: if fewer than 8 quads/unis, max reliability is 12*n percent
|
||||
//
|
||||
int ReliabilityDelta(int value1, int value2, int gramcount) {
|
||||
int max_reliability_percent = 100;
|
||||
if (gramcount < 8) {
|
||||
max_reliability_percent = 12 * gramcount;
|
||||
}
|
||||
int fully_reliable_thresh = (gramcount * 5) >> 3; // see note above
|
||||
if (fully_reliable_thresh < kMinGramCount) { // Fully = 3..16
|
||||
fully_reliable_thresh = kMinGramCount;
|
||||
} else if (fully_reliable_thresh > kMaxGramCount) {
|
||||
fully_reliable_thresh = kMaxGramCount;
|
||||
}
|
||||
|
||||
int delta = value1 - value2;
|
||||
if (delta >= fully_reliable_thresh) {return max_reliability_percent;}
|
||||
if (delta <= 0) {return 0;}
|
||||
return minint(max_reliability_percent,
|
||||
(100 * delta) / fully_reliable_thresh);
|
||||
}
|
||||
|
||||
// Return reliablity of result 0..100 for top score vs. expected mainsteam score
|
||||
// Values are score per 1024 bytes of input
|
||||
// ratio = max(top/mainstream, mainstream/top)
|
||||
// ratio > 4.0 is 0% reliable, <= 2.0 is 100% reliable
|
||||
// Change: short-text word scoring can give unusually good results.
|
||||
// Let top exceed mainstream by 4x at 50% reliable
|
||||
//
|
||||
// dsites April 2010: These could be tightened up. It would be
|
||||
// reasonable with newer data and round-robin table allocation to start ramping
|
||||
// down at mean * 1.5 and mean/1.5, while letting mean*2 and mean/2 pass,
|
||||
// but just barely.
|
||||
//
|
||||
// dsites March 2013: Tightened up a bit.
|
||||
static const double kRatio100 = 1.5;
|
||||
static const double kRatio0 = 4.0;
|
||||
int ReliabilityExpected(int actual_score_1kb, int expected_score_1kb) {
|
||||
if (expected_score_1kb == 0) {return 100;} // No reliability data available yet
|
||||
if (actual_score_1kb == 0) {return 0;} // zero score = unreliable
|
||||
double ratio;
|
||||
if (expected_score_1kb > actual_score_1kb) {
|
||||
ratio = (1.0 * expected_score_1kb) / actual_score_1kb;
|
||||
} else {
|
||||
ratio = (1.0 * actual_score_1kb) / expected_score_1kb;
|
||||
}
|
||||
// Ratio 1.0 .. 1.5 scores 100%
|
||||
// Ratio 2.0 scores 80%
|
||||
// Linear decline, to ratio 4.0 scores 0%
|
||||
if (ratio <= kRatio100) {return 100;}
|
||||
if (ratio > kRatio0) {return 0;}
|
||||
|
||||
int percent_good = 100.0 * (kRatio0 - ratio) / (kRatio0 - kRatio100);
|
||||
return percent_good;
|
||||
}
|
||||
|
||||
// Create a langprob packed value from its parts.
|
||||
// qprob is quantized [0..12]
|
||||
// We use Latn script to represent any RTypeMany language
|
||||
uint32 MakeLangProb(Language lang, int qprob) {
|
||||
uint32 pslang = PerScriptNumber(ULScript_Latin, lang);
|
||||
uint32 retval = (pslang << 8) | kLgProbV2TblBackmap[qprob];
|
||||
return retval;
|
||||
}
|
||||
|
||||
} // End namespace CLD2
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
80
internal/cldutil.h
Normal file
80
internal/cldutil.h
Normal file
@@ -0,0 +1,80 @@
|
||||
// Copyright 2013 Google Inc. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
//
|
||||
// Author: dsites@google.com (Dick Sites)
|
||||
//
|
||||
// Stuff used only by online detector, not used offline
|
||||
//
|
||||
|
||||
#ifndef I18N_ENCODINGS_CLD2_INTERNAL_NEW_CLDUTIL_H__
|
||||
#define I18N_ENCODINGS_CLD2_INTERNAL_NEW_CLDUTIL_H__
|
||||
|
||||
#include "cldutil_shared.h"
|
||||
#include "scoreonescriptspan.h"
|
||||
#include "tote.h"
|
||||
|
||||
namespace CLD2 {
|
||||
|
||||
// Score up to 64KB of a single script span in one pass
|
||||
// Make a dummy entry off the end to calc length of last span
|
||||
// Return offset of first unused input byte
|
||||
int GetUniHits(const char* text,
|
||||
int letter_offset, int letter_limit,
|
||||
ScoringContext* scoringcontext,
|
||||
ScoringHitBuffer* hitbuffer);
|
||||
|
||||
// Score up to 64KB of a single script span, doing both delta-bi and
|
||||
// distinct bis in one pass
|
||||
void GetBiHits(const char* text,
|
||||
int letter_offset, int letter_limit,
|
||||
ScoringContext* scoringcontext,
|
||||
ScoringHitBuffer* hitbuffer);
|
||||
|
||||
// Score up to 64KB of a single script span in one pass
|
||||
// Make a dummy entry off the end to calc length of last span
|
||||
// Return offset of first unused input byte
|
||||
int GetQuadHits(const char* text,
|
||||
int letter_offset, int letter_limit,
|
||||
ScoringContext* scoringcontext,
|
||||
ScoringHitBuffer* hitbuffer);
|
||||
|
||||
// Score up to 64KB of a single script span, doing both delta-octa and
|
||||
// distinct words in one pass
|
||||
void GetOctaHits(const char* text,
|
||||
int letter_offset, int letter_limit,
|
||||
ScoringContext* scoringcontext,
|
||||
ScoringHitBuffer* hitbuffer);
|
||||
|
||||
// Not sure if these belong here or in scoreonescriptspan.cc
|
||||
int ReliabilityDelta(int value1, int value2, int gramcount);
|
||||
int ReliabilityExpected(int actual_score_1kb, int expected_score_1kb);
|
||||
|
||||
// Create a langprob packed value from its parts.
|
||||
uint32 MakeLangProb(Language lang, int qprob);
|
||||
|
||||
|
||||
void ProcessProbV2Tote(uint32 probs, Tote* tote);
|
||||
|
||||
// Return score for a particular per-script language, or zero
|
||||
int GetLangScore(uint32 probs, uint8 pslang);
|
||||
|
||||
static inline int minint(int a, int b) {return (a < b) ? a: b;}
|
||||
static inline int maxint(int a, int b) {return (a > b) ? a: b;}
|
||||
|
||||
} // End namespace CLD2
|
||||
|
||||
#endif // I18N_ENCODINGS_CLD2_INTERNAL_NEW_CLDUTIL_H__
|
||||
|
||||
|
||||
217
internal/cldutil_offline.cc
Normal file
217
internal/cldutil_offline.cc
Normal file
@@ -0,0 +1,217 @@
|
||||
// Copyright 2013 Google Inc. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
//
|
||||
// Author: dsites@google.com (Dick Sites)
|
||||
//
|
||||
|
||||
#include "cldutil_offline.h"
|
||||
#include "tote.h"
|
||||
#include <string>
|
||||
|
||||
static const int kMinCJKUTF8CharBytes = 3;
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Offline: used by mapreduce or table construction
|
||||
//------------------------------------------------------------------------------
|
||||
|
||||
namespace CLD2 {
|
||||
|
||||
// BIGRAM, QUADGRAM, OCTAGRAM score one => tote
|
||||
// Input: 4-byte entry of 3 language numbers and one probability subscript, plus
|
||||
// an accumulator tote. (language 0 means unused entry)
|
||||
// Output: running sums in tote updated
|
||||
void ProcessProbV2Tote(uint32 probs, Tote* tote) {
|
||||
uint8 prob123 = (probs >> 0) & 0xff;
|
||||
const uint8* prob123_entry = LgProb2TblEntry(prob123);
|
||||
|
||||
uint8 top1 = (probs >> 8) & 0xff;
|
||||
if (top1 > 0) {tote->Add(top1, LgProb3(prob123_entry, 0));}
|
||||
uint8 top2 = (probs >> 16) & 0xff;
|
||||
if (top2 > 0) {tote->Add(top2, LgProb3(prob123_entry, 1));}
|
||||
uint8 top3 = (probs >> 24) & 0xff;
|
||||
if (top3 > 0) {tote->Add(top3, LgProb3(prob123_entry, 2));}
|
||||
}
|
||||
|
||||
// Advances src, decrements len
|
||||
uint32 GetNextLangprob(ULScriptRType rtype,
|
||||
const CLD2TableSummary* wrt_unigram_obj,
|
||||
const CLD2TableSummary* wrt_quadgram_obj,
|
||||
const char** isrc, int* isrclen) {
|
||||
// fprintf(stderr, "GetNextLangprob '%s' %d<br>\n", *isrc, *isrclen);
|
||||
if (*isrclen <= 0) {return 0;}
|
||||
|
||||
// Find one quadgram
|
||||
const char* src = *isrc;
|
||||
const char* srclimit = src + *isrclen;
|
||||
if (*src == ' ') {++src;}
|
||||
const char* src_end = src;
|
||||
src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];
|
||||
src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];
|
||||
const char* src_mid = src_end;
|
||||
src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];
|
||||
src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];
|
||||
int len = src_end - src;
|
||||
// Hash the quadgram
|
||||
uint32 quadhash = QuadHashV2(src, len);
|
||||
uint32 probs = QuadHashV3Lookup4(wrt_quadgram_obj, quadhash);
|
||||
int indirect_subscr = probs & ~wrt_quadgram_obj->kCLDTableKeyMask;
|
||||
uint32 langprob;
|
||||
if (indirect_subscr < wrt_quadgram_obj->kCLDTableSizeOne) {
|
||||
// Up to three languages at indirect
|
||||
langprob = wrt_quadgram_obj->kCLDTableInd[indirect_subscr];
|
||||
} else {
|
||||
// Up to six languages at start + 2 * (indirect - start)
|
||||
indirect_subscr += (indirect_subscr - wrt_quadgram_obj->kCLDTableSizeOne);
|
||||
langprob = wrt_quadgram_obj->kCLDTableInd[indirect_subscr];
|
||||
}
|
||||
// Advance: all the way past word if at end-of-word, else 2 chars
|
||||
if (src_end[0] == ' ') {
|
||||
src = src_end;
|
||||
} else {
|
||||
src = src_mid;
|
||||
}
|
||||
if (src < srclimit) {
|
||||
src += kAdvanceOneCharSpaceVowel[(uint8)src[0]];
|
||||
} else {
|
||||
// Advancing by 4/8/16 can overshoot, but we are about to exit anyway
|
||||
src = srclimit;
|
||||
}
|
||||
int quadadvance = src - *isrc;
|
||||
*isrc = src;
|
||||
*isrclen -= quadadvance;
|
||||
return langprob;
|
||||
}
|
||||
|
||||
|
||||
// Find top two langs and scores for one word; underpins delta tables
|
||||
void DoWordScore(const char* isrc, int srclen, ULScript ulscript,
|
||||
const CLD2TableSummary* wrt_unigram_obj,
|
||||
const CLD2TableSummary* wrt_quadgram_obj,
|
||||
Language* lang1, int* score1,
|
||||
Language* lang2, int* score2) {
|
||||
ULScriptRType rtype = ULScriptRecognitionType(ulscript);
|
||||
|
||||
Tote word_tote;
|
||||
const char* src = isrc;
|
||||
int len = srclen;
|
||||
uint32 langprob;
|
||||
|
||||
// Advances src, decrements len
|
||||
langprob = GetNextLangprob(rtype, wrt_unigram_obj, wrt_quadgram_obj,
|
||||
&src, &len);
|
||||
ProcessProbV2Tote(langprob, &word_tote);
|
||||
|
||||
// Advances src, decrements len
|
||||
langprob = GetNextLangprob(rtype, wrt_unigram_obj, wrt_quadgram_obj,
|
||||
&src, &len);
|
||||
ProcessProbV2Tote(langprob, &word_tote);
|
||||
|
||||
int key3[3];
|
||||
word_tote.CurrentTopThreeKeys(key3);
|
||||
*lang1 = FromPerScriptNumber(ulscript, key3[0]);
|
||||
*lang2 = FromPerScriptNumber(ulscript, key3[1]);
|
||||
*score1 = word_tote.GetScore(key3[0]);
|
||||
*score2 = word_tote.GetScore(key3[1]);
|
||||
}
|
||||
|
||||
// Routines to store 3 or 5 log probabilities in a single byte.
|
||||
// Resolution/range = 2**1 to 2**12
|
||||
//------------------------------------------------------------------------------
|
||||
|
||||
// For constructing tables
|
||||
// Given a vector of 3 probabilities 1..12, find subscript of best table match.
|
||||
// Minimizes RMS error
|
||||
// Brute-force version
|
||||
uint8 FindBestProb3Match(const uint8* prob3) {
|
||||
int minsubscr = 0;
|
||||
int minrmserr = 9999;
|
||||
for (int i = 0; i < kLgProbV2TblSize; ++i) {
|
||||
int rmserr = 0;
|
||||
for (int j = 0; j < 3; ++j) {
|
||||
// If target prob is zero, item is unused, so no errterm
|
||||
if (prob3[j] > 0) {
|
||||
int errterm = prob3[j] - LgProb3(LgProb2TblEntry(i), j);
|
||||
rmserr += (errterm * errterm);
|
||||
}
|
||||
}
|
||||
if (minrmserr > rmserr) {
|
||||
minrmserr = rmserr;
|
||||
minsubscr = i;
|
||||
}
|
||||
}
|
||||
return static_cast<uint8>(minsubscr);
|
||||
};
|
||||
|
||||
// Not sure who calls this...
|
||||
// Return the probability for given language, or 0
|
||||
int GetProb(Language lang, uint32 probs) {
|
||||
int prob123 = (probs >> 0) & 0xff;
|
||||
const uint8* prob123_entry = LgProb2TblEntry(prob123);
|
||||
|
||||
int ilang = PerScriptNumber(ULScript_Latin, lang);
|
||||
int top1 = (probs >> 8) & 0xff;
|
||||
if (ilang == top1) {return LgProb3(prob123_entry, 0);}
|
||||
int top2 = (probs >> 16) & 0xff;
|
||||
if (ilang == top2) {return LgProb3(prob123_entry, 1);}
|
||||
int top3 = (probs >> 16) & 0xff;
|
||||
if (ilang == top3) {return LgProb3(prob123_entry, 2);}
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
// Converts a unigram prob/lang byte into an approximate prob/lang triple
|
||||
// Just keeps the largest value.
|
||||
// Now unused.
|
||||
uint32 ApproxProb3(int propval) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
// Take three packed languages and three probabilities 1..12 and put into uint32
|
||||
// For offline construction of tables
|
||||
uint32 ProbPackV2(uint8* plang3, uint8* prob3) {
|
||||
uint32 retval;
|
||||
// If < 3 entries, pack as top, 0, second, else pack as top, second, third
|
||||
// This allows FindBestProb3Match to always find a perfect match for < 3
|
||||
if (plang3[2] == 0) {
|
||||
// Swap [2] and [3]
|
||||
uint8 temp = plang3[2]; plang3[2] = plang3[1]; plang3[1] = temp;
|
||||
temp = prob3[2]; prob3[2] = prob3[1]; prob3[1] = temp;
|
||||
}
|
||||
retval = (plang3[2] << 24) |
|
||||
(plang3[1] << 16) |
|
||||
(plang3[0] << 8) |
|
||||
(FindBestProb3Match(prob3));
|
||||
return retval;
|
||||
}
|
||||
|
||||
// Take uint32 and unpack into three packed languages and three probabilities
|
||||
// For runtime use of tables
|
||||
void ProbUnpackV2(uint32 prob, uint8* plang3, uint8* prob3) {
|
||||
plang3[0] = (prob >> 8) & 0xff;
|
||||
plang3[1] = (prob >> 16) & 0xff;
|
||||
plang3[2] = (prob >> 24) & 0xff;
|
||||
|
||||
int prob123 = (prob >> 0) & 0xff;
|
||||
const uint8* prob123_entry = LgProb2TblEntry(prob123);
|
||||
prob3[0] = LgProb3(prob123_entry, 0);
|
||||
prob3[1] = LgProb3(prob123_entry, 1);
|
||||
prob3[2] = LgProb3(prob123_entry, 2);
|
||||
}
|
||||
|
||||
} // End namespace CLD2
|
||||
|
||||
|
||||
|
||||
71
internal/cldutil_offline.h
Normal file
71
internal/cldutil_offline.h
Normal file
@@ -0,0 +1,71 @@
|
||||
// Copyright 2013 Google Inc. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
//
|
||||
// Author: dsites@google.com (Dick Sites)
|
||||
//
|
||||
// Just the stuff shared between offline table builder and online detector
|
||||
//
|
||||
|
||||
#ifndef I18N_ENCODINGS_CLD2_INTERNAL_NEW_CLDUTIL_OFFLINE_H__
|
||||
#define I18N_ENCODINGS_CLD2_INTERNAL_NEW_CLDUTIL_OFFLINE_H__
|
||||
|
||||
#include "cldutil_shared.h"
|
||||
#include "integral_types.h"
|
||||
#include "lang_script.h"
|
||||
|
||||
namespace CLD2 {
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Offline: used by mapreduce or table construction
|
||||
//------------------------------------------------------------------------------
|
||||
|
||||
// Find top two langs and scores for one word; underpins delta tables
|
||||
void DoWordScore(const char* isrc, int srclen, ULScript ulscript,
|
||||
const CLD2TableSummary* wrt_unigram_obj,
|
||||
const CLD2TableSummary* wrt_quadgram_obj,
|
||||
Language* lang1, int* score1,
|
||||
Language* lang2, int* score2);
|
||||
|
||||
// For constructing tables
|
||||
// Given a vector of 3 probabilities 1..12, find subscript of best table match.
|
||||
// Minimizes RMS error
|
||||
// Brute-force version
|
||||
uint8 FindBestProb3Match(const uint8* prob3);
|
||||
|
||||
// Not sure who calls this...
|
||||
// Return the probability for given language, or 0
|
||||
int GetProb(Language lang, uint32 probs);
|
||||
|
||||
|
||||
// Converts a unigram prob/lang byte into an approximate prob/lang triple
|
||||
// Just keeps the largest value.
|
||||
// ONLY used in mapreduce, doing refinement of language boundaries.
|
||||
uint32 ApproxProb3(int propval);
|
||||
|
||||
|
||||
// Take three packed languages and three probabilities 1..12 and put into uint32
|
||||
// For offline construction of tables
|
||||
uint32 ProbPackV2(uint8* plang3, uint8* prob3);
|
||||
|
||||
// Take uint32 and unpack into three packed languages and three probabilities
|
||||
// For runtime use of tables
|
||||
void ProbUnpackV2(uint32 prob, uint8* plang3, uint8* prob3);
|
||||
|
||||
} // End namespace CLD2
|
||||
|
||||
#endif // I18N_ENCODINGS_CLD2_INTERNAL_NEW_CLDUTIL_OFFLINE_H__
|
||||
|
||||
|
||||
|
||||
437
internal/cldutil_shared.cc
Normal file
437
internal/cldutil_shared.cc
Normal file
@@ -0,0 +1,437 @@
|
||||
// Copyright 2013 Google Inc. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
//
|
||||
// Author: dsites@google.com (Dick Sites)
|
||||
//
|
||||
|
||||
#include "cldutil_shared.h"
|
||||
#include <string>
|
||||
|
||||
#include "cld2tablesummary.h"
|
||||
#include "integral_types.h"
|
||||
#include "port.h"
|
||||
#include "utf8statetable.h"
|
||||
|
||||
namespace CLD2 {
|
||||
|
||||
// Runtime routines for hashing, looking up, and scoring
|
||||
// unigrams (CJK), bigrams (CJK), quadgrams, and octagrams.
|
||||
// Unigrams and bigrams are for CJK languages only, including simplified/
|
||||
// traditional Chinese, Japanese, Korean, Vietnamese Han characters, and
|
||||
// Zhuang Han characters. Surrounding spaces are not considered.
|
||||
// Quadgrams and octagrams for for non-CJK and include two bits indicating
|
||||
// preceding and trailing spaces (word boundaries).
|
||||
|
||||
|
||||
// Indicator bits for leading/trailing space around quad/octagram
|
||||
// NOTE: 4444 bits are chosen to flip constant bits in hash of four chars of
|
||||
// 1-, 2-, or 3-bytes each.
|
||||
static const uint32 kPreSpaceIndicator = 0x00004444;
|
||||
static const uint32 kPostSpaceIndicator = 0x44440000;
|
||||
|
||||
// Little-endian masks for 0..24 bytes picked up as uint32's
|
||||
static const uint32 kWordMask0[4] = {
|
||||
0xFFFFFFFF, 0x000000FF, 0x0000FFFF, 0x00FFFFFF
|
||||
};
|
||||
|
||||
static const int kMinCJKUTF8CharBytes = 3;
|
||||
|
||||
static const int kMinGramCount = 3;
|
||||
static const int kMaxGramCount = 16;
|
||||
|
||||
static const int UTFmax = 4; // Max number of bytes in a UTF-8 character
|
||||
|
||||
|
||||
// Routines to access a hash table of <key:wordhash, value:probs> pairs
|
||||
// Buckets have 4-byte wordhash for sizes < 32K buckets, but only
|
||||
// 2-byte wordhash for sizes >= 32K buckets, with other wordhash bits used as
|
||||
// bucket subscript.
|
||||
// Probs is a packed: three languages plus a subscript for probability table
|
||||
// Buckets have all the keys together, then all the values.Key array never
|
||||
// crosses a cache-line boundary, so no-match case takes exactly one cache miss.
|
||||
// Match case may sometimes take an additional cache miss on value access.
|
||||
//
|
||||
// Other possibilites include 5 or 10 6-byte entries plus pad to make 32 or 64
|
||||
// byte buckets with single cache miss.
|
||||
// Or 2-byte key and 6-byte value, allowing 5 languages instead of three.
|
||||
|
||||
|
||||
//----------------------------------------------------------------------------//
|
||||
// Hashing groups of 1/2/4/8 letters, perhaps with spaces or underscores //
|
||||
//----------------------------------------------------------------------------//
|
||||
|
||||
// Design principles for these hash functions
|
||||
// - Few operations
|
||||
// - Handle 1-, 2-, and 3-byte UTF-8 scripts, ignoring intermixing except in
|
||||
// Latin script expect 1- and 2-byte mixtures.
|
||||
// - Last byte of each character has about 5 bits of information
|
||||
// - Spread good bits around so they can interact in at least two ways
|
||||
// with other characters
|
||||
// - Use add for additional mixing thorugh carries
|
||||
|
||||
// CJK Three-byte bigram
|
||||
// ....dddd..cccccc..bbbbbb....aaaa
|
||||
// ..................ffffff..eeeeee
|
||||
// make
|
||||
// ....dddd..cccccc..bbbbbb....aaaa
|
||||
// 000....dddd..cccccc..bbbbbb....a
|
||||
// ..................ffffff..eeeeee
|
||||
// ffffff..eeeeee000000000000000000
|
||||
//
|
||||
// CJK Four-byte bigram
|
||||
// ..dddddd..cccccc....bbbb....aaaa
|
||||
// ..hhhhhh..gggggg....ffff....eeee
|
||||
// make
|
||||
// ..dddddd..cccccc....bbbb....aaaa
|
||||
// 000..dddddd..cccccc....bbbb....a
|
||||
// ..hhhhhh..gggggg....ffff....eeee
|
||||
// ..ffff....eeee000000000000000000
|
||||
|
||||
// BIGRAM
|
||||
// Pick up 1..8 bytes and hash them via mask/shift/add. NO pre/post
|
||||
// OVERSHOOTS up to 3 bytes
|
||||
// For runtime use of tables
|
||||
// Does X86 unaligned loads
|
||||
uint32 BiHashV2(const char* word_ptr, int bytecount) {
|
||||
if (bytecount == 0) {return 0;}
|
||||
const uint32* word_ptr32 = reinterpret_cast<const uint32*>(word_ptr);
|
||||
uint32 word0, word1;
|
||||
if (bytecount <= 4) {
|
||||
word0 = UNALIGNED_LOAD32(word_ptr32) & kWordMask0[bytecount & 3];
|
||||
word0 = word0 ^ (word0 >> 3);
|
||||
return word0;
|
||||
}
|
||||
// Else do 8 bytes
|
||||
word0 = UNALIGNED_LOAD32(word_ptr32);
|
||||
word0 = word0 ^ (word0 >> 3);
|
||||
word1 = UNALIGNED_LOAD32(word_ptr32 + 1) & kWordMask0[bytecount & 3];
|
||||
word1 = word1 ^ (word1 << 18);
|
||||
return word0 + word1;
|
||||
}
|
||||
|
||||
//
|
||||
// Ascii-7 One-byte chars
|
||||
// ...ddddd...ccccc...bbbbb...aaaaa
|
||||
// make
|
||||
// ...ddddd...ccccc...bbbbb...aaaaa
|
||||
// 000...ddddd...ccccc...bbbbb...aa
|
||||
//
|
||||
// Latin 1- and 2-byte chars
|
||||
// ...ddddd...ccccc...bbbbb...aaaaa
|
||||
// ...................fffff...eeeee
|
||||
// make
|
||||
// ...ddddd...ccccc...bbbbb...aaaaa
|
||||
// 000...ddddd...ccccc...bbbbb...aa
|
||||
// ...................fffff...eeeee
|
||||
// ...............fffff...eeeee0000
|
||||
//
|
||||
// Non-CJK Two-byte chars
|
||||
// ...ddddd...........bbbbb........
|
||||
// ...hhhhh...........fffff........
|
||||
// make
|
||||
// ...ddddd...........bbbbb........
|
||||
// 000...ddddd...........bbbbb.....
|
||||
// ...hhhhh...........fffff........
|
||||
// hhhh...........fffff........0000
|
||||
//
|
||||
// Non-CJK Three-byte chars
|
||||
// ...........ccccc................
|
||||
// ...................fffff........
|
||||
// ...lllll...................iiiii
|
||||
// make
|
||||
// ...........ccccc................
|
||||
// 000...........ccccc.............
|
||||
// ...................fffff........
|
||||
// ...............fffff........0000
|
||||
// ...lllll...................iiiii
|
||||
// .lllll...................iiiii00
|
||||
//
|
||||
|
||||
// QUADGRAM
|
||||
// Pick up 1..12 bytes plus pre/post space and hash them via mask/shift/add
|
||||
// OVERSHOOTS up to 3 bytes
|
||||
// For runtime use of tables
|
||||
// Does X86 unaligned loads
|
||||
uint32 QuadHashV2Mix(const char* word_ptr, int bytecount, uint32 prepost) {
|
||||
const uint32* word_ptr32 = reinterpret_cast<const uint32*>(word_ptr);
|
||||
uint32 word0, word1, word2;
|
||||
if (bytecount <= 4) {
|
||||
word0 = UNALIGNED_LOAD32(word_ptr32) & kWordMask0[bytecount & 3];
|
||||
word0 = word0 ^ (word0 >> 3);
|
||||
return word0 ^ prepost;
|
||||
} else if (bytecount <= 8) {
|
||||
word0 = UNALIGNED_LOAD32(word_ptr32);
|
||||
word0 = word0 ^ (word0 >> 3);
|
||||
word1 = UNALIGNED_LOAD32(word_ptr32 + 1) & kWordMask0[bytecount & 3];
|
||||
word1 = word1 ^ (word1 << 4);
|
||||
return (word0 ^ prepost) + word1;
|
||||
}
|
||||
// else do 12 bytes
|
||||
word0 = UNALIGNED_LOAD32(word_ptr32);
|
||||
word0 = word0 ^ (word0 >> 3);
|
||||
word1 = UNALIGNED_LOAD32(word_ptr32 + 1);
|
||||
word1 = word1 ^ (word1 << 4);
|
||||
word2 = UNALIGNED_LOAD32(word_ptr32 + 2) & kWordMask0[bytecount & 3];
|
||||
word2 = word2 ^ (word2 << 2);
|
||||
return (word0 ^ prepost) + word1 + word2;
|
||||
}
|
||||
|
||||
|
||||
// QUADGRAM wrapper with surrounding spaces
|
||||
// Pick up 1..12 bytes plus pre/post space and hash them via mask/shift/add
|
||||
// UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes
|
||||
// For runtime use of tables
|
||||
uint32 QuadHashV2(const char* word_ptr, int bytecount) {
|
||||
if (bytecount == 0) {return 0;}
|
||||
uint32 prepost = 0;
|
||||
if (word_ptr[-1] == ' ') {prepost |= kPreSpaceIndicator;}
|
||||
if (word_ptr[bytecount] == ' ') {prepost |= kPostSpaceIndicator;}
|
||||
return QuadHashV2Mix(word_ptr, bytecount, prepost);
|
||||
}
|
||||
|
||||
// QUADGRAM wrapper with surrounding underscores (offline use)
|
||||
// Pick up 1..12 bytes plus pre/post '_' and hash them via mask/shift/add
|
||||
// OVERSHOOTS up to 3 bytes
|
||||
// For offline construction of tables
|
||||
uint32 QuadHashV2Underscore(const char* word_ptr, int bytecount) {
|
||||
if (bytecount == 0) {return 0;}
|
||||
const char* local_word_ptr = word_ptr;
|
||||
int local_bytecount = bytecount;
|
||||
uint32 prepost = 0;
|
||||
if (local_word_ptr[0] == '_') {
|
||||
prepost |= kPreSpaceIndicator;
|
||||
++local_word_ptr;
|
||||
--local_bytecount;
|
||||
}
|
||||
if (local_word_ptr[local_bytecount - 1] == '_') {
|
||||
prepost |= kPostSpaceIndicator;
|
||||
--local_bytecount;
|
||||
}
|
||||
return QuadHashV2Mix(local_word_ptr, local_bytecount, prepost);
|
||||
}
|
||||
|
||||
|
||||
// OCTAGRAM
|
||||
// Pick up 1..24 bytes plus pre/post space and hash them via mask/shift/add
|
||||
// UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes
|
||||
//
|
||||
// The low 32 bits follow the pattern from above, tuned to different scripts
|
||||
// The high 8 bits are a simple sum of all bytes, shifted by 0/1/2/3 bits each
|
||||
// For runtime use of tables V3
|
||||
// Does X86 unaligned loads
|
||||
uint64 OctaHash40Mix(const char* word_ptr, int bytecount, uint64 prepost) {
|
||||
const uint32* word_ptr32 = reinterpret_cast<const uint32*>(word_ptr);
|
||||
uint64 word0;
|
||||
uint64 word1;
|
||||
uint64 sum;
|
||||
|
||||
if (word_ptr[-1] == ' ') {prepost |= kPreSpaceIndicator;}
|
||||
if (word_ptr[bytecount] == ' ') {prepost |= kPostSpaceIndicator;}
|
||||
switch ((bytecount - 1) >> 2) {
|
||||
case 0: // 1..4 bytes
|
||||
word0 = UNALIGNED_LOAD32(word_ptr32) & kWordMask0[bytecount & 3];
|
||||
sum = word0;
|
||||
word0 = word0 ^ (word0 >> 3);
|
||||
break;
|
||||
case 1: // 5..8 bytes
|
||||
word0 = UNALIGNED_LOAD32(word_ptr32);
|
||||
sum = word0;
|
||||
word0 = word0 ^ (word0 >> 3);
|
||||
word1 = UNALIGNED_LOAD32(word_ptr32 + 1) & kWordMask0[bytecount & 3];
|
||||
sum += word1;
|
||||
word1 = word1 ^ (word1 << 4);
|
||||
word0 += word1;
|
||||
break;
|
||||
case 2: // 9..12 bytes
|
||||
word0 = UNALIGNED_LOAD32(word_ptr32);
|
||||
sum = word0;
|
||||
word0 = word0 ^ (word0 >> 3);
|
||||
word1 = UNALIGNED_LOAD32(word_ptr32 + 1);
|
||||
sum += word1;
|
||||
word1 = word1 ^ (word1 << 4);
|
||||
word0 += word1;
|
||||
word1 = UNALIGNED_LOAD32(word_ptr32 + 2) & kWordMask0[bytecount & 3];
|
||||
sum += word1;
|
||||
word1 = word1 ^ (word1 << 2);
|
||||
word0 += word1;
|
||||
break;
|
||||
case 3: // 13..16 bytes
|
||||
word0 =UNALIGNED_LOAD32(word_ptr32);
|
||||
sum = word0;
|
||||
word0 = word0 ^ (word0 >> 3);
|
||||
word1 = UNALIGNED_LOAD32(word_ptr32 + 1);
|
||||
sum += word1;
|
||||
word1 = word1 ^ (word1 << 4);
|
||||
word0 += word1;
|
||||
word1 = UNALIGNED_LOAD32(word_ptr32 + 2);
|
||||
sum += word1;
|
||||
word1 = word1 ^ (word1 << 2);
|
||||
word0 += word1;
|
||||
word1 = UNALIGNED_LOAD32(word_ptr32 + 3) & kWordMask0[bytecount & 3];
|
||||
sum += word1;
|
||||
word1 = word1 ^ (word1 >> 8);
|
||||
word0 += word1;
|
||||
break;
|
||||
case 4: // 17..20 bytes
|
||||
word0 = UNALIGNED_LOAD32(word_ptr32);
|
||||
sum = word0;
|
||||
word0 = word0 ^ (word0 >> 3);
|
||||
word1 = UNALIGNED_LOAD32(word_ptr32 + 1);
|
||||
sum += word1;
|
||||
word1 = word1 ^ (word1 << 4);
|
||||
word0 += word1;
|
||||
word1 = UNALIGNED_LOAD32(word_ptr32 + 2);
|
||||
sum += word1;
|
||||
word1 = word1 ^ (word1 << 2);
|
||||
word0 += word1;
|
||||
word1 = UNALIGNED_LOAD32(word_ptr32 + 3);
|
||||
sum += word1;
|
||||
word1 = word1 ^ (word1 >> 8);
|
||||
word0 += word1;
|
||||
word1 = UNALIGNED_LOAD32(word_ptr32 + 4) & kWordMask0[bytecount & 3];
|
||||
sum += word1;
|
||||
word1 = word1 ^ (word1 >> 4);
|
||||
word0 += word1;
|
||||
break;
|
||||
default: // 21..24 bytes and higher (ignores beyond 24)
|
||||
word0 = UNALIGNED_LOAD32(word_ptr32);
|
||||
sum = word0;
|
||||
word0 = word0 ^ (word0 >> 3);
|
||||
word1 = UNALIGNED_LOAD32(word_ptr32 + 1);
|
||||
sum += word1;
|
||||
word1 = word1 ^ (word1 << 4);
|
||||
word0 += word1;
|
||||
word1 = UNALIGNED_LOAD32(word_ptr32 + 2);
|
||||
sum += word1;
|
||||
word1 = word1 ^ (word1 << 2);
|
||||
word0 += word1;
|
||||
word1 = UNALIGNED_LOAD32(word_ptr32 + 3);
|
||||
sum += word1;
|
||||
word1 = word1 ^ (word1 >> 8);
|
||||
word0 += word1;
|
||||
word1 = UNALIGNED_LOAD32(word_ptr32 + 4);
|
||||
sum += word1;
|
||||
word1 = word1 ^ (word1 >> 4);
|
||||
word0 += word1;
|
||||
word1 = UNALIGNED_LOAD32(word_ptr32 + 5) & kWordMask0[bytecount & 3];
|
||||
sum += word1;
|
||||
word1 = word1 ^ (word1 >> 6);
|
||||
word0 += word1;
|
||||
break;
|
||||
}
|
||||
|
||||
sum += (sum >> 17); // extra 1-bit shift for bytes 2 & 3
|
||||
sum += (sum >> 9); // extra 1-bit shift for bytes 1 & 3
|
||||
sum = (sum & 0xff) << 32;
|
||||
return (word0 ^ prepost) + sum;
|
||||
}
|
||||
|
||||
// OCTAGRAM wrapper with surrounding spaces
|
||||
// Pick up 1..24 bytes plus pre/post space and hash them via mask/shift/add
|
||||
// UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes
|
||||
//
|
||||
// The low 32 bits follow the pattern from above, tuned to different scripts
|
||||
// The high 8 bits are a simple sum of all bytes, shifted by 0/1/2/3 bits each
|
||||
// For runtime use of tables V3
|
||||
uint64 OctaHash40(const char* word_ptr, int bytecount) {
|
||||
if (bytecount == 0) {return 0;}
|
||||
uint64 prepost = 0;
|
||||
if (word_ptr[-1] == ' ') {prepost |= kPreSpaceIndicator;}
|
||||
if (word_ptr[bytecount] == ' ') {prepost |= kPostSpaceIndicator;}
|
||||
return OctaHash40Mix(word_ptr, bytecount, prepost);
|
||||
}
|
||||
|
||||
|
||||
// OCTAGRAM wrapper with surrounding underscores (offline use)
|
||||
// Pick up 1..24 bytes plus pre/post space and hash them via mask/shift/add
|
||||
// UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes
|
||||
//
|
||||
// The low 32 bits follow the pattern from above, tuned to different scripts
|
||||
// The high 8 bits are a simple sum of all bytes, shifted by 0/1/2/3 bits each
|
||||
// For offline construction of tables
|
||||
uint64 OctaHash40underscore(const char* word_ptr, int bytecount) {
|
||||
if (bytecount == 0) {return 0;}
|
||||
const char* local_word_ptr = word_ptr;
|
||||
int local_bytecount = bytecount;
|
||||
uint64 prepost = 0;
|
||||
if (local_word_ptr[0] == '_') {
|
||||
prepost |= kPreSpaceIndicator;
|
||||
++local_word_ptr;
|
||||
--local_bytecount;
|
||||
}
|
||||
if (local_word_ptr[local_bytecount - 1] == '_') {
|
||||
prepost |= kPostSpaceIndicator;
|
||||
--local_bytecount;
|
||||
}
|
||||
return OctaHash40Mix(local_word_ptr, local_bytecount, prepost);
|
||||
}
|
||||
|
||||
// Hash a consecutive pair of tokens/words A B
|
||||
// Old: hash is B - A, which gives too many false hits on one-char diffs
|
||||
// Now: rotate(A,13) + B
|
||||
uint64 PairHash(uint64 worda_hash, uint64 wordb_hash) {
|
||||
return ((worda_hash >> 13) | (worda_hash << (64 - 13))) + wordb_hash;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
//----------------------------------------------------------------------------//
|
||||
// Finding groups of 1/2/4/8 letters //
|
||||
//----------------------------------------------------------------------------//
|
||||
|
||||
// src points to a letter. Find the byte length of a unigram starting there.
|
||||
int UniLen(const char* src) {
|
||||
const char* src_end = src;
|
||||
src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];
|
||||
return src_end - src;
|
||||
}
|
||||
|
||||
// src points to a letter. Find the byte length of a bigram starting there.
|
||||
int BiLen(const char* src) {
|
||||
const char* src_end = src;
|
||||
src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];
|
||||
src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];
|
||||
return src_end - src;
|
||||
}
|
||||
|
||||
// src points to a letter. Find the byte length of a quadgram starting there.
|
||||
int QuadLen(const char* src) {
|
||||
const char* src_end = src;
|
||||
src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];
|
||||
src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];
|
||||
src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];
|
||||
src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];
|
||||
return src_end - src;
|
||||
}
|
||||
|
||||
// src points to a letter. Find the byte length of an octagram starting there.
|
||||
int OctaLen(const char* src) {
|
||||
const char* src_end = src;
|
||||
int charcount = 0;
|
||||
while (src_end[0] != ' ') {
|
||||
src_end += UTF8OneCharLen(src);
|
||||
++charcount;
|
||||
if (charcount == 8) {break;}
|
||||
}
|
||||
return src_end - src;
|
||||
}
|
||||
|
||||
} // End namespace CLD2
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
509
internal/cldutil_shared.h
Normal file
509
internal/cldutil_shared.h
Normal file
@@ -0,0 +1,509 @@
|
||||
// Copyright 2013 Google Inc. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
//
|
||||
// Author: dsites@google.com (Dick Sites)
|
||||
//
|
||||
// Just the stuff shared between offline table builder and online detector
|
||||
//
|
||||
|
||||
#ifndef I18N_ENCODINGS_CLD2_INTERNAL_NEW_CLDUTIL_SHARED_H__
|
||||
#define I18N_ENCODINGS_CLD2_INTERNAL_NEW_CLDUTIL_SHARED_H__
|
||||
|
||||
#include "integral_types.h"
|
||||
#include "cld2tablesummary.h"
|
||||
|
||||
namespace CLD2 {
|
||||
|
||||
// Runtime routines for hashing, looking up, and scoring
|
||||
// unigrams (CJK), bigrams (CJK), quadgrams, and octagrams.
|
||||
// Unigrams and bigrams are for CJK languages only, including simplified/
|
||||
// traditional Chinese, Japanese, Korean, Vietnamese Han characters, and
|
||||
// Zhuang Han characters. Surrounding spaces are not considered.
|
||||
// Quadgrams and octagrams for for non-CJK and include two bits indicating
|
||||
// preceding and trailing spaces (word boundaries).
|
||||
|
||||
|
||||
//----------------------------------------------------------------------------//
|
||||
// Main quantized probability table //
|
||||
//----------------------------------------------------------------------------//
|
||||
|
||||
// Table has 240 eight-byte entries. Each entry has a five-byte array and
|
||||
// a three-byte array of log base 2 probabilities in the range 1..12.
|
||||
// The intended use is to express five or three probabilities in a single-byte
|
||||
// subscript, then decode via this table. These probabilities are
|
||||
// intended to go with an array of five or three language numbers.
|
||||
//
|
||||
// The corresponding language numbers will have to be sorted by descending
|
||||
// probability, then the actual probability subscript chosen to match the
|
||||
// closest available entry in this table.
|
||||
//
|
||||
// Pattern of probability values:
|
||||
// hi 3/4 1/2 1/4 lo hi mid lo
|
||||
// where "3/4" is (hi*3+lo)/4, "1/2" is (hi+lo)/2, and "1/4" is (hi+lo*3)/4
|
||||
// and mid is one of 3/4 1/2 or 1/4.
|
||||
// There are three groups of 78 (=12*13/2) entries, with hi running 1..12 and
|
||||
// lo running 1..hi. Only the first group is used for five-entry lookups.
|
||||
// The mid value in the first group is 1/2, the second group 3/4, and the
|
||||
// third group 1/4. For three-entry lookups, this allows the mid entry to be
|
||||
// somewhat higher or lower than the midpoint, to allow a better match to the
|
||||
// original probabilities.
|
||||
static const int kLgProbV2TblSize = 240;
|
||||
static const uint8 kLgProbV2Tbl[kLgProbV2TblSize * 8] = {
|
||||
1,1,1,1,1, 1,1,1, // [0]
|
||||
2,2,2,1,1, 2,2,1, // [1]
|
||||
2,2,2,2,2, 2,2,2,
|
||||
3,3,2,2,1, 3,2,1, // [3]
|
||||
3,3,3,2,2, 3,3,2,
|
||||
3,3,3,3,3, 3,3,3,
|
||||
4,3,3,2,1, 4,3,1, // [6]
|
||||
4,4,3,3,2, 4,3,2,
|
||||
4,4,4,3,3, 4,4,3,
|
||||
4,4,4,4,4, 4,4,4,
|
||||
5,4,3,2,1, 5,3,1, // [10]
|
||||
5,4,4,3,2, 5,4,2,
|
||||
5,5,4,4,3, 5,4,3,
|
||||
5,5,5,4,4, 5,5,4,
|
||||
5,5,5,5,5, 5,5,5,
|
||||
6,5,4,2,1, 6,4,1, // [15]
|
||||
6,5,4,3,2, 6,4,2,
|
||||
6,5,5,4,3, 6,5,3,
|
||||
6,6,5,5,4, 6,5,4,
|
||||
6,6,6,5,5, 6,6,5,
|
||||
6,6,6,6,6, 6,6,6,
|
||||
7,6,4,3,1, 7,4,1, // [21]
|
||||
7,6,5,3,2, 7,5,2,
|
||||
7,6,5,4,3, 7,5,3,
|
||||
7,6,6,5,4, 7,6,4,
|
||||
7,7,6,6,5, 7,6,5,
|
||||
7,7,7,6,6, 7,7,6,
|
||||
7,7,7,7,7, 7,7,7,
|
||||
8,6,5,3,1, 8,5,1, // [28]
|
||||
8,7,5,4,2, 8,5,2,
|
||||
8,7,6,4,3, 8,6,3,
|
||||
8,7,6,5,4, 8,6,4,
|
||||
8,7,7,6,5, 8,7,5,
|
||||
8,8,7,7,6, 8,7,6,
|
||||
8,8,8,7,7, 8,8,7,
|
||||
8,8,8,8,8, 8,8,8,
|
||||
9,7,5,3,1, 9,5,1, // [36]
|
||||
9,7,6,4,2, 9,6,2,
|
||||
9,8,6,5,3, 9,6,3,
|
||||
9,8,7,5,4, 9,7,4,
|
||||
9,8,7,6,5, 9,7,5,
|
||||
9,8,8,7,6, 9,8,6,
|
||||
9,9,8,8,7, 9,8,7,
|
||||
9,9,9,8,8, 9,9,8,
|
||||
9,9,9,9,9, 9,9,9,
|
||||
10,8,6,3,1, 10,6,1, // [45]
|
||||
10,8,6,4,2, 10,6,2,
|
||||
10,8,7,5,3, 10,7,3,
|
||||
10,9,7,6,4, 10,7,4,
|
||||
10,9,8,6,5, 10,8,5,
|
||||
10,9,8,7,6, 10,8,6,
|
||||
10,9,9,8,7, 10,9,7,
|
||||
10,10,9,9,8, 10,9,8,
|
||||
10,10,10,9,9, 10,10,9,
|
||||
10,10,10,10,10, 10,10,10,
|
||||
11,9,6,4,1, 11,6,1, // [55]
|
||||
11,9,7,4,2, 11,7,2,
|
||||
11,9,7,5,3, 11,7,3,
|
||||
11,9,8,6,4, 11,8,4,
|
||||
11,10,8,7,5, 11,8,5,
|
||||
11,10,9,7,6, 11,9,6,
|
||||
11,10,9,8,7, 11,9,7,
|
||||
11,10,10,9,8, 11,10,8,
|
||||
11,11,10,10,9, 11,10,9,
|
||||
11,11,11,10,10, 11,11,10,
|
||||
11,11,11,11,11, 11,11,11,
|
||||
12,9,7,4,1, 12,7,1, // [66]
|
||||
12,10,7,5,2, 12,7,2,
|
||||
12,10,8,5,3, 12,8,3,
|
||||
12,10,8,6,4, 12,8,4,
|
||||
12,10,9,7,5, 12,9,5,
|
||||
12,11,9,8,6, 12,9,6,
|
||||
12,11,10,8,7, 12,10,7,
|
||||
12,11,10,9,8, 12,10,8,
|
||||
12,11,11,10,9, 12,11,9,
|
||||
12,12,11,11,10, 12,11,10,
|
||||
12,12,12,11,11, 12,12,11,
|
||||
12,12,12,12,12, 12,12,12,
|
||||
|
||||
1,1,1,1,1, 1,1,1,
|
||||
2,2,2,1,1, 2,2,1,
|
||||
2,2,2,2,2, 2,2,2,
|
||||
3,3,2,2,1, 3,3,1,
|
||||
3,3,3,2,2, 3,3,2,
|
||||
3,3,3,3,3, 3,3,3,
|
||||
4,3,3,2,1, 4,3,1,
|
||||
4,4,3,3,2, 4,4,2,
|
||||
4,4,4,3,3, 4,4,3,
|
||||
4,4,4,4,4, 4,4,4,
|
||||
5,4,3,2,1, 5,4,1,
|
||||
5,4,4,3,2, 5,4,2,
|
||||
5,5,4,4,3, 5,5,3,
|
||||
5,5,5,4,4, 5,5,4,
|
||||
5,5,5,5,5, 5,5,5,
|
||||
6,5,4,2,1, 6,5,1,
|
||||
6,5,4,3,2, 6,5,2,
|
||||
6,5,5,4,3, 6,5,3,
|
||||
6,6,5,5,4, 6,6,4,
|
||||
6,6,6,5,5, 6,6,5,
|
||||
6,6,6,6,6, 6,6,6,
|
||||
7,6,4,3,1, 7,6,1,
|
||||
7,6,5,3,2, 7,6,2,
|
||||
7,6,5,4,3, 7,6,3,
|
||||
7,6,6,5,4, 7,6,4,
|
||||
7,7,6,6,5, 7,7,5,
|
||||
7,7,7,6,6, 7,7,6,
|
||||
7,7,7,7,7, 7,7,7,
|
||||
8,6,5,3,1, 8,6,1,
|
||||
8,7,5,4,2, 8,7,2,
|
||||
8,7,6,4,3, 8,7,3,
|
||||
8,7,6,5,4, 8,7,4,
|
||||
8,7,7,6,5, 8,7,5,
|
||||
8,8,7,7,6, 8,8,6,
|
||||
8,8,8,7,7, 8,8,7,
|
||||
8,8,8,8,8, 8,8,8,
|
||||
9,7,5,3,1, 9,7,1,
|
||||
9,7,6,4,2, 9,7,2,
|
||||
9,8,6,5,3, 9,8,3,
|
||||
9,8,7,5,4, 9,8,4,
|
||||
9,8,7,6,5, 9,8,5,
|
||||
9,8,8,7,6, 9,8,6,
|
||||
9,9,8,8,7, 9,9,7,
|
||||
9,9,9,8,8, 9,9,8,
|
||||
9,9,9,9,9, 9,9,9,
|
||||
10,8,6,3,1, 10,8,1,
|
||||
10,8,6,4,2, 10,8,2,
|
||||
10,8,7,5,3, 10,8,3,
|
||||
10,9,7,6,4, 10,9,4,
|
||||
10,9,8,6,5, 10,9,5,
|
||||
10,9,8,7,6, 10,9,6,
|
||||
10,9,9,8,7, 10,9,7,
|
||||
10,10,9,9,8, 10,10,8,
|
||||
10,10,10,9,9, 10,10,9,
|
||||
10,10,10,10,10, 10,10,10,
|
||||
11,9,6,4,1, 11,9,1,
|
||||
11,9,7,4,2, 11,9,2,
|
||||
11,9,7,5,3, 11,9,3,
|
||||
11,9,8,6,4, 11,9,4,
|
||||
11,10,8,7,5, 11,10,5,
|
||||
11,10,9,7,6, 11,10,6,
|
||||
11,10,9,8,7, 11,10,7,
|
||||
11,10,10,9,8, 11,10,8,
|
||||
11,11,10,10,9, 11,11,9,
|
||||
11,11,11,10,10, 11,11,10,
|
||||
11,11,11,11,11, 11,11,11,
|
||||
12,9,7,4,1, 12,9,1,
|
||||
12,10,7,5,2, 12,10,2,
|
||||
12,10,8,5,3, 12,10,3,
|
||||
12,10,8,6,4, 12,10,4,
|
||||
12,10,9,7,5, 12,10,5,
|
||||
12,11,9,8,6, 12,11,6,
|
||||
12,11,10,8,7, 12,11,7,
|
||||
12,11,10,9,8, 12,11,8,
|
||||
12,11,11,10,9, 12,11,9,
|
||||
12,12,11,11,10, 12,12,10,
|
||||
12,12,12,11,11, 12,12,11,
|
||||
12,12,12,12,12, 12,12,12,
|
||||
|
||||
1,1,1,1,1, 1,1,1,
|
||||
2,2,2,1,1, 2,1,1,
|
||||
2,2,2,2,2, 2,2,2,
|
||||
3,3,2,2,1, 3,2,1,
|
||||
3,3,3,2,2, 3,2,2,
|
||||
3,3,3,3,3, 3,3,3,
|
||||
4,3,3,2,1, 4,2,1,
|
||||
4,4,3,3,2, 4,3,2,
|
||||
4,4,4,3,3, 4,3,3,
|
||||
4,4,4,4,4, 4,4,4,
|
||||
5,4,3,2,1, 5,2,1,
|
||||
5,4,4,3,2, 5,3,2,
|
||||
5,5,4,4,3, 5,4,3,
|
||||
5,5,5,4,4, 5,4,4,
|
||||
5,5,5,5,5, 5,5,5,
|
||||
6,5,4,2,1, 6,2,1,
|
||||
6,5,4,3,2, 6,3,2,
|
||||
6,5,5,4,3, 6,4,3,
|
||||
6,6,5,5,4, 6,5,4,
|
||||
6,6,6,5,5, 6,5,5,
|
||||
6,6,6,6,6, 6,6,6,
|
||||
7,6,4,3,1, 7,3,1,
|
||||
7,6,5,3,2, 7,3,2,
|
||||
7,6,5,4,3, 7,4,3,
|
||||
7,6,6,5,4, 7,5,4,
|
||||
7,7,6,6,5, 7,6,5,
|
||||
7,7,7,6,6, 7,6,6,
|
||||
7,7,7,7,7, 7,7,7,
|
||||
8,6,5,3,1, 8,3,1,
|
||||
8,7,5,4,2, 8,4,2,
|
||||
8,7,6,4,3, 8,4,3,
|
||||
8,7,6,5,4, 8,5,4,
|
||||
8,7,7,6,5, 8,6,5,
|
||||
8,8,7,7,6, 8,7,6,
|
||||
8,8,8,7,7, 8,7,7,
|
||||
8,8,8,8,8, 8,8,8,
|
||||
9,7,5,3,1, 9,3,1,
|
||||
9,7,6,4,2, 9,4,2,
|
||||
9,8,6,5,3, 9,5,3,
|
||||
9,8,7,5,4, 9,5,4,
|
||||
9,8,7,6,5, 9,6,5,
|
||||
9,8,8,7,6, 9,7,6,
|
||||
9,9,8,8,7, 9,8,7,
|
||||
9,9,9,8,8, 9,8,8,
|
||||
9,9,9,9,9, 9,9,9,
|
||||
10,8,6,3,1, 10,3,1,
|
||||
10,8,6,4,2, 10,4,2,
|
||||
10,8,7,5,3, 10,5,3,
|
||||
10,9,7,6,4, 10,6,4,
|
||||
10,9,8,6,5, 10,6,5,
|
||||
10,9,8,7,6, 10,7,6,
|
||||
10,9,9,8,7, 10,8,7,
|
||||
10,10,9,9,8, 10,9,8,
|
||||
10,10,10,9,9, 10,9,9,
|
||||
10,10,10,10,10, 10,10,10,
|
||||
11,9,6,4,1, 11,4,1,
|
||||
11,9,7,4,2, 11,4,2,
|
||||
11,9,7,5,3, 11,5,3,
|
||||
11,9,8,6,4, 11,6,4,
|
||||
11,10,8,7,5, 11,7,5,
|
||||
11,10,9,7,6, 11,7,6,
|
||||
11,10,9,8,7, 11,8,7,
|
||||
11,10,10,9,8, 11,9,8,
|
||||
11,11,10,10,9, 11,10,9,
|
||||
11,11,11,10,10, 11,10,10,
|
||||
11,11,11,11,11, 11,11,11,
|
||||
12,9,7,4,1, 12,4,1,
|
||||
12,10,7,5,2, 12,5,2,
|
||||
12,10,8,5,3, 12,5,3,
|
||||
12,10,8,6,4, 12,6,4,
|
||||
12,10,9,7,5, 12,7,5,
|
||||
12,11,9,8,6, 12,8,6,
|
||||
12,11,10,8,7, 12,8,7,
|
||||
12,11,10,9,8, 12,9,8,
|
||||
12,11,11,10,9, 12,10,9,
|
||||
12,12,11,11,10, 12,11,10,
|
||||
12,12,12,11,11, 12,11,11,
|
||||
12,12,12,12,12, 12,12,12,
|
||||
|
||||
// Added 2013.01.28 for CJK compatible mapping
|
||||
8,5,2,2,2, 8,2,2,
|
||||
6,6,6,4,2, 6,6,2,
|
||||
6,5,4,4,4, 6,4,4,
|
||||
6,4,2,2,2, 6,2,2,
|
||||
4,3,2,2,2, 4,2,2,
|
||||
2,2,2,2,2, 2,2,2,
|
||||
};
|
||||
|
||||
// Backmap a single desired probability into an entry in kLgProbV2Tbl
|
||||
static const uint8 kLgProbV2TblBackmap[13] = {
|
||||
0,
|
||||
0, 1, 3, 6, 10, 15, 21, 28, 36, 45, 55, 66,
|
||||
};
|
||||
|
||||
// Return address of 8-byte entry[i]
|
||||
inline const uint8* LgProb2TblEntry(int i) {
|
||||
return &kLgProbV2Tbl[i * 8];
|
||||
}
|
||||
|
||||
// Return one of three probabilities in an entry
|
||||
inline uint8 LgProb3(const uint8* entry, int j) {
|
||||
return entry[j + 5];
|
||||
}
|
||||
|
||||
|
||||
// Routines to access a hash table of <key:wordhash, value:probs> pairs
|
||||
// Buckets have 4-byte wordhash for sizes < 32K buckets, but only
|
||||
// 2-byte wordhash for sizes >= 32K buckets, with other wordhash bits used as
|
||||
// bucket subscript.
|
||||
// Probs is a packed: three languages plus a subscript for probability table
|
||||
// Buckets have all the keys together, then all the values.Key array never
|
||||
// crosses a cache-line boundary, so no-match case takes exactly one cache miss.
|
||||
// Match case may sometimes take an additional cache miss on value access.
|
||||
//
|
||||
// Other possibilites include 5 or 10 6-byte entries plus pad to make 32 or 64
|
||||
// byte buckets with single cache miss.
|
||||
// Or 2-byte key and 6-byte value, allowing 5 languages instead of three.
|
||||
|
||||
|
||||
//----------------------------------------------------------------------------//
|
||||
// Hashing groups of 1/2/4/8 letters, perhaps with spaces or underscores //
|
||||
//----------------------------------------------------------------------------//
|
||||
|
||||
// BIGRAM
|
||||
// Pick up 1..8 bytes and hash them via mask/shift/add. NO pre/post
|
||||
// OVERSHOOTS up to 3 bytes
|
||||
// For runtime use of tables
|
||||
// Does X86 unaligned loads if !defined(NEED_ALIGNED_LOADS)UNALIGNED_LOAD32(_p)
|
||||
uint32 BiHashV2(const char* word_ptr, int bytecount);
|
||||
|
||||
// QUADGRAM wrapper with surrounding spaces
|
||||
// Pick up 1..12 bytes plus pre/post space and hash them via mask/shift/add
|
||||
// UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes
|
||||
// For runtime use of tables
|
||||
uint32 QuadHashV2(const char* word_ptr, int bytecount);
|
||||
|
||||
// QUADGRAM wrapper with surrounding underscores (offline use)
|
||||
// Pick up 1..12 bytes plus pre/post '_' and hash them via mask/shift/add
|
||||
// OVERSHOOTS up to 3 bytes
|
||||
// For offline construction of tables
|
||||
uint32 QuadHashV2Underscore(const char* word_ptr, int bytecount);
|
||||
|
||||
// OCTAGRAM wrapper with surrounding spaces
|
||||
// Pick up 1..24 bytes plus pre/post space and hash them via mask/shift/add
|
||||
// UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes
|
||||
uint64 OctaHash40(const char* word_ptr, int bytecount);
|
||||
|
||||
|
||||
// OCTAGRAM wrapper with surrounding underscores (offline use)
|
||||
// Pick up 1..24 bytes plus pre/post space and hash them via mask/shift/add
|
||||
// UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes
|
||||
uint64 OctaHash40underscore(const char* word_ptr, int bytecount);
|
||||
|
||||
// Hash a consecutive pair of tokens/words A B
|
||||
uint64 PairHash(uint64 worda_hash, uint64 wordb_hash);
|
||||
|
||||
|
||||
// From 32-bit gram FP, return hash table subscript and remaining key
|
||||
inline void QuadFPJustHash(uint32 quadhash,
|
||||
uint32 keymask,
|
||||
int bucketcount,
|
||||
uint32* subscr, uint32* hashkey) {
|
||||
*subscr = (quadhash + (quadhash >> 12)) & (bucketcount - 1);
|
||||
*hashkey = quadhash & keymask;
|
||||
}
|
||||
|
||||
// From 40-bit gram FP, return hash table subscript and remaining key
|
||||
inline void OctaFPJustHash(uint64 longwordhash,
|
||||
uint32 keymask,
|
||||
int bucketcount,
|
||||
uint32* subscr, uint32* hashkey) {
|
||||
uint32 temp = (longwordhash + (longwordhash >> 12)) & (bucketcount - 1);
|
||||
*subscr = temp;
|
||||
temp = longwordhash >> 4;
|
||||
*hashkey = temp & keymask;
|
||||
}
|
||||
|
||||
|
||||
// Look up 32-bit gram FP in caller-passed table
|
||||
// Typical size 256K entries (1.5MB)
|
||||
// Two-byte hashkey
|
||||
inline const uint32 QuadHashV3Lookup4(const CLD2TableSummary* gram_obj,
|
||||
uint32 quadhash) {
|
||||
uint32 subscr, hashkey;
|
||||
const IndirectProbBucket4* quadtable = gram_obj->kCLDTable;
|
||||
uint32 keymask = gram_obj->kCLDTableKeyMask;
|
||||
int bucketcount = gram_obj->kCLDTableSize;
|
||||
QuadFPJustHash(quadhash, keymask, bucketcount, &subscr, &hashkey);
|
||||
const IndirectProbBucket4* bucket_ptr = &quadtable[subscr];
|
||||
// Four-way associative, 4 compares
|
||||
if (((hashkey ^ bucket_ptr->keyvalue[0]) & keymask) == 0) {
|
||||
return bucket_ptr->keyvalue[0];
|
||||
}
|
||||
if (((hashkey ^ bucket_ptr->keyvalue[1]) & keymask) == 0) {
|
||||
return bucket_ptr->keyvalue[1];
|
||||
}
|
||||
if (((hashkey ^ bucket_ptr->keyvalue[2]) & keymask) == 0) {
|
||||
return bucket_ptr->keyvalue[2];
|
||||
}
|
||||
if (((hashkey ^ bucket_ptr->keyvalue[3]) & keymask) == 0) {
|
||||
return bucket_ptr->keyvalue[3];
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Look up 40-bit gram FP in caller-passed table
|
||||
// Typical size 256K-4M entries (1-16MB)
|
||||
// 24-12 bit hashkey packed with 8-20 bit indirect lang/probs
|
||||
// keymask is 0xfffff000 for 20-bit hashkey and 12-bit indirect
|
||||
inline const uint32 OctaHashV3Lookup4(const CLD2TableSummary* gram_obj,
|
||||
uint64 longwordhash) {
|
||||
uint32 subscr, hashkey;
|
||||
const IndirectProbBucket4* octatable = gram_obj->kCLDTable;
|
||||
uint32 keymask = gram_obj->kCLDTableKeyMask;
|
||||
int bucketcount = gram_obj->kCLDTableSize;
|
||||
OctaFPJustHash(longwordhash, keymask, bucketcount,
|
||||
&subscr, &hashkey);
|
||||
const IndirectProbBucket4* bucket_ptr = &octatable[subscr];
|
||||
// Four-way associative, 4 compares
|
||||
if (((hashkey ^ bucket_ptr->keyvalue[0]) & keymask) == 0) {
|
||||
return bucket_ptr->keyvalue[0];
|
||||
}
|
||||
if (((hashkey ^ bucket_ptr->keyvalue[1]) & keymask) == 0) {
|
||||
return bucket_ptr->keyvalue[1];
|
||||
}
|
||||
if (((hashkey ^ bucket_ptr->keyvalue[2]) & keymask) == 0) {
|
||||
return bucket_ptr->keyvalue[2];
|
||||
}
|
||||
if (((hashkey ^ bucket_ptr->keyvalue[3]) & keymask) == 0) {
|
||||
return bucket_ptr->keyvalue[3];
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
//----------------------------------------------------------------------------//
|
||||
// Finding groups of 1/2/4/8 letters //
|
||||
//----------------------------------------------------------------------------//
|
||||
|
||||
// Does not advance past space or tab/cr/lf/nul
|
||||
static const uint8 kAdvanceOneCharButSpace[256] = {
|
||||
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
|
||||
0,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
|
||||
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
|
||||
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
|
||||
|
||||
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
|
||||
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
|
||||
2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,
|
||||
3,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4,
|
||||
};
|
||||
|
||||
|
||||
// Advances *only* on space or ASCII vowel (or illegal byte)
|
||||
static const uint8 kAdvanceOneCharSpaceVowel[256] = {
|
||||
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
|
||||
1,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
|
||||
0,1,0,0,0,1,0,0, 0,1,0,0,0,0,0,1, 0,0,0,0,0,1,0,0, 0,0,0,0,0,0,0,0,
|
||||
0,1,0,0,0,1,0,0, 0,1,0,0,0,0,0,1, 0,0,0,0,0,1,0,0, 0,0,0,0,0,0,0,0,
|
||||
|
||||
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
|
||||
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
|
||||
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
|
||||
};
|
||||
|
||||
|
||||
// src points to a letter. Find the byte length of a unigram starting there.
|
||||
int UniLen(const char* src);
|
||||
|
||||
// src points to a letter. Find the byte length of a bigram starting there.
|
||||
int BiLen(const char* src);
|
||||
|
||||
// src points to a letter. Find the byte length of a quadgram starting there.
|
||||
int QuadLen(const char* src);
|
||||
|
||||
// src points to a letter. Find the byte length of an octagram starting there.
|
||||
int OctaLen(const char* src);
|
||||
|
||||
} // End namespace CLD2
|
||||
|
||||
#endif // I18N_ENCODINGS_CLD2_INTERNAL_NEW_CLDUTIL_SHARED_H__
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
328
internal/compact_lang_det.cc
Normal file
328
internal/compact_lang_det.cc
Normal file
@@ -0,0 +1,328 @@
|
||||
// Copyright 2013 Google Inc. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
//
|
||||
// Author: dsites@google.com (Dick Sites)
|
||||
//
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#include "../public/compact_lang_det.h"
|
||||
#include "../public/encodings.h"
|
||||
#include "compact_lang_det_impl.h"
|
||||
#include "integral_types.h"
|
||||
#include "lang_script.h"
|
||||
|
||||
namespace CLD2 {
|
||||
|
||||
// String is "code_version - data_scrape_date"
|
||||
static const char* kDetectLanguageVersion = "V2.0 - 20130614";
|
||||
|
||||
// Large-table version for all ~160 languages
|
||||
// Small-table version for all ~60 languages
|
||||
|
||||
// Scan interchange-valid UTF-8 bytes and detect most likely language
|
||||
Language DetectLanguage(
|
||||
const char* buffer,
|
||||
int buffer_length,
|
||||
bool is_plain_text,
|
||||
bool* is_reliable) {
|
||||
bool allow_extended_lang = false;
|
||||
Language language3[3];
|
||||
int percent3[3];
|
||||
double normalized_score3[3];
|
||||
int text_bytes;
|
||||
int flags = 0;
|
||||
Language plus_one = UNKNOWN_LANGUAGE;
|
||||
const char* tld_hint = "";
|
||||
int encoding_hint = UNKNOWN_ENCODING;
|
||||
Language language_hint = UNKNOWN_LANGUAGE;
|
||||
CLDHints cldhints = {NULL, tld_hint, encoding_hint, language_hint};
|
||||
|
||||
Language lang = DetectLanguageSummaryV2(
|
||||
buffer,
|
||||
buffer_length,
|
||||
is_plain_text,
|
||||
&cldhints,
|
||||
allow_extended_lang,
|
||||
flags,
|
||||
plus_one,
|
||||
language3,
|
||||
percent3,
|
||||
normalized_score3,
|
||||
NULL,
|
||||
&text_bytes,
|
||||
is_reliable);
|
||||
// Default to English
|
||||
if (lang == UNKNOWN_LANGUAGE) {
|
||||
lang = ENGLISH;
|
||||
}
|
||||
return lang;
|
||||
}
|
||||
|
||||
// Scan interchange-valid UTF-8 bytes and detect list of top 3 languages.
|
||||
Language DetectLanguageSummary(
|
||||
const char* buffer,
|
||||
int buffer_length,
|
||||
bool is_plain_text,
|
||||
Language* language3,
|
||||
int* percent3,
|
||||
int* text_bytes,
|
||||
bool* is_reliable) {
|
||||
double normalized_score3[3];
|
||||
bool allow_extended_lang = false;
|
||||
int flags = 0;
|
||||
Language plus_one = UNKNOWN_LANGUAGE;
|
||||
const char* tld_hint = "";
|
||||
int encoding_hint = UNKNOWN_ENCODING;
|
||||
Language language_hint = UNKNOWN_LANGUAGE;
|
||||
CLDHints cldhints = {NULL, tld_hint, encoding_hint, language_hint};
|
||||
|
||||
Language lang = DetectLanguageSummaryV2(
|
||||
buffer,
|
||||
buffer_length,
|
||||
is_plain_text,
|
||||
&cldhints,
|
||||
allow_extended_lang,
|
||||
flags,
|
||||
plus_one,
|
||||
language3,
|
||||
percent3,
|
||||
normalized_score3,
|
||||
NULL,
|
||||
text_bytes,
|
||||
is_reliable);
|
||||
// Default to English
|
||||
if (lang == UNKNOWN_LANGUAGE) {
|
||||
lang = ENGLISH;
|
||||
}
|
||||
return lang;
|
||||
}
|
||||
|
||||
// Same as above, with hints supplied
|
||||
// Scan interchange-valid UTF-8 bytes and detect list of top 3 languages.
|
||||
Language DetectLanguageSummary(
|
||||
const char* buffer,
|
||||
int buffer_length,
|
||||
bool is_plain_text,
|
||||
const char* tld_hint, // "id" boosts Indonesian
|
||||
int encoding_hint, // SJS boosts Japanese
|
||||
Language language_hint, // ITALIAN boosts it
|
||||
Language* language3,
|
||||
int* percent3,
|
||||
int* text_bytes,
|
||||
bool* is_reliable) {
|
||||
double normalized_score3[3];
|
||||
bool allow_extended_lang = false;
|
||||
int flags = 0;
|
||||
Language plus_one = UNKNOWN_LANGUAGE;
|
||||
CLDHints cldhints = {NULL, tld_hint, encoding_hint, language_hint};
|
||||
|
||||
Language lang = DetectLanguageSummaryV2(
|
||||
buffer,
|
||||
buffer_length,
|
||||
is_plain_text,
|
||||
&cldhints,
|
||||
allow_extended_lang,
|
||||
flags,
|
||||
plus_one,
|
||||
language3,
|
||||
percent3,
|
||||
normalized_score3,
|
||||
NULL,
|
||||
text_bytes,
|
||||
is_reliable);
|
||||
// Default to English
|
||||
if (lang == UNKNOWN_LANGUAGE) {
|
||||
lang = ENGLISH;
|
||||
}
|
||||
return lang;
|
||||
}
|
||||
|
||||
|
||||
// Scan interchange-valid UTF-8 bytes and detect list of top 3 extended
|
||||
// languages.
|
||||
// Extended languages are additional Google interface languages and Unicode
|
||||
// single-language scripts, from ext_lang_enc.h
|
||||
Language ExtDetectLanguageSummary(
|
||||
const char* buffer,
|
||||
int buffer_length,
|
||||
bool is_plain_text,
|
||||
Language* language3,
|
||||
int* percent3,
|
||||
int* text_bytes,
|
||||
bool* is_reliable) {
|
||||
double normalized_score3[3];
|
||||
bool allow_extended_lang = true;
|
||||
int flags = 0;
|
||||
Language plus_one = UNKNOWN_LANGUAGE;
|
||||
const char* tld_hint = "";
|
||||
int encoding_hint = UNKNOWN_ENCODING;
|
||||
Language language_hint = UNKNOWN_LANGUAGE;
|
||||
CLDHints cldhints = {NULL, tld_hint, encoding_hint, language_hint};
|
||||
|
||||
Language lang = DetectLanguageSummaryV2(
|
||||
buffer,
|
||||
buffer_length,
|
||||
is_plain_text,
|
||||
&cldhints,
|
||||
allow_extended_lang,
|
||||
flags,
|
||||
plus_one,
|
||||
language3,
|
||||
percent3,
|
||||
normalized_score3,
|
||||
NULL,
|
||||
text_bytes,
|
||||
is_reliable);
|
||||
// Do not default to English
|
||||
return lang;
|
||||
}
|
||||
|
||||
// Same as above, with hints supplied
|
||||
// Scan interchange-valid UTF-8 bytes and detect list of top 3 extended
|
||||
// languages.
|
||||
// Extended languages are additional Google interface languages and Unicode
|
||||
// single-language scripts, from ext_lang_enc.h
|
||||
Language ExtDetectLanguageSummary(
|
||||
const char* buffer,
|
||||
int buffer_length,
|
||||
bool is_plain_text,
|
||||
const char* tld_hint, // "id" boosts Indonesian
|
||||
int encoding_hint, // SJS boosts Japanese
|
||||
Language language_hint, // ITALIAN boosts it
|
||||
Language* language3,
|
||||
int* percent3,
|
||||
int* text_bytes,
|
||||
bool* is_reliable) {
|
||||
double normalized_score3[3];
|
||||
bool allow_extended_lang = true;
|
||||
int flags = 0;
|
||||
Language plus_one = UNKNOWN_LANGUAGE;
|
||||
CLDHints cldhints = {NULL, tld_hint, encoding_hint, language_hint};
|
||||
|
||||
Language lang = DetectLanguageSummaryV2(
|
||||
buffer,
|
||||
buffer_length,
|
||||
is_plain_text,
|
||||
&cldhints,
|
||||
allow_extended_lang,
|
||||
flags,
|
||||
plus_one,
|
||||
language3,
|
||||
percent3,
|
||||
normalized_score3,
|
||||
NULL,
|
||||
text_bytes,
|
||||
is_reliable);
|
||||
// Do not default to English
|
||||
return lang;
|
||||
}
|
||||
|
||||
// Same as above, and also returns internal language scores as a ratio to
|
||||
// normal score for real text in that language. Scores close to 1.0 indicate
|
||||
// normal text, while scores far away from 1.0 indicate badly-skewed text or
|
||||
// gibberish
|
||||
//
|
||||
Language ExtDetectLanguageSummary(
|
||||
const char* buffer,
|
||||
int buffer_length,
|
||||
bool is_plain_text,
|
||||
const char* tld_hint, // "id" boosts Indonesian
|
||||
int encoding_hint, // SJS boosts Japanese
|
||||
Language language_hint, // ITALIAN boosts it
|
||||
Language* language3,
|
||||
int* percent3,
|
||||
double* normalized_score3,
|
||||
int* text_bytes,
|
||||
bool* is_reliable) {
|
||||
bool allow_extended_lang = true;
|
||||
int flags = 0;
|
||||
Language plus_one = UNKNOWN_LANGUAGE;
|
||||
CLDHints cldhints = {NULL, tld_hint, encoding_hint, language_hint};
|
||||
|
||||
Language lang = DetectLanguageSummaryV2(
|
||||
buffer,
|
||||
buffer_length,
|
||||
is_plain_text,
|
||||
&cldhints,
|
||||
allow_extended_lang,
|
||||
flags,
|
||||
plus_one,
|
||||
language3,
|
||||
percent3,
|
||||
normalized_score3,
|
||||
NULL,
|
||||
text_bytes,
|
||||
is_reliable);
|
||||
// Do not default to English
|
||||
return lang;
|
||||
}
|
||||
|
||||
// Use this one.
|
||||
// Hints are collected into a struct.
|
||||
// Flags are passed in (normally zero).
|
||||
//
|
||||
// Also returns 3 internal language scores as a ratio to
|
||||
// normal score for real text in that language. Scores close to 1.0 indicate
|
||||
// normal text, while scores far away from 1.0 indicate badly-skewed text or
|
||||
// gibberish
|
||||
//
|
||||
// Returns a vector of chunks in different languages, so that caller may
|
||||
// spell-check, translate, or otherwaise process different parts of the input
|
||||
// buffer in language-dependant ways.
|
||||
//
|
||||
Language ExtDetectLanguageSummary(
|
||||
const char* buffer,
|
||||
int buffer_length,
|
||||
bool is_plain_text,
|
||||
const CLDHints* cld_hints,
|
||||
int flags,
|
||||
Language* language3,
|
||||
int* percent3,
|
||||
double* normalized_score3,
|
||||
ResultChunkVector* resultchunkvector,
|
||||
int* text_bytes,
|
||||
bool* is_reliable) {
|
||||
bool allow_extended_lang = true;
|
||||
Language plus_one = UNKNOWN_LANGUAGE;
|
||||
|
||||
Language lang = DetectLanguageSummaryV2(
|
||||
buffer,
|
||||
buffer_length,
|
||||
is_plain_text,
|
||||
cld_hints,
|
||||
allow_extended_lang,
|
||||
flags,
|
||||
plus_one,
|
||||
language3,
|
||||
percent3,
|
||||
normalized_score3,
|
||||
resultchunkvector,
|
||||
text_bytes,
|
||||
is_reliable);
|
||||
// Do not default to English
|
||||
return lang;
|
||||
}
|
||||
|
||||
|
||||
// Return version text string
|
||||
// String is "code_version - data_build_date"
|
||||
const char* DetectLanguageVersion() {
|
||||
return kDetectLanguageVersion;
|
||||
}
|
||||
|
||||
} // End namespace CLD2
|
||||
|
||||
1647
internal/compact_lang_det_hint_code.cc
Normal file
1647
internal/compact_lang_det_hint_code.cc
Normal file
File diff suppressed because it is too large
Load Diff
94
internal/compact_lang_det_hint_code.h
Normal file
94
internal/compact_lang_det_hint_code.h
Normal file
@@ -0,0 +1,94 @@
|
||||
// Copyright 2013 Google Inc. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
//
|
||||
// Author: dsites@google.com (Dick Sites)
|
||||
//
|
||||
|
||||
#ifndef I18N_ENCODINGS_COMPACT_LANG_DET_COMPACT_LANG_DET_HINT_CODE_H__
|
||||
#define I18N_ENCODINGS_COMPACT_LANG_DET_COMPACT_LANG_DET_HINT_CODE_H__
|
||||
|
||||
|
||||
#include <string>
|
||||
#include "integral_types.h"
|
||||
#include "lang_script.h"
|
||||
#include "../public/encodings.h"
|
||||
|
||||
namespace CLD2 {
|
||||
|
||||
// Packed <Language, weight>, weight in [-32..31] (powers of 2**1.6 ~=3.03)
|
||||
// Full language in bottom 10 bits, weight in top 6 bits
|
||||
typedef int16 OneCLDLangPrior;
|
||||
|
||||
typedef struct {
|
||||
int32 n;
|
||||
OneCLDLangPrior prior[14];
|
||||
} CLDLangPriors;
|
||||
|
||||
// Reading exposed here; setting hidden in .cc
|
||||
inline int GetCLDPriorWeight(OneCLDLangPrior olp) {
|
||||
return olp >> 10;
|
||||
}
|
||||
inline Language GetCLDPriorLang(OneCLDLangPrior olp) {
|
||||
return static_cast<Language>(olp & 0x3ff);
|
||||
}
|
||||
|
||||
inline int32 GetCLDLangPriorCount(CLDLangPriors* lps) {
|
||||
return lps->n;
|
||||
}
|
||||
|
||||
inline void InitCLDLangPriors(CLDLangPriors* lps) {
|
||||
lps->n = 0;
|
||||
}
|
||||
|
||||
// Trim language priors to no more than max_entries, keeping largest abs weights
|
||||
void TrimCLDLangPriors(int max_entries, CLDLangPriors* lps);
|
||||
|
||||
// Trim language tag string to canonical form for each language
|
||||
// Input is from GetLangTagsFromHtml(), already lowercased
|
||||
std::string TrimCLDLangTagsHint(const std::string& langtags);
|
||||
|
||||
// Add hints to vector of langpriors
|
||||
// Input is from GetLangTagsFromHtml(), already lowercased
|
||||
void SetCLDLangTagsHint(const std::string& langtags, CLDLangPriors* langpriors);
|
||||
|
||||
// Add hints to vector of langpriors
|
||||
// Input is from HTTP content-language
|
||||
void SetCLDContentLangHint(const char* contentlang, CLDLangPriors* langpriors);
|
||||
|
||||
// Add hints to vector of langpriors
|
||||
// Input is from GetTLD(), already lowercased
|
||||
void SetCLDTLDHint(const char* tld, CLDLangPriors* langpriors);
|
||||
|
||||
// Add hints to vector of langpriors
|
||||
// Input is from DetectEncoding()
|
||||
void SetCLDEncodingHint(Encoding enc, CLDLangPriors* langpriors);
|
||||
|
||||
// Add hints to vector of langpriors
|
||||
// Input is from random source
|
||||
void SetCLDLanguageHint(Language lang, CLDLangPriors* langpriors);
|
||||
|
||||
// Make printable string of priors
|
||||
std::string DumpCLDLangPriors(const CLDLangPriors* langpriors);
|
||||
|
||||
|
||||
// Get language tag hints from HTML body
|
||||
// Normalize: remove spaces and make lowercase comma list
|
||||
std::string GetLangTagsFromHtml(const char* utf8_body, int32 utf8_body_len,
|
||||
int32 max_scan_bytes);
|
||||
|
||||
} // End namespace CLD2
|
||||
|
||||
#endif // I18N_ENCODINGS_COMPACT_LANG_DET_COMPACT_LANG_DET_HINT_CODE_H__
|
||||
|
||||
1880
internal/compact_lang_det_impl.cc
Normal file
1880
internal/compact_lang_det_impl.cc
Normal file
File diff suppressed because it is too large
Load Diff
183
internal/compact_lang_det_impl.h
Normal file
183
internal/compact_lang_det_impl.h
Normal file
@@ -0,0 +1,183 @@
|
||||
// Copyright 2013 Google Inc. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
//
|
||||
// Author: dsites@google.com (Dick Sites)
|
||||
//
|
||||
|
||||
#ifndef I18N_ENCODINGS_COMPACT_LANG_DET_COMPACT_LANG_DET_IMPL_H_
|
||||
#define I18N_ENCODINGS_COMPACT_LANG_DET_COMPACT_LANG_DET_IMPL_H_
|
||||
|
||||
#include <vector>
|
||||
|
||||
#include "../public/compact_lang_det.h" // For CLDHints, ResultChunkVector
|
||||
#include "integral_types.h"
|
||||
#include "lang_script.h"
|
||||
|
||||
namespace CLD2 {
|
||||
|
||||
// Internal use flags
|
||||
static const int kCLDFlagFinish = 1;
|
||||
static const int kCLDFlagSqueeze = 2;
|
||||
static const int kCLDFlagRepeats = 4;
|
||||
static const int kCLDFlagTop40 = 8;
|
||||
static const int kCLDFlagShort = 16;
|
||||
static const int kCLDFlagHint = 32;
|
||||
static const int kCLDFlagUseWords = 64;
|
||||
static const int kCLDFlagUNUSED = 128;
|
||||
|
||||
// Public use flags, debug output controls, defined in compact_lang_det.h
|
||||
// 0x0100 and above
|
||||
|
||||
/***
|
||||
|
||||
Flag meanings:
|
||||
|
||||
Flags are used in the context of a recursive call from Detect to itself,
|
||||
trying to deal in a more restrictive way with input that was not reliably
|
||||
identified in the top-level call.
|
||||
|
||||
Finish -- Do not further recurse; return whatever result ensues, even if it is
|
||||
unreliable. Typically set in any recursive call to take a second try
|
||||
on unreliable text.
|
||||
|
||||
Squeeze -- For each text run, do an inplace cheapsqueeze to remove chunks of
|
||||
highly repetitive text and chunks of text with too many 1- and
|
||||
2-letter words. This avoids scoring repetitive or useless non-text
|
||||
crap in large files such bogus JPEGs within an HTML file.
|
||||
|
||||
Repeats -- When scoring a text run, do a cheap prediction of each character
|
||||
and do not score a unigram/quadgram if the last character of same is
|
||||
correctly predicted. This is a slower, finer-grained form of
|
||||
cheapsqueeze, typically used when the first pass got unreliable
|
||||
results.
|
||||
|
||||
Top40 -- Restrict the set of scored languages to the Google "Top 40", which is
|
||||
actually 38 languages. This gets rid of about 110 languages that
|
||||
represent about 0.7% of the web. Typically used when the first pass
|
||||
got unreliable results.
|
||||
|
||||
Short -- DEPRICATED, unused
|
||||
|
||||
Hint -- EXPERIMENTAL flag for compact_lang_det_test.cc to indicate a language
|
||||
hint supplied in parameter plus_one.
|
||||
|
||||
UseWords -- In additon to scoring quad/uni/nil-grams, score complete words
|
||||
|
||||
|
||||
|
||||
Tentative decision logic:
|
||||
|
||||
In the middle of first pass -- After 4KB of text, look at the front 256 bytes
|
||||
of every full 4KB buffer. If it compresses very well (say 3:1) or has
|
||||
lots of spaces (say 1 of every 4 bytes), assume that the input is
|
||||
large and contains lots of bogus non-text. Recurse, passing the
|
||||
Squeeze flag to strip out chunks of this non-text.
|
||||
|
||||
At the end of the first pass --
|
||||
If the top language is reliable and >= 70% of the document, return.
|
||||
Else if the top language is reliable and top+2nd >= say 94%, return.
|
||||
Else, either the top language is not reliable or there is a lot of
|
||||
other crap.
|
||||
***/
|
||||
|
||||
|
||||
// Scan interchange-valid UTF-8 bytes and detect most likely language,
|
||||
// or set of languages.
|
||||
//
|
||||
// Design goals:
|
||||
// Skip over big stretches of HTML tags
|
||||
// Able to return ranges of different languages
|
||||
// Relatively small tables and relatively fast processing
|
||||
// Thread safe
|
||||
//
|
||||
|
||||
typedef struct {
|
||||
int perscript_count;
|
||||
const Language* perscript_lang;
|
||||
} PerScriptPair;
|
||||
|
||||
typedef struct {
|
||||
// Constants for hashing 4-7 byte quadgram to 32 bits
|
||||
const int kQuadHashB4Shift;
|
||||
const int kQuadHashB4bShift;
|
||||
const int kQuadHashB5Shift;
|
||||
const int kQuadHashB5bShift;
|
||||
// Constants for hashing 32 bits to kQuadKeyTable subscript/key
|
||||
const int kHashvalToSubShift;
|
||||
const uint32 kHashvalToSubMask;
|
||||
const int kHashvalToKeyShift;
|
||||
const uint32 kHashvalToKeyMask;
|
||||
const int kHashvalAssociativity;
|
||||
// Pointers to the actual tables
|
||||
const PerScriptPair* kPerScriptPair;
|
||||
const uint16* kQuadKeyTable;
|
||||
const uint32* kQuadValueTable;
|
||||
} LangDetObj;
|
||||
|
||||
// For HTML documents, tags are skipped, along with <script> ... </script>
|
||||
// and <style> ... </style> sequences, and entities are expanded.
|
||||
//
|
||||
// We distinguish between bytes of the raw input buffer and bytes of non-tag
|
||||
// text letters. Since tags can be over 50% of the bytes of an HTML Page,
|
||||
// and are nearly all seven-bit ASCII English, we prefer to distinguish
|
||||
// language mixture fractions based on just the non-tag text.
|
||||
//
|
||||
// Inputs: text and text_length
|
||||
// is_plain_text if true says to NOT parse/skip HTML tags nor entities
|
||||
// Outputs:
|
||||
// language3 is an array of the top 3 languages or UNKNOWN_LANGUAGE
|
||||
// percent3 is an array of the text percentages 0..100 of the top 3 languages
|
||||
// normalized_score3 is an array of internal scores, normalized to the
|
||||
// average score for each language over a body of training text. A
|
||||
// normalized score significantly away from 1.0 indicates very skewed text
|
||||
// or gibberish.
|
||||
//
|
||||
// text_bytes is the amount of non-tag/letters-only text found
|
||||
// is_reliable set true if the returned Language is at least 2**30 times more
|
||||
// probable then the second-best Language
|
||||
//
|
||||
// Return value: the most likely Language for the majority of the input text
|
||||
// Length 0 input and text with no reliable letter sequences returns
|
||||
// UNKNOWN_LANGUAGE
|
||||
//
|
||||
// Subsetting: For fast detection over large documents, these routines will
|
||||
// only scan up to a fixed limit (currently 160KB of non-tag letters).
|
||||
//
|
||||
|
||||
Language DetectLanguageSummaryV2(
|
||||
const char* buffer,
|
||||
int buffer_length,
|
||||
bool is_plain_text,
|
||||
const CLDHints* cld_hints,
|
||||
bool allow_extended_lang,
|
||||
int flags,
|
||||
Language plus_one,
|
||||
Language* language3,
|
||||
int* percent3,
|
||||
double* normalized_score3,
|
||||
ResultChunkVector* resultchunkvector,
|
||||
int* text_bytes,
|
||||
bool* is_reliable);
|
||||
|
||||
// For unit testing:
|
||||
// Remove portions of text that have a high density of spaces, or that are
|
||||
// overly repetitive, squeezing the remaining text in-place to the front
|
||||
// of the input buffer.
|
||||
// Return the new, possibly-shorter length
|
||||
int CheapSqueezeInplace(char* isrc, int srclen, int ichunksize);
|
||||
|
||||
} // End namespace CLD2
|
||||
|
||||
#endif // I18N_ENCODINGS_COMPACT_LANG_DET_COMPACT_LANG_DET_IMPL_H_
|
||||
351
internal/compact_lang_det_test.cc
Normal file
351
internal/compact_lang_det_test.cc
Normal file
@@ -0,0 +1,351 @@
|
||||
// Copyright 2013 Google Inc. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
//
|
||||
// Author: dsites@google.com (Dick Sites)
|
||||
//
|
||||
|
||||
// Test: Do encoding detection on input file
|
||||
// --line treat each line as a separate detection problem
|
||||
|
||||
#include <math.h> // for sqrt
|
||||
#include <stdlib.h> // for exit
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <sys/time.h> // for gettimeofday
|
||||
#include <string>
|
||||
|
||||
#include "cld2tablesummary.h"
|
||||
#include "compact_lang_det_impl.h"
|
||||
#include "debug.h"
|
||||
#include "integral_types.h"
|
||||
#include "lang_script.h"
|
||||
#include "utf8statetable.h"
|
||||
|
||||
namespace CLD2 {
|
||||
|
||||
using namespace std;
|
||||
|
||||
// Scaffolding
|
||||
typedef int32 Encoding;
|
||||
static const Encoding UNKNOWN_ENCODING = 0;
|
||||
|
||||
|
||||
// Linker supplies the right tables; see ScoringTables compact_lang_det_impl.cc
|
||||
// These are here JUST for printing versions
|
||||
extern const UTF8PropObj cld_generated_CjkUni_obj;
|
||||
extern const CLD2TableSummary kCjkDeltaBi_obj;
|
||||
extern const CLD2TableSummary kDistinctBiTable_obj;
|
||||
extern const CLD2TableSummary kQuad_obj;
|
||||
extern const CLD2TableSummary kDeltaOcta_obj;
|
||||
extern const CLD2TableSummary kDistinctOcta_obj;
|
||||
extern const CLD2TableSummary kOcta2_obj;
|
||||
extern const short kAvgDeltaOctaScore[];
|
||||
|
||||
bool FLAGS_cld_version = false;
|
||||
bool FLAGS_cld_html = true;
|
||||
int32 FLAGS_repeat = 1;
|
||||
bool FLAGS_plain = false;
|
||||
bool FLAGS_dbgscore = true;
|
||||
|
||||
|
||||
// Convert GetTimeOfDay output to 64-bit usec
|
||||
static inline uint64 Microseconds(const struct timeval& t) {
|
||||
// Convert to (uint64) microseconds, not (double) seconds.
|
||||
return t.tv_sec * 1000000ULL + t.tv_usec;
|
||||
}
|
||||
|
||||
#define LF 0x0a
|
||||
#define CR 0x0d
|
||||
|
||||
bool Readline(FILE* infile, char* buffer) {
|
||||
char* p = fgets(buffer, 64 * 1024, infile);
|
||||
if (p == NULL) {
|
||||
return false;
|
||||
}
|
||||
int len = strlen(buffer);
|
||||
|
||||
// trim CR LF
|
||||
if (buffer[len-1] == LF) {buffer[--len] = '\0';}
|
||||
if (buffer[len-1] == CR) {buffer[--len] = '\0';}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool IsComment(char* buffer) {
|
||||
int len = strlen(buffer);
|
||||
if (len == 0) {return true;}
|
||||
if (buffer[0] == '#') {return true;}
|
||||
if (buffer[0] == ' ') {return true;} // Any leading space is comment
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
|
||||
void DumpExtLang(int flags,
|
||||
Language summary_lang,
|
||||
Language* language3, int* percent3,
|
||||
double* normalized_score3,
|
||||
int text_bytes, bool is_reliable, int in_size) {
|
||||
char temp[160];
|
||||
char* tp = temp;
|
||||
int tp_left = sizeof(temp);
|
||||
snprintf(tp, tp_left, "ExtLanguage");
|
||||
|
||||
if (language3[0] != UNKNOWN_LANGUAGE) {
|
||||
tp = temp + strlen(temp);
|
||||
tp_left = sizeof(temp) - strlen(temp);
|
||||
snprintf(tp, tp_left, " %s(%d%% %3.0fp)",
|
||||
LanguageName(language3[0]),
|
||||
percent3[0],
|
||||
normalized_score3[0]);
|
||||
|
||||
}
|
||||
if (language3[1] != UNKNOWN_LANGUAGE) {
|
||||
tp = temp + strlen(temp);
|
||||
tp_left = sizeof(temp) - strlen(temp);
|
||||
snprintf(tp, tp_left, ", %s(%d%% %3.0fp)",
|
||||
LanguageName(language3[1]),
|
||||
percent3[1],
|
||||
normalized_score3[1]);
|
||||
}
|
||||
if (language3[2] != UNKNOWN_LANGUAGE) {
|
||||
tp = temp + strlen(temp);
|
||||
tp_left = sizeof(temp) - strlen(temp);
|
||||
snprintf(tp, tp_left, ", %s(%d%% %3.0fp)",
|
||||
LanguageName(language3[2]),
|
||||
percent3[2],
|
||||
normalized_score3[2]);
|
||||
}
|
||||
|
||||
if (text_bytes > 9999) {
|
||||
tp = temp + strlen(temp);
|
||||
tp_left = sizeof(temp) - strlen(temp);
|
||||
snprintf(tp, tp_left, ", %d/%d KB of non-tag letters",
|
||||
text_bytes >> 10, in_size >> 10);
|
||||
} else {
|
||||
tp = temp + strlen(temp);
|
||||
tp_left = sizeof(temp) - strlen(temp);
|
||||
snprintf(tp, tp_left, ", %d/%d bytes of non-tag letters",
|
||||
text_bytes, in_size);
|
||||
}
|
||||
|
||||
tp = temp + strlen(temp);
|
||||
tp_left = sizeof(temp) - strlen(temp);
|
||||
snprintf(tp, tp_left, ", Summary: %s%s",
|
||||
LanguageName(summary_lang),
|
||||
is_reliable ? "" : "*");
|
||||
|
||||
printf("%s\n", temp);
|
||||
|
||||
// Also put into optional HTML output
|
||||
if ((flags & kCLDFlagHtml) != 0) {
|
||||
fprintf(stderr, "%s\n", temp);
|
||||
}
|
||||
}
|
||||
|
||||
void DumpLanguages(Language summary_lang,
|
||||
Language* language3, int* percent3,
|
||||
int text_bytes, bool is_reliable, int in_size) {
|
||||
// fprintf(stderr, "</span>\n\n");
|
||||
int total_percent = 0;
|
||||
if (language3[0] != UNKNOWN_LANGUAGE) {
|
||||
fprintf(stderr, "\n<br>Languages %s(%d%%)",
|
||||
LanguageName(language3[0]),
|
||||
percent3[0]);
|
||||
total_percent += percent3[0];
|
||||
} else {
|
||||
fprintf(stderr, "\n<br>Languages ");
|
||||
}
|
||||
|
||||
if (language3[1] != UNKNOWN_LANGUAGE) {
|
||||
fprintf(stderr, ", %s(%d%%)",
|
||||
LanguageName(language3[1]),
|
||||
percent3[1]);
|
||||
total_percent += percent3[1];
|
||||
}
|
||||
|
||||
if (language3[2] != UNKNOWN_LANGUAGE) {
|
||||
fprintf(stderr, ", %s(%d%%)",
|
||||
LanguageName(language3[2]),
|
||||
percent3[2]);
|
||||
total_percent += percent3[2];
|
||||
}
|
||||
|
||||
fprintf(stderr, ", other(%d%%)", 100 - total_percent);
|
||||
|
||||
if (text_bytes > 9999) {
|
||||
fprintf(stderr, ", %d/%d KB of non-tag letters",
|
||||
text_bytes >> 10, in_size >> 10);
|
||||
} else {
|
||||
fprintf(stderr, ", %d/%d bytes of non-tag letters",
|
||||
text_bytes, in_size);
|
||||
}
|
||||
|
||||
fprintf(stderr, ", Summary: %s%s ",
|
||||
LanguageName(summary_lang),
|
||||
is_reliable ? "" : "*");
|
||||
fprintf(stderr, "<br>\n");
|
||||
}
|
||||
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
if (FLAGS_cld_version) {
|
||||
printf("%s %4dKB uni build date, bytes\n",
|
||||
"........",
|
||||
cld_generated_CjkUni_obj.total_size >> 10);
|
||||
printf("%d %4ldKB delta_bi build date, bytes\n",
|
||||
kCjkDeltaBi_obj.kCLDTableBuildDate,
|
||||
(kCjkDeltaBi_obj.kCLDTableSize *
|
||||
sizeof(IndirectProbBucket4)) >> 10);
|
||||
printf("%d %4ldKB quad build date, bytes\n",
|
||||
kQuad_obj.kCLDTableBuildDate,
|
||||
(kQuad_obj.kCLDTableSize *
|
||||
sizeof(IndirectProbBucket4)) >> 10);
|
||||
printf("%d %4ldKB delta_octa build date, bytes\n",
|
||||
kDeltaOcta_obj.kCLDTableBuildDate,
|
||||
(kDeltaOcta_obj.kCLDTableSize *
|
||||
sizeof(IndirectProbBucket4)) >> 10);
|
||||
exit(0);
|
||||
} // End FLAGS_cld_version
|
||||
|
||||
int flags = 0;
|
||||
bool get_vector = false;
|
||||
const char* fname = NULL;
|
||||
for (int i = 1; i < argc; ++i) {
|
||||
if (argv[i][0] != '-') {fname = argv[i];}
|
||||
if (strcmp(argv[i], "--scoreasquads") == 0) {flags |= kCLDFlagScoreAsQuads;}
|
||||
if (strcmp(argv[i], "--html") == 0) {flags |= kCLDFlagHtml;}
|
||||
if (strcmp(argv[i], "--cr") == 0) {flags |= kCLDFlagCr;}
|
||||
if (strcmp(argv[i], "--verbose") == 0) {flags |= kCLDFlagVerbose;}
|
||||
if (strcmp(argv[i], "--echo") == 0) {flags |= kCLDFlagEcho;}
|
||||
if (strcmp(argv[i], "--vector") == 0) {get_vector = true;}
|
||||
}
|
||||
|
||||
FILE* fin;
|
||||
if (fname == NULL) {
|
||||
fin = stdin;
|
||||
} else {
|
||||
fin = fopen(fname, "rb");
|
||||
if (fin == NULL) {
|
||||
fprintf(stderr, "%s did not open\n", fname);
|
||||
exit(0);
|
||||
}
|
||||
}
|
||||
|
||||
const char* tldhint = "";
|
||||
Encoding enchint = UNKNOWN_ENCODING;
|
||||
Language langhint = UNKNOWN_LANGUAGE;
|
||||
|
||||
int bytes_consumed;
|
||||
int bytes_filled;
|
||||
int error_char_count;
|
||||
bool is_reliable;
|
||||
int usec;
|
||||
char* buffer = new char[10000000]; // Max 10MB of input for this test program
|
||||
struct timeval news, newe;
|
||||
|
||||
|
||||
if ((flags & kCLDFlagHtml) != 0) {
|
||||
// Begin HTML file
|
||||
fprintf(stderr, "<html><meta charset=\"UTF-8\"><body>\n");
|
||||
fprintf(stderr, "<style media=\"print\" type=\"text/css\"> "
|
||||
":root { -webkit-print-color-adjust: exact; } </style>\n");
|
||||
fprintf(stderr, "<span style=\"font-size: 7pt\">\n");
|
||||
}
|
||||
|
||||
if ((flags & kCLDFlagHtml) != 0) {
|
||||
//// fprintf(stderr, "<html><body><span style=\"font-size: 7pt\">\n");
|
||||
//// fprintf(stderr, "<html><body><span style=\"font-size: 6pt\"><pre>\n");
|
||||
fprintf(stderr, "file = %s<br>\n", fname ? fname : "stdin");
|
||||
}
|
||||
|
||||
// Full-blown flag-bit and hints interface
|
||||
bool allow_extended_lang = true;
|
||||
Language plus_one = UNKNOWN_LANGUAGE;
|
||||
|
||||
int n = fread(buffer, 1, 10000000, fin);
|
||||
|
||||
bool ignore_7bit = false;
|
||||
|
||||
|
||||
// Detect language
|
||||
Language summary_lang = UNKNOWN_LANGUAGE;
|
||||
|
||||
Language language3[3];
|
||||
int percent3[3];
|
||||
double normalized_score3[3];
|
||||
ResultChunkVector resultchunkvector;
|
||||
bool is_plain_text = FLAGS_plain;
|
||||
int text_bytes;
|
||||
|
||||
CLDHints cldhints = {NULL, tldhint, enchint, langhint};
|
||||
|
||||
gettimeofday(&news, NULL);
|
||||
for (int i = 0; i < FLAGS_repeat; ++i) {
|
||||
summary_lang = CLD2::DetectLanguageSummaryV2(
|
||||
buffer,
|
||||
n,
|
||||
is_plain_text,
|
||||
&cldhints,
|
||||
allow_extended_lang,
|
||||
flags,
|
||||
plus_one,
|
||||
language3,
|
||||
percent3,
|
||||
normalized_score3,
|
||||
get_vector ? &resultchunkvector : NULL,
|
||||
&text_bytes,
|
||||
&is_reliable);
|
||||
}
|
||||
gettimeofday(&newe, NULL);
|
||||
|
||||
if (get_vector) {
|
||||
DumpResultChunkVector(stderr, buffer, &resultchunkvector);
|
||||
}
|
||||
|
||||
DumpExtLang(flags, summary_lang, language3, percent3, normalized_score3,
|
||||
text_bytes, is_reliable, n);
|
||||
|
||||
if ((flags & kCLDFlagHtml) != 0) {
|
||||
DumpLanguages(summary_lang,
|
||||
language3, percent3, text_bytes, is_reliable, n);
|
||||
}
|
||||
|
||||
usec = static_cast<int>(Microseconds(newe) - Microseconds(news));
|
||||
if (usec == 0) {usec = 1;}
|
||||
printf(" SummaryLanguage %s%s at %u of %d %uus (%d MB/sec), %s\n",
|
||||
LanguageName(summary_lang),
|
||||
is_reliable ? "" : "(un-reliable)",
|
||||
bytes_consumed,
|
||||
n,
|
||||
usec,
|
||||
n / usec,
|
||||
argv[1]);
|
||||
|
||||
if ((flags & kCLDFlagHtml) != 0) {
|
||||
fprintf(stderr, "\n</span></body></html><br>");
|
||||
}
|
||||
|
||||
fclose(fin);
|
||||
delete[] buffer;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
} // End namespace CLD2
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
return CLD2::main(argc, argv);
|
||||
}
|
||||
|
||||
BIN
internal/compact_lang_det_test_chrome0614
Executable file
BIN
internal/compact_lang_det_test_chrome0614
Executable file
Binary file not shown.
39
internal/compile.sh
Executable file
39
internal/compile.sh
Executable file
@@ -0,0 +1,39 @@
|
||||
g++ -O2 -m64 compact_lang_det_test.cc \
|
||||
cldutil.cc cldutil_shared.cc compact_lang_det.cc compact_lang_det_hint_code.cc \
|
||||
compact_lang_det_impl.cc debug.cc fixunicodevalue.cc \
|
||||
generated_entities.cc generated_language.cc generated_ulscript.cc \
|
||||
getonescriptspan.cc lang_script.cc offsetmap.cc scoreonescriptspan.cc \
|
||||
tote.cc utf8statetable.cc \
|
||||
cld_generated_cjk_uni_prop_80.cc cld2_generated_cjk_compatible.cc \
|
||||
cld_generated_cjk_delta_bi_4.cc generated_distinct_bi_0.cc \
|
||||
cld2_generated_quadchrome0604.cc cld2_generated_deltaoctachrome0614.cc \
|
||||
cld2_generated_distinctoctachrome0604.cc cld_generated_score_quad_octa_1024_256.cc \
|
||||
-o compact_lang_det_test_chrome0614
|
||||
echo " compact_lang_det_test_chrome0614 compiled"
|
||||
|
||||
g++ -O2 -m64 cld2_unittest.cc \
|
||||
cldutil.cc cldutil_shared.cc compact_lang_det.cc compact_lang_det_hint_code.cc \
|
||||
compact_lang_det_impl.cc debug.cc fixunicodevalue.cc \
|
||||
generated_entities.cc generated_language.cc generated_ulscript.cc \
|
||||
getonescriptspan.cc lang_script.cc offsetmap.cc scoreonescriptspan.cc \
|
||||
tote.cc utf8statetable.cc \
|
||||
cld_generated_cjk_uni_prop_80.cc cld2_generated_cjk_compatible.cc \
|
||||
cld_generated_cjk_delta_bi_4.cc generated_distinct_bi_0.cc \
|
||||
cld2_generated_quadchrome0604.cc cld2_generated_deltaoctachrome0614.cc \
|
||||
cld2_generated_distinctoctachrome0604.cc cld_generated_score_quad_octa_1024_256.cc \
|
||||
-o cld2_unittest
|
||||
echo " cld2_unittest compiled"
|
||||
|
||||
g++ -O2 -m64 -Davoid_utf8_string_constants cld2_unittest.cc \
|
||||
cldutil.cc cldutil_shared.cc compact_lang_det.cc compact_lang_det_hint_code.cc \
|
||||
compact_lang_det_impl.cc debug.cc fixunicodevalue.cc \
|
||||
generated_entities.cc generated_language.cc generated_ulscript.cc \
|
||||
getonescriptspan.cc lang_script.cc offsetmap.cc scoreonescriptspan.cc \
|
||||
tote.cc utf8statetable.cc \
|
||||
cld_generated_cjk_uni_prop_80.cc cld2_generated_cjk_compatible.cc \
|
||||
cld_generated_cjk_delta_bi_4.cc generated_distinct_bi_0.cc \
|
||||
cld2_generated_quadchrome0604.cc cld2_generated_deltaoctachrome0614.cc \
|
||||
cld2_generated_distinctoctachrome0604.cc cld_generated_score_quad_octa_1024_256.cc \
|
||||
-o cld2_unittest_avoid
|
||||
echo " cld2_unittest -Davoid_utf8_string_constants compiled"
|
||||
|
||||
468
internal/debug.cc
Normal file
468
internal/debug.cc
Normal file
@@ -0,0 +1,468 @@
|
||||
// Copyright 2013 Google Inc. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
//
|
||||
// Author: dsites@google.com (Dick Sites)
|
||||
//
|
||||
|
||||
#include "debug.h"
|
||||
#include <stdio.h>
|
||||
#include <string>
|
||||
|
||||
#include "cldutil.h"
|
||||
#include "getonescriptspan.h"
|
||||
#include "lang_script.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
namespace CLD2 {
|
||||
|
||||
// Debug output string of one unigram
|
||||
string GetUniAt(const char* text) {
|
||||
string retval;
|
||||
retval.clear();
|
||||
int uni_len = UniLen(text);
|
||||
retval.append(text, uni_len);
|
||||
return retval;
|
||||
}
|
||||
|
||||
// Debug output string of one bigram
|
||||
string GetBiAt(const char* text) {
|
||||
string retval;
|
||||
retval.clear();
|
||||
int bi_len = BiLen(text);
|
||||
retval.append(text, bi_len);
|
||||
return retval;
|
||||
}
|
||||
|
||||
// Debug output string of one quadgram, including underscores
|
||||
string GetQuadAt(const char* text) {
|
||||
string retval;
|
||||
retval.clear();
|
||||
if (text[-1] == ' ') {retval.append("_");}
|
||||
int quad_len = QuadLen(text);
|
||||
retval.append(text, quad_len);
|
||||
if (text[quad_len] == ' ') {retval.append("_");}
|
||||
return retval;
|
||||
}
|
||||
|
||||
// Debug output string of one octagram, including underscores
|
||||
string GetOctaAt(const char* text) {
|
||||
string retval;
|
||||
retval.clear();
|
||||
if (text[-1] == ' ') {retval.append("_");}
|
||||
int octa_len = OctaLen(text);
|
||||
retval.append(text, octa_len);
|
||||
if (text[octa_len] == ' ') {retval.append("_");}
|
||||
return retval;
|
||||
}
|
||||
|
||||
// Debug output string of two octagrams, including underscores
|
||||
string GetOcta2At(const char* text) {
|
||||
string retval;
|
||||
retval.clear();
|
||||
if (text[-1] == ' ') {retval.append("_");}
|
||||
int octa_len = OctaLen(text);
|
||||
retval.append(text, octa_len);
|
||||
if (text[octa_len] == ' ') {retval.append("_");}
|
||||
text += (octa_len + 1);
|
||||
int octa2_len = OctaLen(text);
|
||||
retval.append(text, octa2_len);
|
||||
if (text[octa2_len] == ' ') {retval.append("_");}
|
||||
return retval;
|
||||
}
|
||||
|
||||
// Debug output string of one formatted pslang,qprob pair
|
||||
string FmtLP(ULScript ulscript, uint8 pslang, uint8 qprob) {
|
||||
string retval;
|
||||
retval.clear();
|
||||
Language lang = FromPerScriptNumber(ulscript, pslang);
|
||||
char temp[16];
|
||||
sprintf(temp, "%s.%d", LanguageCode(lang), qprob);
|
||||
retval.append(temp);
|
||||
return retval;
|
||||
}
|
||||
|
||||
// Debug output string of one formatted langprob
|
||||
// Returns "en.24 fr.10 es.4"
|
||||
string GetLangProbTxt(const ScoringContext* scoringcontext, uint32 langprob) {
|
||||
/*const uint16* pslangtolang = scoringcontext->pslangtolang;*/
|
||||
string retval;
|
||||
retval.clear();
|
||||
uint8 prob123 = (langprob >> 0) & 0xff;
|
||||
const uint8* prob123_entry = LgProb2TblEntry(prob123);
|
||||
uint8 top1 = (langprob >> 8) & 0xff;
|
||||
if (top1 > 0) {
|
||||
retval.append(FmtLP(scoringcontext->ulscript,
|
||||
top1, LgProb3(prob123_entry, 0)));
|
||||
}
|
||||
uint8 top2 = (langprob >> 16) & 0xff;
|
||||
if (top2 > 0) {
|
||||
if (!retval.empty()) {retval.append("~");}
|
||||
retval.append(FmtLP(scoringcontext->ulscript,
|
||||
top2, LgProb3(prob123_entry, 1)));
|
||||
}
|
||||
uint8 top3 = (langprob >> 24) & 0xff;
|
||||
if (top3 > 0) {
|
||||
if (!retval.empty()) {retval.append("~");}
|
||||
retval.append(FmtLP(scoringcontext->ulscript,
|
||||
top3, LgProb3(prob123_entry, 2)));
|
||||
}
|
||||
return retval;
|
||||
}
|
||||
|
||||
|
||||
// Debug output string of one or two formatted quadgram langprobs
|
||||
string GetScoreTxt(const ScoringContext* scoringcontext,
|
||||
const CLD2TableSummary* base_obj, int indirect) {
|
||||
string retval;
|
||||
retval.clear();
|
||||
if (indirect < base_obj->kCLDTableSizeOne) {
|
||||
// Up to three languages at indirect
|
||||
uint32 langprob = base_obj->kCLDTableInd[indirect];
|
||||
retval.append(GetLangProbTxt(scoringcontext, langprob));
|
||||
} else {
|
||||
// Up to six languages at start + 2 * (indirect - start)
|
||||
indirect += (indirect - base_obj->kCLDTableSizeOne);
|
||||
uint32 langprob = base_obj->kCLDTableInd[indirect];
|
||||
uint32 langprob2 = base_obj->kCLDTableInd[indirect + 1];
|
||||
retval.append(GetLangProbTxt(scoringcontext, langprob));
|
||||
if (!retval.empty()) {retval.append("~");}
|
||||
retval.append(GetLangProbTxt(scoringcontext, langprob2));
|
||||
}
|
||||
return retval;
|
||||
}
|
||||
|
||||
|
||||
// 16 background colors, perhaps from the low 4 bits of the language number
|
||||
static const int kLangBackground[16] = {
|
||||
0xffd8d8, 0xf8ffd8, 0xd8ffe7, 0xd8f3ff,
|
||||
0xefd8ff, 0xffd8eb, 0xfff7d8, 0xe3ffd8,
|
||||
0xd8ffff, 0xe3d8ff, 0xffd8f7, 0xffebd8,
|
||||
0xefffd8, 0xd8fff3, 0xd8e7ff, 0xf8d8ff,
|
||||
};
|
||||
|
||||
// 16 text colors, perhaps from the high 4 bits of the language number
|
||||
// 00..7f
|
||||
static const int kLangColor[16] = {
|
||||
0x000000, 0x7f2f00, 0x7f5f00, 0x6f7f00, // first 16 lang: black text
|
||||
0x3f7f00, 0x0f7f00, 0x007f1f, 0x007f4f,
|
||||
0x007f7f, 0x004f7f, 0x001f7f, 0x0f007f,
|
||||
0x3f007f, 0x6f007f, 0x7f005f, 0x7f002f,
|
||||
};
|
||||
|
||||
static const int kUnscoredText = 0xb0b0b0; // medium-light gray
|
||||
static const int kUnscoredBackground = 0xffffff; // white
|
||||
static const int kIgnoremeText = 0x8090a0; // medium-light green-gray
|
||||
static const int kIgnoremeBackground = 0xffeecc; // light orange
|
||||
static const int kEnglishBackground = 0xfffff4; // very light yellow
|
||||
|
||||
static int GetBackColor(Language lang, bool lighten) {
|
||||
int retval;
|
||||
if (lang == ENGLISH) {
|
||||
retval = kEnglishBackground;
|
||||
} else if (lang == UNKNOWN_LANGUAGE) {
|
||||
retval = kUnscoredBackground;
|
||||
} else if (lang == TG_UNKNOWN_LANGUAGE) {
|
||||
retval = kIgnoremeBackground;
|
||||
} else if (lang < 0) {
|
||||
retval = kUnscoredBackground;
|
||||
} else {
|
||||
retval = kLangBackground[lang & 0x0f];
|
||||
}
|
||||
if (lighten) {
|
||||
// Make 1/2 as far away from white
|
||||
retval = (retval >> 1) | 0x808080;
|
||||
}
|
||||
return retval;
|
||||
}
|
||||
|
||||
static int GetTextColor(Language lang, bool lighten) {
|
||||
int retval;
|
||||
if (lang == UNKNOWN_LANGUAGE) {
|
||||
retval = kUnscoredText;
|
||||
} else if (lang == TG_UNKNOWN_LANGUAGE) {
|
||||
retval = kIgnoremeText;
|
||||
} else if (lang < 0) {
|
||||
retval = kUnscoredText;
|
||||
} else {
|
||||
retval = kLangColor[(lang >> 4) & 0x0f];
|
||||
}
|
||||
if (lighten) {
|
||||
// Make 1/2 as far away from white
|
||||
retval = (retval >> 1) | 0x808080;
|
||||
}
|
||||
return retval;
|
||||
}
|
||||
|
||||
string GetPlainEscapedText(const string& txt) {
|
||||
string retval;
|
||||
retval.clear();
|
||||
for (int i = 0; i < txt.size(); ++i) {
|
||||
char c = txt[i];
|
||||
if (c == '\n') {
|
||||
retval.append(" ");
|
||||
} else if (c == '\r') {
|
||||
retval.append(" ");
|
||||
} else {
|
||||
retval.append(1, c);
|
||||
}
|
||||
}
|
||||
return retval;
|
||||
}
|
||||
|
||||
string GetHtmlEscapedText(const string& txt) {
|
||||
string retval;
|
||||
retval.clear();
|
||||
for (int i = 0; i < txt.size(); ++i) {
|
||||
char c = txt[i];
|
||||
if (c == '<') {
|
||||
retval.append("<");
|
||||
} else if (c == '>') {
|
||||
retval.append(">");
|
||||
} else if (c == '&') {
|
||||
retval.append("&");
|
||||
} else if (c == '\'') {
|
||||
retval.append("'");
|
||||
} else if (c == '"') {
|
||||
retval.append(""");
|
||||
} else if (c == '\n') {
|
||||
retval.append(" ");
|
||||
} else if (c == '\r') {
|
||||
retval.append(" ");
|
||||
} else {
|
||||
retval.append(1, c);
|
||||
}
|
||||
}
|
||||
return retval;
|
||||
}
|
||||
|
||||
string GetColorHtmlEscapedText(Language lang, const string& txt) {
|
||||
char temp[64];
|
||||
sprintf(temp, " <span style=\"background:#%06X;color:#%06X;\">\n",
|
||||
GetBackColor(lang, false),
|
||||
GetTextColor(lang, false));
|
||||
string esc_txt = string(temp);
|
||||
esc_txt.append(GetHtmlEscapedText(txt));
|
||||
esc_txt.append("</span>");
|
||||
return esc_txt;
|
||||
}
|
||||
|
||||
string GetLangColorHtmlEscapedText(Language lang, const string& txt) {
|
||||
char temp[64];
|
||||
sprintf(temp, "[%s]", LanguageCode(lang));
|
||||
string esc_txt = string(temp);
|
||||
esc_txt.append(GetColorHtmlEscapedText(lang, txt));
|
||||
return esc_txt;
|
||||
}
|
||||
|
||||
|
||||
// For showing one chunk
|
||||
// Print debug output for one scored chunk
|
||||
// Optionally print out per-chunk scoring information
|
||||
// In degenerate cases, hitbuffer and cspan can be NULL
|
||||
void CLD2_Debug(const char* text,
|
||||
int lo_offset,
|
||||
int hi_offset,
|
||||
bool more_to_come, bool score_cjk,
|
||||
const ScoringHitBuffer* hitbuffer,
|
||||
const ScoringContext* scoringcontext,
|
||||
const ChunkSpan* cspan,
|
||||
const ChunkSummary* chunksummary) {
|
||||
FILE* df = scoringcontext->debug_file;
|
||||
if (df == NULL) {return;}
|
||||
|
||||
if (scoringcontext->flags_cld2_verbose &&
|
||||
(hitbuffer != NULL) &&
|
||||
(cspan != NULL) && (hitbuffer->next_linear > 0)) {
|
||||
int base_limit = cspan->chunk_base + cspan->base_len;
|
||||
for (int i = cspan->chunk_base; i < base_limit; ++i) {
|
||||
int ngram_start = hitbuffer->linear[i].offset;
|
||||
uint32 langprob = hitbuffer->linear[i].langprob;
|
||||
string ngram_text;
|
||||
switch (hitbuffer->linear[i].type) {
|
||||
case UNIHIT:
|
||||
ngram_text = GetUniAt(&text[ngram_start]);
|
||||
break;
|
||||
case QUADHIT:
|
||||
ngram_text = GetQuadAt(&text[ngram_start]);
|
||||
break;
|
||||
case DELTAHIT:
|
||||
case DISTINCTHIT:
|
||||
if (score_cjk) {
|
||||
ngram_text = GetBiAt(&text[ngram_start]);
|
||||
} else {
|
||||
// TODO: figure out how to display optional two words
|
||||
ngram_text = GetOctaAt(&text[ngram_start]);
|
||||
}
|
||||
break;
|
||||
}
|
||||
string score_text = GetLangProbTxt(scoringcontext, langprob);
|
||||
fprintf(df, "%c:%s=%s ",
|
||||
"UQLD"[hitbuffer->linear[i].type],
|
||||
ngram_text.c_str(),
|
||||
score_text.c_str());
|
||||
}
|
||||
fprintf(df, "<br>\n");
|
||||
|
||||
// Score boosts for langprior and distinct tokens
|
||||
// Get boosts for current script
|
||||
const LangBoosts* langprior_boost = &scoringcontext->langprior_boost.latn;
|
||||
const LangBoosts* distinct_boost = &scoringcontext->distinct_boost.latn;
|
||||
if (scoringcontext->ulscript != ULScript_Latin) {
|
||||
langprior_boost = &scoringcontext->langprior_boost.othr;
|
||||
distinct_boost = &scoringcontext->distinct_boost.othr;
|
||||
}
|
||||
fprintf(df, "LangPrior_boost: ");
|
||||
for (int k = 0; k < kMaxBoosts; ++k) {
|
||||
uint32 langprob = langprior_boost->langprob[k];
|
||||
if (langprob > 0) {
|
||||
fprintf(df, "%s ",
|
||||
GetLangProbTxt(scoringcontext, langprob).c_str());
|
||||
}
|
||||
}
|
||||
fprintf(df, "Distinct_boost: ");
|
||||
for (int k = 0; k < kMaxBoosts; ++k) {
|
||||
uint32 langprob = distinct_boost->langprob[k];
|
||||
if (langprob > 0) {
|
||||
fprintf(df, "%s ",
|
||||
GetLangProbTxt(scoringcontext, langprob).c_str());
|
||||
}
|
||||
}
|
||||
fprintf(df, "<br>\n");
|
||||
|
||||
// Print chunksummary
|
||||
fprintf(df, "%s.%d %s.%d %dB %d# %s %dRd %dRs<br>\n",
|
||||
LanguageCode(static_cast<Language>(chunksummary->lang1)),
|
||||
chunksummary->score1,
|
||||
LanguageCode(static_cast<Language>(chunksummary->lang2)),
|
||||
chunksummary->score2,
|
||||
chunksummary->bytes,
|
||||
chunksummary->grams,
|
||||
ULScriptCode(static_cast<ULScript>(chunksummary->ulscript)),
|
||||
chunksummary->reliability_delta,
|
||||
chunksummary->reliability_score);
|
||||
} // End flags_cld2_verbose linear
|
||||
|
||||
|
||||
// Print annotated colored text of this chunk
|
||||
bool is_reliable = true;
|
||||
bool match_prior = false;
|
||||
int reliable = CLD2::minint(chunksummary->reliability_delta,
|
||||
chunksummary->reliability_score);
|
||||
is_reliable = (reliable >= 75);
|
||||
match_prior = (chunksummary->lang1 == scoringcontext->prior_chunk_lang);
|
||||
if (!is_reliable) {match_prior = false;}
|
||||
|
||||
if (match_prior) {
|
||||
fprintf(df, "[]");
|
||||
} else if (is_reliable) {
|
||||
fprintf(df, "[%s]",
|
||||
LanguageCode(static_cast<Language>(chunksummary->lang1)));
|
||||
} else {
|
||||
fprintf(df, "[%s*.%d/%s.%d]",
|
||||
LanguageCode(static_cast<Language>(chunksummary->lang1)),
|
||||
chunksummary->score1,
|
||||
LanguageCode(static_cast<Language>(chunksummary->lang2)),
|
||||
chunksummary->score2);
|
||||
}
|
||||
|
||||
int chunktext_len = hi_offset - lo_offset;
|
||||
if (chunktext_len < 0) {
|
||||
chunktext_len = 0;
|
||||
fprintf(df, " LEN_ERR hi %d lo %d<br>\n", hi_offset, lo_offset);
|
||||
}
|
||||
string chunk_text(&text[lo_offset], chunktext_len);
|
||||
|
||||
Language lang = static_cast<Language>(chunksummary->lang1);
|
||||
fprintf(df, " <span style=\"background:#%06X;color:#%06X;\">\n",
|
||||
GetBackColor(lang, false),
|
||||
GetTextColor(lang, false));
|
||||
fprintf(df, "%s", chunk_text.c_str());
|
||||
if (scoringcontext->flags_cld2_cr) {
|
||||
fprintf(df, "</span><br>\n");
|
||||
} else {
|
||||
fprintf(df, "</span> \n");
|
||||
}
|
||||
}
|
||||
|
||||
// For showing all chunks
|
||||
void CLD2_Debug2(const char* text,
|
||||
bool more_to_come, bool score_cjk,
|
||||
const ScoringHitBuffer* hitbuffer,
|
||||
const ScoringContext* scoringcontext,
|
||||
const SummaryBuffer* summarybuffer) {
|
||||
FILE* df = scoringcontext->debug_file;
|
||||
if (df == NULL) {return;}
|
||||
uint16 prior_chunk_lang = static_cast<uint16>(UNKNOWN_LANGUAGE);
|
||||
|
||||
for (int i = 0; i < summarybuffer->n; ++i) {
|
||||
fprintf(df, "Debug2[%d] ", i);
|
||||
const ChunkSummary* chunksummary = &summarybuffer->chunksummary[i];
|
||||
// Print annotated colored text of this chunk
|
||||
bool is_reliable = true;
|
||||
bool match_prior = false;
|
||||
int reliable = CLD2::minint(chunksummary->reliability_delta,
|
||||
chunksummary->reliability_score);
|
||||
is_reliable = (reliable >= 75);
|
||||
match_prior = (chunksummary->lang1 == prior_chunk_lang);
|
||||
if (!is_reliable) {match_prior = false;}
|
||||
|
||||
if (match_prior) {
|
||||
fprintf(df, "[]");
|
||||
} else if (is_reliable) {
|
||||
fprintf(df, "[%s]",
|
||||
LanguageCode(static_cast<Language>(chunksummary->lang1)));
|
||||
} else {
|
||||
fprintf(df, "[%s*.%d/%s.%d]",
|
||||
LanguageCode(static_cast<Language>(chunksummary->lang1)),
|
||||
chunksummary->score1,
|
||||
LanguageCode(static_cast<Language>(chunksummary->lang2)),
|
||||
chunksummary->score2);
|
||||
}
|
||||
|
||||
int lo_offset = chunksummary->offset;
|
||||
int chunktext_len = chunksummary->bytes;
|
||||
string chunk_text(&text[lo_offset], chunktext_len);
|
||||
|
||||
Language lang = static_cast<Language>(chunksummary->lang1);
|
||||
fprintf(df, " <span style=\"background:#%06X;color:#%06X;\">\n",
|
||||
GetBackColor(lang, false),
|
||||
GetTextColor(lang, false));
|
||||
fprintf(df, "%s", chunk_text.c_str());
|
||||
if (scoringcontext->flags_cld2_cr) {
|
||||
fprintf(df, "</span><br>\n");
|
||||
} else {
|
||||
fprintf(df, "</span> \n");
|
||||
}
|
||||
prior_chunk_lang = chunksummary->lang1;
|
||||
}
|
||||
}
|
||||
|
||||
void DumpResultChunkVector(FILE* f, const char* src,
|
||||
ResultChunkVector* resultchunkvector) {
|
||||
fprintf(f, "DumpResultChunkVector[%ld]<br>\n", resultchunkvector->size());
|
||||
for (int i = 0; i < resultchunkvector->size(); ++i) {
|
||||
ResultChunk* rc = &(*resultchunkvector)[i];
|
||||
Language lang1 = static_cast<Language>(rc->lang1);
|
||||
string this_chunk = string(src, rc->offset, rc->bytes);
|
||||
fprintf(f, "[%d]{%d %d %s} ", i, rc->offset, rc->bytes, LanguageCode(lang1));
|
||||
fprintf(f, "%s<br>\n", GetColorHtmlEscapedText(lang1, this_chunk).c_str());
|
||||
}
|
||||
fprintf(f, "<br>\n");
|
||||
}
|
||||
|
||||
} // End namespace CLD2
|
||||
|
||||
|
||||
58
internal/debug.h
Normal file
58
internal/debug.h
Normal file
@@ -0,0 +1,58 @@
|
||||
// Copyright 2013 Google Inc. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
//
|
||||
// Author: dsites@google.com (Dick Sites)
|
||||
//
|
||||
// Produces debugging output for CLD2. See debug_empty.h for suppressing this.
|
||||
|
||||
|
||||
#ifndef I18N_ENCODINGS_CLD2_INTERNAL_DEBUG_H_
|
||||
#define I18N_ENCODINGS_CLD2_INTERNAL_DEBUG_H_
|
||||
|
||||
#include <string>
|
||||
#include "scoreonescriptspan.h"
|
||||
|
||||
namespace CLD2 {
|
||||
|
||||
// For showing one chunk
|
||||
void CLD2_Debug(const char* text,
|
||||
int lo_offset,
|
||||
int hi_offset,
|
||||
bool more_to_come, bool score_cjk,
|
||||
const ScoringHitBuffer* hitbuffer,
|
||||
const ScoringContext* scoringcontext,
|
||||
const ChunkSpan* cspan,
|
||||
const ChunkSummary* chunksummary);
|
||||
|
||||
// For showing all chunks
|
||||
void CLD2_Debug2(const char* text,
|
||||
bool more_to_come, bool score_cjk,
|
||||
const ScoringHitBuffer* hitbuffer,
|
||||
const ScoringContext* scoringcontext,
|
||||
const SummaryBuffer* summarybuffer);
|
||||
|
||||
std::string GetPlainEscapedText(const std::string& txt);
|
||||
std::string GetHtmlEscapedText(const std::string& txt);
|
||||
std::string GetColorHtmlEscapedText(Language lang, const std::string& txt);
|
||||
std::string GetLangColorHtmlEscapedText(Language lang, const std::string& txt);
|
||||
|
||||
void DumpResultChunkVector(FILE* f, const char* src,
|
||||
ResultChunkVector* resultchunkvector);
|
||||
|
||||
|
||||
} // End namespace CLD2
|
||||
|
||||
#endif // I18N_ENCODINGS_CLD2_INTERNAL_DEBUG_H_
|
||||
|
||||
64
internal/debug_empty.cc
Normal file
64
internal/debug_empty.cc
Normal file
@@ -0,0 +1,64 @@
|
||||
// Copyright 2013 Google Inc. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
//
|
||||
// Author: dsites@google.com (Dick Sites)
|
||||
// Compile this in instead of debug.cc to remove code for debug output
|
||||
//
|
||||
|
||||
#include "debug.h"
|
||||
#include <string>
|
||||
|
||||
using namespace std;
|
||||
|
||||
namespace CLD2 {
|
||||
|
||||
string GetPlainEscapedText(const string& txt) {return string("");}
|
||||
|
||||
string GetHtmlEscapedText(const string& txt) {return string("");}
|
||||
|
||||
string GetColorHtmlEscapedText(Language lang, const string& txt) {
|
||||
return string("");
|
||||
}
|
||||
|
||||
string GetLangColorHtmlEscapedText(Language lang, const string& txt) {
|
||||
return string("");
|
||||
}
|
||||
|
||||
|
||||
// For showing one chunk
|
||||
// Print debug output for one scored chunk
|
||||
// Optionally print out per-chunk scoring information
|
||||
// In degenerate cases, hitbuffer and cspan can be NULL
|
||||
void CLD2_Debug(const char* text,
|
||||
int lo_offset,
|
||||
int hi_offset,
|
||||
bool more_to_come, bool score_cjk,
|
||||
const ScoringHitBuffer* hitbuffer,
|
||||
const ScoringContext* scoringcontext,
|
||||
const ChunkSpan* cspan,
|
||||
const ChunkSummary* chunksummary) {}
|
||||
|
||||
// For showing all chunks
|
||||
void CLD2_Debug2(const char* text,
|
||||
bool more_to_come, bool score_cjk,
|
||||
const ScoringHitBuffer* hitbuffer,
|
||||
const ScoringContext* scoringcontext,
|
||||
const SummaryBuffer* summarybuffer) {}
|
||||
|
||||
void DumpResultChunkVector(FILE* f, const char* src,
|
||||
ResultChunkVector* resultchunkvector) {}
|
||||
|
||||
} // End namespace CLD2
|
||||
|
||||
54
internal/fixunicodevalue.cc
Normal file
54
internal/fixunicodevalue.cc
Normal file
@@ -0,0 +1,54 @@
|
||||
// Copyright 2013 Google Inc. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
//
|
||||
// Routine that maps a Unicode code point to an interchange-valid one
|
||||
//
|
||||
|
||||
#include "fixunicodevalue.h"
|
||||
#include "integral_types.h"
|
||||
|
||||
namespace CLD2 {
|
||||
|
||||
// Guarantees that the resulting output value is interchange valid
|
||||
// 00-FF; map to spaces or MS CP1252
|
||||
// D800-DFFF; surrogates
|
||||
// FDD0-FDEF; non-characters
|
||||
// xxFFFE-xxFFFF; non-characters
|
||||
char32 FixUnicodeValue(char32 uv) {
|
||||
uint32 uuv = static_cast<uint32>(uv);
|
||||
if (uuv < 0x0100) {
|
||||
return kMapFullMicrosoft1252OrSpace[uuv];
|
||||
}
|
||||
if (uuv < 0xD800) {
|
||||
return uv;
|
||||
}
|
||||
if ((uuv & ~0x0F) == 0xFDD0) { // non-characters
|
||||
return 0xFFFD;
|
||||
}
|
||||
if ((uuv & ~0x0F) == 0xFDE0) { // non-characters
|
||||
return 0xFFFD;
|
||||
}
|
||||
if ((uuv & 0x00FFFE) == 0xFFFE) { // non-characters
|
||||
return 0xFFFD;
|
||||
}
|
||||
if ((0xE000 <= uuv) && (uuv <= 0x10FFFF)) {
|
||||
return uv;
|
||||
}
|
||||
// surrogates and negative and > 0x10FFFF all land here
|
||||
return 0xFFFD;
|
||||
}
|
||||
|
||||
} // End namespace CLD2
|
||||
|
||||
68
internal/fixunicodevalue.h
Normal file
68
internal/fixunicodevalue.h
Normal file
@@ -0,0 +1,68 @@
|
||||
// Copyright 2013 Google Inc. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
//
|
||||
// Routine that maps a Unicode code point to an interchange-valid one
|
||||
//
|
||||
// Table that maps MS CP1252 bytes 00-FF to their corresponding Unicode
|
||||
// code points. C0 and C1 control codes that are not interchange-valid
|
||||
// are mapped to spaces.
|
||||
|
||||
|
||||
#ifndef I18N_ENCODINGS_CLD2_INTERNAL_FIXUNICODEVALUE_H__
|
||||
#define I18N_ENCODINGS_CLD2_INTERNAL_FIXUNICODEVALUE_H__
|
||||
|
||||
#include "integral_types.h" // for char32
|
||||
#include "port.h"
|
||||
|
||||
namespace CLD2 {
|
||||
|
||||
// Map byte value 0000-00FF to char32
|
||||
// Maps C0 control codes (other than CR LF HT FF) to space [29 instances including DEL=0x7F]
|
||||
// Maps C1 control codes to CP1252 [27 instances] or space [5 instances]
|
||||
static const char32 kMapFullMicrosoft1252OrSpace[256] = {
|
||||
0x20,0x20,0x20,0x20, 0x20,0x20,0x20,0x20, 0x20,0x09,0x0a,0x20, 0x0c,0x0d,0x20,0x20, // 00
|
||||
0x20,0x20,0x20,0x20, 0x20,0x20,0x20,0x20, 0x20,0x20,0x20,0x20, 0x20,0x20,0x20,0x20,
|
||||
0x20,0x21,0x22,0x23, 0x24,0x25,0x26,0x27, 0x28,0x29,0x2a,0x2b, 0x2c,0x2d,0x2e,0x2f,
|
||||
0x30,0x31,0x32,0x33, 0x34,0x35,0x36,0x37, 0x38,0x39,0x3a,0x3b, 0x3c,0x3d,0x3e,0x3f,
|
||||
|
||||
0x40,0x41,0x42,0x43, 0x44,0x45,0x46,0x47, 0x48,0x49,0x4a,0x4b, 0x4c,0x4d,0x4e,0x4f, // 40
|
||||
0x50,0x51,0x52,0x53, 0x54,0x55,0x56,0x57, 0x58,0x59,0x5a,0x5b, 0x5c,0x5d,0x5e,0x5f,
|
||||
0x60,0x61,0x62,0x63, 0x64,0x65,0x66,0x67, 0x68,0x69,0x6a,0x6b, 0x6c,0x6d,0x6e,0x6f,
|
||||
0x70,0x71,0x72,0x73, 0x74,0x75,0x76,0x77, 0x78,0x79,0x7a,0x7b, 0x7c,0x7d,0x7e,0x20,
|
||||
|
||||
0x20ac,0x20,0x201a,0x0192, 0x201e,0x2026,0x2020,0x2021, // 80
|
||||
0x02c6,0x2030,0x0160,0x2039, 0x0152,0x20,0x017d,0x20,
|
||||
0x20,0x2018,0x2019,0x201c, 0x201d,0x2022,0x2013,0x2014,
|
||||
0x02dc,0x2122,0x0161,0x203a, 0x0153,0x20,0x017e,0x0178,
|
||||
0xa0,0xa1,0xa2,0xa3, 0xa4,0xa5,0xa6,0xa7, 0xa8,0xa9,0xaa,0xab, 0xac,0xad,0xae,0xaf, // A0
|
||||
0xb0,0xb1,0xb2,0xb3, 0xb4,0xb5,0xb6,0xb7, 0xb8,0xb9,0xba,0xbb, 0xbc,0xbd,0xbe,0xbf,
|
||||
|
||||
0xc0,0xc1,0xc2,0xc3, 0xc4,0xc5,0xc6,0xc7, 0xc8,0xc9,0xca,0xcb, 0xcc,0xcd,0xce,0xcf, // C0
|
||||
0xd0,0xd1,0xd2,0xd3, 0xd4,0xd5,0xd6,0xd7, 0xd8,0xd9,0xda,0xdb, 0xdc,0xdd,0xde,0xdf,
|
||||
0xe0,0xe1,0xe2,0xe3, 0xe4,0xe5,0xe6,0xe7, 0xe8,0xe9,0xea,0xeb, 0xec,0xed,0xee,0xef,
|
||||
0xf0,0xf1,0xf2,0xf3, 0xf4,0xf5,0xf6,0xf7, 0xf8,0xf9,0xfa,0xfb, 0xfc,0xfd,0xfe,0xff,
|
||||
};
|
||||
|
||||
// Guarantees that the resulting output value is interchange valid
|
||||
// 00-FF; map to spaces or MS CP1252
|
||||
// D800-DFFF; surrogates
|
||||
// FDD0-FDEF; non-characters
|
||||
// xxFFFE-xxFFFF; non-characters
|
||||
char32 FixUnicodeValue(char32 uv);
|
||||
|
||||
} // End namespace CLD2
|
||||
|
||||
#endif // I18N_ENCODINGS_CLD2_INTERNAL_FIXUNICODEVALUE_H__
|
||||
|
||||
51
internal/generated_distinct_bi_0.cc
Normal file
51
internal/generated_distinct_bi_0.cc
Normal file
@@ -0,0 +1,51 @@
|
||||
// Copyright 2013 Google Inc. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
//
|
||||
// Degenerate CLD2 scoring lookup table, for use as placeholder
|
||||
//
|
||||
#include "cld2tablesummary.h"
|
||||
|
||||
namespace CLD2 {
|
||||
|
||||
static const int kDistinctBiTableBuildDate = 20130101; // yyyymmdd
|
||||
static const int kDistinctBiTableSizeOne = 1; // One-langprob Bucket count
|
||||
static const int kDistinctBiTableSize = 1; // Total Bucket count
|
||||
static const int kDistinctBiTableKeyMask = 0xffffffff; // Mask hash key
|
||||
static const char* const kDistinctBiTableRecognizedLangScripts = "";
|
||||
|
||||
// Empty table
|
||||
static const IndirectProbBucket4 kDistinctBiTable[kDistinctBiTableSize] = {
|
||||
// key[4], words[4] in UTF-8
|
||||
// value[4]
|
||||
{ {0x00000000,0x00000000,0x00000000,0x00000000}}, // [000]
|
||||
};
|
||||
|
||||
static const uint32 kDistinctBiTableInd[1] = {
|
||||
// [0000]
|
||||
0x00000000, };
|
||||
|
||||
extern const CLD2TableSummary kDistinctBiTable_obj = {
|
||||
kDistinctBiTable,
|
||||
kDistinctBiTableInd,
|
||||
kDistinctBiTableSizeOne,
|
||||
kDistinctBiTableSize,
|
||||
kDistinctBiTableKeyMask,
|
||||
kDistinctBiTableBuildDate,
|
||||
kDistinctBiTableRecognizedLangScripts,
|
||||
};
|
||||
|
||||
} // End namespace CLD2
|
||||
|
||||
// End of generated tables
|
||||
294
internal/generated_entities.cc
Normal file
294
internal/generated_entities.cc
Normal file
@@ -0,0 +1,294 @@
|
||||
// Copyright 2013 Google Inc. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
// generated_entities.cc
|
||||
// Machine generated. Do Not Edit.
|
||||
//
|
||||
// Declarations for HTML entities recognized by CLD2
|
||||
//
|
||||
#include "generated_ulscript.h" // for CharIntPair
|
||||
|
||||
namespace CLD2 {
|
||||
|
||||
// Alphabetical order for binary search
|
||||
extern const int kNameToEntitySize = 265;
|
||||
extern const CharIntPair kNameToEntity[kNameToEntitySize] = {
|
||||
{"AElig", 198},
|
||||
{"AMP", 38},
|
||||
{"Aacute", 193},
|
||||
{"Acirc", 194},
|
||||
{"Agrave", 192},
|
||||
{"Alpha", 913},
|
||||
{"Aring", 197},
|
||||
{"Atilde", 195},
|
||||
{"Auml", 196},
|
||||
{"Beta", 914},
|
||||
{"Ccaron", 268},
|
||||
{"Ccedil", 199},
|
||||
{"Chi", 935},
|
||||
{"Dagger", 8225},
|
||||
{"Delta", 916},
|
||||
{"ETH", 208},
|
||||
{"Eacute", 201},
|
||||
{"Ecaron", 282},
|
||||
{"Ecirc", 202},
|
||||
{"Egrave", 200},
|
||||
{"Epsilon", 917},
|
||||
{"Eta", 919},
|
||||
{"Euml", 203},
|
||||
{"GT", 62},
|
||||
{"Gamma", 915},
|
||||
{"Iacute", 205},
|
||||
{"Icirc", 206},
|
||||
{"Igrave", 204},
|
||||
{"Iota", 921},
|
||||
{"Iuml", 207},
|
||||
{"Kappa", 922},
|
||||
{"LT", 60},
|
||||
{"Lambda", 923},
|
||||
{"Mu", 924},
|
||||
{"Ntilde", 209},
|
||||
{"Nu", 925},
|
||||
{"OElig", 338},
|
||||
{"Oacute", 211},
|
||||
{"Ocirc", 212},
|
||||
{"Ograve", 210},
|
||||
{"Omega", 937},
|
||||
{"Omicron", 927},
|
||||
{"Oslash", 216},
|
||||
{"Otilde", 213},
|
||||
{"Ouml", 214},
|
||||
{"Phi", 934},
|
||||
{"Pi", 928},
|
||||
{"Prime", 8243},
|
||||
{"Psi", 936},
|
||||
{"QUOT", 34},
|
||||
{"Rcaron", 344},
|
||||
{"Rho", 929},
|
||||
{"Scaron", 352},
|
||||
{"Sigma", 931},
|
||||
{"THORN", 222},
|
||||
{"Tau", 932},
|
||||
{"Theta", 920},
|
||||
{"Uacute", 218},
|
||||
{"Ucirc", 219},
|
||||
{"Ugrave", 217},
|
||||
{"Upsilon", 933},
|
||||
{"Uuml", 220},
|
||||
{"Xi", 926},
|
||||
{"Yacute", 221},
|
||||
{"Yuml", 376},
|
||||
{"Zeta", 918},
|
||||
{"aacute", 225},
|
||||
{"acirc", 226},
|
||||
{"acute", 180},
|
||||
{"aelig", 230},
|
||||
{"agrave", 224},
|
||||
{"alefsym", 8501},
|
||||
{"alpha", 945},
|
||||
{"amp", 38},
|
||||
{"and", 8743},
|
||||
{"ang", 8736},
|
||||
{"apos", 39},
|
||||
{"aring", 229},
|
||||
{"asymp", 8776},
|
||||
{"atilde", 227},
|
||||
{"auml", 228},
|
||||
{"bdquo", 8222},
|
||||
{"beta", 946},
|
||||
{"brvbar", 166},
|
||||
{"bull", 8226},
|
||||
{"cap", 8745},
|
||||
{"ccaron", 269},
|
||||
{"ccedil", 231},
|
||||
{"cedil", 184},
|
||||
{"cent", 162},
|
||||
{"chi", 967},
|
||||
{"circ", 710},
|
||||
{"clubs", 9827},
|
||||
{"cong", 8773},
|
||||
{"copy", 169},
|
||||
{"crarr", 8629},
|
||||
{"cup", 8746},
|
||||
{"curren", 164},
|
||||
{"dArr", 8659},
|
||||
{"dagger", 8224},
|
||||
{"darr", 8595},
|
||||
{"deg", 176},
|
||||
{"delta", 948},
|
||||
{"diams", 9830},
|
||||
{"divide", 247},
|
||||
{"eacute", 233},
|
||||
{"ecaron", 283},
|
||||
{"ecirc", 234},
|
||||
{"egrave", 232},
|
||||
{"emdash", 8212},
|
||||
{"empty", 8709},
|
||||
{"emsp", 8195},
|
||||
{"endash", 8211},
|
||||
{"ensp", 8194},
|
||||
{"epsilon", 949},
|
||||
{"equiv", 8801},
|
||||
{"eta", 951},
|
||||
{"eth", 240},
|
||||
{"euml", 235},
|
||||
{"euro", 8364},
|
||||
{"exist", 8707},
|
||||
{"fnof", 402},
|
||||
{"forall", 8704},
|
||||
{"frac12", 189},
|
||||
{"frac14", 188},
|
||||
{"frac34", 190},
|
||||
{"frasl", 8260},
|
||||
{"gamma", 947},
|
||||
{"ge", 8805},
|
||||
{"gt", 62},
|
||||
{"hArr", 8660},
|
||||
{"harr", 8596},
|
||||
{"hearts", 9829},
|
||||
{"hellip", 8230},
|
||||
{"iacute", 237},
|
||||
{"icirc", 238},
|
||||
{"iexcl", 161},
|
||||
{"igrave", 236},
|
||||
{"image", 8465},
|
||||
{"infin", 8734},
|
||||
{"int", 8747},
|
||||
{"iota", 953},
|
||||
{"iquest", 191},
|
||||
{"isin", 8712},
|
||||
{"iuml", 239},
|
||||
{"kappa", 954},
|
||||
{"lArr", 8656},
|
||||
{"lambda", 955},
|
||||
{"lang", 9001},
|
||||
{"laquo", 171},
|
||||
{"larr", 8592},
|
||||
{"lceil", 8968},
|
||||
{"ldquo", 8220},
|
||||
{"le", 8804},
|
||||
{"lfloor", 8970},
|
||||
{"lowast", 8727},
|
||||
{"loz", 9674},
|
||||
{"lrm", 8206},
|
||||
{"lsaquo", 8249},
|
||||
{"lsquo", 8216},
|
||||
{"lt", 60},
|
||||
{"macr", 175},
|
||||
{"mdash", 8212},
|
||||
{"micro", 181},
|
||||
{"middot", 183},
|
||||
{"minus", 8722},
|
||||
{"mu", 956},
|
||||
{"nabla", 8711},
|
||||
{"nbsp", 160},
|
||||
{"ndash", 8211},
|
||||
{"ne", 8800},
|
||||
{"ni", 8715},
|
||||
{"not", 172},
|
||||
{"notin", 8713},
|
||||
{"nsub", 8836},
|
||||
{"ntilde", 241},
|
||||
{"nu", 957},
|
||||
{"oacute", 243},
|
||||
{"ocirc", 244},
|
||||
{"oelig", 339},
|
||||
{"ograve", 242},
|
||||
{"oline", 8254},
|
||||
{"omega", 969},
|
||||
{"omicron", 959},
|
||||
{"oplus", 8853},
|
||||
{"or", 8744},
|
||||
{"ordf", 170},
|
||||
{"ordm", 186},
|
||||
{"oslash", 248},
|
||||
{"otilde", 245},
|
||||
{"otimes", 8855},
|
||||
{"ouml", 246},
|
||||
{"para", 182},
|
||||
{"part", 8706},
|
||||
{"permil", 8240},
|
||||
{"perp", 8869},
|
||||
{"phi", 966},
|
||||
{"pi", 960},
|
||||
{"piv", 982},
|
||||
{"plusmn", 177},
|
||||
{"pound", 163},
|
||||
{"prime", 8242},
|
||||
{"prod", 8719},
|
||||
{"prop", 8733},
|
||||
{"psi", 968},
|
||||
{"quot", 34},
|
||||
{"rArr", 8658},
|
||||
{"radic", 8730},
|
||||
{"rang", 9002},
|
||||
{"raquo", 187},
|
||||
{"rarr", 8594},
|
||||
{"rcaron", 345},
|
||||
{"rceil", 8969},
|
||||
{"rdquo", 8221},
|
||||
{"real", 8476},
|
||||
{"reg", 174},
|
||||
{"rfloor", 8971},
|
||||
{"rho", 961},
|
||||
{"rlm", 8207},
|
||||
{"rsaquo", 8250},
|
||||
{"rsquo", 8217},
|
||||
{"sbquo", 8218},
|
||||
{"scaron", 353},
|
||||
{"sdot", 8901},
|
||||
{"sect", 167},
|
||||
{"shy", 173},
|
||||
{"sigma", 963},
|
||||
{"sigmaf", 962},
|
||||
{"sim", 8764},
|
||||
{"spades", 9824},
|
||||
{"sub", 8834},
|
||||
{"sube", 8838},
|
||||
{"sum", 8721},
|
||||
{"sup", 8835},
|
||||
{"sup1", 185},
|
||||
{"sup2", 178},
|
||||
{"sup3", 179},
|
||||
{"supe", 8839},
|
||||
{"szlig", 223},
|
||||
{"tau", 964},
|
||||
{"there4", 8756},
|
||||
{"theta", 952},
|
||||
{"thetasym", 977},
|
||||
{"thinsp", 8201},
|
||||
{"thorn", 254},
|
||||
{"tilde", 732},
|
||||
{"times", 215},
|
||||
{"trade", 8482},
|
||||
{"uArr", 8657},
|
||||
{"uacute", 250},
|
||||
{"uarr", 8593},
|
||||
{"ucirc", 251},
|
||||
{"ugrave", 249},
|
||||
{"uml", 168},
|
||||
{"upsih", 978},
|
||||
{"upsilon", 965},
|
||||
{"uuml", 252},
|
||||
{"weierp", 8472},
|
||||
{"xi", 958},
|
||||
{"yacute", 253},
|
||||
{"yen", 165},
|
||||
{"yuml", 255},
|
||||
{"zeta", 950},
|
||||
{"zwj", 8205},
|
||||
{"zwnj", 8204},
|
||||
};
|
||||
|
||||
} // namespace CLD2
|
||||
4679
internal/generated_language.cc
Normal file
4679
internal/generated_language.cc
Normal file
File diff suppressed because it is too large
Load Diff
651
internal/generated_language.h
Normal file
651
internal/generated_language.h
Normal file
@@ -0,0 +1,651 @@
|
||||
// Copyright 2013 Google Inc. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
// generated_language.h
|
||||
// Machine generated. Do Not Edit.
|
||||
//
|
||||
// Declarations for languages recognized by CLD2
|
||||
//
|
||||
|
||||
#ifndef I18N_ENCODINGS_CLD2_INTERNAL_GENERATED_LANGUAGE_H__
|
||||
#define I18N_ENCODINGS_CLD2_INTERNAL_GENERATED_LANGUAGE_H__
|
||||
|
||||
#include "generated_ulscript.h"
|
||||
#include "integral_types.h"
|
||||
|
||||
namespace CLD2 {
|
||||
|
||||
typedef uint16 FourScripts[4];
|
||||
|
||||
typedef enum {
|
||||
ENGLISH = 0, // en
|
||||
DANISH = 1, // da
|
||||
DUTCH = 2, // nl
|
||||
FINNISH = 3, // fi
|
||||
FRENCH = 4, // fr
|
||||
GERMAN = 5, // de
|
||||
HEBREW = 6, // iw
|
||||
ITALIAN = 7, // it
|
||||
JAPANESE = 8, // ja
|
||||
KOREAN = 9, // ko
|
||||
NORWEGIAN = 10, // no
|
||||
POLISH = 11, // pl
|
||||
PORTUGUESE = 12, // pt
|
||||
RUSSIAN = 13, // ru
|
||||
SPANISH = 14, // es
|
||||
SWEDISH = 15, // sv
|
||||
CHINESE = 16, // zh
|
||||
CZECH = 17, // cs
|
||||
GREEK = 18, // el
|
||||
ICELANDIC = 19, // is
|
||||
LATVIAN = 20, // lv
|
||||
LITHUANIAN = 21, // lt
|
||||
ROMANIAN = 22, // ro
|
||||
HUNGARIAN = 23, // hu
|
||||
ESTONIAN = 24, // et
|
||||
TG_UNKNOWN_LANGUAGE = 25, // xxx
|
||||
UNKNOWN_LANGUAGE = 26, // un
|
||||
BULGARIAN = 27, // bg
|
||||
CROATIAN = 28, // hr
|
||||
SERBIAN = 29, // sr
|
||||
IRISH = 30, // ga
|
||||
GALICIAN = 31, // gl
|
||||
TAGALOG = 32, // tl
|
||||
TURKISH = 33, // tr
|
||||
UKRAINIAN = 34, // uk
|
||||
HINDI = 35, // hi
|
||||
MACEDONIAN = 36, // mk
|
||||
BENGALI = 37, // bn
|
||||
INDONESIAN = 38, // id
|
||||
LATIN = 39, // la
|
||||
MALAY = 40, // ms
|
||||
MALAYALAM = 41, // ml
|
||||
WELSH = 42, // cy
|
||||
NEPALI = 43, // ne
|
||||
TELUGU = 44, // te
|
||||
ALBANIAN = 45, // sq
|
||||
TAMIL = 46, // ta
|
||||
BELARUSIAN = 47, // be
|
||||
JAVANESE = 48, // jw
|
||||
OCCITAN = 49, // oc
|
||||
URDU = 50, // ur
|
||||
BIHARI = 51, // bh
|
||||
GUJARATI = 52, // gu
|
||||
THAI = 53, // th
|
||||
ARABIC = 54, // ar
|
||||
CATALAN = 55, // ca
|
||||
ESPERANTO = 56, // eo
|
||||
BASQUE = 57, // eu
|
||||
INTERLINGUA = 58, // ia
|
||||
KANNADA = 59, // kn
|
||||
PUNJABI = 60, // pa
|
||||
SCOTS_GAELIC = 61, // gd
|
||||
SWAHILI = 62, // sw
|
||||
SLOVENIAN = 63, // sl
|
||||
MARATHI = 64, // mr
|
||||
MALTESE = 65, // mt
|
||||
VIETNAMESE = 66, // vi
|
||||
FRISIAN = 67, // fy
|
||||
SLOVAK = 68, // sk
|
||||
CHINESE_T = 69, // zh-Hant
|
||||
FAROESE = 70, // fo
|
||||
SUNDANESE = 71, // su
|
||||
UZBEK = 72, // uz
|
||||
AMHARIC = 73, // am
|
||||
AZERBAIJANI = 74, // az
|
||||
GEORGIAN = 75, // ka
|
||||
TIGRINYA = 76, // ti
|
||||
PERSIAN = 77, // fa
|
||||
BOSNIAN = 78, // bs
|
||||
SINHALESE = 79, // si
|
||||
NORWEGIAN_N = 80, // nn
|
||||
X_81 = 81, //
|
||||
X_82 = 82, //
|
||||
XHOSA = 83, // xh
|
||||
ZULU = 84, // zu
|
||||
GUARANI = 85, // gn
|
||||
SESOTHO = 86, // st
|
||||
TURKMEN = 87, // tk
|
||||
KYRGYZ = 88, // ky
|
||||
BRETON = 89, // br
|
||||
TWI = 90, // tw
|
||||
YIDDISH = 91, // yi
|
||||
X_92 = 92, //
|
||||
SOMALI = 93, // so
|
||||
UIGHUR = 94, // ug
|
||||
KURDISH = 95, // ku
|
||||
MONGOLIAN = 96, // mn
|
||||
ARMENIAN = 97, // hy
|
||||
LAOTHIAN = 98, // lo
|
||||
SINDHI = 99, // sd
|
||||
RHAETO_ROMANCE = 100, // rm
|
||||
AFRIKAANS = 101, // af
|
||||
LUXEMBOURGISH = 102, // lb
|
||||
BURMESE = 103, // my
|
||||
KHMER = 104, // km
|
||||
TIBETAN = 105, // bo
|
||||
DHIVEHI = 106, // dv
|
||||
CHEROKEE = 107, // chr
|
||||
SYRIAC = 108, // syr
|
||||
LIMBU = 109, // lif
|
||||
ORIYA = 110, // or
|
||||
ASSAMESE = 111, // as
|
||||
CORSICAN = 112, // co
|
||||
INTERLINGUE = 113, // ie
|
||||
KAZAKH = 114, // kk
|
||||
LINGALA = 115, // ln
|
||||
X_116 = 116, //
|
||||
PASHTO = 117, // ps
|
||||
QUECHUA = 118, // qu
|
||||
SHONA = 119, // sn
|
||||
TAJIK = 120, // tg
|
||||
TATAR = 121, // tt
|
||||
TONGA = 122, // to
|
||||
YORUBA = 123, // yo
|
||||
X_124 = 124, //
|
||||
X_125 = 125, //
|
||||
X_126 = 126, //
|
||||
X_127 = 127, //
|
||||
MAORI = 128, // mi
|
||||
WOLOF = 129, // wo
|
||||
ABKHAZIAN = 130, // ab
|
||||
AFAR = 131, // aa
|
||||
AYMARA = 132, // ay
|
||||
BASHKIR = 133, // ba
|
||||
BISLAMA = 134, // bi
|
||||
DZONGKHA = 135, // dz
|
||||
FIJIAN = 136, // fj
|
||||
GREENLANDIC = 137, // kl
|
||||
HAUSA = 138, // ha
|
||||
HAITIAN_CREOLE = 139, // ht
|
||||
INUPIAK = 140, // ik
|
||||
INUKTITUT = 141, // iu
|
||||
KASHMIRI = 142, // ks
|
||||
KINYARWANDA = 143, // rw
|
||||
MALAGASY = 144, // mg
|
||||
NAURU = 145, // na
|
||||
OROMO = 146, // om
|
||||
RUNDI = 147, // rn
|
||||
SAMOAN = 148, // sm
|
||||
SANGO = 149, // sg
|
||||
SANSKRIT = 150, // sa
|
||||
SISWANT = 151, // ss
|
||||
TSONGA = 152, // ts
|
||||
TSWANA = 153, // tn
|
||||
VOLAPUK = 154, // vo
|
||||
ZHUANG = 155, // za
|
||||
KHASI = 156, // kha
|
||||
SCOTS = 157, // sco
|
||||
GANDA = 158, // lg
|
||||
MANX = 159, // gv
|
||||
MONTENEGRIN = 160, // sr-ME
|
||||
AKAN = 161, // ak
|
||||
IGBO = 162, // ig
|
||||
MAURITIAN_CREOLE = 163, // mfe
|
||||
HAWAIIAN = 164, // haw
|
||||
CEBUANO = 165, // ceb
|
||||
EWE = 166, // ee
|
||||
GA = 167, // gaa
|
||||
HMONG = 168, // blu
|
||||
KRIO = 169, // kri
|
||||
LOZI = 170, // loz
|
||||
LUBA_LULUA = 171, // lua
|
||||
LUO_KENYA_AND_TANZANIA = 172, // luo
|
||||
NEWARI = 173, // new
|
||||
NYANJA = 174, // ny
|
||||
OSSETIAN = 175, // os
|
||||
PAMPANGA = 176, // pam
|
||||
PEDI = 177, // nso
|
||||
RAJASTHANI = 178, // raj
|
||||
SESELWA = 179, // crs
|
||||
TUMBUKA = 180, // tum
|
||||
VENDA = 181, // ve
|
||||
WARAY_PHILIPPINES = 182, // war
|
||||
X_183 = 183, //
|
||||
X_184 = 184, //
|
||||
X_185 = 185, //
|
||||
X_186 = 186, //
|
||||
X_187 = 187, //
|
||||
X_188 = 188, //
|
||||
X_189 = 189, //
|
||||
X_190 = 190, //
|
||||
X_191 = 191, //
|
||||
X_192 = 192, //
|
||||
X_193 = 193, //
|
||||
X_194 = 194, //
|
||||
X_195 = 195, //
|
||||
X_196 = 196, //
|
||||
X_197 = 197, //
|
||||
X_198 = 198, //
|
||||
X_199 = 199, //
|
||||
X_200 = 200, //
|
||||
X_201 = 201, //
|
||||
X_202 = 202, //
|
||||
X_203 = 203, //
|
||||
X_204 = 204, //
|
||||
X_205 = 205, //
|
||||
X_206 = 206, //
|
||||
X_207 = 207, //
|
||||
X_208 = 208, //
|
||||
X_209 = 209, //
|
||||
X_210 = 210, //
|
||||
X_211 = 211, //
|
||||
X_212 = 212, //
|
||||
X_213 = 213, //
|
||||
X_214 = 214, //
|
||||
X_215 = 215, //
|
||||
X_216 = 216, //
|
||||
X_217 = 217, //
|
||||
X_218 = 218, //
|
||||
X_219 = 219, //
|
||||
X_220 = 220, //
|
||||
X_221 = 221, //
|
||||
X_222 = 222, //
|
||||
X_223 = 223, //
|
||||
X_224 = 224, //
|
||||
X_225 = 225, //
|
||||
X_226 = 226, //
|
||||
X_227 = 227, //
|
||||
X_228 = 228, //
|
||||
X_229 = 229, //
|
||||
X_230 = 230, //
|
||||
X_231 = 231, //
|
||||
X_232 = 232, //
|
||||
X_233 = 233, //
|
||||
X_234 = 234, //
|
||||
X_235 = 235, //
|
||||
X_236 = 236, //
|
||||
X_237 = 237, //
|
||||
X_238 = 238, //
|
||||
X_239 = 239, //
|
||||
X_240 = 240, //
|
||||
X_241 = 241, //
|
||||
X_242 = 242, //
|
||||
X_243 = 243, //
|
||||
X_244 = 244, //
|
||||
X_245 = 245, //
|
||||
X_246 = 246, //
|
||||
X_247 = 247, //
|
||||
X_248 = 248, //
|
||||
X_249 = 249, //
|
||||
X_250 = 250, //
|
||||
X_251 = 251, //
|
||||
X_252 = 252, //
|
||||
X_253 = 253, //
|
||||
X_254 = 254, //
|
||||
X_255 = 255, //
|
||||
X_256 = 256, //
|
||||
X_257 = 257, //
|
||||
X_258 = 258, //
|
||||
X_259 = 259, //
|
||||
X_260 = 260, //
|
||||
X_261 = 261, //
|
||||
X_262 = 262, //
|
||||
X_263 = 263, //
|
||||
X_264 = 264, //
|
||||
X_265 = 265, //
|
||||
X_266 = 266, //
|
||||
X_267 = 267, //
|
||||
X_268 = 268, //
|
||||
X_269 = 269, //
|
||||
X_270 = 270, //
|
||||
X_271 = 271, //
|
||||
X_272 = 272, //
|
||||
X_273 = 273, //
|
||||
X_274 = 274, //
|
||||
X_275 = 275, //
|
||||
X_276 = 276, //
|
||||
X_277 = 277, //
|
||||
X_278 = 278, //
|
||||
X_279 = 279, //
|
||||
X_280 = 280, //
|
||||
X_281 = 281, //
|
||||
X_282 = 282, //
|
||||
X_283 = 283, //
|
||||
X_284 = 284, //
|
||||
X_285 = 285, //
|
||||
X_286 = 286, //
|
||||
X_287 = 287, //
|
||||
X_288 = 288, //
|
||||
X_289 = 289, //
|
||||
X_290 = 290, //
|
||||
X_291 = 291, //
|
||||
X_292 = 292, //
|
||||
X_293 = 293, //
|
||||
X_294 = 294, //
|
||||
X_295 = 295, //
|
||||
X_296 = 296, //
|
||||
X_297 = 297, //
|
||||
X_298 = 298, //
|
||||
X_299 = 299, //
|
||||
X_300 = 300, //
|
||||
X_301 = 301, //
|
||||
X_302 = 302, //
|
||||
X_303 = 303, //
|
||||
X_304 = 304, //
|
||||
X_305 = 305, //
|
||||
X_306 = 306, //
|
||||
X_307 = 307, //
|
||||
X_308 = 308, //
|
||||
X_309 = 309, //
|
||||
X_310 = 310, //
|
||||
X_311 = 311, //
|
||||
X_312 = 312, //
|
||||
X_313 = 313, //
|
||||
X_314 = 314, //
|
||||
X_315 = 315, //
|
||||
X_316 = 316, //
|
||||
X_317 = 317, //
|
||||
X_318 = 318, //
|
||||
X_319 = 319, //
|
||||
X_320 = 320, //
|
||||
X_321 = 321, //
|
||||
X_322 = 322, //
|
||||
X_323 = 323, //
|
||||
X_324 = 324, //
|
||||
X_325 = 325, //
|
||||
X_326 = 326, //
|
||||
X_327 = 327, //
|
||||
X_328 = 328, //
|
||||
X_329 = 329, //
|
||||
X_330 = 330, //
|
||||
X_331 = 331, //
|
||||
X_332 = 332, //
|
||||
X_333 = 333, //
|
||||
X_334 = 334, //
|
||||
X_335 = 335, //
|
||||
X_336 = 336, //
|
||||
X_337 = 337, //
|
||||
X_338 = 338, //
|
||||
X_339 = 339, //
|
||||
X_340 = 340, //
|
||||
X_341 = 341, //
|
||||
X_342 = 342, //
|
||||
X_343 = 343, //
|
||||
X_344 = 344, //
|
||||
X_345 = 345, //
|
||||
X_346 = 346, //
|
||||
X_347 = 347, //
|
||||
X_348 = 348, //
|
||||
X_349 = 349, //
|
||||
X_350 = 350, //
|
||||
X_351 = 351, //
|
||||
X_352 = 352, //
|
||||
X_353 = 353, //
|
||||
X_354 = 354, //
|
||||
X_355 = 355, //
|
||||
X_356 = 356, //
|
||||
X_357 = 357, //
|
||||
X_358 = 358, //
|
||||
X_359 = 359, //
|
||||
X_360 = 360, //
|
||||
X_361 = 361, //
|
||||
X_362 = 362, //
|
||||
X_363 = 363, //
|
||||
X_364 = 364, //
|
||||
X_365 = 365, //
|
||||
X_366 = 366, //
|
||||
X_367 = 367, //
|
||||
X_368 = 368, //
|
||||
X_369 = 369, //
|
||||
X_370 = 370, //
|
||||
X_371 = 371, //
|
||||
X_372 = 372, //
|
||||
X_373 = 373, //
|
||||
X_374 = 374, //
|
||||
X_375 = 375, //
|
||||
X_376 = 376, //
|
||||
X_377 = 377, //
|
||||
X_378 = 378, //
|
||||
X_379 = 379, //
|
||||
X_380 = 380, //
|
||||
X_381 = 381, //
|
||||
X_382 = 382, //
|
||||
X_383 = 383, //
|
||||
X_384 = 384, //
|
||||
X_385 = 385, //
|
||||
X_386 = 386, //
|
||||
X_387 = 387, //
|
||||
X_388 = 388, //
|
||||
X_389 = 389, //
|
||||
X_390 = 390, //
|
||||
X_391 = 391, //
|
||||
X_392 = 392, //
|
||||
X_393 = 393, //
|
||||
X_394 = 394, //
|
||||
X_395 = 395, //
|
||||
X_396 = 396, //
|
||||
X_397 = 397, //
|
||||
X_398 = 398, //
|
||||
X_399 = 399, //
|
||||
X_400 = 400, //
|
||||
X_401 = 401, //
|
||||
X_402 = 402, //
|
||||
X_403 = 403, //
|
||||
X_404 = 404, //
|
||||
X_405 = 405, //
|
||||
X_406 = 406, //
|
||||
X_407 = 407, //
|
||||
X_408 = 408, //
|
||||
X_409 = 409, //
|
||||
X_410 = 410, //
|
||||
X_411 = 411, //
|
||||
X_412 = 412, //
|
||||
X_413 = 413, //
|
||||
X_414 = 414, //
|
||||
X_415 = 415, //
|
||||
X_416 = 416, //
|
||||
X_417 = 417, //
|
||||
X_418 = 418, //
|
||||
X_419 = 419, //
|
||||
X_420 = 420, //
|
||||
X_421 = 421, //
|
||||
X_422 = 422, //
|
||||
X_423 = 423, //
|
||||
X_424 = 424, //
|
||||
X_425 = 425, //
|
||||
X_426 = 426, //
|
||||
X_427 = 427, //
|
||||
X_428 = 428, //
|
||||
X_429 = 429, //
|
||||
X_430 = 430, //
|
||||
X_431 = 431, //
|
||||
X_432 = 432, //
|
||||
X_433 = 433, //
|
||||
X_434 = 434, //
|
||||
X_435 = 435, //
|
||||
X_436 = 436, //
|
||||
X_437 = 437, //
|
||||
X_438 = 438, //
|
||||
X_439 = 439, //
|
||||
X_440 = 440, //
|
||||
X_441 = 441, //
|
||||
X_442 = 442, //
|
||||
X_443 = 443, //
|
||||
X_444 = 444, //
|
||||
X_445 = 445, //
|
||||
X_446 = 446, //
|
||||
X_447 = 447, //
|
||||
X_448 = 448, //
|
||||
X_449 = 449, //
|
||||
X_450 = 450, //
|
||||
X_451 = 451, //
|
||||
X_452 = 452, //
|
||||
X_453 = 453, //
|
||||
X_454 = 454, //
|
||||
X_455 = 455, //
|
||||
X_456 = 456, //
|
||||
X_457 = 457, //
|
||||
X_458 = 458, //
|
||||
X_459 = 459, //
|
||||
X_460 = 460, //
|
||||
X_461 = 461, //
|
||||
X_462 = 462, //
|
||||
X_463 = 463, //
|
||||
X_464 = 464, //
|
||||
X_465 = 465, //
|
||||
X_466 = 466, //
|
||||
X_467 = 467, //
|
||||
X_468 = 468, //
|
||||
X_469 = 469, //
|
||||
X_470 = 470, //
|
||||
X_471 = 471, //
|
||||
X_472 = 472, //
|
||||
X_473 = 473, //
|
||||
X_474 = 474, //
|
||||
X_475 = 475, //
|
||||
X_476 = 476, //
|
||||
X_477 = 477, //
|
||||
X_478 = 478, //
|
||||
X_479 = 479, //
|
||||
X_480 = 480, //
|
||||
X_481 = 481, //
|
||||
X_482 = 482, //
|
||||
X_483 = 483, //
|
||||
X_484 = 484, //
|
||||
X_485 = 485, //
|
||||
X_486 = 486, //
|
||||
X_487 = 487, //
|
||||
X_488 = 488, //
|
||||
X_489 = 489, //
|
||||
X_490 = 490, //
|
||||
X_491 = 491, //
|
||||
X_492 = 492, //
|
||||
X_493 = 493, //
|
||||
X_494 = 494, //
|
||||
X_495 = 495, //
|
||||
X_496 = 496, //
|
||||
X_497 = 497, //
|
||||
X_498 = 498, //
|
||||
X_499 = 499, //
|
||||
X_500 = 500, //
|
||||
X_501 = 501, //
|
||||
X_502 = 502, //
|
||||
X_503 = 503, //
|
||||
X_504 = 504, //
|
||||
X_505 = 505, //
|
||||
NDEBELE = 506, // nr
|
||||
X_BORK_BORK_BORK = 507, // zzb
|
||||
X_PIG_LATIN = 508, // zzp
|
||||
X_HACKER = 509, // zzh
|
||||
X_KLINGON = 510, // tlh
|
||||
X_ELMER_FUDD = 511, // zze
|
||||
X_Common = 512, // xx-Zyyy
|
||||
X_Latin = 513, // xx-Latn
|
||||
X_Greek = 514, // xx-Grek
|
||||
X_Cyrillic = 515, // xx-Cyrl
|
||||
X_Armenian = 516, // xx-Armn
|
||||
X_Hebrew = 517, // xx-Hebr
|
||||
X_Arabic = 518, // xx-Arab
|
||||
X_Syriac = 519, // xx-Syrc
|
||||
X_Thaana = 520, // xx-Thaa
|
||||
X_Devanagari = 521, // xx-Deva
|
||||
X_Bengali = 522, // xx-Beng
|
||||
X_Gurmukhi = 523, // xx-Guru
|
||||
X_Gujarati = 524, // xx-Gujr
|
||||
X_Oriya = 525, // xx-Orya
|
||||
X_Tamil = 526, // xx-Taml
|
||||
X_Telugu = 527, // xx-Telu
|
||||
X_Kannada = 528, // xx-Knda
|
||||
X_Malayalam = 529, // xx-Mlym
|
||||
X_Sinhala = 530, // xx-Sinh
|
||||
X_Thai = 531, // xx-Thai
|
||||
X_Lao = 532, // xx-Laoo
|
||||
X_Tibetan = 533, // xx-Tibt
|
||||
X_Myanmar = 534, // xx-Mymr
|
||||
X_Georgian = 535, // xx-Geor
|
||||
X_Hangul = 536, // xx-Hang
|
||||
X_Ethiopic = 537, // xx-Ethi
|
||||
X_Cherokee = 538, // xx-Cher
|
||||
X_Canadian_Aboriginal = 539, // xx-Cans
|
||||
X_Ogham = 540, // xx-Ogam
|
||||
X_Runic = 541, // xx-Runr
|
||||
X_Khmer = 542, // xx-Khmr
|
||||
X_Mongolian = 543, // xx-Mong
|
||||
X_Hiragana = 544, // xx-Hira
|
||||
X_Katakana = 545, // xx-Kana
|
||||
X_Bopomofo = 546, // xx-Bopo
|
||||
X_Han = 547, // xx-Hani
|
||||
X_Yi = 548, // xx-Yiii
|
||||
X_Old_Italic = 549, // xx-Ital
|
||||
X_Gothic = 550, // xx-Goth
|
||||
X_Deseret = 551, // xx-Dsrt
|
||||
X_Inherited = 552, // xx-Qaai
|
||||
X_Tagalog = 553, // xx-Tglg
|
||||
X_Hanunoo = 554, // xx-Hano
|
||||
X_Buhid = 555, // xx-Buhd
|
||||
X_Tagbanwa = 556, // xx-Tagb
|
||||
X_Limbu = 557, // xx-Limb
|
||||
X_Tai_Le = 558, // xx-Tale
|
||||
X_Linear_B = 559, // xx-Linb
|
||||
X_Ugaritic = 560, // xx-Ugar
|
||||
X_Shavian = 561, // xx-Shaw
|
||||
X_Osmanya = 562, // xx-Osma
|
||||
X_Cypriot = 563, // xx-Cprt
|
||||
X_Braille = 564, // xx-Brai
|
||||
X_Buginese = 565, // xx-Bugi
|
||||
X_Coptic = 566, // xx-Copt
|
||||
X_New_Tai_Lue = 567, // xx-Talu
|
||||
X_Glagolitic = 568, // xx-Glag
|
||||
X_Tifinagh = 569, // xx-Tfng
|
||||
X_Syloti_Nagri = 570, // xx-Sylo
|
||||
X_Old_Persian = 571, // xx-Xpeo
|
||||
X_Kharoshthi = 572, // xx-Khar
|
||||
X_Balinese = 573, // xx-Bali
|
||||
X_Cuneiform = 574, // xx-Xsux
|
||||
X_Phoenician = 575, // xx-Phnx
|
||||
X_Phags_Pa = 576, // xx-Phag
|
||||
X_Nko = 577, // xx-Nkoo
|
||||
X_Sundanese = 578, // xx-Sund
|
||||
X_Lepcha = 579, // xx-Lepc
|
||||
X_Ol_Chiki = 580, // xx-Olck
|
||||
X_Vai = 581, // xx-Vaii
|
||||
X_Saurashtra = 582, // xx-Saur
|
||||
X_Kayah_Li = 583, // xx-Kali
|
||||
X_Rejang = 584, // xx-Rjng
|
||||
X_Lycian = 585, // xx-Lyci
|
||||
X_Carian = 586, // xx-Cari
|
||||
X_Lydian = 587, // xx-Lydi
|
||||
X_Cham = 588, // xx-Cham
|
||||
X_Tai_Tham = 589, // xx-Lana
|
||||
X_Tai_Viet = 590, // xx-Tavt
|
||||
X_Avestan = 591, // xx-Avst
|
||||
X_Egyptian_Hieroglyphs = 592, // xx-Egyp
|
||||
X_Samaritan = 593, // xx-Samr
|
||||
X_Lisu = 594, // xx-Lisu
|
||||
X_Bamum = 595, // xx-Bamu
|
||||
X_Javanese = 596, // xx-Java
|
||||
X_Meetei_Mayek = 597, // xx-Mtei
|
||||
X_Imperial_Aramaic = 598, // xx-Armi
|
||||
X_Old_South_Arabian = 599, // xx-Sarb
|
||||
X_Inscriptional_Parthian = 600, // xx-Prti
|
||||
X_Inscriptional_Pahlavi = 601, // xx-Phli
|
||||
X_Old_Turkic = 602, // xx-Orkh
|
||||
X_Kaithi = 603, // xx-Kthi
|
||||
X_Batak = 604, // xx-Batk
|
||||
X_Brahmi = 605, // xx-Brah
|
||||
X_Mandaic = 606, // xx-Mand
|
||||
X_Chakma = 607, // xx-Cakm
|
||||
X_Meroitic_Cursive = 608, // xx-Merc
|
||||
X_Meroitic_Hieroglyphs = 609, // xx-Mero
|
||||
X_Miao = 610, // xx-Plrd
|
||||
X_Sharada = 611, // xx-Shrd
|
||||
X_Sora_Sompeng = 612, // xx-Sora
|
||||
X_Takri = 613, // xx-Takr
|
||||
NUM_LANGUAGES
|
||||
} Language;
|
||||
|
||||
} // namespace CLD2
|
||||
|
||||
#endif // I18N_ENCODINGS_CLD2_INTERNAL_GENERATED_LANGUAGE_H__
|
||||
781
internal/generated_ulscript.cc
Normal file
781
internal/generated_ulscript.cc
Normal file
@@ -0,0 +1,781 @@
|
||||
// Copyright 2013 Google Inc. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
// generated_ulscript.cc
|
||||
// Machine generated. Do Not Edit.
|
||||
//
|
||||
// Declarations for scripts recognized by CLD2
|
||||
//
|
||||
|
||||
#include "generated_ulscript.h"
|
||||
#include "generated_language.h"
|
||||
|
||||
namespace CLD2 {
|
||||
|
||||
// Subscripted by enum ULScript
|
||||
extern const int kULScriptToNameSize = 102;
|
||||
extern const char* const kULScriptToName[kULScriptToNameSize] = {
|
||||
"Common", // 0 Zyyy
|
||||
"Latin", // 1 Latn
|
||||
"Greek", // 2 Grek
|
||||
"Cyrillic", // 3 Cyrl
|
||||
"Armenian", // 4 Armn
|
||||
"Hebrew", // 5 Hebr
|
||||
"Arabic", // 6 Arab
|
||||
"Syriac", // 7 Syrc
|
||||
"Thaana", // 8 Thaa
|
||||
"Devanagari", // 9 Deva
|
||||
"Bengali", // 10 Beng
|
||||
"Gurmukhi", // 11 Guru
|
||||
"Gujarati", // 12 Gujr
|
||||
"Oriya", // 13 Orya
|
||||
"Tamil", // 14 Taml
|
||||
"Telugu", // 15 Telu
|
||||
"Kannada", // 16 Knda
|
||||
"Malayalam", // 17 Mlym
|
||||
"Sinhala", // 18 Sinh
|
||||
"Thai", // 19 Thai
|
||||
"Lao", // 20 Laoo
|
||||
"Tibetan", // 21 Tibt
|
||||
"Myanmar", // 22 Mymr
|
||||
"Georgian", // 23 Geor
|
||||
"Hani", // 24 Hani
|
||||
"Ethiopic", // 25 Ethi
|
||||
"Cherokee", // 26 Cher
|
||||
"Canadian_Aboriginal", // 27 Cans
|
||||
"Ogham", // 28 Ogam
|
||||
"Runic", // 29 Runr
|
||||
"Khmer", // 30 Khmr
|
||||
"Mongolian", // 31 Mong
|
||||
"", // 32
|
||||
"", // 33
|
||||
"Bopomofo", // 34 Bopo
|
||||
"", // 35
|
||||
"Yi", // 36 Yiii
|
||||
"Old_Italic", // 37 Ital
|
||||
"Gothic", // 38 Goth
|
||||
"Deseret", // 39 Dsrt
|
||||
"Inherited", // 40 Zinh
|
||||
"Tagalog", // 41 Tglg
|
||||
"Hanunoo", // 42 Hano
|
||||
"Buhid", // 43 Buhd
|
||||
"Tagbanwa", // 44 Tagb
|
||||
"Limbu", // 45 Limb
|
||||
"Tai_Le", // 46 Tale
|
||||
"Linear_B", // 47 Linb
|
||||
"Ugaritic", // 48 Ugar
|
||||
"Shavian", // 49 Shaw
|
||||
"Osmanya", // 50 Osma
|
||||
"Cypriot", // 51 Cprt
|
||||
"Braille", // 52 Brai
|
||||
"Buginese", // 53 Bugi
|
||||
"Coptic", // 54 Copt
|
||||
"New_Tai_Lue", // 55 Talu
|
||||
"Glagolitic", // 56 Glag
|
||||
"Tifinagh", // 57 Tfng
|
||||
"Syloti_Nagri", // 58 Sylo
|
||||
"Old_Persian", // 59 Xpeo
|
||||
"Kharoshthi", // 60 Khar
|
||||
"Balinese", // 61 Bali
|
||||
"Cuneiform", // 62 Xsux
|
||||
"Phoenician", // 63 Phnx
|
||||
"Phags_Pa", // 64 Phag
|
||||
"Nko", // 65 Nkoo
|
||||
"Sundanese", // 66 Sund
|
||||
"Lepcha", // 67 Lepc
|
||||
"Ol_Chiki", // 68 Olck
|
||||
"Vai", // 69 Vaii
|
||||
"Saurashtra", // 70 Saur
|
||||
"Kayah_Li", // 71 Kali
|
||||
"Rejang", // 72 Rjng
|
||||
"Lycian", // 73 Lyci
|
||||
"Carian", // 74 Cari
|
||||
"Lydian", // 75 Lydi
|
||||
"Cham", // 76 Cham
|
||||
"Tai_Tham", // 77 Lana
|
||||
"Tai_Viet", // 78 Tavt
|
||||
"Avestan", // 79 Avst
|
||||
"Egyptian_Hieroglyphs", // 80 Egyp
|
||||
"Samaritan", // 81 Samr
|
||||
"Lisu", // 82 Lisu
|
||||
"Bamum", // 83 Bamu
|
||||
"Javanese", // 84 Java
|
||||
"Meetei_Mayek", // 85 Mtei
|
||||
"Imperial_Aramaic", // 86 Armi
|
||||
"Old_South_Arabian", // 87 Sarb
|
||||
"Inscriptional_Parthian", // 88 Prti
|
||||
"Inscriptional_Pahlavi", // 89 Phli
|
||||
"Old_Turkic", // 90 Orkh
|
||||
"Kaithi", // 91 Kthi
|
||||
"Batak", // 92 Batk
|
||||
"Brahmi", // 93 Brah
|
||||
"Mandaic", // 94 Mand
|
||||
"Chakma", // 95 Cakm
|
||||
"Meroitic_Cursive", // 96 Merc
|
||||
"Meroitic_Hieroglyphs", // 97 Mero
|
||||
"Miao", // 98 Plrd
|
||||
"Sharada", // 99 Shrd
|
||||
"Sora_Sompeng", // 100 Sora
|
||||
"Takri", // 101 Takr
|
||||
};
|
||||
|
||||
// Subscripted by enum ULScript
|
||||
extern const int kULScriptToCodeSize = 102;
|
||||
extern const char* const kULScriptToCode[kULScriptToCodeSize] = {
|
||||
"Zyyy", // 0 Common
|
||||
"Latn", // 1 Latin
|
||||
"Grek", // 2 Greek
|
||||
"Cyrl", // 3 Cyrillic
|
||||
"Armn", // 4 Armenian
|
||||
"Hebr", // 5 Hebrew
|
||||
"Arab", // 6 Arabic
|
||||
"Syrc", // 7 Syriac
|
||||
"Thaa", // 8 Thaana
|
||||
"Deva", // 9 Devanagari
|
||||
"Beng", // 10 Bengali
|
||||
"Guru", // 11 Gurmukhi
|
||||
"Gujr", // 12 Gujarati
|
||||
"Orya", // 13 Oriya
|
||||
"Taml", // 14 Tamil
|
||||
"Telu", // 15 Telugu
|
||||
"Knda", // 16 Kannada
|
||||
"Mlym", // 17 Malayalam
|
||||
"Sinh", // 18 Sinhala
|
||||
"Thai", // 19 Thai
|
||||
"Laoo", // 20 Lao
|
||||
"Tibt", // 21 Tibetan
|
||||
"Mymr", // 22 Myanmar
|
||||
"Geor", // 23 Georgian
|
||||
"Hani", // 24 Hani
|
||||
"Ethi", // 25 Ethiopic
|
||||
"Cher", // 26 Cherokee
|
||||
"Cans", // 27 Canadian_Aboriginal
|
||||
"Ogam", // 28 Ogham
|
||||
"Runr", // 29 Runic
|
||||
"Khmr", // 30 Khmer
|
||||
"Mong", // 31 Mongolian
|
||||
"", // 32
|
||||
"", // 33
|
||||
"Bopo", // 34 Bopomofo
|
||||
"", // 35
|
||||
"Yiii", // 36 Yi
|
||||
"Ital", // 37 Old_Italic
|
||||
"Goth", // 38 Gothic
|
||||
"Dsrt", // 39 Deseret
|
||||
"Zinh", // 40 Inherited
|
||||
"Tglg", // 41 Tagalog
|
||||
"Hano", // 42 Hanunoo
|
||||
"Buhd", // 43 Buhid
|
||||
"Tagb", // 44 Tagbanwa
|
||||
"Limb", // 45 Limbu
|
||||
"Tale", // 46 Tai_Le
|
||||
"Linb", // 47 Linear_B
|
||||
"Ugar", // 48 Ugaritic
|
||||
"Shaw", // 49 Shavian
|
||||
"Osma", // 50 Osmanya
|
||||
"Cprt", // 51 Cypriot
|
||||
"Brai", // 52 Braille
|
||||
"Bugi", // 53 Buginese
|
||||
"Copt", // 54 Coptic
|
||||
"Talu", // 55 New_Tai_Lue
|
||||
"Glag", // 56 Glagolitic
|
||||
"Tfng", // 57 Tifinagh
|
||||
"Sylo", // 58 Syloti_Nagri
|
||||
"Xpeo", // 59 Old_Persian
|
||||
"Khar", // 60 Kharoshthi
|
||||
"Bali", // 61 Balinese
|
||||
"Xsux", // 62 Cuneiform
|
||||
"Phnx", // 63 Phoenician
|
||||
"Phag", // 64 Phags_Pa
|
||||
"Nkoo", // 65 Nko
|
||||
"Sund", // 66 Sundanese
|
||||
"Lepc", // 67 Lepcha
|
||||
"Olck", // 68 Ol_Chiki
|
||||
"Vaii", // 69 Vai
|
||||
"Saur", // 70 Saurashtra
|
||||
"Kali", // 71 Kayah_Li
|
||||
"Rjng", // 72 Rejang
|
||||
"Lyci", // 73 Lycian
|
||||
"Cari", // 74 Carian
|
||||
"Lydi", // 75 Lydian
|
||||
"Cham", // 76 Cham
|
||||
"Lana", // 77 Tai_Tham
|
||||
"Tavt", // 78 Tai_Viet
|
||||
"Avst", // 79 Avestan
|
||||
"Egyp", // 80 Egyptian_Hieroglyphs
|
||||
"Samr", // 81 Samaritan
|
||||
"Lisu", // 82 Lisu
|
||||
"Bamu", // 83 Bamum
|
||||
"Java", // 84 Javanese
|
||||
"Mtei", // 85 Meetei_Mayek
|
||||
"Armi", // 86 Imperial_Aramaic
|
||||
"Sarb", // 87 Old_South_Arabian
|
||||
"Prti", // 88 Inscriptional_Parthian
|
||||
"Phli", // 89 Inscriptional_Pahlavi
|
||||
"Orkh", // 90 Old_Turkic
|
||||
"Kthi", // 91 Kaithi
|
||||
"Batk", // 92 Batak
|
||||
"Brah", // 93 Brahmi
|
||||
"Mand", // 94 Mandaic
|
||||
"Cakm", // 95 Chakma
|
||||
"Merc", // 96 Meroitic_Cursive
|
||||
"Mero", // 97 Meroitic_Hieroglyphs
|
||||
"Plrd", // 98 Miao
|
||||
"Shrd", // 99 Sharada
|
||||
"Sora", // 100 Sora_Sompeng
|
||||
"Takr", // 101 Takri
|
||||
};
|
||||
|
||||
// Subscripted by enum ULScript
|
||||
extern const int kULScriptToCNameSize = 102;
|
||||
extern const char* const kULScriptToCName[kULScriptToCNameSize] = {
|
||||
"ULScript_Common", // 0 Zyyy
|
||||
"ULScript_Latin", // 1 Latn
|
||||
"ULScript_Greek", // 2 Grek
|
||||
"ULScript_Cyrillic", // 3 Cyrl
|
||||
"ULScript_Armenian", // 4 Armn
|
||||
"ULScript_Hebrew", // 5 Hebr
|
||||
"ULScript_Arabic", // 6 Arab
|
||||
"ULScript_Syriac", // 7 Syrc
|
||||
"ULScript_Thaana", // 8 Thaa
|
||||
"ULScript_Devanagari", // 9 Deva
|
||||
"ULScript_Bengali", // 10 Beng
|
||||
"ULScript_Gurmukhi", // 11 Guru
|
||||
"ULScript_Gujarati", // 12 Gujr
|
||||
"ULScript_Oriya", // 13 Orya
|
||||
"ULScript_Tamil", // 14 Taml
|
||||
"ULScript_Telugu", // 15 Telu
|
||||
"ULScript_Kannada", // 16 Knda
|
||||
"ULScript_Malayalam", // 17 Mlym
|
||||
"ULScript_Sinhala", // 18 Sinh
|
||||
"ULScript_Thai", // 19 Thai
|
||||
"ULScript_Lao", // 20 Laoo
|
||||
"ULScript_Tibetan", // 21 Tibt
|
||||
"ULScript_Myanmar", // 22 Mymr
|
||||
"ULScript_Georgian", // 23 Geor
|
||||
"ULScript_Hani", // 24 Hani
|
||||
"ULScript_Ethiopic", // 25 Ethi
|
||||
"ULScript_Cherokee", // 26 Cher
|
||||
"ULScript_Canadian_Aboriginal", // 27 Cans
|
||||
"ULScript_Ogham", // 28 Ogam
|
||||
"ULScript_Runic", // 29 Runr
|
||||
"ULScript_Khmer", // 30 Khmr
|
||||
"ULScript_Mongolian", // 31 Mong
|
||||
"ULScript_32", // 32
|
||||
"ULScript_33", // 33
|
||||
"ULScript_Bopomofo", // 34 Bopo
|
||||
"ULScript_35", // 35
|
||||
"ULScript_Yi", // 36 Yiii
|
||||
"ULScript_Old_Italic", // 37 Ital
|
||||
"ULScript_Gothic", // 38 Goth
|
||||
"ULScript_Deseret", // 39 Dsrt
|
||||
"ULScript_Inherited", // 40 Zinh
|
||||
"ULScript_Tagalog", // 41 Tglg
|
||||
"ULScript_Hanunoo", // 42 Hano
|
||||
"ULScript_Buhid", // 43 Buhd
|
||||
"ULScript_Tagbanwa", // 44 Tagb
|
||||
"ULScript_Limbu", // 45 Limb
|
||||
"ULScript_Tai_Le", // 46 Tale
|
||||
"ULScript_Linear_B", // 47 Linb
|
||||
"ULScript_Ugaritic", // 48 Ugar
|
||||
"ULScript_Shavian", // 49 Shaw
|
||||
"ULScript_Osmanya", // 50 Osma
|
||||
"ULScript_Cypriot", // 51 Cprt
|
||||
"ULScript_Braille", // 52 Brai
|
||||
"ULScript_Buginese", // 53 Bugi
|
||||
"ULScript_Coptic", // 54 Copt
|
||||
"ULScript_New_Tai_Lue", // 55 Talu
|
||||
"ULScript_Glagolitic", // 56 Glag
|
||||
"ULScript_Tifinagh", // 57 Tfng
|
||||
"ULScript_Syloti_Nagri", // 58 Sylo
|
||||
"ULScript_Old_Persian", // 59 Xpeo
|
||||
"ULScript_Kharoshthi", // 60 Khar
|
||||
"ULScript_Balinese", // 61 Bali
|
||||
"ULScript_Cuneiform", // 62 Xsux
|
||||
"ULScript_Phoenician", // 63 Phnx
|
||||
"ULScript_Phags_Pa", // 64 Phag
|
||||
"ULScript_Nko", // 65 Nkoo
|
||||
"ULScript_Sundanese", // 66 Sund
|
||||
"ULScript_Lepcha", // 67 Lepc
|
||||
"ULScript_Ol_Chiki", // 68 Olck
|
||||
"ULScript_Vai", // 69 Vaii
|
||||
"ULScript_Saurashtra", // 70 Saur
|
||||
"ULScript_Kayah_Li", // 71 Kali
|
||||
"ULScript_Rejang", // 72 Rjng
|
||||
"ULScript_Lycian", // 73 Lyci
|
||||
"ULScript_Carian", // 74 Cari
|
||||
"ULScript_Lydian", // 75 Lydi
|
||||
"ULScript_Cham", // 76 Cham
|
||||
"ULScript_Tai_Tham", // 77 Lana
|
||||
"ULScript_Tai_Viet", // 78 Tavt
|
||||
"ULScript_Avestan", // 79 Avst
|
||||
"ULScript_Egyptian_Hieroglyphs", // 80 Egyp
|
||||
"ULScript_Samaritan", // 81 Samr
|
||||
"ULScript_Lisu", // 82 Lisu
|
||||
"ULScript_Bamum", // 83 Bamu
|
||||
"ULScript_Javanese", // 84 Java
|
||||
"ULScript_Meetei_Mayek", // 85 Mtei
|
||||
"ULScript_Imperial_Aramaic", // 86 Armi
|
||||
"ULScript_Old_South_Arabian", // 87 Sarb
|
||||
"ULScript_Inscriptional_Parthian", // 88 Prti
|
||||
"ULScript_Inscriptional_Pahlavi", // 89 Phli
|
||||
"ULScript_Old_Turkic", // 90 Orkh
|
||||
"ULScript_Kaithi", // 91 Kthi
|
||||
"ULScript_Batak", // 92 Batk
|
||||
"ULScript_Brahmi", // 93 Brah
|
||||
"ULScript_Mandaic", // 94 Mand
|
||||
"ULScript_Chakma", // 95 Cakm
|
||||
"ULScript_Meroitic_Cursive", // 96 Merc
|
||||
"ULScript_Meroitic_Hieroglyphs", // 97 Mero
|
||||
"ULScript_Miao", // 98 Plrd
|
||||
"ULScript_Sharada", // 99 Shrd
|
||||
"ULScript_Sora_Sompeng", // 100 Sora
|
||||
"ULScript_Takri", // 101 Takr
|
||||
};
|
||||
|
||||
// Subscripted by enum ULScript
|
||||
extern const int kULScriptToRtypeSize = 102;
|
||||
extern const ULScriptRType kULScriptToRtype[kULScriptToRtypeSize] = {
|
||||
RTypeNone, // 0 Zyyy
|
||||
RTypeMany, // 1 Latn
|
||||
RTypeOne, // 2 Grek
|
||||
RTypeMany, // 3 Cyrl
|
||||
RTypeOne, // 4 Armn
|
||||
RTypeMany, // 5 Hebr
|
||||
RTypeMany, // 6 Arab
|
||||
RTypeOne, // 7 Syrc
|
||||
RTypeOne, // 8 Thaa
|
||||
RTypeMany, // 9 Deva
|
||||
RTypeMany, // 10 Beng
|
||||
RTypeOne, // 11 Guru
|
||||
RTypeOne, // 12 Gujr
|
||||
RTypeOne, // 13 Orya
|
||||
RTypeOne, // 14 Taml
|
||||
RTypeOne, // 15 Telu
|
||||
RTypeOne, // 16 Knda
|
||||
RTypeOne, // 17 Mlym
|
||||
RTypeOne, // 18 Sinh
|
||||
RTypeOne, // 19 Thai
|
||||
RTypeOne, // 20 Laoo
|
||||
RTypeMany, // 21 Tibt
|
||||
RTypeOne, // 22 Mymr
|
||||
RTypeOne, // 23 Geor
|
||||
RTypeCJK, // 24 Hani
|
||||
RTypeMany, // 25 Ethi
|
||||
RTypeOne, // 26 Cher
|
||||
RTypeOne, // 27 Cans
|
||||
RTypeNone, // 28 Ogam
|
||||
RTypeNone, // 29 Runr
|
||||
RTypeOne, // 30 Khmr
|
||||
RTypeOne, // 31 Mong
|
||||
RTypeNone, // 32
|
||||
RTypeNone, // 33
|
||||
RTypeNone, // 34 Bopo
|
||||
RTypeNone, // 35
|
||||
RTypeNone, // 36 Yiii
|
||||
RTypeNone, // 37 Ital
|
||||
RTypeNone, // 38 Goth
|
||||
RTypeNone, // 39 Dsrt
|
||||
RTypeNone, // 40 Zinh
|
||||
RTypeOne, // 41 Tglg
|
||||
RTypeNone, // 42 Hano
|
||||
RTypeNone, // 43 Buhd
|
||||
RTypeNone, // 44 Tagb
|
||||
RTypeOne, // 45 Limb
|
||||
RTypeNone, // 46 Tale
|
||||
RTypeNone, // 47 Linb
|
||||
RTypeNone, // 48 Ugar
|
||||
RTypeNone, // 49 Shaw
|
||||
RTypeNone, // 50 Osma
|
||||
RTypeNone, // 51 Cprt
|
||||
RTypeNone, // 52 Brai
|
||||
RTypeNone, // 53 Bugi
|
||||
RTypeNone, // 54 Copt
|
||||
RTypeNone, // 55 Talu
|
||||
RTypeNone, // 56 Glag
|
||||
RTypeNone, // 57 Tfng
|
||||
RTypeNone, // 58 Sylo
|
||||
RTypeNone, // 59 Xpeo
|
||||
RTypeNone, // 60 Khar
|
||||
RTypeNone, // 61 Bali
|
||||
RTypeNone, // 62 Xsux
|
||||
RTypeNone, // 63 Phnx
|
||||
RTypeNone, // 64 Phag
|
||||
RTypeNone, // 65 Nkoo
|
||||
RTypeNone, // 66 Sund
|
||||
RTypeNone, // 67 Lepc
|
||||
RTypeNone, // 68 Olck
|
||||
RTypeNone, // 69 Vaii
|
||||
RTypeNone, // 70 Saur
|
||||
RTypeNone, // 71 Kali
|
||||
RTypeNone, // 72 Rjng
|
||||
RTypeNone, // 73 Lyci
|
||||
RTypeNone, // 74 Cari
|
||||
RTypeNone, // 75 Lydi
|
||||
RTypeNone, // 76 Cham
|
||||
RTypeNone, // 77 Lana
|
||||
RTypeNone, // 78 Tavt
|
||||
RTypeNone, // 79 Avst
|
||||
RTypeNone, // 80 Egyp
|
||||
RTypeNone, // 81 Samr
|
||||
RTypeNone, // 82 Lisu
|
||||
RTypeNone, // 83 Bamu
|
||||
RTypeNone, // 84 Java
|
||||
RTypeNone, // 85 Mtei
|
||||
RTypeNone, // 86 Armi
|
||||
RTypeNone, // 87 Sarb
|
||||
RTypeNone, // 88 Prti
|
||||
RTypeNone, // 89 Phli
|
||||
RTypeNone, // 90 Orkh
|
||||
RTypeNone, // 91 Kthi
|
||||
RTypeNone, // 92 Batk
|
||||
RTypeNone, // 93 Brah
|
||||
RTypeNone, // 94 Mand
|
||||
RTypeNone, // 95 Cakm
|
||||
RTypeNone, // 96 Merc
|
||||
RTypeNone, // 97 Mero
|
||||
RTypeNone, // 98 Plrd
|
||||
RTypeNone, // 99 Shrd
|
||||
RTypeNone, // 100 Sora
|
||||
RTypeNone, // 101 Takr
|
||||
};
|
||||
|
||||
// Subscripted by enum ULScript
|
||||
extern const int kULScriptToDefaultLangSize = 102;
|
||||
extern const Language kULScriptToDefaultLang[kULScriptToDefaultLangSize] = {
|
||||
X_Common, // 0 Zyyy RTypeNone
|
||||
ENGLISH, // 1 Latn RTypeMany
|
||||
GREEK, // 2 Grek RTypeOne
|
||||
RUSSIAN, // 3 Cyrl RTypeMany
|
||||
ARMENIAN, // 4 Armn RTypeOne
|
||||
HEBREW, // 5 Hebr RTypeMany
|
||||
ARABIC, // 6 Arab RTypeMany
|
||||
SYRIAC, // 7 Syrc RTypeOne
|
||||
DHIVEHI, // 8 Thaa RTypeOne
|
||||
HINDI, // 9 Deva RTypeMany
|
||||
BENGALI, // 10 Beng RTypeMany
|
||||
PUNJABI, // 11 Guru RTypeOne
|
||||
GUJARATI, // 12 Gujr RTypeOne
|
||||
ORIYA, // 13 Orya RTypeOne
|
||||
TAMIL, // 14 Taml RTypeOne
|
||||
TELUGU, // 15 Telu RTypeOne
|
||||
KANNADA, // 16 Knda RTypeOne
|
||||
MALAYALAM, // 17 Mlym RTypeOne
|
||||
SINHALESE, // 18 Sinh RTypeOne
|
||||
THAI, // 19 Thai RTypeOne
|
||||
LAOTHIAN, // 20 Laoo RTypeOne
|
||||
TIBETAN, // 21 Tibt RTypeMany
|
||||
BURMESE, // 22 Mymr RTypeOne
|
||||
GEORGIAN, // 23 Geor RTypeOne
|
||||
JAPANESE, // 24 Hani RTypeCJK
|
||||
AMHARIC, // 25 Ethi RTypeMany
|
||||
CHEROKEE, // 26 Cher RTypeOne
|
||||
INUKTITUT, // 27 Cans RTypeOne
|
||||
X_Ogham, // 28 Ogam RTypeNone
|
||||
X_Runic, // 29 Runr RTypeNone
|
||||
KHMER, // 30 Khmr RTypeOne
|
||||
MONGOLIAN, // 31 Mong RTypeOne
|
||||
UNKNOWN_LANGUAGE, // 32 RTypeNone
|
||||
UNKNOWN_LANGUAGE, // 33 RTypeNone
|
||||
X_Bopomofo, // 34 Bopo RTypeNone
|
||||
UNKNOWN_LANGUAGE, // 35 RTypeNone
|
||||
X_Yi, // 36 Yiii RTypeNone
|
||||
X_Old_Italic, // 37 Ital RTypeNone
|
||||
X_Gothic, // 38 Goth RTypeNone
|
||||
X_Deseret, // 39 Dsrt RTypeNone
|
||||
X_Inherited, // 40 Zinh RTypeNone
|
||||
TAGALOG, // 41 Tglg RTypeOne
|
||||
X_Hanunoo, // 42 Hano RTypeNone
|
||||
X_Buhid, // 43 Buhd RTypeNone
|
||||
X_Tagbanwa, // 44 Tagb RTypeNone
|
||||
LIMBU, // 45 Limb RTypeOne
|
||||
X_Tai_Le, // 46 Tale RTypeNone
|
||||
X_Linear_B, // 47 Linb RTypeNone
|
||||
X_Ugaritic, // 48 Ugar RTypeNone
|
||||
X_Shavian, // 49 Shaw RTypeNone
|
||||
X_Osmanya, // 50 Osma RTypeNone
|
||||
X_Cypriot, // 51 Cprt RTypeNone
|
||||
X_Braille, // 52 Brai RTypeNone
|
||||
X_Buginese, // 53 Bugi RTypeNone
|
||||
X_Coptic, // 54 Copt RTypeNone
|
||||
X_New_Tai_Lue, // 55 Talu RTypeNone
|
||||
X_Glagolitic, // 56 Glag RTypeNone
|
||||
X_Tifinagh, // 57 Tfng RTypeNone
|
||||
X_Syloti_Nagri, // 58 Sylo RTypeNone
|
||||
X_Old_Persian, // 59 Xpeo RTypeNone
|
||||
X_Kharoshthi, // 60 Khar RTypeNone
|
||||
X_Balinese, // 61 Bali RTypeNone
|
||||
X_Cuneiform, // 62 Xsux RTypeNone
|
||||
X_Phoenician, // 63 Phnx RTypeNone
|
||||
X_Phags_Pa, // 64 Phag RTypeNone
|
||||
X_Nko, // 65 Nkoo RTypeNone
|
||||
X_Sundanese, // 66 Sund RTypeNone
|
||||
X_Lepcha, // 67 Lepc RTypeNone
|
||||
X_Ol_Chiki, // 68 Olck RTypeNone
|
||||
X_Vai, // 69 Vaii RTypeNone
|
||||
X_Saurashtra, // 70 Saur RTypeNone
|
||||
X_Kayah_Li, // 71 Kali RTypeNone
|
||||
X_Rejang, // 72 Rjng RTypeNone
|
||||
X_Lycian, // 73 Lyci RTypeNone
|
||||
X_Carian, // 74 Cari RTypeNone
|
||||
X_Lydian, // 75 Lydi RTypeNone
|
||||
X_Cham, // 76 Cham RTypeNone
|
||||
X_Tai_Tham, // 77 Lana RTypeNone
|
||||
X_Tai_Viet, // 78 Tavt RTypeNone
|
||||
X_Avestan, // 79 Avst RTypeNone
|
||||
X_Egyptian_Hieroglyphs, // 80 Egyp RTypeNone
|
||||
X_Samaritan, // 81 Samr RTypeNone
|
||||
X_Lisu, // 82 Lisu RTypeNone
|
||||
X_Bamum, // 83 Bamu RTypeNone
|
||||
X_Javanese, // 84 Java RTypeNone
|
||||
X_Meetei_Mayek, // 85 Mtei RTypeNone
|
||||
X_Imperial_Aramaic, // 86 Armi RTypeNone
|
||||
X_Old_South_Arabian, // 87 Sarb RTypeNone
|
||||
X_Inscriptional_Parthian, // 88 Prti RTypeNone
|
||||
X_Inscriptional_Pahlavi, // 89 Phli RTypeNone
|
||||
X_Old_Turkic, // 90 Orkh RTypeNone
|
||||
X_Kaithi, // 91 Kthi RTypeNone
|
||||
X_Batak, // 92 Batk RTypeNone
|
||||
X_Brahmi, // 93 Brah RTypeNone
|
||||
X_Mandaic, // 94 Mand RTypeNone
|
||||
X_Chakma, // 95 Cakm RTypeNone
|
||||
X_Meroitic_Cursive, // 96 Merc RTypeNone
|
||||
X_Meroitic_Hieroglyphs, // 97 Mero RTypeNone
|
||||
X_Miao, // 98 Plrd RTypeNone
|
||||
X_Sharada, // 99 Shrd RTypeNone
|
||||
X_Sora_Sompeng, // 100 Sora RTypeNone
|
||||
X_Takri, // 101 Takr RTypeNone
|
||||
};
|
||||
|
||||
// Alphabetical order for binary search
|
||||
extern const int kNameToULScriptSize = 105;
|
||||
extern const CharIntPair kNameToULScript[kNameToULScriptSize] = {
|
||||
{"Arabic", 6}, // Arab
|
||||
{"Armenian", 4}, // Armn
|
||||
{"Avestan", 79}, // Avst
|
||||
{"Balinese", 61}, // Bali
|
||||
{"Bamum", 83}, // Bamu
|
||||
{"Batak", 92}, // Batk
|
||||
{"Bengali", 10}, // Beng
|
||||
{"Bopomofo", 34}, // Bopo
|
||||
{"Brahmi", 93}, // Brah
|
||||
{"Braille", 52}, // Brai
|
||||
{"Buginese", 53}, // Bugi
|
||||
{"Buhid", 43}, // Buhd
|
||||
{"Canadian_Aboriginal", 27}, // Cans
|
||||
{"Carian", 74}, // Cari
|
||||
{"Chakma", 95}, // Cakm
|
||||
{"Cham", 76}, // Cham
|
||||
{"Cherokee", 26}, // Cher
|
||||
{"Common", 0}, // Zyyy
|
||||
{"Coptic", 54}, // Copt
|
||||
{"Cuneiform", 62}, // Xsux
|
||||
{"Cypriot", 51}, // Cprt
|
||||
{"Cyrillic", 3}, // Cyrl
|
||||
{"Deseret", 39}, // Dsrt
|
||||
{"Devanagari", 9}, // Deva
|
||||
{"Egyptian_Hieroglyphs", 80}, // Egyp
|
||||
{"Ethiopic", 25}, // Ethi
|
||||
{"Georgian", 23}, // Geor
|
||||
{"Glagolitic", 56}, // Glag
|
||||
{"Gothic", 38}, // Goth
|
||||
{"Greek", 2}, // Grek
|
||||
{"Gujarati", 12}, // Gujr
|
||||
{"Gurmukhi", 11}, // Guru
|
||||
{"Han", 24}, // Hant
|
||||
{"Han", 24}, // Hans
|
||||
{"Han", 24}, // Hani
|
||||
{"Hangul", 24}, // Hang
|
||||
{"Hani", 24}, // Hani
|
||||
{"Hanunoo", 42}, // Hano
|
||||
{"Hebrew", 5}, // Hebr
|
||||
{"Hiragana", 24}, // Hira
|
||||
{"Imperial_Aramaic", 86}, // Armi
|
||||
{"Inherited", 40}, // Zinh
|
||||
{"Inscriptional_Pahlavi", 89}, // Phli
|
||||
{"Inscriptional_Parthian", 88}, // Prti
|
||||
{"Javanese", 84}, // Java
|
||||
{"Kaithi", 91}, // Kthi
|
||||
{"Kannada", 16}, // Knda
|
||||
{"Katakana", 24}, // Kana
|
||||
{"Kayah_Li", 71}, // Kali
|
||||
{"Kharoshthi", 60}, // Khar
|
||||
{"Khmer", 30}, // Khmr
|
||||
{"Lao", 20}, // Laoo
|
||||
{"Latin", 1}, // Latn
|
||||
{"Lepcha", 67}, // Lepc
|
||||
{"Limbu", 45}, // Limb
|
||||
{"Linear_B", 47}, // Linb
|
||||
{"Lisu", 82}, // Lisu
|
||||
{"Lycian", 73}, // Lyci
|
||||
{"Lydian", 75}, // Lydi
|
||||
{"Malayalam", 17}, // Mlym
|
||||
{"Mandaic", 94}, // Mand
|
||||
{"Meetei_Mayek", 85}, // Mtei
|
||||
{"Meroitic_Cursive", 96}, // Merc
|
||||
{"Meroitic_Hieroglyphs", 97}, // Mero
|
||||
{"Miao", 98}, // Plrd
|
||||
{"Mongolian", 31}, // Mong
|
||||
{"Myanmar", 22}, // Mymr
|
||||
{"New_Tai_Lue", 55}, // Talu
|
||||
{"Nko", 65}, // Nkoo
|
||||
{"Ogham", 28}, // Ogam
|
||||
{"Ol_Chiki", 68}, // Olck
|
||||
{"Old_Italic", 37}, // Ital
|
||||
{"Old_Persian", 59}, // Xpeo
|
||||
{"Old_South_Arabian", 87}, // Sarb
|
||||
{"Old_Turkic", 90}, // Orkh
|
||||
{"Oriya", 13}, // Orya
|
||||
{"Osmanya", 50}, // Osma
|
||||
{"Phags_Pa", 64}, // Phag
|
||||
{"Phoenician", 63}, // Phnx
|
||||
{"Rejang", 72}, // Rjng
|
||||
{"Runic", 29}, // Runr
|
||||
{"Samaritan", 81}, // Samr
|
||||
{"Saurashtra", 70}, // Saur
|
||||
{"Sharada", 99}, // Shrd
|
||||
{"Shavian", 49}, // Shaw
|
||||
{"Sinhala", 18}, // Sinh
|
||||
{"Sora_Sompeng", 100}, // Sora
|
||||
{"Sundanese", 66}, // Sund
|
||||
{"Syloti_Nagri", 58}, // Sylo
|
||||
{"Syriac", 7}, // Syrc
|
||||
{"Tagalog", 41}, // Tglg
|
||||
{"Tagbanwa", 44}, // Tagb
|
||||
{"Tai_Le", 46}, // Tale
|
||||
{"Tai_Tham", 77}, // Lana
|
||||
{"Tai_Viet", 78}, // Tavt
|
||||
{"Takri", 101}, // Takr
|
||||
{"Tamil", 14}, // Taml
|
||||
{"Telugu", 15}, // Telu
|
||||
{"Thaana", 8}, // Thaa
|
||||
{"Thai", 19}, // Thai
|
||||
{"Tibetan", 21}, // Tibt
|
||||
{"Tifinagh", 57}, // Tfng
|
||||
{"Ugaritic", 48}, // Ugar
|
||||
{"Vai", 69}, // Vaii
|
||||
{"Yi", 36}, // Yiii
|
||||
};
|
||||
|
||||
// Alphabetical order for binary search
|
||||
extern const int kCodeToULScriptSize = 105;
|
||||
extern const CharIntPair kCodeToULScript[kNameToULScriptSize] = {
|
||||
{"Arab", 6}, // Arab
|
||||
{"Armi", 86}, // Armi
|
||||
{"Armn", 4}, // Armn
|
||||
{"Avst", 79}, // Avst
|
||||
{"Bali", 61}, // Bali
|
||||
{"Bamu", 83}, // Bamu
|
||||
{"Batk", 92}, // Batk
|
||||
{"Beng", 10}, // Beng
|
||||
{"Bopo", 34}, // Bopo
|
||||
{"Brah", 93}, // Brah
|
||||
{"Brai", 52}, // Brai
|
||||
{"Bugi", 53}, // Bugi
|
||||
{"Buhd", 43}, // Buhd
|
||||
{"Cakm", 95}, // Cakm
|
||||
{"Cans", 27}, // Cans
|
||||
{"Cari", 74}, // Cari
|
||||
{"Cham", 76}, // Cham
|
||||
{"Cher", 26}, // Cher
|
||||
{"Copt", 54}, // Copt
|
||||
{"Cprt", 51}, // Cprt
|
||||
{"Cyrl", 3}, // Cyrl
|
||||
{"Deva", 9}, // Deva
|
||||
{"Dsrt", 39}, // Dsrt
|
||||
{"Egyp", 80}, // Egyp
|
||||
{"Ethi", 25}, // Ethi
|
||||
{"Geor", 23}, // Geor
|
||||
{"Glag", 56}, // Glag
|
||||
{"Goth", 38}, // Goth
|
||||
{"Grek", 2}, // Grek
|
||||
{"Gujr", 12}, // Gujr
|
||||
{"Guru", 11}, // Guru
|
||||
{"Hang", 24}, // Hang
|
||||
{"Hani", 24}, // Hani
|
||||
{"Hani", 24}, // Hani
|
||||
{"Hano", 42}, // Hano
|
||||
{"Hans", 24}, // Hans
|
||||
{"Hant", 24}, // Hant
|
||||
{"Hebr", 5}, // Hebr
|
||||
{"Hira", 24}, // Hira
|
||||
{"Ital", 37}, // Ital
|
||||
{"Java", 84}, // Java
|
||||
{"Kali", 71}, // Kali
|
||||
{"Kana", 24}, // Kana
|
||||
{"Khar", 60}, // Khar
|
||||
{"Khmr", 30}, // Khmr
|
||||
{"Knda", 16}, // Knda
|
||||
{"Kthi", 91}, // Kthi
|
||||
{"Lana", 77}, // Lana
|
||||
{"Laoo", 20}, // Laoo
|
||||
{"Latn", 1}, // Latn
|
||||
{"Lepc", 67}, // Lepc
|
||||
{"Limb", 45}, // Limb
|
||||
{"Linb", 47}, // Linb
|
||||
{"Lisu", 82}, // Lisu
|
||||
{"Lyci", 73}, // Lyci
|
||||
{"Lydi", 75}, // Lydi
|
||||
{"Mand", 94}, // Mand
|
||||
{"Merc", 96}, // Merc
|
||||
{"Mero", 97}, // Mero
|
||||
{"Mlym", 17}, // Mlym
|
||||
{"Mong", 31}, // Mong
|
||||
{"Mtei", 85}, // Mtei
|
||||
{"Mymr", 22}, // Mymr
|
||||
{"Nkoo", 65}, // Nkoo
|
||||
{"Ogam", 28}, // Ogam
|
||||
{"Olck", 68}, // Olck
|
||||
{"Orkh", 90}, // Orkh
|
||||
{"Orya", 13}, // Orya
|
||||
{"Osma", 50}, // Osma
|
||||
{"Phag", 64}, // Phag
|
||||
{"Phli", 89}, // Phli
|
||||
{"Phnx", 63}, // Phnx
|
||||
{"Plrd", 98}, // Plrd
|
||||
{"Prti", 88}, // Prti
|
||||
{"Rjng", 72}, // Rjng
|
||||
{"Runr", 29}, // Runr
|
||||
{"Samr", 81}, // Samr
|
||||
{"Sarb", 87}, // Sarb
|
||||
{"Saur", 70}, // Saur
|
||||
{"Shaw", 49}, // Shaw
|
||||
{"Shrd", 99}, // Shrd
|
||||
{"Sinh", 18}, // Sinh
|
||||
{"Sora", 100}, // Sora
|
||||
{"Sund", 66}, // Sund
|
||||
{"Sylo", 58}, // Sylo
|
||||
{"Syrc", 7}, // Syrc
|
||||
{"Tagb", 44}, // Tagb
|
||||
{"Takr", 101}, // Takr
|
||||
{"Tale", 46}, // Tale
|
||||
{"Talu", 55}, // Talu
|
||||
{"Taml", 14}, // Taml
|
||||
{"Tavt", 78}, // Tavt
|
||||
{"Telu", 15}, // Telu
|
||||
{"Tfng", 57}, // Tfng
|
||||
{"Tglg", 41}, // Tglg
|
||||
{"Thaa", 8}, // Thaa
|
||||
{"Thai", 19}, // Thai
|
||||
{"Tibt", 21}, // Tibt
|
||||
{"Ugar", 48}, // Ugar
|
||||
{"Vaii", 69}, // Vaii
|
||||
{"Xpeo", 59}, // Xpeo
|
||||
{"Xsux", 62}, // Xsux
|
||||
{"Yiii", 36}, // Yiii
|
||||
{"Zinh", 40}, // Zinh
|
||||
{"Zyyy", 0}, // Zyyy
|
||||
};
|
||||
|
||||
} // namespace CLD2
|
||||
140
internal/generated_ulscript.h
Normal file
140
internal/generated_ulscript.h
Normal file
@@ -0,0 +1,140 @@
|
||||
// Copyright 2013 Google Inc. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
// generated_ulscript.h
|
||||
// Machine generated. Do Not Edit.
|
||||
//
|
||||
// Declarations for scripts recognized by CLD2
|
||||
//
|
||||
|
||||
#ifndef I18N_ENCODINGS_CLD2_INTERNAL_GENERATED_ULSCRIPT_H__
|
||||
#define I18N_ENCODINGS_CLD2_INTERNAL_GENERATED_ULSCRIPT_H__
|
||||
|
||||
namespace CLD2 {
|
||||
|
||||
typedef enum {RTypeNone = 0, RTypeOne, RTypeMany, RTypeCJK} ULScriptRType;
|
||||
|
||||
typedef struct {const char* s; int i;} CharIntPair;
|
||||
|
||||
typedef enum {
|
||||
ULScript_Common = 0, // Zyyy
|
||||
ULScript_Latin = 1, // Latn
|
||||
ULScript_Greek = 2, // Grek
|
||||
ULScript_Cyrillic = 3, // Cyrl
|
||||
ULScript_Armenian = 4, // Armn
|
||||
ULScript_Hebrew = 5, // Hebr
|
||||
ULScript_Arabic = 6, // Arab
|
||||
ULScript_Syriac = 7, // Syrc
|
||||
ULScript_Thaana = 8, // Thaa
|
||||
ULScript_Devanagari = 9, // Deva
|
||||
ULScript_Bengali = 10, // Beng
|
||||
ULScript_Gurmukhi = 11, // Guru
|
||||
ULScript_Gujarati = 12, // Gujr
|
||||
ULScript_Oriya = 13, // Orya
|
||||
ULScript_Tamil = 14, // Taml
|
||||
ULScript_Telugu = 15, // Telu
|
||||
ULScript_Kannada = 16, // Knda
|
||||
ULScript_Malayalam = 17, // Mlym
|
||||
ULScript_Sinhala = 18, // Sinh
|
||||
ULScript_Thai = 19, // Thai
|
||||
ULScript_Lao = 20, // Laoo
|
||||
ULScript_Tibetan = 21, // Tibt
|
||||
ULScript_Myanmar = 22, // Mymr
|
||||
ULScript_Georgian = 23, // Geor
|
||||
ULScript_Hani = 24, // Hani
|
||||
ULScript_Ethiopic = 25, // Ethi
|
||||
ULScript_Cherokee = 26, // Cher
|
||||
ULScript_Canadian_Aboriginal = 27, // Cans
|
||||
ULScript_Ogham = 28, // Ogam
|
||||
ULScript_Runic = 29, // Runr
|
||||
ULScript_Khmer = 30, // Khmr
|
||||
ULScript_Mongolian = 31, // Mong
|
||||
ULScript_32 = 32, //
|
||||
ULScript_33 = 33, //
|
||||
ULScript_Bopomofo = 34, // Bopo
|
||||
ULScript_35 = 35, //
|
||||
ULScript_Yi = 36, // Yiii
|
||||
ULScript_Old_Italic = 37, // Ital
|
||||
ULScript_Gothic = 38, // Goth
|
||||
ULScript_Deseret = 39, // Dsrt
|
||||
ULScript_Inherited = 40, // Zinh
|
||||
ULScript_Tagalog = 41, // Tglg
|
||||
ULScript_Hanunoo = 42, // Hano
|
||||
ULScript_Buhid = 43, // Buhd
|
||||
ULScript_Tagbanwa = 44, // Tagb
|
||||
ULScript_Limbu = 45, // Limb
|
||||
ULScript_Tai_Le = 46, // Tale
|
||||
ULScript_Linear_B = 47, // Linb
|
||||
ULScript_Ugaritic = 48, // Ugar
|
||||
ULScript_Shavian = 49, // Shaw
|
||||
ULScript_Osmanya = 50, // Osma
|
||||
ULScript_Cypriot = 51, // Cprt
|
||||
ULScript_Braille = 52, // Brai
|
||||
ULScript_Buginese = 53, // Bugi
|
||||
ULScript_Coptic = 54, // Copt
|
||||
ULScript_New_Tai_Lue = 55, // Talu
|
||||
ULScript_Glagolitic = 56, // Glag
|
||||
ULScript_Tifinagh = 57, // Tfng
|
||||
ULScript_Syloti_Nagri = 58, // Sylo
|
||||
ULScript_Old_Persian = 59, // Xpeo
|
||||
ULScript_Kharoshthi = 60, // Khar
|
||||
ULScript_Balinese = 61, // Bali
|
||||
ULScript_Cuneiform = 62, // Xsux
|
||||
ULScript_Phoenician = 63, // Phnx
|
||||
ULScript_Phags_Pa = 64, // Phag
|
||||
ULScript_Nko = 65, // Nkoo
|
||||
ULScript_Sundanese = 66, // Sund
|
||||
ULScript_Lepcha = 67, // Lepc
|
||||
ULScript_Ol_Chiki = 68, // Olck
|
||||
ULScript_Vai = 69, // Vaii
|
||||
ULScript_Saurashtra = 70, // Saur
|
||||
ULScript_Kayah_Li = 71, // Kali
|
||||
ULScript_Rejang = 72, // Rjng
|
||||
ULScript_Lycian = 73, // Lyci
|
||||
ULScript_Carian = 74, // Cari
|
||||
ULScript_Lydian = 75, // Lydi
|
||||
ULScript_Cham = 76, // Cham
|
||||
ULScript_Tai_Tham = 77, // Lana
|
||||
ULScript_Tai_Viet = 78, // Tavt
|
||||
ULScript_Avestan = 79, // Avst
|
||||
ULScript_Egyptian_Hieroglyphs = 80, // Egyp
|
||||
ULScript_Samaritan = 81, // Samr
|
||||
ULScript_Lisu = 82, // Lisu
|
||||
ULScript_Bamum = 83, // Bamu
|
||||
ULScript_Javanese = 84, // Java
|
||||
ULScript_Meetei_Mayek = 85, // Mtei
|
||||
ULScript_Imperial_Aramaic = 86, // Armi
|
||||
ULScript_Old_South_Arabian = 87, // Sarb
|
||||
ULScript_Inscriptional_Parthian = 88, // Prti
|
||||
ULScript_Inscriptional_Pahlavi = 89, // Phli
|
||||
ULScript_Old_Turkic = 90, // Orkh
|
||||
ULScript_Kaithi = 91, // Kthi
|
||||
ULScript_Batak = 92, // Batk
|
||||
ULScript_Brahmi = 93, // Brah
|
||||
ULScript_Mandaic = 94, // Mand
|
||||
ULScript_Chakma = 95, // Cakm
|
||||
ULScript_Meroitic_Cursive = 96, // Merc
|
||||
ULScript_Meroitic_Hieroglyphs = 97, // Mero
|
||||
ULScript_Miao = 98, // Plrd
|
||||
ULScript_Sharada = 99, // Shrd
|
||||
ULScript_Sora_Sompeng = 100, // Sora
|
||||
ULScript_Takri = 101, // Takr
|
||||
NUM_ULSCRIPTS
|
||||
} ULScript;
|
||||
|
||||
#define UNKNOWN_ULSCRIPT ULScript_Common
|
||||
|
||||
} // namespace CLD2
|
||||
|
||||
#endif // I18N_ENCODINGS_CLD2_INTERNAL_GENERATED_ULSCRIPT_H__
|
||||
1094
internal/getonescriptspan.cc
Normal file
1094
internal/getonescriptspan.cc
Normal file
File diff suppressed because it is too large
Load Diff
110
internal/getonescriptspan.h
Normal file
110
internal/getonescriptspan.h
Normal file
@@ -0,0 +1,110 @@
|
||||
// Copyright 2013 Google Inc. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
//
|
||||
// Author: dsites@google.com (Dick Sites)
|
||||
//
|
||||
|
||||
|
||||
#ifndef I18N_ENCODINGS_CLD2_INTERNAL_GETONESCRIPTSPAN_H_
|
||||
#define I18N_ENCODINGS_CLD2_INTERNAL_GETONESCRIPTSPAN_H_
|
||||
|
||||
#include "integral_types.h"
|
||||
#include "langspan.h"
|
||||
#include "offsetmap.h"
|
||||
|
||||
namespace CLD2 {
|
||||
|
||||
static const int kMaxScriptBuffer = 40960;
|
||||
static const int kMaxScriptLowerBuffer = (kMaxScriptBuffer * 3) / 2;
|
||||
static const int kMaxScriptBytes = kMaxScriptBuffer - 32; // Leave some room
|
||||
static const int kWithinScriptTail = 32; // Stop at word space in last
|
||||
// N bytes of script buffer
|
||||
|
||||
|
||||
static inline bool IsContinuationByte(char c) {
|
||||
return static_cast<signed char>(c) < -64;
|
||||
}
|
||||
|
||||
// Gets lscript number for letters; always returns
|
||||
// 0 (common script) for non-letters
|
||||
int GetUTF8LetterScriptNum(const char* src);
|
||||
|
||||
// Update src pointer to point to next quadgram, +2..+5
|
||||
// Looks at src[0..4]
|
||||
const char* AdvanceQuad(const char* src);
|
||||
|
||||
|
||||
class ScriptScanner {
|
||||
public:
|
||||
ScriptScanner(const char* buffer, int buffer_length, bool is_plain_text);
|
||||
ScriptScanner(const char* buffer, int buffer_length, bool is_plain_text,
|
||||
bool any_text, bool any_script);
|
||||
~ScriptScanner();
|
||||
|
||||
// Copy next run of same-script non-tag letters to buffer [NUL terminated]
|
||||
bool GetOneScriptSpan(LangSpan* span);
|
||||
|
||||
// Force Latin and Cyrillic scripts to be lowercase
|
||||
void LowerScriptSpan(LangSpan* span);
|
||||
|
||||
// Copy next run of same-script non-tag letters to buffer [NUL terminated]
|
||||
// Force Latin and Cyrillic scripts to be lowercase
|
||||
bool GetOneScriptSpanLower(LangSpan* span);
|
||||
|
||||
// Copy next run of non-tag characters to buffer [NUL terminated]
|
||||
// This just removes tags and removes entities
|
||||
// Buffer has leading space
|
||||
bool GetOneTextSpan(LangSpan* span);
|
||||
|
||||
// Maps byte offset in most recent GetOneScriptSpan/Lower
|
||||
// span->text [0..text_bytes] into an additional byte offset from
|
||||
// span->offset, to get back to corresponding text in the original
|
||||
// input buffer.
|
||||
// text_offset must be the first byte
|
||||
// of a UTF-8 character, or just beyond the last character. Normally this
|
||||
// routine is called with the first byte of an interesting range and
|
||||
// again with the first byte of the following range.
|
||||
int MapBack(int text_offset);
|
||||
|
||||
const char* GetBufferStart() {return start_byte_;};
|
||||
|
||||
private:
|
||||
// Skip over tags and non-letters
|
||||
int SkipToFrontOfSpan(const char* src, int len, int* script);
|
||||
|
||||
const char* start_byte_; // Starting byte of buffer to scan
|
||||
const char* next_byte_; // First unscanned byte
|
||||
const char* next_byte_limit_; // Last byte + 1
|
||||
int byte_length_; // Bytes left: next_byte_limit_ - next_byte_
|
||||
|
||||
bool is_plain_text_; // true fo text, false for HTML
|
||||
char* script_buffer_; // Holds text with expanded entities
|
||||
char* script_buffer_lower_; // Holds lowercased text
|
||||
bool letters_marks_only_; // To distinguish scriptspan of one
|
||||
// letters/marks vs. any mixture of text
|
||||
bool one_script_only_; // To distinguish scriptspan of one
|
||||
// script vs. any mixture of scripts
|
||||
int exit_state_; // For tag parser kTagParseTbl_0, based
|
||||
// on letters_marks_only_
|
||||
public :
|
||||
// Expose for debugging
|
||||
OffsetMap map2original_; // map from script_buffer_ to buffer
|
||||
OffsetMap map2uplow_; // map from script_buffer_lower_ to script_buffer_
|
||||
};
|
||||
|
||||
} // namespace CLD2
|
||||
|
||||
#endif // I18N_ENCODINGS_CLD2_INTERNAL_GETONESCRIPTSPAN_H_
|
||||
|
||||
31
internal/integral_types.h
Normal file
31
internal/integral_types.h
Normal file
@@ -0,0 +1,31 @@
|
||||
// Copyright 2013 Google Inc. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
// Cheap version
|
||||
namespace CLD2 {
|
||||
|
||||
typedef unsigned char uint8;
|
||||
typedef unsigned short uint16;
|
||||
typedef unsigned int uint32;
|
||||
typedef unsigned long long int uint64;
|
||||
|
||||
typedef signed char int8;
|
||||
typedef signed short int16;
|
||||
typedef signed int int32;
|
||||
typedef signed long long int int64;
|
||||
|
||||
typedef int32 char32;
|
||||
|
||||
} // End namespace CLD2
|
||||
|
||||
546
internal/lang_script.cc
Normal file
546
internal/lang_script.cc
Normal file
@@ -0,0 +1,546 @@
|
||||
// Copyright 2013 Google Inc. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
//
|
||||
// File: lang_script.cc
|
||||
// ================
|
||||
//
|
||||
// Author: dsites@google.com (Dick Sites)
|
||||
//
|
||||
// This file declares language and script numbers and names for CLD2
|
||||
//
|
||||
|
||||
#include "lang_script.h"
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
#include "generated_language.h"
|
||||
#include "generated_ulscript.h"
|
||||
|
||||
namespace CLD2 {
|
||||
|
||||
// Language tables
|
||||
// Subscripted by enum Language
|
||||
extern const int kLanguageToNameSize;
|
||||
extern const char* const kLanguageToName[];
|
||||
extern const int kLanguageToCodeSize;
|
||||
extern const char* const kLanguageToCode[];
|
||||
extern const int kLanguageToCNameSize;
|
||||
extern const char* const kLanguageToCName[];
|
||||
extern const int kLanguageToScriptsSize;
|
||||
extern const FourScripts kLanguageToScripts[];
|
||||
|
||||
// Subscripted by Language
|
||||
extern const int kLanguageToPLangSize;
|
||||
extern const uint8 kLanguageToPLang[];
|
||||
// Subscripted by per-script language
|
||||
extern const uint16 kPLangToLanguageLatn[];
|
||||
extern const uint16 kPLangToLanguageOthr[];
|
||||
|
||||
// Alphabetical order for binary search
|
||||
extern const int kNameToLanguageSize;
|
||||
extern const CharIntPair kNameToLanguage[];
|
||||
extern const int kCodeToLanguageSize;
|
||||
extern const CharIntPair kCodeToLanguage[];
|
||||
|
||||
// ULScript tables
|
||||
// Subscripted by enum ULScript
|
||||
extern const int kULScriptToNameSize;
|
||||
extern const char* const kULScriptToName[];
|
||||
extern const int kULScriptToCodeSize;
|
||||
extern const char* const kULScriptToCode[];
|
||||
extern const int kULScriptToCNameSize;
|
||||
extern const char* const kULScriptToCName[];
|
||||
extern const int kULScriptToRtypeSize;
|
||||
extern const ULScriptRType kULScriptToRtype[];
|
||||
extern const int kULScriptToDefaultLangSize;
|
||||
extern const Language kULScriptToDefaultLang[];
|
||||
|
||||
// Alphabetical order for binary search
|
||||
extern const int kNameToULScriptSize;
|
||||
extern const CharIntPair kNameToULScript[];
|
||||
extern const int kCodeToULScriptSize;
|
||||
extern const CharIntPair kCodeToULScript[];
|
||||
|
||||
|
||||
//
|
||||
// File: lang_script.h
|
||||
// ================
|
||||
//
|
||||
// Author: dsites@google.com (Dick Sites)
|
||||
//
|
||||
// This file declares language and script numbers and names for CLD2
|
||||
//
|
||||
|
||||
|
||||
// NOTE: The script numbers and language numbers here are not guaranteed to be
|
||||
// stable. If you want to record a result for posterity, save the ISO codes
|
||||
// as character strings.
|
||||
//
|
||||
//
|
||||
// The Unicode scripts recognized by CLD2 are numbered almost arbitrarily,
|
||||
// specified in an enum. Each script has human-readable script name and a
|
||||
// 4-letter ISO 15924 script code. Each has a C name (largely for use by
|
||||
// programs that generate declarations in cld2_generated_scripts.h). Each
|
||||
// also has a recognition type
|
||||
// r_type: 0 script-only, 1 nilgrams, 2 quadgrams, 3 CJK
|
||||
//
|
||||
// The declarations for a particular version of Unicode are machine-generated in
|
||||
// cld2_generated_scripts.h
|
||||
//
|
||||
// This file includes that one and declares the access routines. The type
|
||||
// involved is called "ULScript" to signify Unicode Letters-Marks Scripts,
|
||||
// which are not quite Unicode Scripts. In particular, the CJK scripts are
|
||||
// merged into a single number because CLD2 recognizes the CJK languages from
|
||||
// four scripts intermixed: Hani (both Hans and Hant), Hangul, Hiragana, and
|
||||
// Katakana.
|
||||
|
||||
// Each script has one of these four recognition types.
|
||||
// RTypeNone: There is no language associated with this script. In extended
|
||||
// language recognition calls, return a fake language number that maps to
|
||||
// xx-Cham, with literally "xx" for the language code,and with the script
|
||||
// code instead of "Cham". In non-extended calls, return UNKNOWN_LANGUAGE.
|
||||
// RTypeOne: The script maps 1:1 to a single language. No letters are examined
|
||||
// during recognition and no lookups done.
|
||||
// RTypeMany: The usual quadgram + delta-octagram + distinctive-words scoring
|
||||
// is done to determine the languages involved.
|
||||
// RTypeCJK: The CJK unigram + delta-bigram scoring is done to determine the
|
||||
// languages involved.
|
||||
//
|
||||
// Note that the choice of recognition type is a function of script, not
|
||||
// language. In particular, some languges are recognized in multiple scripts
|
||||
// and those have different recognition types (Mongolian mn-Latn vs. mn-Mong
|
||||
// for example).
|
||||
|
||||
//----------------------------------------------------------------------------//
|
||||
// Functions of ULScript //
|
||||
//----------------------------------------------------------------------------//
|
||||
|
||||
// If the input is out of range or otherwise unrecognized, it is treated
|
||||
// as UNKNOWN_ULSCRIPT (which never participates in language recognition)
|
||||
const char* ULScriptName(ULScript ulscript) {
|
||||
int i_ulscript = ulscript;
|
||||
if (i_ulscript < 0) {i_ulscript = UNKNOWN_ULSCRIPT;}
|
||||
if (i_ulscript >= NUM_ULSCRIPTS) {i_ulscript = UNKNOWN_ULSCRIPT;}
|
||||
return kULScriptToName[i_ulscript];
|
||||
}
|
||||
|
||||
const char* ULScriptCode(ULScript ulscript) {
|
||||
int i_ulscript = ulscript;
|
||||
if (i_ulscript < 0) {i_ulscript = UNKNOWN_ULSCRIPT;}
|
||||
if (i_ulscript >= NUM_ULSCRIPTS) {i_ulscript = UNKNOWN_ULSCRIPT;}
|
||||
return kULScriptToCode[i_ulscript];
|
||||
}
|
||||
|
||||
const char* ULScriptDeclaredName(ULScript ulscript) {
|
||||
int i_ulscript = ulscript;
|
||||
if (i_ulscript < 0) {i_ulscript = UNKNOWN_ULSCRIPT;}
|
||||
if (i_ulscript >= NUM_ULSCRIPTS) {i_ulscript = UNKNOWN_ULSCRIPT;}
|
||||
return kULScriptToCName[i_ulscript];
|
||||
}
|
||||
|
||||
ULScriptRType ULScriptRecognitionType(ULScript ulscript) {
|
||||
int i_ulscript = ulscript;
|
||||
if (i_ulscript < 0) {i_ulscript = UNKNOWN_ULSCRIPT;}
|
||||
if (i_ulscript >= NUM_ULSCRIPTS) {i_ulscript = UNKNOWN_ULSCRIPT;}
|
||||
return kULScriptToRtype[i_ulscript];
|
||||
}
|
||||
|
||||
|
||||
|
||||
// The languages recognized by CLD2 are numbered almost arbitrarily,
|
||||
// specified in an enum. Each language has human-readable language name and a
|
||||
// 2- or 3-letter ISO 639 language code. Each has a C name (largely for use by
|
||||
// programs that generate declarations in cld2_generated_languagess.h).
|
||||
// Each has a list of up to four scripts in which it is currently recognized.
|
||||
//
|
||||
// The declarations for a particular set of recognized languages are
|
||||
// machine-generated in
|
||||
// cld2_generated_languages.h
|
||||
//
|
||||
// The Language enum is intended to match the internal Google Language enum
|
||||
// in i18n/languages/proto/languages.proto up to NUM_LANGUAGES, with additional
|
||||
// languages assigned above that. Over time, some languages may be renumbered
|
||||
// if they are moved into the Language enum.
|
||||
//
|
||||
// The Language enum includes the fake language numbers for RTypeNone above.
|
||||
//
|
||||
// In an open-source environment, the Google-specific Language enum is not
|
||||
// available. Language decouples the two environments while maintaining
|
||||
// internal compatibility.
|
||||
|
||||
|
||||
// If the input is out of range or otherwise unrecognized, it is treated
|
||||
// as UNKNOWN_LANGUAGE
|
||||
//
|
||||
// LanguageCode
|
||||
// ------------
|
||||
// Given the Language, return the language code, e.g. "ko"
|
||||
// This is determined by
|
||||
// the following (in order of preference):
|
||||
// - ISO-639-1 two-letter language code
|
||||
// (all except those mentioned below)
|
||||
// - ISO-639-2 three-letter bibliographic language code
|
||||
// (Tibetan, Dhivehi, Cherokee, Syriac)
|
||||
// - Google-specific language code
|
||||
// (ChineseT ("zh-TW"), Teragram Unknown, Unknown,
|
||||
// Portuguese-Portugal, Portuguese-Brazil, Limbu)
|
||||
// - Fake RTypeNone names.
|
||||
|
||||
//----------------------------------------------------------------------------//
|
||||
// Functions of Language //
|
||||
//----------------------------------------------------------------------------//
|
||||
|
||||
const char* LanguageName(Language lang) {
|
||||
int i_lang = lang;
|
||||
if (i_lang < 0) {i_lang = UNKNOWN_LANGUAGE;}
|
||||
if (i_lang >= NUM_LANGUAGES) {i_lang = UNKNOWN_LANGUAGE;}
|
||||
return kLanguageToName[i_lang];
|
||||
}
|
||||
const char* LanguageCode(Language lang) {
|
||||
int i_lang = lang;
|
||||
if (i_lang < 0) {i_lang = UNKNOWN_LANGUAGE;}
|
||||
if (i_lang >= NUM_LANGUAGES) {i_lang = UNKNOWN_LANGUAGE;}
|
||||
return kLanguageToCode[i_lang];
|
||||
}
|
||||
|
||||
const char* LanguageDeclaredName(Language lang) {
|
||||
int i_lang = lang;
|
||||
if (i_lang < 0) {i_lang = UNKNOWN_LANGUAGE;}
|
||||
if (i_lang >= NUM_LANGUAGES) {i_lang = UNKNOWN_LANGUAGE;}
|
||||
return kLanguageToCName[i_lang];
|
||||
}
|
||||
|
||||
// n is in 0..3. Trailing entries are filled with
|
||||
// UNKNOWN_LANGUAGE (which never participates in language recognition)
|
||||
ULScript LanguageRecognizedScript(Language lang, int n) {
|
||||
int i_lang = lang;
|
||||
if (i_lang < 0) {i_lang = UNKNOWN_LANGUAGE;}
|
||||
if (i_lang >= NUM_LANGUAGES) {i_lang = UNKNOWN_LANGUAGE;}
|
||||
return static_cast<ULScript>(kLanguageToScripts[i_lang][n]);
|
||||
}
|
||||
|
||||
// Given the Language, returns its string name used as the output by
|
||||
// the lang/enc identifier, e.g. "Korean"
|
||||
// "invalid_language" if the input is invalid.
|
||||
// TG_UNKNOWN_LANGUAGE is used as a placeholder for the "ignore me" language,
|
||||
// used to subtract out HTML, link farms, DNA strings, and alittle English porn
|
||||
const char* ExtLanguageName(const Language lang) {
|
||||
return LanguageName(lang);
|
||||
}
|
||||
|
||||
// Given the Language, return the language code, e.g. "ko"
|
||||
const char* ExtLanguageCode(const Language lang) {
|
||||
return LanguageCode(lang);
|
||||
}
|
||||
|
||||
|
||||
// Given the Language, returns its Language enum spelling, for use by
|
||||
// programs that create C declarations, e.g. "KOREAN"
|
||||
// "UNKNOWN_LANGUAGE" if the input is invalid.
|
||||
const char* ExtLanguageDeclaredName(const Language lang) {
|
||||
return LanguageDeclaredName(lang);
|
||||
}
|
||||
|
||||
// Returns which set of statistically-close languages lang is in. 0 means none.
|
||||
int LanguageCloseSet(Language lang) {
|
||||
// Scaffolding
|
||||
// id ms # INDONESIAN MALAY coef=0.4698 Problematic w/o extra words
|
||||
// bo dz # TIBETAN DZONGKHA coef=0.4571
|
||||
// cs sk # CZECH SLOVAK coef=0.4273
|
||||
// zu xh # ZULU XHOSA coef=0.3716
|
||||
//
|
||||
// bs hr sr srm # BOSNIAN CROATIAN SERBIAN MONTENEGRIN
|
||||
// hi mr bh ne # HINDI MARATHI BIHARI NEPALI
|
||||
// no nn da # NORWEGIAN NORWEGIAN_N DANISH
|
||||
// gl es pt # GALICIAN SPANISH PORTUGUESE
|
||||
// rw rn # KINYARWANDA RUNDI
|
||||
|
||||
if (lang == INDONESIAN) {return 1;}
|
||||
if (lang == MALAY) {return 1;}
|
||||
|
||||
if (lang == TIBETAN) {return 2;}
|
||||
if (lang == DZONGKHA) {return 2;}
|
||||
|
||||
if (lang == CZECH) {return 3;}
|
||||
if (lang == SLOVAK) {return 3;}
|
||||
|
||||
if (lang == ZULU) {return 4;}
|
||||
if (lang == XHOSA) {return 4;}
|
||||
|
||||
if (lang == BOSNIAN) {return 5;}
|
||||
if (lang == CROATIAN) {return 5;}
|
||||
if (lang == SERBIAN) {return 5;}
|
||||
if (lang == MONTENEGRIN) {return 5;}
|
||||
|
||||
if (lang == HINDI) {return 6;}
|
||||
if (lang == MARATHI) {return 6;}
|
||||
if (lang == BIHARI) {return 6;}
|
||||
if (lang == NEPALI) {return 6;}
|
||||
|
||||
if (lang == NORWEGIAN) {return 7;}
|
||||
if (lang == NORWEGIAN_N) {return 7;}
|
||||
if (lang == DANISH) {return 7;}
|
||||
|
||||
if (lang == GALICIAN) {return 8;}
|
||||
if (lang == SPANISH) {return 8;}
|
||||
if (lang == PORTUGUESE) {return 8;}
|
||||
|
||||
if (lang == KINYARWANDA) {return 9;}
|
||||
if (lang == RUNDI) {return 9;}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------------//
|
||||
// Functions of ULScript and Language //
|
||||
//----------------------------------------------------------------------------//
|
||||
|
||||
Language DefaultLanguage(ULScript ulscript) {
|
||||
if (ulscript < 0) {return UNKNOWN_LANGUAGE;}
|
||||
if (ulscript >= NUM_ULSCRIPTS) {return UNKNOWN_LANGUAGE;}
|
||||
return kULScriptToDefaultLang[ulscript];
|
||||
}
|
||||
|
||||
uint8 PerScriptNumber(ULScript ulscript, Language lang) {
|
||||
if (ulscript < 0) {return 0;}
|
||||
if (ulscript >= NUM_ULSCRIPTS) {return 0;}
|
||||
if (kULScriptToRtype[ulscript] == RTypeNone) {return 1;}
|
||||
if (lang >= kLanguageToPLangSize) {return 0;}
|
||||
return kLanguageToPLang[lang];
|
||||
}
|
||||
|
||||
Language FromPerScriptNumber(ULScript ulscript, uint8 perscript_number) {
|
||||
if (ulscript < 0) {return UNKNOWN_LANGUAGE;}
|
||||
if (ulscript >= NUM_ULSCRIPTS) {return UNKNOWN_LANGUAGE;}
|
||||
if ((kULScriptToRtype[ulscript] == RTypeNone) ||
|
||||
(kULScriptToRtype[ulscript] == RTypeOne)) {
|
||||
return kULScriptToDefaultLang[ulscript];
|
||||
}
|
||||
|
||||
if (ulscript == ULScript_Latin) {
|
||||
return static_cast<Language>(kPLangToLanguageLatn[perscript_number]);
|
||||
} else {
|
||||
return static_cast<Language>(kPLangToLanguageOthr[perscript_number]);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
//----------------------------------------------------------------------------//
|
||||
// Other //
|
||||
//----------------------------------------------------------------------------//
|
||||
|
||||
// Returns mid if key found in lo <= mid < hi, else -1
|
||||
int BinarySearch(const char* key, int lo, int hi, const CharIntPair* cipair) {
|
||||
// binary search
|
||||
while (lo < hi) {
|
||||
int mid = (lo + hi) >> 1;
|
||||
if (strcmp(key, cipair[mid].s) < 0) {
|
||||
hi = mid;
|
||||
} else if (strcmp(key, cipair[mid].s) > 0) {
|
||||
lo = mid + 1;
|
||||
} else {
|
||||
return mid;
|
||||
}
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
Language MakeLang(int i) {return static_cast<Language>(i);}
|
||||
|
||||
// Name can be either full name or ISO code, or can be ISO code embedded in
|
||||
// a language-script combination such as "ABKHAZIAN", "en", "en-Latn-GB"
|
||||
Language GetLanguageFromName(const char* src) {
|
||||
const char* hyphen1 = strchr(src, '-');
|
||||
const char* hyphen2 = NULL;
|
||||
if (hyphen1 != NULL) {hyphen2 = strchr(hyphen1 + 1, '-');}
|
||||
|
||||
int match = -1;
|
||||
if (hyphen1 == NULL) {
|
||||
// Bare name. Look at full name, then code
|
||||
match = BinarySearch(src, 0, kNameToLanguageSize, kNameToLanguage);
|
||||
if (match >= 0) {return MakeLang(kNameToLanguage[match].i);} // aa
|
||||
match = BinarySearch(src, 0, kCodeToLanguageSize, kCodeToLanguage);
|
||||
if (match >= 0) {return MakeLang(kCodeToLanguage[match].i);} // aa
|
||||
return UNKNOWN_LANGUAGE;
|
||||
}
|
||||
|
||||
if (hyphen2 == NULL) {
|
||||
// aa-bb. Not a full name; must be code-something. Try zh-TW then bare zh
|
||||
match = BinarySearch(src, 0, kCodeToLanguageSize, kCodeToLanguage);
|
||||
if (match >= 0) {return MakeLang(kCodeToLanguage[match].i);} // aa-bb
|
||||
|
||||
int len = strlen(src);
|
||||
if (len >= 16) {return UNKNOWN_LANGUAGE;} // Real codes are shorter
|
||||
|
||||
char temp[16];
|
||||
int hyphen1_offset = hyphen1 - src;
|
||||
// Take off part after hyphen1
|
||||
memcpy(temp, src, len);
|
||||
temp[hyphen1_offset] = '\0';
|
||||
match = BinarySearch(temp, 0, kCodeToLanguageSize, kCodeToLanguage);
|
||||
if (match >= 0) {return MakeLang(kCodeToLanguage[match].i);} // aa
|
||||
|
||||
return UNKNOWN_LANGUAGE;
|
||||
}
|
||||
|
||||
// aa-bb-cc. Must be code-something. Try en-Latn-US, en-Latn, en-US, en
|
||||
match = BinarySearch(src, 0, kCodeToLanguageSize, kCodeToLanguage);
|
||||
if (match >= 0) {return MakeLang(kCodeToLanguage[match].i);} // aa-bb-cc
|
||||
|
||||
|
||||
int len = strlen(src);
|
||||
if (len >= 16) {return UNKNOWN_LANGUAGE;} // Real codes are shorter
|
||||
|
||||
char temp[16];
|
||||
int hyphen1_offset = hyphen1 - src;
|
||||
int hyphen2_offset = hyphen2 - src;
|
||||
// Take off part after hyphen2
|
||||
memcpy(temp, src, len);
|
||||
temp[hyphen2_offset] = '\0';
|
||||
match = BinarySearch(temp, 0, kCodeToLanguageSize, kCodeToLanguage);
|
||||
if (match >= 0) {return MakeLang(kCodeToLanguage[match].i);} // aa-bb
|
||||
|
||||
|
||||
// Take off part between hyphen1 and hyphen2
|
||||
int len2 = len - hyphen2_offset;
|
||||
memcpy(temp, src, len);
|
||||
memcpy(&temp[hyphen1_offset], hyphen2, len2);
|
||||
temp[hyphen1_offset + len2] = '\0';
|
||||
match = BinarySearch(temp, 0, kCodeToLanguageSize, kCodeToLanguage);
|
||||
if (match >= 0) {return MakeLang(kCodeToLanguage[match].i);} // aa-cc
|
||||
|
||||
|
||||
// Take off everything after hyphen1
|
||||
memcpy(temp, src, len);
|
||||
temp[hyphen1_offset] = '\0';
|
||||
match = BinarySearch(temp, 0, kCodeToLanguageSize, kCodeToLanguage);
|
||||
if (match >= 0) {return MakeLang(kCodeToLanguage[match].i);} // aa
|
||||
|
||||
|
||||
return UNKNOWN_LANGUAGE;
|
||||
}
|
||||
|
||||
|
||||
// Name can be either full name or ISO code, or can be ISO code embedded in
|
||||
// a language-script combination such as "en-Latn-GB"
|
||||
// MORE WORK to do here. also kLanguageToScripts [4] is bogus
|
||||
// if bare language name, no script, want zh, ja, ko to Hani, pt to Latn, etc.
|
||||
// Something like map code to Language, then Language to kLanguageToScripts[x][0]
|
||||
// ADD BIAS: kLanguageToScripts lists default script first
|
||||
// If total mismatch, reutrn Latn
|
||||
// if (strcmp(src, "nd") == 0) {return NDEBELE;} // [nd was wrong]
|
||||
// if (strcmp(src, "sit-NP-Limb") == 0) {return ULScript_Limbu;}
|
||||
|
||||
ULScript MakeULScr(int i) {return static_cast<ULScript>(i);}
|
||||
|
||||
ULScript GetULScriptFromName(const char* src) {
|
||||
const char* hyphen1 = strchr(src, '-');
|
||||
const char* hyphen2 = NULL;
|
||||
if (hyphen1 != NULL) {hyphen2 = strchr(hyphen1 + 1, '-');}
|
||||
|
||||
int match = -1;
|
||||
if (hyphen1 == NULL) {
|
||||
// Bare name. Look at full name, then code, then try backmapping as Language
|
||||
match = BinarySearch(src, 0, kNameToULScriptSize, kNameToULScript);
|
||||
if (match >= 0) {return MakeULScr(kNameToULScript[match].i);} // aa
|
||||
match = BinarySearch(src, 0, kCodeToULScriptSize, kCodeToULScript);
|
||||
if (match >= 0) {return MakeULScr(kCodeToULScript[match].i);} // aa
|
||||
|
||||
Language backmap_me = GetLanguageFromName(src);
|
||||
if (backmap_me != UNKNOWN_LANGUAGE) {
|
||||
return static_cast<ULScript>(kLanguageToScripts[backmap_me][0]);
|
||||
}
|
||||
return ULScript_Latin;
|
||||
}
|
||||
|
||||
if (hyphen2 == NULL) {
|
||||
// aa-bb. Not a full name; must be code-something. Try en-Latn, bare Latn
|
||||
if (strcmp(src, "zh-TW") == 0) {return ULScript_Hani;}
|
||||
if (strcmp(src, "zh-CN") == 0) {return ULScript_Hani;}
|
||||
if (strcmp(src, "sit-NP") == 0) {return ULScript_Limbu;}
|
||||
if (strcmp(src, "sit-Limb") == 0) {return ULScript_Limbu;}
|
||||
if (strcmp(src, "sr-ME") == 0) {return ULScript_Latin;}
|
||||
match = BinarySearch(src, 0, kCodeToULScriptSize, kCodeToULScript);
|
||||
if (match >= 0) {return MakeULScr(kCodeToULScript[match].i);} // aa-bb
|
||||
|
||||
int len = strlen(src);
|
||||
if (len >= 16) {return ULScript_Latin;} // Real codes are shorter
|
||||
|
||||
char temp[16];
|
||||
int hyphen1_offset = hyphen1 - src;
|
||||
int len1 = len - hyphen1_offset - 1; // Exclude the hyphen
|
||||
// Take off part before hyphen1
|
||||
memcpy(temp, hyphen1 + 1, len1);
|
||||
temp[len1] = '\0';
|
||||
match = BinarySearch(temp, 0, kCodeToULScriptSize, kCodeToULScript);
|
||||
if (match >= 0) {return MakeULScr(kCodeToULScript[match].i);} // bb
|
||||
|
||||
// Take off part after hyphen1
|
||||
memcpy(temp, src, len);
|
||||
temp[hyphen1_offset] = '\0';
|
||||
match = BinarySearch(temp, 0, kCodeToULScriptSize, kCodeToULScript);
|
||||
if (match >= 0) {return MakeULScr(kCodeToULScript[match].i);} // aa
|
||||
|
||||
return ULScript_Latin;
|
||||
}
|
||||
|
||||
// aa-bb-cc. Must be code-something. Try en-Latn-US, en-Latn, en-US, en
|
||||
if (strcmp(src, "sit-NP-Limb") == 0) {return ULScript_Limbu;}
|
||||
if (strcmp(src, "sr-ME-Latn") == 0) {return ULScript_Latin;}
|
||||
if (strcmp(src, "sr-ME-Cyrl") == 0) {return ULScript_Cyrillic;}
|
||||
match = BinarySearch(src, 0, kCodeToULScriptSize, kCodeToULScript);
|
||||
if (match >= 0) {return MakeULScr(kCodeToULScript[match].i);} // aa-bb-cc
|
||||
|
||||
int len = strlen(src);
|
||||
if (len >= 16) {return ULScript_Latin;} // Real codes are shorter
|
||||
|
||||
char temp[16];
|
||||
int hyphen1_offset = hyphen1 - src;
|
||||
int hyphen2_offset = hyphen2 - src;
|
||||
int len2 = len - hyphen2_offset - 1; // Exclude the hyphen
|
||||
int lenmid = hyphen2_offset - hyphen1_offset - 1; // Exclude the hyphen
|
||||
// Keep part between hyphen1 and hyphen2
|
||||
memcpy(temp, hyphen1 + 1, lenmid);
|
||||
temp[lenmid] = '\0';
|
||||
match = BinarySearch(temp, 0, kCodeToULScriptSize, kCodeToULScript);
|
||||
if (match >= 0) {return MakeULScr(kCodeToULScript[match].i);} // bb
|
||||
|
||||
// Keep part after hyphen2
|
||||
memcpy(temp, hyphen2 + 1, len2);
|
||||
temp[len2] = '\0';
|
||||
match = BinarySearch(temp, 0, kCodeToULScriptSize, kCodeToULScript);
|
||||
if (match >= 0) {return MakeULScr(kCodeToULScript[match].i);} // cc
|
||||
|
||||
// Keep part before hyphen1
|
||||
memcpy(temp, src, len);
|
||||
temp[hyphen1_offset] = '\0';
|
||||
match = BinarySearch(temp, 0, kCodeToULScriptSize, kCodeToULScript);
|
||||
if (match >= 0) {return MakeULScr(kCodeToULScript[match].i);} // aa
|
||||
|
||||
return ULScript_Latin;
|
||||
}
|
||||
|
||||
// Map script into Latin, Cyrillic, Arabic, Other
|
||||
int LScript4(ULScript ulscript) {
|
||||
if (ulscript == ULScript_Latin) {return 0;}
|
||||
if (ulscript == ULScript_Cyrillic) {return 1;}
|
||||
if (ulscript == ULScript_Arabic) {return 2;}
|
||||
return 3;
|
||||
}
|
||||
|
||||
} // namespace CLD2
|
||||
|
||||
180
internal/lang_script.h
Normal file
180
internal/lang_script.h
Normal file
@@ -0,0 +1,180 @@
|
||||
// Copyright 2013 Google Inc. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
//
|
||||
// File: lang_script.h
|
||||
// ================
|
||||
//
|
||||
// Author: dsites@google.com (Dick Sites)
|
||||
//
|
||||
// This file declares language and script numbers and names for CLD2,
|
||||
// plus routines that access side tables based on these
|
||||
//
|
||||
|
||||
#ifndef I18N_ENCODINGS_CLD2_LANG_SCRIPT_H__
|
||||
#define I18N_ENCODINGS_CLD2_LANG_SCRIPT_H__
|
||||
|
||||
#include "generated_language.h"
|
||||
#include "generated_ulscript.h"
|
||||
#include "integral_types.h"
|
||||
|
||||
|
||||
// NOTE: The script numbers and language numbers here are not guaranteed to be
|
||||
// stable. If you want to record a result for posterity, save the
|
||||
// ULScriptCode(ULScript ulscript) result as character strings.
|
||||
//
|
||||
// The Unicode scripts recognized by CLD2 are numbered almost arbitrarily,
|
||||
// specified in an enum. Each script has human-readable script name and a
|
||||
// 4-letter ISO 15924 script code. Each has a C name (largely for use by
|
||||
// programs that generate declarations in cld2_generated_scripts.h). Each
|
||||
// also has a recognition type
|
||||
// r_type: 0 script-only, 1 nilgrams, 2 quadgrams, 3 CJK
|
||||
//
|
||||
// The declarations for a particular version of Unicode are machine-generated in
|
||||
// generated_scripts.h
|
||||
//
|
||||
// This file includes that one and declares the access routines. The type
|
||||
// involved is called "ULScript" to signify Unicode Letters-Marks Scripts,
|
||||
// which are not quite Unicode Scripts. In particular, the CJK scripts are
|
||||
// merged into a single number because CLD2 recognizes the CJK languages from
|
||||
// four scripts intermixed: Hani (both Hans and Hant), Hangul, Hiragana, and
|
||||
// Katakana.
|
||||
|
||||
// Each script has one of these four recognition types.
|
||||
// RTypeNone: There is no language associated with this script. In extended
|
||||
// language recognition calls, return a fake language number that maps to
|
||||
// xx-Cham, with literally "xx" for the language code,and with the script
|
||||
// code instead of "Cham". In non-extended calls, return UNKNOWN_LANGUAGE.
|
||||
// RTypeOne: The script maps 1:1 to a single language. No letters are examined
|
||||
// during recognition and no lookups done.
|
||||
// RTypeMany: The usual quadgram + delta-octagram + distinctive-words scoring
|
||||
// is done to determine the languages involved.
|
||||
// RTypeCJK: The CJK unigram + delta-bigram scoring is done to determine the
|
||||
// languages involved.
|
||||
//
|
||||
// Note that the choice of recognition type is a function of script, not
|
||||
// language. In particular, some languges are recognized in multiple scripts
|
||||
// and those have different recognition types (Mongolian mn-Latn vs. mn-Mong
|
||||
// for example).
|
||||
|
||||
namespace CLD2 {
|
||||
|
||||
//----------------------------------------------------------------------------//
|
||||
// Functions of ULScript //
|
||||
//----------------------------------------------------------------------------//
|
||||
|
||||
// If the input is out of range or otherwise unrecognized, it is treated
|
||||
// as ULScript_Common (which never participates in language recognition)
|
||||
const char* ULScriptName(ULScript ulscript);
|
||||
const char* ULScriptCode(ULScript ulscript);
|
||||
const char* ULScriptDeclaredName(ULScript ulscript);
|
||||
ULScriptRType ULScriptRecognitionType(ULScript ulscript);
|
||||
|
||||
// Name can be either full name or ISO code, or can be ISO code embedded in
|
||||
// a language-script combination such as "en-Latn-GB"
|
||||
ULScript GetULScriptFromName(const char* src);
|
||||
|
||||
// Map script into Latin, Cyrillic, Arabic, Other
|
||||
int LScript4(ULScript ulscript);
|
||||
|
||||
//----------------------------------------------------------------------------//
|
||||
// Functions of Language //
|
||||
//----------------------------------------------------------------------------//
|
||||
|
||||
// The languages recognized by CLD2 are numbered almost arbitrarily,
|
||||
// specified in an enum. Each language has human-readable language name and a
|
||||
// 2- or 3-letter ISO 639 language code. Each has a C name (largely for use by
|
||||
// programs that generate declarations in cld2_generated_languagess.h).
|
||||
// Each has a list of up to four scripts in which it is currently recognized.
|
||||
//
|
||||
// The declarations for a particular set of recognized languages are
|
||||
// machine-generated in
|
||||
// generated_languages.h
|
||||
//
|
||||
// The Language enum is intended to match the internal Google Language enum
|
||||
// in i18n/languages/proto/languages.proto up to NUM_LANGUAGES, with additional
|
||||
// languages assigned above that. Over time, some languages may be renumbered
|
||||
// if they are moved into the Language enum.
|
||||
//
|
||||
// The Language enum includes the fake language numbers for RTypeNone above.
|
||||
//
|
||||
// In an open-source environment, the Google-specific Language enum is not
|
||||
// available. Language decouples the two environments while maintaining
|
||||
// internal compatibility.
|
||||
|
||||
|
||||
// If the input is out of range or otherwise unrecognized, it is treated
|
||||
// as UNKNOWN_LANGUAGE
|
||||
//
|
||||
// LanguageCode
|
||||
// ------------
|
||||
// Given the Language, return the language code, e.g. "ko"
|
||||
// This is determined by
|
||||
// the following (in order of preference):
|
||||
// - ISO-639-1 two-letter language code
|
||||
// (all except those mentioned below)
|
||||
// - ISO-639-2 three-letter bibliographic language code
|
||||
// (Tibetan, Dhivehi, Cherokee, Syriac)
|
||||
// - Google-specific language code
|
||||
// (ChineseT ("zh-TW"), Teragram Unknown, Unknown,
|
||||
// Portuguese-Portugal, Portuguese-Brazil, Limbu)
|
||||
// - Fake RTypeNone names.
|
||||
|
||||
const char* LanguageName(Language lang);
|
||||
const char* LanguageCode(Language lang);
|
||||
const char* LanguageShortCode(Language lang);
|
||||
const char* LanguageDeclaredName(Language lang);
|
||||
|
||||
// n is in 0..3. Trailing entries are filled with
|
||||
// ULScript_Common (which never participates in language recognition)
|
||||
ULScript LanguageRecognizedScript(Language lang, int n);
|
||||
|
||||
// Name can be either full name or ISO code, or can be ISO code embedded in
|
||||
// a language-script combination such as "en-Latn-GB"
|
||||
Language GetLanguageFromName(const char* src);
|
||||
|
||||
// Returns which set of statistically-close languages lang is in. 0 means none.
|
||||
int LanguageCloseSet(Language lang);
|
||||
|
||||
//----------------------------------------------------------------------------//
|
||||
// Functions of ULScript and Language //
|
||||
//----------------------------------------------------------------------------//
|
||||
|
||||
// Most common language in each script
|
||||
Language DefaultLanguage(ULScript ulscript);
|
||||
|
||||
// For RTypeMany recognition,
|
||||
// the CLD2 lookup tables are kept small by encoding a language into one byte.
|
||||
// To avoid limiting CLD2 to at most 256 languages, a larger range of external
|
||||
// Language numbers is mapped to a smaller range of per-script numbers. At
|
||||
// the moment (January 2013) the Latin script has about 90 languages to be
|
||||
// recognized, while all the other scripts total about 50 more languages. In
|
||||
// addition, the RTypeNone scripts map to about 100 fake languages.
|
||||
// So we map all Latin-script languages to one range of 1..255 per-script
|
||||
// numbers and map all the other RTypeMany languages to an overlapping range
|
||||
// 1..255 of per-script numbers.
|
||||
|
||||
uint8 PerScriptNumber(ULScript ulscript, Language lang);
|
||||
Language FromPerScriptNumber(ULScript ulscript, uint8 perscript_number);
|
||||
|
||||
//----------------------------------------------------------------------------//
|
||||
// Other //
|
||||
//----------------------------------------------------------------------------//
|
||||
|
||||
// Utility routine to search alphabetical tables
|
||||
int BinarySearch(const char* key, int lo, int hi, const CharIntPair* cipair);
|
||||
|
||||
} // namespace CLD2
|
||||
|
||||
#endif // I18N_ENCODINGS_CLD2_LANG_SCRIPT_H__
|
||||
40
internal/langspan.h
Normal file
40
internal/langspan.h
Normal file
@@ -0,0 +1,40 @@
|
||||
// Copyright 2013 Google Inc. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
//
|
||||
// Author: dsites@google.com (Dick Sites)
|
||||
//
|
||||
|
||||
|
||||
#ifndef I18N_ENCODINGS_CLD2_INTERNAL_LANGSPAN_H_
|
||||
#define I18N_ENCODINGS_CLD2_INTERNAL_LANGSPAN_H_
|
||||
|
||||
#include "generated_language.h"
|
||||
#include "generated_ulscript.h"
|
||||
|
||||
namespace CLD2 {
|
||||
|
||||
typedef struct {
|
||||
char* text; // Pointer to the span, somewhere
|
||||
int text_bytes; // Number of bytes of text in the span
|
||||
int offset; // Offset of start of span in original input buffer
|
||||
ULScript ulscript; // Unicode Letters Script of this span
|
||||
Language lang; // Language identified for this span
|
||||
bool truncated; // true if buffer filled up before a
|
||||
// different script or EOF was found
|
||||
} LangSpan;
|
||||
|
||||
} // namespace CLD2
|
||||
#endif // I18N_ENCODINGS_CLD2_INTERNAL_LANGSPAN_H_
|
||||
|
||||
569
internal/offsetmap.cc
Normal file
569
internal/offsetmap.cc
Normal file
@@ -0,0 +1,569 @@
|
||||
// Copyright 2013 Google Inc. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
//
|
||||
// Author: dsites@google.com (Dick Sites)
|
||||
//
|
||||
//
|
||||
|
||||
#include "offsetmap.h"
|
||||
|
||||
#include <string.h> // for strcmp
|
||||
#include <stdio.h> // for fprintf, stderr, fclose, etc
|
||||
#include <algorithm> // for min
|
||||
|
||||
using namespace std;
|
||||
|
||||
namespace CLD2 {
|
||||
|
||||
// Constructor, destructor
|
||||
OffsetMap::OffsetMap() {
|
||||
Clear();
|
||||
}
|
||||
|
||||
OffsetMap::~OffsetMap() {
|
||||
}
|
||||
|
||||
// Clear the map
|
||||
// After:
|
||||
// next_diff_sub_ is 0
|
||||
// Windows are the a and a' ranges covered by diffs_[next_diff_sub_-1]
|
||||
// which is a fake range of width 0 mapping 0=>0
|
||||
void OffsetMap::Clear() {
|
||||
diffs_.clear();
|
||||
pending_op_ = COPY_OP;
|
||||
pending_length_ = 0;
|
||||
next_diff_sub_ = 0;
|
||||
current_lo_aoffset_ = 0;
|
||||
current_hi_aoffset_ = 0;
|
||||
current_lo_aprimeoffset_ = 0;
|
||||
current_hi_aprimeoffset_ = 0;
|
||||
current_diff_ = 0;
|
||||
max_aoffset_ = 0; // Largest seen so far
|
||||
max_aprimeoffset_ = 0; // Largest seen so far
|
||||
}
|
||||
|
||||
static inline char OpPart(const char c) {
|
||||
return (c >> 6) & 3;
|
||||
}
|
||||
static inline char LenPart(const char c) {
|
||||
return c & 0x3f;
|
||||
}
|
||||
|
||||
// Print map to file, for debugging
|
||||
void OffsetMap::Printmap(const char* filename) {
|
||||
FILE* fout;
|
||||
bool needs_close = false;
|
||||
if (strcmp(filename, "stdout") == 0) {
|
||||
fout = stdout;
|
||||
} else if (strcmp(filename, "stderr") == 0) {
|
||||
fout = stderr;
|
||||
} else {
|
||||
fout = fopen(filename, "w");
|
||||
needs_close = true;
|
||||
}
|
||||
if (fout == NULL) {
|
||||
fprintf(stderr, "%s did not open\n", filename);
|
||||
return;
|
||||
}
|
||||
|
||||
Flush(); // Make sure any pending entry gets printed
|
||||
fprintf(fout, "Offsetmap: %ld bytes\n", diffs_.size());
|
||||
for (int i = 0; i < diffs_.size(); ++i) {
|
||||
fprintf(fout, "%c%02d ", "&=+-"[OpPart(diffs_[i])], LenPart(diffs_[i]));
|
||||
if ((i % 20) == 19) {fprintf(fout, "\n");}
|
||||
}
|
||||
fprintf(fout, "\n");
|
||||
if (needs_close) {
|
||||
fclose(fout);
|
||||
}
|
||||
}
|
||||
|
||||
// Reset to offset 0
|
||||
void OffsetMap::Reset() {
|
||||
MaybeFlushAll();
|
||||
|
||||
next_diff_sub_ = 0;
|
||||
current_lo_aoffset_ = 0;
|
||||
current_hi_aoffset_ = 0;
|
||||
current_lo_aprimeoffset_ = 0;
|
||||
current_hi_aprimeoffset_ = 0;
|
||||
current_diff_ = 0;
|
||||
}
|
||||
|
||||
// Add to mapping from A to A', specifying how many next bytes are
|
||||
// identical in A and A'
|
||||
void OffsetMap::Copy(int bytes) {
|
||||
if (bytes == 0) {return;}
|
||||
max_aoffset_ += bytes; // Largest seen so far
|
||||
max_aprimeoffset_ += bytes; // Largest seen so far
|
||||
if (pending_op_ == COPY_OP) {
|
||||
pending_length_ += bytes;
|
||||
} else {
|
||||
Flush();
|
||||
pending_op_ = COPY_OP;
|
||||
pending_length_ = bytes;
|
||||
}
|
||||
}
|
||||
|
||||
// Add to mapping from A to A', specifying how many next bytes are
|
||||
// inserted in A' while not advancing in A at all
|
||||
void OffsetMap::Insert(int bytes){
|
||||
if (bytes == 0) {return;}
|
||||
max_aprimeoffset_ += bytes; // Largest seen so far
|
||||
if (pending_op_ == INSERT_OP) {
|
||||
pending_length_ += bytes;
|
||||
} else if ((bytes == 1) &&
|
||||
(pending_op_ == DELETE_OP) && (pending_length_ == 1)) {
|
||||
// Special-case exactly delete(1) insert(1) +> copy(1);
|
||||
// all others backmap inserts to after deletes
|
||||
pending_op_ = COPY_OP;
|
||||
} else {
|
||||
Flush();
|
||||
pending_op_ = INSERT_OP;
|
||||
pending_length_ = bytes;
|
||||
}
|
||||
}
|
||||
|
||||
// Add to mapping from A to A', specifying how many next bytes are
|
||||
// deleted from A while not advancing in A' at all
|
||||
void OffsetMap::Delete(int bytes){
|
||||
if (bytes == 0) {return;}
|
||||
max_aoffset_ += bytes; // Largest seen so far
|
||||
if (pending_op_ == DELETE_OP) {
|
||||
pending_length_ += bytes;
|
||||
} else if ((bytes == 1) &&
|
||||
(pending_op_ == INSERT_OP) && (pending_length_ == 1)) {
|
||||
// Special-case exactly insert(1) delete(1) => copy(1);
|
||||
// all others backmap deletes to after insertss
|
||||
pending_op_ = COPY_OP;
|
||||
} else {
|
||||
Flush();
|
||||
pending_op_ = DELETE_OP;
|
||||
pending_length_ = bytes;
|
||||
}
|
||||
}
|
||||
|
||||
void OffsetMap::Flush() {
|
||||
if (pending_length_ == 0) {
|
||||
return;
|
||||
}
|
||||
// We may be emitting a copy op just after a copy op because +1 -1 cancelled
|
||||
// inbetween. If the lengths don't need a prefix byte, combine them
|
||||
if ((pending_op_ == COPY_OP) && !diffs_.empty()) {
|
||||
char c = diffs_[diffs_.size() - 1];
|
||||
MapOp prior_op = static_cast<MapOp>(OpPart(c));
|
||||
int prior_len = LenPart(c);
|
||||
if ((prior_op == COPY_OP) && ((prior_len + pending_length_) <= 0x3f)) {
|
||||
diffs_[diffs_.size() - 1] += pending_length_;
|
||||
pending_length_ = 0;
|
||||
return;
|
||||
}
|
||||
}
|
||||
if (pending_length_ > 0x3f) {
|
||||
bool non_zero_emitted = false;
|
||||
for (int shift = 30; shift > 0; shift -= 6) {
|
||||
int prefix = (pending_length_ >> shift) & 0x3f;
|
||||
if ((prefix > 0) || non_zero_emitted) {
|
||||
Emit(PREFIX_OP, prefix);
|
||||
non_zero_emitted = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
Emit(pending_op_, pending_length_ & 0x3f);
|
||||
pending_length_ = 0;
|
||||
}
|
||||
|
||||
|
||||
// Add one more entry to copy one byte off the end, then flush
|
||||
void OffsetMap::FlushAll() {
|
||||
Copy(1);
|
||||
Flush();
|
||||
}
|
||||
|
||||
// Flush all if necessary
|
||||
void OffsetMap::MaybeFlushAll() {
|
||||
if ((0 < pending_length_) || diffs_.empty()) {
|
||||
FlushAll();
|
||||
}
|
||||
}
|
||||
|
||||
// Len may be 0, for example as the low piece of length=64
|
||||
void OffsetMap::Emit(MapOp op, int len) {
|
||||
char c = (static_cast<char>(op) << 6) | (len & 0x3f);
|
||||
diffs_.push_back(c);
|
||||
}
|
||||
|
||||
void OffsetMap::DumpString() {
|
||||
for (int i = 0; i < diffs_.size(); ++i) {
|
||||
fprintf(stderr, "%c%02d ", "&=+-"[OpPart(diffs_[i])], LenPart(diffs_[i]));
|
||||
}
|
||||
fprintf(stderr, "\n");
|
||||
|
||||
// Print running table of correspondences
|
||||
fprintf(stderr, " op A => A' (A forward-maps to A')\n");
|
||||
int aoffset = 0;
|
||||
int aprimeoffset = 0;
|
||||
int length = 0;
|
||||
for (int i = 0; i < diffs_.size(); ++i) {
|
||||
char c = diffs_[i];
|
||||
MapOp op = static_cast<MapOp>(OpPart(c));
|
||||
int len = LenPart(c);
|
||||
length = (length << 6) + len;
|
||||
if (op == COPY_OP) {
|
||||
aoffset += length;
|
||||
aprimeoffset += length;
|
||||
length = 0;
|
||||
} else if (op == INSERT_OP) {
|
||||
aoffset += 0;
|
||||
aprimeoffset += length;
|
||||
length = 0;
|
||||
} else if (op == DELETE_OP) {
|
||||
aoffset += length;
|
||||
aprimeoffset += 0;
|
||||
length = 0;
|
||||
} else { // (op == PREFIX_OP)
|
||||
// Do nothing else
|
||||
}
|
||||
fprintf(stderr, "[%3d] %c%02d %6d %6d%s\n",
|
||||
i, "&=+-"[op], len,
|
||||
aoffset, aprimeoffset,
|
||||
(next_diff_sub_ == i) ? " <==next_diff_sub_" : "");
|
||||
|
||||
}
|
||||
fprintf(stderr, "\n");
|
||||
}
|
||||
|
||||
void OffsetMap::DumpWindow() {
|
||||
fprintf(stderr, "DumpWindow(A => A'): max_aoffset_ = %d, "
|
||||
"max_aprimeoffset_ = %d, next_diff_sub_ = %d<br>\n",
|
||||
max_aoffset_, max_aprimeoffset_, next_diff_sub_);
|
||||
fprintf(stderr, "A [%u..%u)\n",
|
||||
current_lo_aoffset_, current_hi_aoffset_);
|
||||
fprintf(stderr, "A' [%u..%u)\n",
|
||||
current_lo_aprimeoffset_, current_hi_aprimeoffset_);
|
||||
fprintf(stderr, " diff = %d\n", current_diff_);
|
||||
DumpString();
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------------//
|
||||
// The guts of the 2013 design //
|
||||
// If there are three ranges a b c in diffs_, we can be in one of five //
|
||||
// states: LEFT of a, in ranges a b c, or RIGHT of c //
|
||||
// In each state, there are windows A[Alo..Ahi), A'[A'lo..A'hi) and diffs_ //
|
||||
// position next_diff_sub_ //
|
||||
// There also are mapping constants max_aoffset_ and max_aprimeoffset_ //
|
||||
// If LEFT, Alo=Ahi=0, A'lo=A'hi=0 and next_diff_sub_=0 //
|
||||
// If RIGHT, Alo=Ahi=max_aoffset_, A'lo=A'hi=max_aprimeoffset_ and //
|
||||
// next_diff_sub_=diffs_.size() //
|
||||
// Otherwise, at least one of A[) and A'[) is non-empty and the first bytes //
|
||||
// correspond to each other. If range i is active, next_diff_sub_ is at //
|
||||
// the first byte of range i+1. Because of the length-prefix operator, //
|
||||
// an individual range item in diffs_ may be multiple bytes //
|
||||
// In all cases aprimeoffset = aoffset + current_diff_ //
|
||||
// i.e. current_diff_ = aprimeoffset - aoffset //
|
||||
// //
|
||||
// In the degenerate case of diffs_.empty(), there are only two states //
|
||||
// LEFT and RIGHT and the mapping is the identity mapping. //
|
||||
// The initial state is LEFT. //
|
||||
// It is an error to move left into LEFT or right into RIGHT, but the code //
|
||||
// below is robust in these cases. //
|
||||
//----------------------------------------------------------------------------//
|
||||
|
||||
void OffsetMap::SetLeft() {
|
||||
current_lo_aoffset_ = 0;
|
||||
current_hi_aoffset_ = 0;
|
||||
current_lo_aprimeoffset_ = 0;
|
||||
current_hi_aprimeoffset_ = 0;
|
||||
current_diff_ = 0;
|
||||
next_diff_sub_ = 0;
|
||||
}
|
||||
|
||||
void OffsetMap::SetRight() {
|
||||
current_lo_aoffset_ = max_aoffset_;
|
||||
current_hi_aoffset_ = max_aoffset_;
|
||||
current_lo_aprimeoffset_ = max_aprimeoffset_;
|
||||
current_hi_aprimeoffset_ = max_aprimeoffset_;
|
||||
current_diff_ = max_aprimeoffset_ - max_aoffset_;
|
||||
next_diff_sub_ = 0;
|
||||
}
|
||||
|
||||
// Back up over previous range, 1..5 bytes
|
||||
// Return subscript at the beginning of that. Pins at 0
|
||||
int OffsetMap::Backup(int sub) {
|
||||
if (sub <= 0) {return 0;}
|
||||
--sub;
|
||||
while ((0 < sub) &&
|
||||
(static_cast<MapOp>(OpPart(diffs_[sub - 1]) == PREFIX_OP))) {
|
||||
--sub;
|
||||
}
|
||||
return sub;
|
||||
}
|
||||
|
||||
// Parse next range, 1..5 bytes
|
||||
// Return subscript just off the end of that
|
||||
int OffsetMap::ParseNext(int sub, MapOp* op, int* length) {
|
||||
*op = PREFIX_OP;
|
||||
*length = 0;
|
||||
char c;
|
||||
while ((sub < diffs_.size()) && (*op == PREFIX_OP)) {
|
||||
c = diffs_[sub++];
|
||||
*op = static_cast<MapOp>(OpPart(c));
|
||||
int len = LenPart(c);
|
||||
*length = (*length << 6) + len;
|
||||
}
|
||||
// If mal-formed or in RIGHT, this will return with op = PREFIX_OP
|
||||
// Mal-formed can include a trailing prefix byte with no following op
|
||||
return sub;
|
||||
}
|
||||
|
||||
// Parse previous range, 1..5 bytes
|
||||
// Return current subscript
|
||||
int OffsetMap::ParsePrevious(int sub, MapOp* op, int* length) {
|
||||
sub = Backup(sub);
|
||||
return ParseNext(sub, op, length);
|
||||
}
|
||||
|
||||
// Quick debugging dump; does not parse multi-byte items, so just length & 0x3f
|
||||
void OffsetMap::PrintPosition(const char* str) {
|
||||
MapOp op = PREFIX_OP;
|
||||
int length = 0;
|
||||
if ((0 < next_diff_sub_) && (next_diff_sub_ <= diffs_.size())) {
|
||||
op = static_cast<MapOp>(OpPart(diffs_[next_diff_sub_ - 1]));
|
||||
length = LenPart(diffs_[next_diff_sub_ - 1]);
|
||||
}
|
||||
fprintf(stderr, "%s[%d] %c%02d = A[%d..%d) ==> A'[%d..%d)\n",
|
||||
str,
|
||||
next_diff_sub_, "&=+-"[op], length,
|
||||
current_lo_aoffset_, current_hi_aoffset_,
|
||||
current_lo_aprimeoffset_, current_hi_aprimeoffset_);
|
||||
}
|
||||
|
||||
// Move active window one range to the right
|
||||
// Return true if move was OK
|
||||
bool OffsetMap::MoveRight() {
|
||||
// If at last range or RIGHT, set to RIGHT, return error
|
||||
if (next_diff_sub_ >= diffs_.size()) {
|
||||
SetRight();
|
||||
return false;
|
||||
}
|
||||
// Actually OK to move right
|
||||
MapOp op;
|
||||
int length;
|
||||
bool retval = true;
|
||||
// If mal-formed or in RIGHT, this will return with op = PREFIX_OP
|
||||
next_diff_sub_ = ParseNext(next_diff_sub_, &op, &length);
|
||||
|
||||
current_lo_aoffset_ = current_hi_aoffset_;
|
||||
current_lo_aprimeoffset_ = current_hi_aprimeoffset_;
|
||||
if (op == COPY_OP) {
|
||||
current_hi_aoffset_ = current_lo_aoffset_ + length;
|
||||
current_hi_aprimeoffset_ = current_lo_aprimeoffset_ + length;
|
||||
} else if (op == INSERT_OP) {
|
||||
current_hi_aoffset_ = current_lo_aoffset_ + 0;
|
||||
current_hi_aprimeoffset_ = current_lo_aprimeoffset_ + length;
|
||||
} else if (op == DELETE_OP) {
|
||||
current_hi_aoffset_ = current_lo_aoffset_ + length;
|
||||
current_hi_aprimeoffset_ = current_lo_aprimeoffset_ + 0;
|
||||
} else {
|
||||
SetRight();
|
||||
retval = false;
|
||||
}
|
||||
current_diff_ = current_lo_aprimeoffset_ - current_lo_aoffset_;
|
||||
return retval;
|
||||
}
|
||||
|
||||
// Move active window one range to the left
|
||||
// Return true if move was OK
|
||||
bool OffsetMap::MoveLeft() {
|
||||
// If at first range or LEFT, set to LEFT, return error
|
||||
if (next_diff_sub_ <= 0) {
|
||||
SetLeft();
|
||||
return false;
|
||||
}
|
||||
// Back up over current active window
|
||||
next_diff_sub_ = Backup(next_diff_sub_);
|
||||
if (next_diff_sub_ <= 0) {
|
||||
SetLeft();
|
||||
return false;
|
||||
}
|
||||
// Actually OK to move left
|
||||
MapOp op;
|
||||
int length;
|
||||
bool retval = true;
|
||||
// If mal-formed or in LEFT, this will return with op = PREFIX_OP
|
||||
next_diff_sub_ = ParsePrevious(next_diff_sub_, &op, &length);
|
||||
|
||||
current_hi_aoffset_ = current_lo_aoffset_;
|
||||
current_hi_aprimeoffset_ = current_lo_aprimeoffset_;
|
||||
if (op == COPY_OP) {
|
||||
current_lo_aoffset_ = current_hi_aoffset_ - length;
|
||||
current_lo_aprimeoffset_ = current_hi_aprimeoffset_ - length;
|
||||
} else if (op == INSERT_OP) {
|
||||
current_lo_aoffset_ = current_hi_aoffset_ - 0;
|
||||
current_lo_aprimeoffset_ = current_hi_aprimeoffset_ - length;
|
||||
} else if (op == DELETE_OP) {
|
||||
current_lo_aoffset_ = current_hi_aoffset_ - length;
|
||||
current_lo_aprimeoffset_ = current_hi_aprimeoffset_ - 0;
|
||||
} else {
|
||||
SetLeft();
|
||||
retval = false;
|
||||
}
|
||||
current_diff_ = current_lo_aprimeoffset_ - current_lo_aoffset_;
|
||||
return true;
|
||||
}
|
||||
|
||||
// Map an offset in A' to the corresponding offset in A
|
||||
int OffsetMap::MapBack(int aprimeoffset){
|
||||
MaybeFlushAll();
|
||||
if (aprimeoffset < 0) {return 0;}
|
||||
if (max_aprimeoffset_ <= aprimeoffset) {
|
||||
return (aprimeoffset - max_aprimeoffset_) + max_aoffset_;
|
||||
}
|
||||
|
||||
// If current_lo_aprimeoffset_ <= aprimeoffset < current_hi_aprimeoffset_,
|
||||
// use current mapping, else move window left/right
|
||||
bool ok = true;
|
||||
while (ok && (aprimeoffset < current_lo_aprimeoffset_)) {
|
||||
ok = MoveLeft();
|
||||
}
|
||||
while (ok && (current_hi_aprimeoffset_ <= aprimeoffset)) {
|
||||
ok = MoveRight();
|
||||
}
|
||||
// So now current_lo_aprimeoffset_ <= aprimeoffset < current_hi_aprimeoffset_
|
||||
|
||||
int aoffset = aprimeoffset - current_diff_;
|
||||
if (aoffset >= current_hi_aoffset_) {
|
||||
// A' is in an insert region, all bytes of which backmap to A=hi_aoffset_
|
||||
aoffset = current_hi_aoffset_;
|
||||
}
|
||||
return aoffset;
|
||||
}
|
||||
|
||||
// Map an offset in A to the corresponding offset in A'
|
||||
int OffsetMap::MapForward(int aoffset){
|
||||
MaybeFlushAll();
|
||||
if (aoffset < 0) {return 0;}
|
||||
if (max_aoffset_ <= aoffset) {
|
||||
return (aoffset - max_aoffset_) + max_aprimeoffset_;
|
||||
}
|
||||
|
||||
// If current_lo_aoffset_ <= aoffset < current_hi_aoffset_,
|
||||
// use current mapping, else move window left/right
|
||||
bool ok = true;
|
||||
while (ok && (aoffset < current_lo_aoffset_)) {
|
||||
ok = MoveLeft();
|
||||
}
|
||||
while (ok && (current_hi_aoffset_ <= aoffset)) {
|
||||
ok = MoveRight();
|
||||
}
|
||||
|
||||
int aprimeoffset = aoffset + current_diff_;
|
||||
if (aprimeoffset >= current_hi_aprimeoffset_) {
|
||||
// A is in a delete region, all bytes of which map to A'=hi_aprimeoffset_
|
||||
aprimeoffset = current_hi_aprimeoffset_;
|
||||
}
|
||||
return aprimeoffset;
|
||||
}
|
||||
|
||||
|
||||
// static
|
||||
bool OffsetMap::CopyInserts(OffsetMap* source, OffsetMap* dest) {
|
||||
bool ok = true;
|
||||
while (ok && (source->next_diff_sub_ != source->diffs_.size())) {
|
||||
ok = source->MoveRight();
|
||||
if (source->current_lo_aoffset_ != source->current_hi_aoffset_) {
|
||||
return false;
|
||||
}
|
||||
dest->Insert(
|
||||
source->current_hi_aprimeoffset_ - source->current_lo_aprimeoffset_);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// static
|
||||
bool OffsetMap::CopyDeletes(OffsetMap* source, OffsetMap* dest) {
|
||||
bool ok = true;
|
||||
while (ok && (source->next_diff_sub_ != source->diffs_.size())) {
|
||||
ok = source->MoveRight();
|
||||
if (source->current_lo_aprimeoffset_ != source->current_hi_aprimeoffset_) {
|
||||
return false;
|
||||
}
|
||||
dest->Delete(source->current_hi_aoffset_ - source->current_lo_aoffset_);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// static
|
||||
void OffsetMap::ComposeOffsetMap(
|
||||
OffsetMap* g, OffsetMap* f, OffsetMap* h) {
|
||||
h->Clear();
|
||||
f->Reset();
|
||||
g->Reset();
|
||||
|
||||
int lo = 0;
|
||||
for (;;) {
|
||||
// Consume delete operations in f. This moves A without moving
|
||||
// A' and A''.
|
||||
if (lo >= g->current_hi_aoffset_ && CopyInserts(g, h)) {
|
||||
if (lo >= f->current_hi_aprimeoffset_ && CopyDeletes(f, h)) {
|
||||
// fprintf(stderr,
|
||||
// "ComposeOffsetMap ERROR, f is longer than g.<br>\n");
|
||||
}
|
||||
|
||||
// FlushAll(), called by Reset(), MapForward() or MapBack(), has
|
||||
// added an extra COPY_OP to f and g, so this function has
|
||||
// composed an extra COPY_OP in h from those. To avoid
|
||||
// FlushAll() adds one more extra COPY_OP to h later, dispatch
|
||||
// Flush() right now.
|
||||
h->Flush();
|
||||
return;
|
||||
}
|
||||
|
||||
// Consume insert operations in g. This moves A'' without moving A
|
||||
// and A'.
|
||||
if (lo >= f->current_hi_aprimeoffset_) {
|
||||
if (!CopyDeletes(f, h)) {
|
||||
// fprintf(stderr,
|
||||
// "ComposeOffsetMap ERROR, g is longer than f.<br>\n");
|
||||
}
|
||||
}
|
||||
|
||||
// Compose one operation which moves A' from lo to hi.
|
||||
int hi = min(f->current_hi_aprimeoffset_, g->current_hi_aoffset_);
|
||||
if (f->current_lo_aoffset_ != f->current_hi_aoffset_ &&
|
||||
g->current_lo_aprimeoffset_ != g->current_hi_aprimeoffset_) {
|
||||
h->Copy(hi - lo);
|
||||
} else if (f->current_lo_aoffset_ != f->current_hi_aoffset_) {
|
||||
h->Delete(hi - lo);
|
||||
} else if (g->current_lo_aprimeoffset_ != g->current_hi_aprimeoffset_) {
|
||||
h->Insert(hi - lo);
|
||||
}
|
||||
|
||||
lo = hi;
|
||||
}
|
||||
}
|
||||
|
||||
// For testing only -- force a mapping
|
||||
void OffsetMap::StuffIt(const string& diffs,
|
||||
int max_aoffset, int max_aprimeoffset) {
|
||||
Clear();
|
||||
diffs_ = diffs;
|
||||
max_aoffset_ = max_aoffset;
|
||||
max_aprimeoffset_ = max_aprimeoffset;
|
||||
}
|
||||
|
||||
|
||||
} // namespace CLD2
|
||||
|
||||
175
internal/offsetmap.h
Normal file
175
internal/offsetmap.h
Normal file
@@ -0,0 +1,175 @@
|
||||
// Copyright 2013 Google Inc. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
//
|
||||
// Author: dsites@google.com (Dick Sites)
|
||||
//
|
||||
|
||||
#ifndef UTIL_UTF8_OFFSETMAP_H_
|
||||
#define UTIL_UTF8_OFFSETMAP_H_
|
||||
|
||||
#include <string> // for string
|
||||
#include "integral_types.h" // for uint32
|
||||
|
||||
// ***************************** OffsetMap **************************
|
||||
//
|
||||
// An OffsetMap object is a container for a mapping from offsets in one text
|
||||
// buffer A' to offsets in another text buffer A. It is most useful when A' is
|
||||
// built from A via substitutions that occasionally do not preserve byte length.
|
||||
//
|
||||
// A series of operators are used to build the correspondence map, then
|
||||
// calls can be made to map an offset in A' to an offset in A, or vice versa.
|
||||
// The map starts with offset 0 in A corresponding to offset 0 in A'.
|
||||
// The mapping is then built sequentially, adding on byte ranges that are
|
||||
// identical in A and A', byte ranges that are inserted in A', and byte ranges
|
||||
// that are deleted from A. All bytes beyond those specified when building the
|
||||
// map are assumed to correspond, i.e. a Copy(infinity) is assumed at the
|
||||
// end of the map.
|
||||
//
|
||||
// The internal data structure records positions at which bytes are added or
|
||||
// deleted. Using the map is O(1) when increasing the A' or A offset
|
||||
// monotonically, and O(n) when accessing random offsets, where n is the
|
||||
// number of differences.
|
||||
//
|
||||
|
||||
namespace CLD2 {
|
||||
|
||||
class OffsetMap {
|
||||
public:
|
||||
// Constructor, destructor
|
||||
OffsetMap();
|
||||
~OffsetMap();
|
||||
|
||||
// Clear the map
|
||||
void Clear();
|
||||
|
||||
// Add to mapping from A to A', specifying how many next bytes correspond
|
||||
// in A and A'
|
||||
void Copy(int bytes);
|
||||
|
||||
// Add to mapping from A to A', specifying how many next bytes are
|
||||
// inserted in A' while not advancing in A at all
|
||||
void Insert(int bytes);
|
||||
|
||||
// Add to mapping from A to A', specifying how many next bytes are
|
||||
// deleted from A while not advancing in A' at all
|
||||
void Delete(int bytes);
|
||||
|
||||
// Print map to file, for debugging
|
||||
void Printmap(const char* filename);
|
||||
|
||||
// [Finish building map,] Re-position to offset 0
|
||||
// This call is optional; MapForward and MapBack finish building the map
|
||||
// if necessary
|
||||
void Reset();
|
||||
|
||||
// Map an offset in A' to the corresponding offset in A
|
||||
int MapBack(int aprimeoffset);
|
||||
|
||||
// Map an offset in A to the corresponding offset in A'
|
||||
int MapForward(int aoffset);
|
||||
|
||||
// h = ComposeOffsetMap(g, f), where f is a map from A to A', g is
|
||||
// from A' to A'' and h is from A to A''.
|
||||
//
|
||||
// Note that g->MoveForward(f->MoveForward(aoffset)) always equals
|
||||
// to h->MoveForward(aoffset), while
|
||||
// f->MoveBack(g->MoveBack(aprimeprimeoffset)) doesn't always equals
|
||||
// to h->MoveBack(aprimeprimeoffset). This happens when deletion in
|
||||
// f and insertion in g are at the same place. For example,
|
||||
//
|
||||
// A 1 2 3 4
|
||||
// ^ | ^ ^
|
||||
// | | / | f
|
||||
// v vv v
|
||||
// A' 1' 2' 3'
|
||||
// ^ ^^ ^
|
||||
// | | \ | g
|
||||
// v | v v
|
||||
// A'' 1'' 2'' 3'' 4''
|
||||
//
|
||||
// results in:
|
||||
//
|
||||
// A 1 2 3 4
|
||||
// ^ ^\ ^ ^
|
||||
// | | \ | | h
|
||||
// v | vv v
|
||||
// A'' 1'' 2'' 3'' 4''
|
||||
//
|
||||
// 2'' is mapped 3 in the former figure, while 2'' is mapped to 2 in
|
||||
// the latter figure.
|
||||
static void ComposeOffsetMap(OffsetMap* g, OffsetMap* f, OffsetMap* h);
|
||||
|
||||
// For debugging only; writes to stderr
|
||||
void DumpWindow();
|
||||
|
||||
// For testing only -- force a mapping
|
||||
void StuffIt(const std::string& diffs, int max_aoffset, int max_aprimeoffset);
|
||||
|
||||
private:
|
||||
enum MapOp {PREFIX_OP, COPY_OP, INSERT_OP, DELETE_OP};
|
||||
|
||||
void Flush();
|
||||
void FlushAll();
|
||||
void MaybeFlushAll();
|
||||
void Emit(MapOp op, int len);
|
||||
|
||||
void SetLeft();
|
||||
void SetRight();
|
||||
|
||||
// Back up over previous range, 1..5 bytes
|
||||
// Return subscript at the beginning of that. Pins at 0
|
||||
int Backup(int sub);
|
||||
|
||||
// Parse next range, 1..5 bytes
|
||||
// Return subscript just off the end of that
|
||||
int ParseNext(int sub, MapOp* op, int* length);
|
||||
|
||||
// Parse previous range, 1..5 bytes
|
||||
// Return current subscript
|
||||
int ParsePrevious(int sub, MapOp* op, int* length);
|
||||
|
||||
void PrintPosition(const char* str);
|
||||
|
||||
bool MoveRight(); // Returns true if OK
|
||||
bool MoveLeft(); // Returns true if OK
|
||||
void DumpString();
|
||||
|
||||
// Copies insert operations from source to dest. Returns true if no
|
||||
// other operations are found.
|
||||
static bool CopyInserts(OffsetMap* source, OffsetMap* dest);
|
||||
|
||||
// Copies delete operations from source to dest. Returns true if no other
|
||||
// operations are found.
|
||||
static bool CopyDeletes(OffsetMap* source, OffsetMap* dest);
|
||||
|
||||
std::string diffs_;
|
||||
MapOp pending_op_;
|
||||
uint32 pending_length_;
|
||||
|
||||
// Offsets in the ranges below correspond to each other, with A' = A + diff
|
||||
int next_diff_sub_;
|
||||
int current_lo_aoffset_;
|
||||
int current_hi_aoffset_;
|
||||
int current_lo_aprimeoffset_;
|
||||
int current_hi_aprimeoffset_;
|
||||
int current_diff_;
|
||||
int max_aoffset_;
|
||||
int max_aprimeoffset_;
|
||||
};
|
||||
|
||||
} // namespace CLD2
|
||||
|
||||
#endif // UTIL_UTF8_OFFSETMAP_H_
|
||||
|
||||
129
internal/port.h
Normal file
129
internal/port.h
Normal file
@@ -0,0 +1,129 @@
|
||||
// Copyright 2013 Google Inc. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
//
|
||||
// These are weird things we need to do to get this compiling on
|
||||
// random systems [subset].
|
||||
|
||||
#ifndef BASE_PORT_H_
|
||||
#define BASE_PORT_H_
|
||||
|
||||
#include <string.h> // for memcpy()
|
||||
#include "integral_types.h"
|
||||
|
||||
namespace CLD2 {
|
||||
|
||||
// Portable handling of unaligned loads, stores, and copies.
|
||||
// On some platforms, like ARM, the copy functions can be more efficient
|
||||
// then a load and a store.
|
||||
|
||||
#if defined(ARCH_PIII) || defined(ARCH_ATHLON) || defined(ARCH_K8) || defined(_ARCH_PPC)
|
||||
|
||||
// x86 and x86-64 can perform unaligned loads/stores directly;
|
||||
// modern PowerPC hardware can also do unaligned integer loads and stores;
|
||||
// but note: the FPU still sends unaligned loads and stores to a trap handler!
|
||||
|
||||
#define UNALIGNED_LOAD16(_p) (*reinterpret_cast<const uint16 *>(_p))
|
||||
#define UNALIGNED_LOAD32(_p) (*reinterpret_cast<const uint32 *>(_p))
|
||||
#define UNALIGNED_LOAD64(_p) (*reinterpret_cast<const uint64 *>(_p))
|
||||
|
||||
#define UNALIGNED_STORE16(_p, _val) (*reinterpret_cast<uint16 *>(_p) = (_val))
|
||||
#define UNALIGNED_STORE32(_p, _val) (*reinterpret_cast<uint32 *>(_p) = (_val))
|
||||
#define UNALIGNED_STORE64(_p, _val) (*reinterpret_cast<uint64 *>(_p) = (_val))
|
||||
|
||||
#elif defined(__arm__) && \
|
||||
!defined(__ARM_ARCH_5__) && \
|
||||
!defined(__ARM_ARCH_5T__) && \
|
||||
!defined(__ARM_ARCH_5TE__) && \
|
||||
!defined(__ARM_ARCH_5TEJ__) && \
|
||||
!defined(__ARM_ARCH_6__) && \
|
||||
!defined(__ARM_ARCH_6J__) && \
|
||||
!defined(__ARM_ARCH_6K__) && \
|
||||
!defined(__ARM_ARCH_6Z__) && \
|
||||
!defined(__ARM_ARCH_6ZK__) && \
|
||||
!defined(__ARM_ARCH_6T2__)
|
||||
|
||||
// ARMv7 and newer support native unaligned accesses, but only of 16-bit
|
||||
// and 32-bit values (not 64-bit); older versions either raise a fatal signal,
|
||||
// do an unaligned read and rotate the words around a bit, or do the reads very
|
||||
// slowly (trip through kernel mode). There's no simple #define that says just
|
||||
// “ARMv7 or higher”, so we have to filter away all ARMv5 and ARMv6
|
||||
// sub-architectures. Newer gcc (>= 4.6) set an __ARM_FEATURE_ALIGNED #define,
|
||||
// so in time, maybe we can move on to that.
|
||||
//
|
||||
// This is a mess, but there's not much we can do about it.
|
||||
|
||||
#define UNALIGNED_LOAD16(_p) (*reinterpret_cast<const uint16 *>(_p))
|
||||
#define UNALIGNED_LOAD32(_p) (*reinterpret_cast<const uint32 *>(_p))
|
||||
|
||||
#define UNALIGNED_STORE16(_p, _val) (*reinterpret_cast<uint16 *>(_p) = (_val))
|
||||
#define UNALIGNED_STORE32(_p, _val) (*reinterpret_cast<uint32 *>(_p) = (_val))
|
||||
|
||||
// TODO(sesse): NEON supports unaligned 64-bit loads and stores.
|
||||
// See if that would be more efficient on platforms supporting it,
|
||||
// at least for copies.
|
||||
|
||||
inline uint64 UNALIGNED_LOAD64(const void *p) {
|
||||
uint64 t;
|
||||
memcpy(&t, p, sizeof t);
|
||||
return t;
|
||||
}
|
||||
|
||||
inline void UNALIGNED_STORE64(void *p, uint64 v) {
|
||||
memcpy(p, &v, sizeof v);
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
#define NEED_ALIGNED_LOADS
|
||||
|
||||
// These functions are provided for architectures that don't support
|
||||
// unaligned loads and stores.
|
||||
|
||||
inline uint16 UNALIGNED_LOAD16(const void *p) {
|
||||
uint16 t;
|
||||
memcpy(&t, p, sizeof t);
|
||||
return t;
|
||||
}
|
||||
|
||||
inline uint32 UNALIGNED_LOAD32(const void *p) {
|
||||
uint32 t;
|
||||
memcpy(&t, p, sizeof t);
|
||||
return t;
|
||||
}
|
||||
|
||||
inline uint64 UNALIGNED_LOAD64(const void *p) {
|
||||
uint64 t;
|
||||
memcpy(&t, p, sizeof t);
|
||||
return t;
|
||||
}
|
||||
|
||||
inline void UNALIGNED_STORE16(void *p, uint16 v) {
|
||||
memcpy(p, &v, sizeof v);
|
||||
}
|
||||
|
||||
inline void UNALIGNED_STORE32(void *p, uint32 v) {
|
||||
memcpy(p, &v, sizeof v);
|
||||
}
|
||||
|
||||
inline void UNALIGNED_STORE64(void *p, uint64 v) {
|
||||
memcpy(p, &v, sizeof v);
|
||||
}
|
||||
|
||||
} // End namespace CLD2
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
#endif // BASE_PORT_H_
|
||||
1299
internal/scoreonescriptspan.cc
Normal file
1299
internal/scoreonescriptspan.cc
Normal file
File diff suppressed because it is too large
Load Diff
290
internal/scoreonescriptspan.h
Normal file
290
internal/scoreonescriptspan.h
Normal file
@@ -0,0 +1,290 @@
|
||||
// Copyright 2013 Google Inc. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
//
|
||||
// Author: dsites@google.com (Dick Sites)
|
||||
//
|
||||
//
|
||||
// Terminology:
|
||||
// Incoming original text has HTML tags and entities removed, all but letters
|
||||
// removed, and letters lowercased. Strings of non-letters are mapped to a
|
||||
// single ASCII space.
|
||||
//
|
||||
// One scriptspan has a run of letters/spaces in a single script. This is the
|
||||
// fundamental text unit that is scored. There is an optional backmap from
|
||||
// scriptspan text to the original document text, so that the language ranges
|
||||
// reported in ResultChunkVector refer to byte ranges inthe original text.
|
||||
//
|
||||
// Scripts come in two forms, the full Unicode scripts described by
|
||||
// http://www.unicode.org/Public/UNIDATA/Scripts.txt
|
||||
// and a modified list used exclusively in CLD2. The modified form maps all
|
||||
// the CJK scripts to one, Hani. The current version description is in
|
||||
// i18n/encodings/cld2/builddata/script_summary.txt
|
||||
// In addition, all non-letters are mapped to the Common script.
|
||||
//
|
||||
// ULScript describes this Unicode Letter script.
|
||||
//
|
||||
// Scoring uses text nil-grams, uni-grams, bi-grams, quad-grams, and octa-grams.
|
||||
// Nilgrams (no text lookup at all) are for script-based pseudo-languages and
|
||||
// for languages that are 1:1 with a given script. Unigrams and bigrams are
|
||||
// used to score the CJK languages, all in the Hani script. Quadgrams and
|
||||
// octagrams are used to score all other languages.
|
||||
//
|
||||
// RType is the Recognition Type per ulscript.
|
||||
//
|
||||
// The scoring tables map various grams to language-probability scores.
|
||||
// A given gram that hits in scoring table maps to an indirect subscript into
|
||||
// a list of packed languages and log probabilities.
|
||||
//
|
||||
// Languages are stored in two forms: 10-bit values in the Languge enum, and
|
||||
// shorter 8-bit per-ulscript values in the scoring tables.
|
||||
//
|
||||
// Language refers to the full 10-bit range.
|
||||
// pslang refers to the per-ulscript shorter values.
|
||||
//
|
||||
// Log probabilities also come in two forms. The full range uses values 0..255
|
||||
// to represent minus log base 10th-root-of-2, covering 1 .. 1/2**25.5 or about
|
||||
// TODO BOGUS description, 24 vs 12
|
||||
// 1/47.5M. The second form quantizes these into multiples of 8 that can be
|
||||
// added together to represent probability products. The quantized form uses
|
||||
// values 24..0 with 0 now least likely instead of most likely, thus making
|
||||
// larger sums for more probable results. 24 maps to original 1/2**4.8 (~1/28)
|
||||
// and 0 maps to original 1/2**24.0 (~1/16M).
|
||||
//
|
||||
// qprob refers to quantized log probabilities.
|
||||
//
|
||||
// langprob is a uint32 holding three 1-byte pslangs and a 1-byte subscript to
|
||||
// a list of three qprobs. It always nees a companion ulscript
|
||||
//
|
||||
// A scriptspan is scored via one or more hitbuffers
|
||||
|
||||
|
||||
#ifndef I18N_ENCODINGS_CLD2_INTERNAL_SCOREONESCRIPTSPAN_H__
|
||||
#define I18N_ENCODINGS_CLD2_INTERNAL_SCOREONESCRIPTSPAN_H__
|
||||
|
||||
#include <stdio.h>
|
||||
|
||||
#include "integral_types.h" // for uint8 etc.
|
||||
|
||||
#include "cld2tablesummary.h"
|
||||
#include "compact_lang_det_impl.h" // for ResultChunkVector
|
||||
#include "getonescriptspan.h"
|
||||
#include "langspan.h"
|
||||
#include "tote.h"
|
||||
#include "utf8statetable.h"
|
||||
|
||||
namespace CLD2 {
|
||||
|
||||
static const int kMaxBoosts = 4; // For each of PerScriptLangBoosts
|
||||
// must be power of two for wrap()
|
||||
static const int kChunksizeQuads = 20; // For non-CJK
|
||||
static const int kChunksizeUnis = 50; // For CJK
|
||||
static const int kMaxScoringHits = 1000;
|
||||
static const int kMaxSummaries = kMaxScoringHits / kChunksizeQuads;
|
||||
|
||||
|
||||
// The first four tables are for CJK languages,
|
||||
// the next three for quadgram languages, and
|
||||
// the last for expected scores.
|
||||
typedef struct {
|
||||
const UTF8PropObj* unigram_obj; // 80K CJK characters
|
||||
const CLD2TableSummary* unigram_compat_obj; // 256 CJK lookup probabilities
|
||||
const CLD2TableSummary* deltabi_obj;
|
||||
const CLD2TableSummary* distinctbi_obj;
|
||||
|
||||
const CLD2TableSummary* quadgram_obj;
|
||||
const CLD2TableSummary* deltaocta_obj;
|
||||
const CLD2TableSummary* distinctocta_obj;
|
||||
|
||||
const short* kExpectedScore; // Expected base + delta + distinct score
|
||||
// per 1KB input
|
||||
// Subscripted by language and script4
|
||||
} ScoringTables;
|
||||
|
||||
// Context for boosting several languages
|
||||
typedef struct {
|
||||
int32 n;
|
||||
uint32 langprob[kMaxBoosts];
|
||||
int wrap(int32 n) {return n & (kMaxBoosts - 1);}
|
||||
} LangBoosts;
|
||||
|
||||
typedef struct {
|
||||
LangBoosts latn;
|
||||
LangBoosts othr;
|
||||
} PerScriptLangBoosts;
|
||||
|
||||
|
||||
|
||||
// ScoringContext carries state across scriptspans
|
||||
// ScoringContext also has read-only scoring tables mapping grams to qprobs
|
||||
typedef struct {
|
||||
FILE* debug_file; // Non-NULL if debug output wanted
|
||||
bool flags_cld2_score_as_quads;
|
||||
bool flags_cld2_html;
|
||||
bool flags_cld2_cr;
|
||||
bool flags_cld2_verbose;
|
||||
ULScript ulscript; // langprobs below are with respect to this script
|
||||
Language prior_chunk_lang; // Mostly for debug output
|
||||
PerScriptLangBoosts langprior_boost; // From http content-lang or meta lang=
|
||||
PerScriptLangBoosts distinct_boost; // From distinctive letter groups
|
||||
int oldest_distinct_boost; // Subscript in hitbuffer of oldest
|
||||
// distinct score to use
|
||||
const ScoringTables* scoringtables; // Probability lookup tables
|
||||
ScriptScanner* scanner; // For ResultChunkVector backmap
|
||||
|
||||
// Inits boosts
|
||||
void init() {
|
||||
memset(&langprior_boost, 0, sizeof(langprior_boost));
|
||||
memset(&distinct_boost, 0, sizeof(distinct_boost));
|
||||
};
|
||||
} ScoringContext;
|
||||
|
||||
|
||||
|
||||
// Begin private
|
||||
|
||||
// Holds one scoring-table lookup hit. We hold indirect subscript instead of
|
||||
// langprob to allow a single hit to use a variable number of langprobs.
|
||||
typedef struct {
|
||||
int offset; // First byte of quad/octa etc. in scriptspan
|
||||
int indirect; // subscript of langprobs in scoring table
|
||||
} ScoringHit;
|
||||
|
||||
typedef enum {
|
||||
UNIHIT = 0,
|
||||
QUADHIT = 1,
|
||||
DELTAHIT = 2,
|
||||
DISTINCTHIT = 3
|
||||
} LinearHitType;
|
||||
|
||||
// Holds one scoring-table lookup hit resolved into a langprob.
|
||||
typedef struct {
|
||||
uint16 offset; // First byte of quad/octa etc. in scriptspan
|
||||
uint16 type; // LinearHitType
|
||||
uint32 langprob; // langprob from scoring table
|
||||
} LangprobHit;
|
||||
|
||||
// Holds arrays of scoring-table lookup hits for (part of) a scriptspan
|
||||
typedef struct {
|
||||
ULScript ulscript; // langprobs below are with respect to this script
|
||||
int maxscoringhits; // determines size of arrays below
|
||||
int next_base; // First unused entry in each array
|
||||
int next_delta; // "
|
||||
int next_distinct; // "
|
||||
int next_linear; // "
|
||||
int next_chunk_start; // First unused chunk_start entry
|
||||
int lowest_offset; // First byte of text span used to fill hitbuffer
|
||||
// Dummy entry at the end of each giving offset of first unused text byte
|
||||
ScoringHit base[kMaxScoringHits + 1]; // Uni/quad hits
|
||||
ScoringHit delta[kMaxScoringHits + 1]; // delta-bi/delta-octa hits
|
||||
ScoringHit distinct[kMaxScoringHits + 1]; // distinct-word hits
|
||||
LangprobHit linear[4 * kMaxScoringHits + 1]; // Above three merge-sorted
|
||||
// (4: some bases => 2 linear)
|
||||
int chunk_start[kMaxSummaries + 1]; // First linear[] subscr of
|
||||
// each scored chunk
|
||||
int chunk_offset[kMaxSummaries + 1]; // First text subscr of
|
||||
// each scored chunk
|
||||
|
||||
void init() {
|
||||
ulscript = ULScript_Common;
|
||||
maxscoringhits = kMaxScoringHits;
|
||||
next_base = 0;
|
||||
next_delta = 0;
|
||||
next_distinct = 0;
|
||||
next_linear = 0;
|
||||
next_chunk_start = 0;
|
||||
lowest_offset = 0;
|
||||
base[0].offset = 0;
|
||||
base[0].indirect = 0;
|
||||
delta[0].offset = 0;
|
||||
delta[0].indirect = 0;
|
||||
distinct[0].offset = 0;
|
||||
distinct[0].indirect = 0;
|
||||
linear[0].offset = 0;
|
||||
linear[0].langprob = 0;
|
||||
chunk_start[0] = 0;
|
||||
chunk_offset[0] = 0;
|
||||
};
|
||||
} ScoringHitBuffer;
|
||||
|
||||
// TODO: Explain here why we need both ChunkSpan and ChunkSummary
|
||||
typedef struct {
|
||||
int chunk_base; // Subscript of first hitbuffer.base[] in chunk
|
||||
int chunk_delta; // Subscript of first hitbuffer.delta[]
|
||||
int chunk_distinct; // Subscript of first hitbuffer.distinct[]
|
||||
int base_len; // Number of hitbuffer.base[] in chunk
|
||||
int delta_len; // Number of hitbuffer.delta[] in chunk
|
||||
int distinct_len; // Number of hitbuffer.distinct[] in chunk
|
||||
} ChunkSpan;
|
||||
|
||||
|
||||
// Packed into 20 bytes for space
|
||||
typedef struct {
|
||||
uint16 offset; // Text offset within current scriptspan.text
|
||||
uint16 chunk_start; // Scoring subscr within hitbuffer->linear[]
|
||||
uint16 lang1; // Top lang, mapped to full Language
|
||||
uint16 lang2; // Second lang, mapped to full Language
|
||||
uint16 score1; // Top lang raw score
|
||||
uint16 score2; // Second lang raw score
|
||||
uint16 bytes; // Number of lower letters bytes in chunk
|
||||
uint16 grams; // Number of scored base quad- uni-grams in chunk
|
||||
uint16 ulscript; // ULScript of chunk
|
||||
uint8 reliability_delta; // Reliability 0..100, delta top:second scores
|
||||
uint8 reliability_score; // Reliability 0..100, top:expected score
|
||||
} ChunkSummary;
|
||||
|
||||
|
||||
// We buffer up ~50 chunk summaries, corresponding to chunks of 20 quads in a
|
||||
// 1000-quad hit buffer, so we can do boundary adjustment on them
|
||||
// when adjacent entries are different languages. After that, we add them
|
||||
// all into the document score
|
||||
//
|
||||
// About 50 * 20 = 1000 bytes. OK for stack alloc
|
||||
typedef struct {
|
||||
int n;
|
||||
ChunkSummary chunksummary[kMaxSummaries + 1];
|
||||
} SummaryBuffer;
|
||||
|
||||
// End private
|
||||
|
||||
|
||||
// Score RTypeNone or RTypeOne scriptspan into doc_tote and vec, updating
|
||||
// scoringcontext
|
||||
void ScoreEntireScriptSpan(const LangSpan& scriptspan,
|
||||
ScoringContext* scoringcontext,
|
||||
DocTote* doc_tote,
|
||||
ResultChunkVector* vec);
|
||||
|
||||
// Score RTypeCJK scriptspan into doc_tote and vec, updating scoringcontext
|
||||
void ScoreCJKScriptSpan(const LangSpan& scriptspan,
|
||||
ScoringContext* scoringcontext,
|
||||
DocTote* doc_tote,
|
||||
ResultChunkVector* vec);
|
||||
|
||||
// Score RTypeMany scriptspan into doc_tote and vec, updating scoringcontext
|
||||
void ScoreQuadScriptSpan(const LangSpan& scriptspan,
|
||||
ScoringContext* scoringcontext,
|
||||
DocTote* doc_tote,
|
||||
ResultChunkVector* vec);
|
||||
|
||||
// Score one scriptspan into doc_tote and vec, updating scoringcontext
|
||||
void ScoreOneScriptSpan(const LangSpan& scriptspan,
|
||||
ScoringContext* scoringcontext,
|
||||
DocTote* doc_tote,
|
||||
ResultChunkVector* vec);
|
||||
|
||||
} // End namespace CLD2
|
||||
|
||||
#endif // I18N_ENCODINGS_CLD2_INTERNAL_SCOREONESCRIPTSPAN_H__
|
||||
|
||||
548
internal/scoreutf8text.cc
Normal file
548
internal/scoreutf8text.cc
Normal file
@@ -0,0 +1,548 @@
|
||||
// Copyright 2013 Google Inc. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
//
|
||||
// Little program to read sample UTF-8 text and score it
|
||||
// Giving precision, recall, F, and matrix
|
||||
//
|
||||
// Author: dsites@google.com (Dick Sites)
|
||||
//
|
||||
|
||||
#include <math.h> // for sqrt
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <sys/time.h> // for gettimeofday
|
||||
#include <string>
|
||||
|
||||
#include "debug.h" // for uint8 etc
|
||||
#include "integral_types.h" // for uint8 etc
|
||||
#include "compact_lang_det_impl.h"
|
||||
#include "lang_script.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
namespace CLD2 {
|
||||
|
||||
|
||||
// Scaffolding
|
||||
typedef int32 Encoding;
|
||||
static const Encoding UNKNOWN_ENCODING = 0;
|
||||
|
||||
static const bool FLAGS_cld2_html = true;
|
||||
static const bool FLAGS_noext = false;
|
||||
static const bool FLAGS_echo_mismatch = true;
|
||||
static const int32 FLAGS_minsize = 0;
|
||||
|
||||
|
||||
/***
|
||||
accepts one or more input files
|
||||
loop:
|
||||
reads source line
|
||||
does cld on each source line
|
||||
records source lang-script, CLD lang-script, count+=1
|
||||
|
||||
at end, print row headers = CLD lang, script, per_M
|
||||
print column headers = in lang, script, per_M
|
||||
print matrix, recall, precision, F
|
||||
and overall RMS F.
|
||||
|
||||
sort by script, and within script, by per_M and near diagonal
|
||||
***/
|
||||
|
||||
|
||||
#define LF 0x0a
|
||||
#define CR 0x0d
|
||||
const int kMaxBuffer = 5 * 1024;
|
||||
|
||||
bool ReadLine(FILE* infile, char* buffer, size_t maxlen) {
|
||||
char* p = fgets(buffer, maxlen, infile);
|
||||
if (p == NULL) {
|
||||
return false;
|
||||
}
|
||||
int len = strlen(buffer);
|
||||
|
||||
// trim CR LF
|
||||
if (buffer[len-1] == LF) {buffer[--len] = '\0';}
|
||||
if (buffer[len-1] == CR) {buffer[--len] = '\0';}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool IsComment(char* buffer) {
|
||||
int len = strlen(buffer);
|
||||
if (len == 0) {return true;}
|
||||
if (buffer[0] == '#') {return true;}
|
||||
if (buffer[0] == ' ') {return true;} // Any leading space is comment
|
||||
if ((len >= 5) && (memcmp(buffer, "BOGUS", 5) == 0)) {return true;}
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
// Skips over xxxxx_ where _ is one or more spaces/tabs
|
||||
// Returns string::npos if no more fields
|
||||
int SkipOneField(const string& src, int pos) {
|
||||
if (pos == string::npos) {return pos;}
|
||||
|
||||
int lpos = pos;
|
||||
lpos = src.find_first_of(" \t", lpos);
|
||||
if (lpos == string::npos) {return lpos;}
|
||||
lpos = src.find_first_not_of(" \t", lpos);
|
||||
if (lpos == string::npos) {return lpos;}
|
||||
return lpos;
|
||||
}
|
||||
|
||||
// Return language and script from parsed line
|
||||
void GetStatedLangScript(const string& src, string* lang_script, string* tld) {
|
||||
*lang_script = "";
|
||||
*tld = "";
|
||||
int pos = 0;
|
||||
int pos2 = 0;
|
||||
if (src.substr(0,7) == "SAMPLE ") {
|
||||
// SAMPLE ll-Ssss
|
||||
pos = SkipOneField(src, pos);
|
||||
} else if (src.substr(0,5) == "SAMP ") {
|
||||
// SAMP ll-Ssss /tld2.tld/
|
||||
pos = SkipOneField(src, pos);
|
||||
pos2 = SkipOneField(src, pos);
|
||||
} else if (src.substr(0,5) == "Samp ") {
|
||||
// Samp ll-Ssss /tld2.tld/
|
||||
pos = SkipOneField(src, pos);
|
||||
pos2 = SkipOneField(src, pos);
|
||||
}
|
||||
if (pos == 0) {return;}
|
||||
if (pos == string::npos) {return;}
|
||||
|
||||
// Pos is at the first letter of language-script combination
|
||||
int end = src.find_first_of(" \t", pos); // find end of lang-script
|
||||
if (end == string::npos) {return;}
|
||||
*lang_script = src.substr(pos, end - pos);
|
||||
|
||||
// Pos2 is 0 or at the first letter of the tld string
|
||||
if (pos2 == 0) {return;}
|
||||
if (pos2 == string::npos) {return;}
|
||||
end = src.find_first_of(" \t", pos2);
|
||||
if (end == string::npos) {return;}
|
||||
*tld = src.substr(pos2, end - pos2);
|
||||
}
|
||||
|
||||
// Return position of start of text
|
||||
int GetTextBeginPos(const string& src) {
|
||||
int pos = 0;
|
||||
if (src.size() < 8) {return pos;}
|
||||
|
||||
if (src.substr(0,7) == "SAMPLE ") {
|
||||
// Skip SAMPLE ll-Ssss
|
||||
pos = SkipOneField(src, pos);
|
||||
pos = SkipOneField(src, pos);
|
||||
} else if (src.substr(0,5) == "SAMP ") {
|
||||
// Skip SAMP ll-Ssss /tld2.tld/
|
||||
pos = SkipOneField(src, pos);
|
||||
pos = SkipOneField(src, pos);
|
||||
pos = SkipOneField(src, pos);
|
||||
} else if (src.substr(0,5) == "Samp ") {
|
||||
// Skip Samp ll-Ssss /tld2.tld/
|
||||
pos = SkipOneField(src, pos);
|
||||
pos = SkipOneField(src, pos);
|
||||
pos = SkipOneField(src, pos);
|
||||
}
|
||||
return pos;
|
||||
}
|
||||
|
||||
|
||||
bool CarefulMatch(const char* in_langscript,
|
||||
Language in_lang, ULScript in_lscript,
|
||||
Language cld_lang, ULScript cld_lscript) {
|
||||
bool easy_match = ((in_lang == cld_lang) & (in_lscript == cld_lscript));
|
||||
if (easy_match) {return true;}
|
||||
|
||||
// Unrecognized list, matching un-Xxxx
|
||||
if ((cld_lang == UNKNOWN_LANGUAGE) && (in_lscript == cld_lscript)) {
|
||||
if (strcmp(in_langscript, "az-Arab") == 0) {return true;}
|
||||
if (strcmp(in_langscript, "az-Cyrl") == 0) {return true;}
|
||||
if (strcmp(in_langscript, "kk-Latn") == 0) {return true;}
|
||||
if (strcmp(in_langscript, "ku-Latn") == 0) {return true;}
|
||||
if (strcmp(in_langscript, "my-Latn") == 0) {return true;}
|
||||
if (strcmp(in_langscript, "ru-Latn") == 0) {return true;}
|
||||
if (strcmp(in_langscript, "tg-Arab") == 0) {return true;}
|
||||
if (strcmp(in_langscript, "ug-Latn") == 0) {return true;}
|
||||
if (strcmp(in_langscript, "za-Hani") == 0) {return true;}
|
||||
}
|
||||
|
||||
// bs/me => sr/hr
|
||||
if ((cld_lang == CROATIAN) && (cld_lscript == ULScript_Latin)) {
|
||||
if (strcmp(in_langscript, "bs-Latn") == 0) {return true;}
|
||||
if (strcmp(in_langscript, "sr-ME-Latn") == 0) {return true;}
|
||||
}
|
||||
if ((cld_lang == SERBIAN) && (cld_lscript == ULScript_Cyrillic)) {
|
||||
if (strcmp(in_langscript, "bs-Cyrl") == 0) {return true;}
|
||||
if (strcmp(in_langscript, "sr-ME-Cyrl") == 0) {return true;}
|
||||
}
|
||||
|
||||
// Twi => Akan
|
||||
if ((cld_lang == AKAN) && (cld_lscript == ULScript_Latin)) {
|
||||
if (strcmp(in_langscript, "tw-Latn") == 0) {return true;}
|
||||
}
|
||||
|
||||
// za-Hani
|
||||
if ((cld_lang == CHINESE) && (cld_lscript == ULScript_Hani)) {
|
||||
if (strcmp(in_langscript, "za-Hani") == 0) {return true;}
|
||||
}
|
||||
|
||||
// zzb, zze, zzh fake languages
|
||||
if (strcmp(in_langscript, "zzb-Latn") == 0) {return true;}
|
||||
if (strcmp(in_langscript, "zze-Latn") == 0) {return true;}
|
||||
if (strcmp(in_langscript, "zzh-Latn") == 0) {return true;}
|
||||
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
#if 0
|
||||
typedef hash_map<string, int32> StringIntMap;
|
||||
|
||||
static int32 next_in_map;
|
||||
static int32 next_cld_map;
|
||||
static int32 next_in_cld_map;
|
||||
static StringIntMap in_map; // xx-Fooo to small int
|
||||
static StringIntMap cld_map; // xx-Fooo to small int
|
||||
static StringIntMap in_cld_map; // xx-Fooo_xx-Barr to small int
|
||||
|
||||
static vector<int32> in_count; // counts by in_map subscript
|
||||
static vector<int32> cld_count; // counts by cld_map subscript
|
||||
static vector<int32> in_cld_count; // counts by in_cld_map subscript
|
||||
|
||||
int32 MapToSmallInt(const string& s, StringIntMap* smap, int* next_smap) {
|
||||
StringIntMap::iterator it = smap->find(s);
|
||||
if (it == smap->end()) {
|
||||
// New
|
||||
(*smap)[s] = *next_smap;
|
||||
*next_smap += 1;
|
||||
}
|
||||
return (*smap)[s];
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
void InitResult() {
|
||||
#if 0
|
||||
in_map.clear();
|
||||
cld_map.clear();
|
||||
in_cld_map.clear();
|
||||
next_in_map = 0;
|
||||
next_cld_map = 0;
|
||||
next_in_cld_map = 0;
|
||||
in_count.clear();
|
||||
cld_count.clear();
|
||||
in_cld_count.clear();
|
||||
#endif
|
||||
}
|
||||
|
||||
void RecordCLDResult(const char* buffer, const char* in_langscript,
|
||||
Language in_lang, ULScript in_lscript,
|
||||
Language cld_lang, ULScript cld_lscript) {
|
||||
|
||||
bool match = CarefulMatch(in_langscript,
|
||||
in_lang, in_lscript, cld_lang, cld_lscript);
|
||||
if (FLAGS_echo_mismatch && !match) {
|
||||
fprintf(stderr,
|
||||
" =Mismatch: "
|
||||
"expected %s, actual %s<br>\n",
|
||||
LanguageCode(in_lang), LanguageCode(cld_lang));
|
||||
}
|
||||
#if 0
|
||||
printf("%s %s-%s %s\n", in_langscript,
|
||||
ExtLanguageCode(cld_lang), UnicodeLScriptCode(cld_lscript),
|
||||
match ? "" : "!=");
|
||||
|
||||
string cld_langscript = ExtLanguageCode(cld_lang);
|
||||
cld_langscript.append("-");
|
||||
cld_langscript.append( UnicodeLScriptCode(cld_lscript));
|
||||
|
||||
string in_cld_langscript = in_langscript;
|
||||
in_cld_langscript.append("_");
|
||||
in_cld_langscript.append(cld_langscript);
|
||||
|
||||
// Extend vectors if needed
|
||||
int32 in_int = MapToSmallInt(in_langscript, &in_map, &next_in_map);
|
||||
while (in_count.size() <= in_int) {in_count.push_back(0);}
|
||||
in_count[in_int] += 1;
|
||||
|
||||
int32 cld_int = MapToSmallInt(cld_langscript, &cld_map, &next_cld_map);
|
||||
while (cld_count.size() <= cld_int) {cld_count.push_back(0);}
|
||||
cld_count[cld_int] += 1;
|
||||
|
||||
int32 in_cld_int = MapToSmallInt(in_cld_langscript,
|
||||
&in_cld_map, &next_in_cld_map);
|
||||
while (in_cld_count.size() <= in_cld_int) {in_cld_count.push_back(0);}
|
||||
in_cld_count[in_cld_int] += 1;
|
||||
#endif
|
||||
}
|
||||
|
||||
void FinishResult() {
|
||||
#if 0
|
||||
int32 in_n = in_map.size();
|
||||
int32 cld_n = cld_map.size();
|
||||
|
||||
int32* in_total = new int32[in_n];
|
||||
memset(in_total, 0, in_n * sizeof(int32));
|
||||
int32* in_matches = new int32[in_n];
|
||||
memset(in_matches, 0, in_n * sizeof(int32));
|
||||
string* in_str = new string[in_n];
|
||||
for (StringIntMap::iterator it = in_map.begin(); it != in_map.end(); ++it) {
|
||||
in_str[it->second] = it->first;
|
||||
}
|
||||
|
||||
int32* cld_total = new int32[cld_n];
|
||||
memset(cld_total, 0, cld_n * sizeof(int32));
|
||||
int32* cld_matches = new int32[cld_n];
|
||||
memset(cld_matches, 0, cld_n * sizeof(int32));
|
||||
string* cld_str = new string[cld_n];
|
||||
for (StringIntMap::iterator it = cld_map.begin(); it != cld_map.end(); ++it) {
|
||||
cld_str[it->second] = it->first;
|
||||
}
|
||||
|
||||
for (StringIntMap::iterator it = in_cld_map.begin();
|
||||
it != in_cld_map.end(); ++it) {
|
||||
string in_cld = it->first;
|
||||
int32 in_cld_int = it->second;
|
||||
//VERYTEMP
|
||||
//printf("%s[%d] = %d\n", in_cld.c_str(), in_cld_int, in_cld_count[in_cld_int]);
|
||||
|
||||
// Decompose it all
|
||||
int under_pos = in_cld.find("_");
|
||||
string in_langscript = in_cld.substr(0, under_pos);
|
||||
string cld_langscript = in_cld.substr(under_pos + 1);
|
||||
int32 in_int = MapToSmallInt(in_langscript, &in_map, &next_in_map);
|
||||
int32 cld_int = MapToSmallInt(cld_langscript, &cld_map, &next_cld_map);
|
||||
|
||||
Language in_lang = GetLanguageFromNumberOrName(in_langscript.c_str());
|
||||
ULScript in_lscript = GetLScriptFromNumberOrName(in_langscript.c_str());
|
||||
Language cld_lang = GetLanguageFromNumberOrName(cld_langscript.c_str());
|
||||
ULScript cld_lscript = GetLScriptFromNumberOrName(cld_langscript.c_str());
|
||||
|
||||
bool match = CarefulMatch(in_langscript.c_str(),
|
||||
in_lang, in_lscript, cld_lang, cld_lscript);
|
||||
|
||||
//VERYTEMP
|
||||
//printf("%s-%s %s-%s #=%d %d %d %s\n",
|
||||
// ExtLanguageCode(in_lang), UnicodeLScriptCode(in_lscript),
|
||||
// ExtLanguageCode(cld_lang), UnicodeLScriptCode(cld_lscript),
|
||||
// in_cld_count[in_cld_int], in_int, cld_int, match ? "match" : "!=");
|
||||
|
||||
in_total[in_int] += in_cld_count[in_cld_int];
|
||||
cld_total[cld_int] += in_cld_count[in_cld_int];
|
||||
if (match) {
|
||||
in_matches[in_int] += in_cld_count[in_cld_int];
|
||||
cld_matches[cld_int] += in_cld_count[in_cld_int];
|
||||
}
|
||||
}
|
||||
|
||||
int32 total = 0;
|
||||
int32 match_total = 0;
|
||||
for (int i = 0; i < cld_n; ++i) {
|
||||
printf("Precision: %s %d/%d = %6.4f\n",
|
||||
cld_str[i].c_str(), cld_matches[i], cld_total[i],
|
||||
cld_total[i] == 0 ? 0.0 : (cld_matches[i] * 1.0) / cld_total[i]);
|
||||
total += cld_total[i];
|
||||
match_total += cld_matches[i];
|
||||
}
|
||||
printf("Precision: %s %d/%d = %6.4f\n",
|
||||
"TOTAL", match_total, total,
|
||||
total == 0 ? 0.0 : (match_total * 1.0) / total);
|
||||
|
||||
total = 0;
|
||||
match_total = 0;
|
||||
for (int i = 0; i < in_n; ++i) {
|
||||
printf("Recall: %s %d/%d = %6.4f\n",
|
||||
in_str[i].c_str(), in_matches[i], in_total[i],
|
||||
in_total[i] == 0 ? 0.0 : (in_matches[i] * 1.0) / in_total[i]);
|
||||
total += in_total[i];
|
||||
match_total += in_matches[i];
|
||||
}
|
||||
printf("Recall: %s %d/%d = %6.4f\n",
|
||||
"TOTAL", match_total, total,
|
||||
total == 0 ? 0.0 : (match_total * 1.0) / total);
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
bool SkipMe(char c) {
|
||||
if (static_cast<uint8>(c) <= '9') {return true;}
|
||||
return false;
|
||||
}
|
||||
|
||||
// Remove any trailing digits/spaces (possible mapreduce counts)
|
||||
// Return length
|
||||
int Trim(char* buffer) {
|
||||
int buffer_len = strlen(buffer);
|
||||
while (SkipMe(buffer[buffer_len - 1])) {--buffer_len;}
|
||||
buffer[buffer_len] = '\0';
|
||||
return buffer_len;
|
||||
}
|
||||
|
||||
void LangDetLinesOfFile(int flags, bool get_vector, const char* fname) {
|
||||
FILE* fin = fopen(fname, "rb");
|
||||
if (fin == NULL) {
|
||||
fprintf(stderr, "Did not open %s\n", fname);
|
||||
return;
|
||||
}
|
||||
|
||||
// Expecting
|
||||
// Samp af-Latn /afr/ word tot skuldig bevind volgens die wet, in...
|
||||
char buffer[kMaxBuffer];
|
||||
while (ReadLine(fin, buffer, kMaxBuffer)) {
|
||||
if (IsComment(buffer)) {continue;}
|
||||
|
||||
int buffer_len = Trim(buffer);
|
||||
|
||||
string buffer_str(buffer, buffer_len);
|
||||
string lang_script;
|
||||
string tld;
|
||||
|
||||
// Get lang-script
|
||||
GetStatedLangScript(buffer_str, &lang_script, &tld);
|
||||
Language in_lang = GetLanguageFromName(lang_script.c_str());
|
||||
ULScript in_lscript = GetULScriptFromName(lang_script.c_str());
|
||||
|
||||
// Get Text; skip over any prefix fields
|
||||
int pos = GetTextBeginPos(buffer_str);
|
||||
if (pos == string::npos) {continue;}
|
||||
|
||||
const char* src = buffer_str.data() + pos;
|
||||
int src_len = buffer_str.size() - pos;
|
||||
|
||||
if (src_len < FLAGS_minsize) {continue;} // Skip if too short
|
||||
|
||||
// Detect language in one line of UTF-8
|
||||
bool is_plain_text = false;
|
||||
const char* tldhint = "";
|
||||
Encoding enchint = UNKNOWN_ENCODING;
|
||||
Language langhint = UNKNOWN_LANGUAGE;
|
||||
// Full-blown flag-bit and hints interface
|
||||
bool allow_extended_lang = true;
|
||||
// Caller initializes flags
|
||||
Language plus_one = UNKNOWN_LANGUAGE;
|
||||
|
||||
Language language3[3];
|
||||
int percent3[3];
|
||||
double normalized_score3[3];
|
||||
ResultChunkVector resultchunkvector;
|
||||
int text_bytes;
|
||||
bool is_reliable;
|
||||
|
||||
// Detected language biased summary (biased against English)
|
||||
Language summary_lang = UNKNOWN_LANGUAGE;
|
||||
|
||||
// Identify the expected value
|
||||
fprintf(stderr, "Samp %s ", lang_script.c_str());
|
||||
flags |= kCLDFlagQuiet;
|
||||
|
||||
CLDHints cldhints = {NULL, tldhint, enchint, langhint};
|
||||
|
||||
summary_lang = DetectLanguageSummaryV2(
|
||||
src, src_len,
|
||||
is_plain_text,
|
||||
&cldhints,
|
||||
allow_extended_lang,
|
||||
flags,
|
||||
plus_one,
|
||||
language3,
|
||||
percent3,
|
||||
normalized_score3,
|
||||
get_vector ? &resultchunkvector : NULL,
|
||||
&text_bytes,
|
||||
&is_reliable);
|
||||
|
||||
#if 0
|
||||
if (FLAGS_noext) {
|
||||
summary_lang = DetectLanguageSummary(
|
||||
src, src_len,
|
||||
is_plain_text,
|
||||
language3,
|
||||
percent3,
|
||||
&text_bytes,
|
||||
&is_reliable);
|
||||
} else {
|
||||
summary_lang = ExtDetectLanguageSummary(
|
||||
src, src_len,
|
||||
is_plain_text,
|
||||
language3,
|
||||
percent3,
|
||||
&text_bytes,
|
||||
&is_reliable);
|
||||
}
|
||||
#endif
|
||||
if (get_vector) {
|
||||
DumpResultChunkVector(stderr, src, &resultchunkvector);
|
||||
}
|
||||
|
||||
if (!is_reliable) {summary_lang = UNKNOWN_LANGUAGE;}
|
||||
|
||||
RecordCLDResult(buffer, lang_script.c_str(),
|
||||
in_lang, in_lscript,
|
||||
summary_lang, in_lscript);
|
||||
}
|
||||
|
||||
fclose(fin);
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
int main (int argc, char *argv[])
|
||||
{
|
||||
int flags = 0;
|
||||
bool get_vector = false;
|
||||
|
||||
for (int i = 1; i < argc; ++i) {
|
||||
if (strcmp(argv[i], "--scoreasquads") == 0) {flags |= kCLDFlagScoreAsQuads;}
|
||||
if (strcmp(argv[i], "--html") == 0) {flags |= kCLDFlagHtml;}
|
||||
if (strcmp(argv[i], "--cr") == 0) {flags |= kCLDFlagCr;}
|
||||
if (strcmp(argv[i], "--verbose") == 0) {flags |= kCLDFlagVerbose;}
|
||||
if (strcmp(argv[i], "--vector") == 0) {get_vector = true;}
|
||||
}
|
||||
|
||||
if (FLAGS_cld2_html) {
|
||||
// Begin HTML file
|
||||
fprintf(stderr, "<html><meta charset=\"UTF-8\"><body>\n");
|
||||
fprintf(stderr, "<style media=\"print\" type=\"text/css\"> "
|
||||
":root { -webkit-print-color-adjust: exact; } </style>\n");
|
||||
fprintf(stderr, "<span style=\"font-size: 7pt\">\n");
|
||||
}
|
||||
|
||||
|
||||
InitResult();
|
||||
for (int i = 1; i < argc; ++i) {
|
||||
if (argv[i][0] != '-') {
|
||||
const char* fname = argv[i];
|
||||
fprintf(stderr, "file = %s<br><br>\n", fname ? fname : "stdin");
|
||||
LangDetLinesOfFile(flags, get_vector, fname);
|
||||
}
|
||||
}
|
||||
FinishResult();
|
||||
|
||||
if (FLAGS_cld2_html) {
|
||||
fprintf(stderr, "\n</span></body></html><br>");
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
} // End namespace CLD2
|
||||
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
return CLD2::main(argc, argv);
|
||||
}
|
||||
|
||||
78
internal/stringpiece.h
Normal file
78
internal/stringpiece.h
Normal file
@@ -0,0 +1,78 @@
|
||||
// Copyright 2013 Google Inc. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
//
|
||||
// A StringPiece points to part or all of a string, double-quoted string
|
||||
// literal, or other string-like object. A StringPiece does *not* own the
|
||||
// string to which it points. A StringPiece is not null-terminated. [subset]
|
||||
//
|
||||
|
||||
#ifndef STRINGS_STRINGPIECE_H_
|
||||
#define STRINGS_STRINGPIECE_H_
|
||||
|
||||
#include <string.h>
|
||||
#include <string>
|
||||
|
||||
|
||||
typedef int stringpiece_ssize_type;
|
||||
|
||||
class StringPiece {
|
||||
private:
|
||||
const char* ptr_;
|
||||
stringpiece_ssize_type length_;
|
||||
|
||||
public:
|
||||
// We provide non-explicit singleton constructors so users can pass
|
||||
// in a "const char*" or a "string" wherever a "StringPiece" is
|
||||
// expected.
|
||||
StringPiece() : ptr_(NULL), length_(0) {}
|
||||
|
||||
StringPiece(const char* str) // NOLINT(runtime/explicit)
|
||||
: ptr_(str), length_(0) {
|
||||
if (str != NULL) {
|
||||
length_ = strlen(str);
|
||||
}
|
||||
}
|
||||
|
||||
StringPiece(const std::string& str) // NOLINT(runtime/explicit)
|
||||
: ptr_(str.data()), length_(0) {
|
||||
length_ = str.size();
|
||||
}
|
||||
|
||||
StringPiece(const char* offset, stringpiece_ssize_type len)
|
||||
: ptr_(offset), length_(len) {
|
||||
}
|
||||
|
||||
void remove_prefix(stringpiece_ssize_type n) {
|
||||
ptr_ += n;
|
||||
length_ -= n;
|
||||
}
|
||||
|
||||
void remove_suffix(stringpiece_ssize_type n) {
|
||||
length_ -= n;
|
||||
}
|
||||
|
||||
// data() may return a pointer to a buffer with embedded NULs, and the
|
||||
// returned buffer may or may not be null terminated. Therefore it is
|
||||
// typically a mistake to pass data() to a routine that expects a NUL
|
||||
// terminated string.
|
||||
const char* data() const { return ptr_; }
|
||||
stringpiece_ssize_type size() const { return length_; }
|
||||
stringpiece_ssize_type length() const { return length_; }
|
||||
bool empty() const { return length_ == 0; }
|
||||
};
|
||||
|
||||
class StringPiece;
|
||||
|
||||
#endif // STRINGS_STRINGPIECE_H__
|
||||
265
internal/tote.cc
Normal file
265
internal/tote.cc
Normal file
@@ -0,0 +1,265 @@
|
||||
// Copyright 2013 Google Inc. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
//
|
||||
// Author: dsites@google.com (Dick Sites)
|
||||
//
|
||||
|
||||
#include "tote.h"
|
||||
#include "lang_script.h" // For LanguageCode in Dump
|
||||
|
||||
#include <stdio.h>
|
||||
#include <string.h> // For memset
|
||||
|
||||
namespace CLD2 {
|
||||
|
||||
// Take a set of <key, value> pairs and tote them up.
|
||||
// After explicitly sorting, retrieve top key, value pairs
|
||||
// Normal use is key=per-script language and value = probability score
|
||||
Tote::Tote() {
|
||||
in_use_mask_ = 0;
|
||||
byte_count_ = 0;
|
||||
score_count_ = 0;
|
||||
// No need to initialize values
|
||||
}
|
||||
|
||||
Tote::~Tote() {
|
||||
}
|
||||
|
||||
void Tote::Reinit() {
|
||||
in_use_mask_ = 0;
|
||||
byte_count_ = 0;
|
||||
score_count_ = 0;
|
||||
// No need to initialize values
|
||||
}
|
||||
// Increment count of quadgrams/trigrams/unigrams scored
|
||||
void Tote::AddScoreCount() {
|
||||
++score_count_;
|
||||
}
|
||||
|
||||
|
||||
void Tote::Add(uint8 ikey, int idelta) {
|
||||
int key_group = ikey >> 2;
|
||||
uint64 groupmask = (1ULL << key_group);
|
||||
if ((in_use_mask_ & groupmask) == 0) {
|
||||
// Initialize this group
|
||||
gscore_[key_group] = 0;
|
||||
in_use_mask_ |= groupmask;
|
||||
}
|
||||
score_[ikey] += idelta;
|
||||
}
|
||||
|
||||
|
||||
// Return current top three keys
|
||||
void Tote::CurrentTopThreeKeys(int* key3) const {
|
||||
key3[0] = -1;
|
||||
key3[1] = -1;
|
||||
key3[2] = -1;
|
||||
int score3[3] = {-1, -1, -1};
|
||||
uint64 tempmask = in_use_mask_;
|
||||
int base = 0;
|
||||
while (tempmask != 0) {
|
||||
if (tempmask & 1) {
|
||||
// Look at four in-use keys
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
int insert_me = score_[base + i];
|
||||
// Favor lower numbers on ties
|
||||
if (insert_me > score3[2]) {
|
||||
// Insert
|
||||
int insert_at = 2;
|
||||
if (insert_me > score3[1]) {
|
||||
score3[2] = score3[1];
|
||||
key3[2] = key3[1];
|
||||
insert_at = 1;
|
||||
if (insert_me > score3[0]) {
|
||||
score3[1] = score3[0];
|
||||
key3[1] = key3[0];
|
||||
insert_at = 0;
|
||||
}
|
||||
}
|
||||
score3[insert_at] = insert_me;
|
||||
key3[insert_at] = base + i;
|
||||
}
|
||||
}
|
||||
}
|
||||
tempmask >>= 1;
|
||||
base += 4;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// Take a set of <key, value> pairs and tote them up.
|
||||
// After explicitly sorting, retrieve top key, value pairs
|
||||
// 0xFFFF in key signifies unused
|
||||
DocTote::DocTote() {
|
||||
// No need to initialize score_ or value_
|
||||
incr_count_ = 0;
|
||||
sorted_ = 0;
|
||||
memset(closepair_, 0, sizeof(closepair_));
|
||||
memset(key_, 0xFF, sizeof(key_));
|
||||
}
|
||||
|
||||
DocTote::~DocTote() {
|
||||
}
|
||||
|
||||
void DocTote::Reinit() {
|
||||
// No need to initialize score_ or value_
|
||||
incr_count_ = 0;
|
||||
sorted_ = 0;
|
||||
memset(closepair_, 0, sizeof(closepair_));
|
||||
memset(key_, 0xFF, sizeof(key_));
|
||||
runningscore_.Reinit();
|
||||
}
|
||||
|
||||
// Weight reliability by ibytes
|
||||
// Also see three-way associative comments above for Tote
|
||||
void DocTote::Add(uint16 ikey, int ibytes,
|
||||
int score, int ireliability) {
|
||||
++incr_count_;
|
||||
|
||||
// Look for existing entry in top 2 positions of 3, times 8 columns
|
||||
int sub0 = ikey & 15;
|
||||
if (key_[sub0] == ikey) {
|
||||
value_[sub0] += ibytes;
|
||||
score_[sub0] += score;
|
||||
reliability_[sub0] += ireliability * ibytes;
|
||||
return;
|
||||
}
|
||||
// Look for existing entry in other of top 2 positions of 3, times 8 columns
|
||||
int sub1 = sub0 ^ 8;
|
||||
if (key_[sub1] == ikey) {
|
||||
value_[sub1] += ibytes;
|
||||
score_[sub1] += score;
|
||||
reliability_[sub1] += ireliability * ibytes;
|
||||
return;
|
||||
}
|
||||
// Look for existing entry in third position of 3, times 8 columns
|
||||
int sub2 = (ikey & 7) + 16;
|
||||
if (key_[sub2] == ikey) {
|
||||
value_[sub2] += ibytes;
|
||||
score_[sub2] += score;
|
||||
reliability_[sub2] += ireliability * ibytes;
|
||||
return;
|
||||
}
|
||||
|
||||
// Allocate new entry
|
||||
int alloc = -1;
|
||||
if (key_[sub0] == kUnusedKey) {
|
||||
alloc = sub0;
|
||||
} else if (key_[sub1] == kUnusedKey) {
|
||||
alloc = sub1;
|
||||
} else if (key_[sub2] == kUnusedKey) {
|
||||
alloc = sub2;
|
||||
} else {
|
||||
// All choices allocated, need to replace smallest one
|
||||
alloc = sub0;
|
||||
if (value_[sub1] < value_[alloc]) {alloc = sub1;}
|
||||
if (value_[sub2] < value_[alloc]) {alloc = sub2;}
|
||||
}
|
||||
key_[alloc] = ikey;
|
||||
value_[alloc] = ibytes;
|
||||
score_[alloc] = score;
|
||||
reliability_[alloc] = ireliability * ibytes;
|
||||
return;
|
||||
}
|
||||
|
||||
// Find subscript of a given packed language, or -1
|
||||
int DocTote::Find(uint16 ikey) {
|
||||
if (sorted_) {
|
||||
// Linear search if sorted
|
||||
for (int sub = 0; sub < kMaxSize_; ++sub) {
|
||||
if (key_[sub] == ikey) {return sub;}
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Look for existing entry
|
||||
int sub0 = ikey & 15;
|
||||
if (key_[sub0] == ikey) {
|
||||
return sub0;
|
||||
}
|
||||
int sub1 = sub0 ^ 8;
|
||||
if (key_[sub1] == ikey) {
|
||||
return sub1;
|
||||
}
|
||||
int sub2 = (ikey & 7) + 16;
|
||||
if (key_[sub2] == ikey) {
|
||||
return sub2;
|
||||
}
|
||||
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Return current top key
|
||||
int DocTote::CurrentTopKey() {
|
||||
int top_key = 0;
|
||||
int top_value = -1;
|
||||
for (int sub = 0; sub < kMaxSize_; ++sub) {
|
||||
if (key_[sub] == kUnusedKey) {continue;}
|
||||
if (top_value < value_[sub]) {
|
||||
top_value = value_[sub];
|
||||
top_key = key_[sub];
|
||||
}
|
||||
}
|
||||
return top_key;
|
||||
}
|
||||
|
||||
|
||||
// Sort first n entries by decreasing order of value
|
||||
// If key==0 other fields are not valid, treat value as -1
|
||||
void DocTote::Sort(int n) {
|
||||
// This is n**2, but n is small
|
||||
for (int sub = 0; sub < n; ++sub) {
|
||||
if (key_[sub] == kUnusedKey) {value_[sub] = -1;}
|
||||
|
||||
// Bubble sort key[sub] and entry[sub]
|
||||
for (int sub2 = sub + 1; sub2 < kMaxSize_; ++sub2) {
|
||||
if (key_[sub2] == kUnusedKey) {value_[sub2] = -1;}
|
||||
if (value_[sub] < value_[sub2]) {
|
||||
// swap
|
||||
uint16 tmpk = key_[sub];
|
||||
key_[sub] = key_[sub2];
|
||||
key_[sub2] = tmpk;
|
||||
|
||||
int tmpv = value_[sub];
|
||||
value_[sub] = value_[sub2];
|
||||
value_[sub2] = tmpv;
|
||||
|
||||
double tmps = score_[sub];
|
||||
score_[sub] = score_[sub2];
|
||||
score_[sub2] = tmps;
|
||||
|
||||
int tmpr = reliability_[sub];
|
||||
reliability_[sub] = reliability_[sub2];
|
||||
reliability_[sub2] = tmpr;
|
||||
}
|
||||
}
|
||||
}
|
||||
sorted_ = 1;
|
||||
}
|
||||
|
||||
void DocTote::Dump(FILE* f) {
|
||||
fprintf(f, "DocTote::Dump\n");
|
||||
for (int sub = 0; sub < kMaxSize_; ++sub) {
|
||||
if (key_[sub] != kUnusedKey) {
|
||||
Language lang = static_cast<Language>(key_[sub]);
|
||||
fprintf(f, "[%2d] %3s %6dB %5dp %4dR,\n", sub, LanguageCode(lang),
|
||||
value_[sub], score_[sub], reliability_[sub]);
|
||||
}
|
||||
}
|
||||
fprintf(f, " %d chunks scored<br>\n", incr_count_);
|
||||
}
|
||||
|
||||
} // End namespace CLD2
|
||||
|
||||
112
internal/tote.h
Normal file
112
internal/tote.h
Normal file
@@ -0,0 +1,112 @@
|
||||
// Copyright 2013 Google Inc. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
//
|
||||
// Author: dsites@google.com (Dick Sites)
|
||||
//
|
||||
|
||||
#ifndef I18N_ENCODINGS_CLD2_INTERNAL_TOTE_H_
|
||||
#define I18N_ENCODINGS_CLD2_INTERNAL_TOTE_H_
|
||||
|
||||
#include <stdio.h>
|
||||
#include "integral_types.h" // for uint8 etc
|
||||
|
||||
namespace CLD2 {
|
||||
|
||||
|
||||
// Take a set of <key, score> pairs and tote them up.
|
||||
// Key is an 8-bit per-script language
|
||||
// After explicitly sorting, retrieve top key, score pairs
|
||||
// Normal use is key=per-script language
|
||||
// The main data structure is an array of 256 uint16 counts. We normally
|
||||
// expect this to be initialized, added-to about 60 times, then the top three
|
||||
// items found. The reduce the initial and final time, we also keep a bit vector
|
||||
// of unused (and uninitialized) parts, each of 64 bits covering four keys.
|
||||
class Tote {
|
||||
public:
|
||||
Tote();
|
||||
~Tote();
|
||||
void Reinit();
|
||||
void AddScoreCount();
|
||||
void Add(uint8 ikey, int idelta);
|
||||
void AddBytes(int ibytes) {byte_count_ += ibytes;}
|
||||
void CurrentTopThreeKeys(int* key3) const;
|
||||
int GetScoreCount() const {return score_count_;}
|
||||
int GetByteCount() const {return byte_count_;}
|
||||
int GetScore(int i) const {return score_[i];}
|
||||
void SetScoreCount(uint16 v) {score_count_ = v;}
|
||||
void SetScore(int i, int v) {score_[i] = v;}
|
||||
|
||||
private:
|
||||
uint64 in_use_mask_; // 64 bits, one for each group of 4 scores.
|
||||
// 0 = not initialized,not used
|
||||
int byte_count_; // Bytes of text scored
|
||||
int score_count_; // Number of quadgrams/etc. scored
|
||||
union {
|
||||
uint64 gscore_[64]; // For alignment and clearing quickly
|
||||
uint16 score_[256]; // Probability score sum
|
||||
};
|
||||
|
||||
};
|
||||
|
||||
|
||||
// Take a set of <key, score, reliability> triples and tote them up.
|
||||
// Key is a 16-bit full language
|
||||
// After explicitly sorting, retrieve top key, score, reliability triples
|
||||
class DocTote {
|
||||
public:
|
||||
DocTote();
|
||||
~DocTote();
|
||||
void Reinit();
|
||||
void Add(uint16 ikey, int ibytes, int score, int ireliability);
|
||||
int Find(uint16 ikey);
|
||||
void AddClosePair(int subscr, int val) {closepair_[subscr] += val;}
|
||||
int CurrentTopKey();
|
||||
Tote* RunningScore() {return &runningscore_;}
|
||||
void Sort(int n);
|
||||
void Dump(FILE* f);
|
||||
|
||||
int GetIncrCount() const {return incr_count_;}
|
||||
int GetClosePair(int subscr) const {return closepair_[subscr];}
|
||||
int MaxSize() const {return kMaxSize_;}
|
||||
uint16 Key(int i) const {return key_[i];}
|
||||
int Value(int i) const {return value_[i];} // byte count
|
||||
int Score(int i) const {return score_[i];} // sum lg prob
|
||||
int Reliability(int i) const {return reliability_[i];}
|
||||
void SetKey(int i, int v) {key_[i] = v;}
|
||||
void SetValue(int i, int v) {value_[i] = v;}
|
||||
void SetScore(int i, int v) {score_[i] = v;}
|
||||
void SetReliability(int i, int v) {reliability_[i] = v;}
|
||||
|
||||
static const uint16 kUnusedKey = 0xFFFF;
|
||||
|
||||
private:
|
||||
static const int kMaxSize_ = 24;
|
||||
static const int kMaxClosePairSize_ = 8;
|
||||
|
||||
int incr_count_; // Number of Add calls
|
||||
int sorted_; // Contents have been sorted, cannot Add
|
||||
Tote runningscore_; // Top lang scores across entire doc, for
|
||||
// helping resolve close pairs
|
||||
// Align at multiple of 8 bytes
|
||||
int closepair_[kMaxClosePairSize_];
|
||||
uint16 key_[kMaxSize_]; // Lang unassigned = 0xFFFF, valid = 1..1023
|
||||
int value_[kMaxSize_]; // Bytecount this lang
|
||||
int score_[kMaxSize_]; // Probability score sum
|
||||
int reliability_[kMaxSize_]; // Percentage 0..100
|
||||
};
|
||||
|
||||
} // End namespace CLD2
|
||||
|
||||
#endif // I18N_ENCODINGS_CLD2_INTERNAL_TOTE_H_
|
||||
445
internal/unittest_data.h
Normal file
445
internal/unittest_data.h
Normal file
File diff suppressed because one or more lines are too long
1629
internal/utf8prop_lettermarkscriptnum.h
Normal file
1629
internal/utf8prop_lettermarkscriptnum.h
Normal file
File diff suppressed because it is too large
Load Diff
756
internal/utf8repl_lettermarklower.h
Normal file
756
internal/utf8repl_lettermarklower.h
Normal file
@@ -0,0 +1,756 @@
|
||||
// Copyright 2013 Google Inc. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
//
|
||||
// Created by utf8tablebuilder version 2.9
|
||||
//
|
||||
// Replaces all codes from file:
|
||||
// lettermarklower_6.2.0.txt
|
||||
// Accepts all other UTF-8 codes 0000..10FFFF
|
||||
// Space optimized
|
||||
//
|
||||
// ** ASSUMES INPUT IS STRUCTURALLY VALID UTF-8 **
|
||||
//
|
||||
// Table entries are absolute statetable subscripts
|
||||
|
||||
#ifndef UTF8REPL_LETTERMARKLOWER_H__
|
||||
#define UTF8REPL_LETTERMARKLOWER_H__
|
||||
|
||||
#include "integral_types.h"
|
||||
#include "utf8statetable.h"
|
||||
|
||||
namespace CLD2 {
|
||||
|
||||
#define X__ (kExitIllegalStructure)
|
||||
#define RJ_ (kExitReject)
|
||||
#define S1_ (kExitReplace1)
|
||||
#define S2_ (kExitReplace2)
|
||||
#define S3_ (kExitReplace3)
|
||||
#define S21 (kExitReplace21)
|
||||
#define S31 (kExitReplace31)
|
||||
#define S32 (kExitReplace32)
|
||||
#define T1_ (kExitReplaceOffset1)
|
||||
#define T2_ (kExitReplaceOffset2)
|
||||
#define S11 (kExitReplace1S0)
|
||||
#define SP_ (kExitSpecial)
|
||||
#define D__ (kExitDoAgain)
|
||||
#define RJA (kExitRejectAlt)
|
||||
|
||||
// Entire table has 111 state blocks of 64 entries each
|
||||
|
||||
static const unsigned int utf8repl_lettermarklower_STATE0 = 0; // state[0]
|
||||
static const unsigned int utf8repl_lettermarklower_STATE0_SIZE = 320; // =[5]
|
||||
static const unsigned int utf8repl_lettermarklower_TOTAL_SIZE = 7104;
|
||||
static const unsigned int utf8repl_lettermarklower_MAX_EXPAND_X4 = 12;
|
||||
static const unsigned int utf8repl_lettermarklower_SHIFT = 6;
|
||||
static const unsigned int utf8repl_lettermarklower_BYTES = 1;
|
||||
static const unsigned int utf8repl_lettermarklower_LOSUB = 0x5b5b5b5b;
|
||||
static const unsigned int utf8repl_lettermarklower_HIADD = 0x00000000;
|
||||
|
||||
static const uint8 utf8repl_lettermarklower[] = {
|
||||
// state[0] 0x000000 Byte 1
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
|
||||
0,S11,S11,S11,S11,S11,S11,S11, S11,S11,S11,S11,S11,S11,S11,S11,
|
||||
S11,S11,S11,S11,S11,S11,S11,S11, S11,S11,S11, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
|
||||
X__,X__, 6, 11, 13, 16, 19, 22, 25, 28, 6, 6, 6, 31, 33, 36,
|
||||
39, 42, 44, 46, 48, 51, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
|
||||
7, 54, 74, 8, 8, 8, 8, 8, 8, 8, 88, 8, 8, 8, 8,100,
|
||||
104, 9, 9, 9, 10,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
|
||||
0x00,0x61,0x62,0x63,0x64,0x65,0x66,0x67, 0x68,0x69,0x6a,0x6b,0x6c,0x6d,0x6e,0x6f,
|
||||
0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77, 0x78,0x79,0x7a,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
|
||||
// state[6 + 2] 0x000080 Byte 2 of 2
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
|
||||
// state[7 + 2] 0x000000 Byte 2 of 3
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
|
||||
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
|
||||
|
||||
// state[8 + 2] 0x003000 Byte 2 of 3
|
||||
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
|
||||
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
|
||||
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
|
||||
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
|
||||
|
||||
// state[9 + 2] 0x040000 Byte 2 of 4
|
||||
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
|
||||
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
|
||||
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
|
||||
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
|
||||
|
||||
// state[10 + 2] 0x100000 Byte 2 of 4
|
||||
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
|
||||
// state[11 + 2] 0x0000c0 Byte 2 of 2
|
||||
S1_,S1_,S1_,S1_,S1_,S1_,S1_,S1_, S1_,S1_,S1_,S1_,S1_,S1_,S1_,S1_,
|
||||
S1_,S1_,S1_,S1_,S1_,S1_,S1_, 0, S1_,S1_,S1_,S1_,S1_,S1_,S1_, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
|
||||
0xa0,0xa1,0xa2,0xa3,0xa4,0xa5,0xa6,0xa7, 0xa8,0xa9,0xaa,0xab,0xac,0xad,0xae,0xaf,
|
||||
0xb0,0xb1,0xb2,0xb3,0xb4,0xb5,0xb6,0x00, 0xb8,0xb9,0xba,0xbb,0xbc,0xbd,0xbe,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
|
||||
// state[13 + 2] 0x000100 Byte 2 of 2
|
||||
S1_, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
|
||||
S1_, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
|
||||
S1_, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
|
||||
S21, 0,S1_, 0,S1_, 0,S1_, 0, 0,S1_, 0,S1_, 0,S1_, 0,S2_,
|
||||
|
||||
0x81,0x00,0x83,0x00,0x85,0x00,0x87,0x00, 0x89,0x00,0x8b,0x00,0x8d,0x00,0x8f,0x00,
|
||||
0x91,0x00,0x93,0x00,0x95,0x00,0x97,0x00, 0x99,0x00,0x9b,0x00,0x9d,0x00,0x9f,0x00,
|
||||
0xa1,0x00,0xa3,0x00,0xa5,0x00,0xa7,0x00, 0xa9,0x00,0xab,0x00,0xad,0x00,0xaf,0x00,
|
||||
0x69,0x00,0xb3,0x00,0xb5,0x00,0xb7,0x00, 0x00,0xba,0x00,0xbc,0x00,0xbe,0x00,0x80,
|
||||
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0xc5,
|
||||
|
||||
// state[16 + 2] 0x000140 Byte 2 of 2
|
||||
0,S1_, 0,S1_, 0,S1_, 0,S1_, 0, 0,S1_, 0,S1_, 0,S1_, 0,
|
||||
S1_, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
|
||||
S1_, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
|
||||
S1_, 0,S1_, 0,S1_, 0,S1_, 0, S2_,S1_, 0,S1_, 0,S1_, 0, 0,
|
||||
|
||||
0x00,0x82,0x00,0x84,0x00,0x86,0x00,0x88, 0x00,0x00,0x8b,0x00,0x8d,0x00,0x8f,0x00,
|
||||
0x91,0x00,0x93,0x00,0x95,0x00,0x97,0x00, 0x99,0x00,0x9b,0x00,0x9d,0x00,0x9f,0x00,
|
||||
0xa1,0x00,0xa3,0x00,0xa5,0x00,0xa7,0x00, 0xa9,0x00,0xab,0x00,0xad,0x00,0xaf,0x00,
|
||||
0xb1,0x00,0xb3,0x00,0xb5,0x00,0xb7,0x00, 0xbf,0xba,0x00,0xbc,0x00,0xbe,0x00,0x00,
|
||||
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0xc3,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
|
||||
// state[19 + 2] 0x000180 Byte 2 of 2
|
||||
0,S2_,S1_, 0,S1_, 0,S2_,S1_, 0,S2_,S2_,S1_, 0, 0,S2_,S2_,
|
||||
S2_,S1_, 0,S2_,S2_, 0,S2_,S2_, S1_, 0, 0, 0,S2_,S2_, 0,S2_,
|
||||
S1_, 0,S1_, 0,S1_, 0,S2_,S1_, 0,S2_, 0, 0,S1_, 0,S2_,S1_,
|
||||
0,S2_,S2_,S1_, 0,S1_, 0,S2_, S1_, 0, 0, 0,S1_, 0, 0, 0,
|
||||
|
||||
0x00,0x93,0x83,0x00,0x85,0x00,0x94,0x88, 0x00,0x96,0x97,0x8c,0x00,0x00,0x9d,0x99,
|
||||
0x9b,0x92,0x00,0xa0,0xa3,0x00,0xa9,0xa8, 0x99,0x00,0x00,0x00,0xaf,0xb2,0x00,0xb5,
|
||||
0xa1,0x00,0xa3,0x00,0xa5,0x00,0x80,0xa8, 0x00,0x83,0x00,0x00,0xad,0x00,0x88,0xb0,
|
||||
0x00,0x8a,0x8b,0xb4,0x00,0xb6,0x00,0x92, 0xb9,0x00,0x00,0x00,0xbd,0x00,0x00,0x00,
|
||||
|
||||
0x00,0xc9,0x00,0x00,0x00,0x00,0xc9,0x00, 0x00,0xc9,0xc9,0x00,0x00,0x00,0xc7,0xc9,
|
||||
0xc9,0x00,0x00,0xc9,0xc9,0x00,0xc9,0xc9, 0x00,0x00,0x00,0x00,0xc9,0xc9,0x00,0xc9,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0xca,0x00, 0x00,0xca,0x00,0x00,0x00,0x00,0xca,0x00,
|
||||
0x00,0xca,0xca,0x00,0x00,0x00,0x00,0xca, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
|
||||
// state[22 + 2] 0x0001c0 Byte 2 of 2
|
||||
0, 0, 0, 0,S1_,S1_, 0,S1_, S1_, 0,S1_,S1_, 0,S1_, 0,S1_,
|
||||
0,S1_, 0,S1_, 0,S1_, 0,S1_, 0,S1_, 0,S1_, 0, 0,S1_, 0,
|
||||
S1_, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
|
||||
0,S1_,S1_, 0,S1_, 0,S2_,S2_, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
|
||||
|
||||
0x00,0x00,0x00,0x00,0x86,0x86,0x00,0x89, 0x89,0x00,0x8c,0x8c,0x00,0x8e,0x00,0x90,
|
||||
0x00,0x92,0x00,0x94,0x00,0x96,0x00,0x98, 0x00,0x9a,0x00,0x9c,0x00,0x00,0x9f,0x00,
|
||||
0xa1,0x00,0xa3,0x00,0xa5,0x00,0xa7,0x00, 0xa9,0x00,0xab,0x00,0xad,0x00,0xaf,0x00,
|
||||
0x00,0xb3,0xb3,0x00,0xb5,0x00,0x95,0xbf, 0xb9,0x00,0xbb,0x00,0xbd,0x00,0xbf,0x00,
|
||||
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0xc6,0xc6, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
|
||||
// state[25 + 2] 0x000200 Byte 2 of 2
|
||||
S1_, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
|
||||
S1_, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
|
||||
S2_, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
|
||||
S1_, 0,S1_, 0, 0, 0, 0, 0, 0, 0,T1_,S1_, 0,S2_,T1_, 0,
|
||||
|
||||
0x81,0x00,0x83,0x00,0x85,0x00,0x87,0x00, 0x89,0x00,0x8b,0x00,0x8d,0x00,0x8f,0x00,
|
||||
0x91,0x00,0x93,0x00,0x95,0x00,0x97,0x00, 0x99,0x00,0x9b,0x00,0x9d,0x00,0x9f,0x00,
|
||||
0x9e,0x00,0xa3,0x00,0xa5,0x00,0xa7,0x00, 0xa9,0x00,0xab,0x00,0xad,0x00,0xaf,0x00,
|
||||
0xb1,0x00,0xb3,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0xbc,0x00,0x9a,0x01,0x00,
|
||||
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0xc6,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0xc6,0x00,0x00,
|
||||
|
||||
// state[28 + 2] 0x000240 Byte 2 of 2
|
||||
0,S1_, 0,S2_,S2_,S2_,S1_, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
|
||||
0x00,0x82,0x00,0x80,0x89,0x8c,0x87,0x00, 0x89,0x00,0x8b,0x00,0x8d,0x00,0x8f,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
|
||||
0x00,0x00,0x00,0xc6,0xca,0xca,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
|
||||
// state[31 + 2] 0x000340 Byte 2 of 2
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
S1_, 0,S1_, 0, 0, 0,S1_, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0xb1,0x00,0xb3,0x00,0x00,0x00,0xb7,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
|
||||
// state[33 + 2] 0x000380 Byte 2 of 2
|
||||
0, 0, 0, 0, 0, 0,S1_, 0, S1_,S1_,S1_, 0,S2_, 0,S2_,S2_,
|
||||
0,S1_,S1_,S1_,S1_,S1_,S1_,S1_, S1_,S1_,S1_,S1_,S1_,S1_,S1_,S1_,
|
||||
S2_,S2_, 0,S2_,S2_,S2_,S2_,S2_, S2_,S2_,S2_,S2_, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0xac,0x00, 0xad,0xae,0xaf,0x00,0x8c,0x00,0x8d,0x8e,
|
||||
0x00,0xb1,0xb2,0xb3,0xb4,0xb5,0xb6,0xb7, 0xb8,0xb9,0xba,0xbb,0xbc,0xbd,0xbe,0xbf,
|
||||
0x80,0x81,0x00,0x83,0x84,0x85,0x86,0x87, 0x88,0x89,0x8a,0x8b,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0xcf,0x00,0xcf,0xcf,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0xcf,0xcf,0x00,0xcf,0xcf,0xcf,0xcf,0xcf, 0xcf,0xcf,0xcf,0xcf,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
|
||||
// state[36 + 2] 0x0003c0 Byte 2 of 2
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,S1_,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
|
||||
S1_, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
|
||||
0, 0, 0, 0,S2_, 0, 0,S1_, 0,S1_,S1_, 0, 0,S2_,S2_,S2_,
|
||||
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x97,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x99,0x00,0x9b,0x00,0x9d,0x00,0x9f,0x00,
|
||||
0xa1,0x00,0xa3,0x00,0xa5,0x00,0xa7,0x00, 0xa9,0x00,0xab,0x00,0xad,0x00,0xaf,0x00,
|
||||
0x00,0x00,0x00,0x00,0xb8,0x00,0x00,0xb8, 0x00,0xb2,0xbb,0x00,0x00,0xbb,0xbc,0xbd,
|
||||
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0xce,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0xcd,0xcd,0xcd,
|
||||
|
||||
// state[39 + 2] 0x000400 Byte 2 of 2
|
||||
S2_,S2_,S2_,S2_,S2_,S2_,S2_,S2_, S2_,S2_,S2_,S2_,S2_,S2_,S2_,S2_,
|
||||
S1_,S1_,S1_,S1_,S1_,S1_,S1_,S1_, S1_,S1_,S1_,S1_,S1_,S1_,S1_,S1_,
|
||||
S2_,S2_,S2_,S2_,S2_,S2_,S2_,S2_, S2_,S2_,S2_,S2_,S2_,S2_,S2_,S2_,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
|
||||
0x90,0x91,0x92,0x93,0x94,0x95,0x96,0x97, 0x98,0x99,0x9a,0x9b,0x9c,0x9d,0x9e,0x9f,
|
||||
0xb0,0xb1,0xb2,0xb3,0xb4,0xb5,0xb6,0xb7, 0xb8,0xb9,0xba,0xbb,0xbc,0xbd,0xbe,0xbf,
|
||||
0x80,0x81,0x82,0x83,0x84,0x85,0x86,0x87, 0x88,0x89,0x8a,0x8b,0x8c,0x8d,0x8e,0x8f,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
|
||||
0xd1,0xd1,0xd1,0xd1,0xd1,0xd1,0xd1,0xd1, 0xd1,0xd1,0xd1,0xd1,0xd1,0xd1,0xd1,0xd1,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0xd1,0xd1,0xd1,0xd1,0xd1,0xd1,0xd1,0xd1, 0xd1,0xd1,0xd1,0xd1,0xd1,0xd1,0xd1,0xd1,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
|
||||
// state[42 + 2] 0x000440 Byte 2 of 2
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
S1_, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
|
||||
S1_, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
|
||||
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0xa1,0x00,0xa3,0x00,0xa5,0x00,0xa7,0x00, 0xa9,0x00,0xab,0x00,0xad,0x00,0xaf,0x00,
|
||||
0xb1,0x00,0xb3,0x00,0xb5,0x00,0xb7,0x00, 0xb9,0x00,0xbb,0x00,0xbd,0x00,0xbf,0x00,
|
||||
|
||||
// state[44 + 2] 0x000480 Byte 2 of 2
|
||||
S1_, 0, 0, 0, 0, 0, 0, 0, 0, 0,S1_, 0,S1_, 0,S1_, 0,
|
||||
S1_, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
|
||||
S1_, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
|
||||
S1_, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
|
||||
|
||||
0x81,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x8b,0x00,0x8d,0x00,0x8f,0x00,
|
||||
0x91,0x00,0x93,0x00,0x95,0x00,0x97,0x00, 0x99,0x00,0x9b,0x00,0x9d,0x00,0x9f,0x00,
|
||||
0xa1,0x00,0xa3,0x00,0xa5,0x00,0xa7,0x00, 0xa9,0x00,0xab,0x00,0xad,0x00,0xaf,0x00,
|
||||
0xb1,0x00,0xb3,0x00,0xb5,0x00,0xb7,0x00, 0xb9,0x00,0xbb,0x00,0xbd,0x00,0xbf,0x00,
|
||||
|
||||
// state[46 + 2] 0x0004c0 Byte 2 of 2
|
||||
S1_,S1_, 0,S1_, 0,S1_, 0,S1_, 0,S1_, 0,S1_, 0,S1_, 0, 0,
|
||||
S1_, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
|
||||
S1_, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
|
||||
S1_, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
|
||||
|
||||
0x8f,0x82,0x00,0x84,0x00,0x86,0x00,0x88, 0x00,0x8a,0x00,0x8c,0x00,0x8e,0x00,0x00,
|
||||
0x91,0x00,0x93,0x00,0x95,0x00,0x97,0x00, 0x99,0x00,0x9b,0x00,0x9d,0x00,0x9f,0x00,
|
||||
0xa1,0x00,0xa3,0x00,0xa5,0x00,0xa7,0x00, 0xa9,0x00,0xab,0x00,0xad,0x00,0xaf,0x00,
|
||||
0xb1,0x00,0xb3,0x00,0xb5,0x00,0xb7,0x00, 0xb9,0x00,0xbb,0x00,0xbd,0x00,0xbf,0x00,
|
||||
|
||||
// state[48 + 2] 0x000500 Byte 2 of 2
|
||||
S1_, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
|
||||
S1_, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
|
||||
S1_, 0,S1_, 0,S1_, 0,S1_, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0,S2_,S2_,S2_,S2_,S2_,S2_,S2_, S2_,S2_,S2_,S2_,S2_,S2_,S2_,S2_,
|
||||
|
||||
0x81,0x00,0x83,0x00,0x85,0x00,0x87,0x00, 0x89,0x00,0x8b,0x00,0x8d,0x00,0x8f,0x00,
|
||||
0x91,0x00,0x93,0x00,0x95,0x00,0x97,0x00, 0x99,0x00,0x9b,0x00,0x9d,0x00,0x9f,0x00,
|
||||
0xa1,0x00,0xa3,0x00,0xa5,0x00,0xa7,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0xa1,0xa2,0xa3,0xa4,0xa5,0xa6,0xa7, 0xa8,0xa9,0xaa,0xab,0xac,0xad,0xae,0xaf,
|
||||
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0xd5,0xd5,0xd5,0xd5,0xd5,0xd5,0xd5, 0xd5,0xd5,0xd5,0xd5,0xd5,0xd5,0xd5,0xd5,
|
||||
|
||||
// state[51 + 2] 0x000540 Byte 2 of 2
|
||||
S1_,S1_,S1_,S1_,S1_,S1_,S1_,S1_, S1_,S1_,S1_,S1_,S1_,S1_,S1_,S1_,
|
||||
S2_,S2_,S2_,S2_,S2_,S2_,S2_, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
|
||||
0xb0,0xb1,0xb2,0xb3,0xb4,0xb5,0xb6,0xb7, 0xb8,0xb9,0xba,0xbb,0xbc,0xbd,0xbe,0xbf,
|
||||
0x80,0x81,0x82,0x83,0x84,0x85,0x86,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0xd6,0xd6,0xd6,0xd6,0xd6,0xd6,0xd6,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
|
||||
// state[54 + 2] 0x001000 Byte 2 of 3
|
||||
6, 6, 55, 57, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
|
||||
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
|
||||
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
|
||||
6, 6, 6, 6, 6, 6, 6, 6, 59, 59, 61, 59, 64, 66, 68, 71,
|
||||
|
||||
// state[55 + 2] 0x001080 Byte 3 of 3
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
T1_,T1_,T1_,T1_,T1_,T1_,T1_,T1_, T1_,T1_,T1_,T1_,T1_,T1_,T1_,T1_,
|
||||
T1_,T1_,T1_,T1_,T1_,T1_,T1_,T1_, T1_,T1_,T1_,T1_,T1_,T1_,T1_,T1_,
|
||||
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09, 0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0x10,0x11,
|
||||
0x12,0x13,0x14,0x15,0x16,0x17,0x18,0x19, 0x1a,0x1b,0x1c,0x1d,0x1e,0x1f,0x20,0x21,
|
||||
|
||||
// state[57 + 2] 0x0010c0 Byte 3 of 3
|
||||
T1_,T1_,T1_,T1_,T1_,T1_, 0,T1_, 0, 0, 0, 0, 0,T1_, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
|
||||
0x22,0x23,0x24,0x25,0x26,0x27,0x00,0x28, 0x00,0x00,0x00,0x00,0x00,0x29,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
|
||||
// state[59 + 2] 0x001e00 Byte 3 of 3
|
||||
S1_, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
|
||||
S1_, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
|
||||
S1_, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
|
||||
S1_, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
|
||||
|
||||
0x81,0x00,0x83,0x00,0x85,0x00,0x87,0x00, 0x89,0x00,0x8b,0x00,0x8d,0x00,0x8f,0x00,
|
||||
0x91,0x00,0x93,0x00,0x95,0x00,0x97,0x00, 0x99,0x00,0x9b,0x00,0x9d,0x00,0x9f,0x00,
|
||||
0xa1,0x00,0xa3,0x00,0xa5,0x00,0xa7,0x00, 0xa9,0x00,0xab,0x00,0xad,0x00,0xaf,0x00,
|
||||
0xb1,0x00,0xb3,0x00,0xb5,0x00,0xb7,0x00, 0xb9,0x00,0xbb,0x00,0xbd,0x00,0xbf,0x00,
|
||||
|
||||
// state[61 + 2] 0x001e80 Byte 3 of 3
|
||||
S1_, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
|
||||
S1_, 0,S1_, 0,S1_, 0, 0, 0, 0, 0, 0, 0, 0, 0,S32, 0,
|
||||
S1_, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
|
||||
S1_, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
|
||||
|
||||
0x81,0x00,0x83,0x00,0x85,0x00,0x87,0x00, 0x89,0x00,0x8b,0x00,0x8d,0x00,0x8f,0x00,
|
||||
0x91,0x00,0x93,0x00,0x95,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x9f,0x00,
|
||||
0xa1,0x00,0xa3,0x00,0xa5,0x00,0xa7,0x00, 0xa9,0x00,0xab,0x00,0xad,0x00,0xaf,0x00,
|
||||
0xb1,0x00,0xb3,0x00,0xb5,0x00,0xb7,0x00, 0xb9,0x00,0xbb,0x00,0xbd,0x00,0xbf,0x00,
|
||||
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0xc3,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
|
||||
// state[64 + 2] 0x001f00 Byte 3 of 3
|
||||
0, 0, 0, 0, 0, 0, 0, 0, S1_,S1_,S1_,S1_,S1_,S1_,S1_,S1_,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, S1_,S1_,S1_,S1_,S1_,S1_, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, S1_,S1_,S1_,S1_,S1_,S1_,S1_,S1_,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, S1_,S1_,S1_,S1_,S1_,S1_,S1_,S1_,
|
||||
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x80,0x81,0x82,0x83,0x84,0x85,0x86,0x87,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x90,0x91,0x92,0x93,0x94,0x95,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0xa0,0xa1,0xa2,0xa3,0xa4,0xa5,0xa6,0xa7,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0xb0,0xb1,0xb2,0xb3,0xb4,0xb5,0xb6,0xb7,
|
||||
|
||||
// state[66 + 2] 0x001f40 Byte 3 of 3
|
||||
0, 0, 0, 0, 0, 0, 0, 0, S1_,S1_,S1_,S1_,S1_,S1_, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0,S1_, 0,S1_, 0,S1_, 0,S1_,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, S1_,S1_,S1_,S1_,S1_,S1_,S1_,S1_,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x80,0x81,0x82,0x83,0x84,0x85,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x91,0x00,0x93,0x00,0x95,0x00,0x97,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0xa0,0xa1,0xa2,0xa3,0xa4,0xa5,0xa6,0xa7,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
|
||||
// state[68 + 2] 0x001f80 Byte 3 of 3
|
||||
0, 0, 0, 0, 0, 0, 0, 0, S1_,S1_,S1_,S1_,S1_,S1_,S1_,S1_,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, S1_,S1_,S1_,S1_,S1_,S1_,S1_,S1_,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, S1_,S1_,S1_,S1_,S1_,S1_,S1_,S1_,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, S1_,S1_,S2_,S2_,S1_, 0, 0, 0,
|
||||
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x80,0x81,0x82,0x83,0x84,0x85,0x86,0x87,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x90,0x91,0x92,0x93,0x94,0x95,0x96,0x97,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0xa0,0xa1,0xa2,0xa3,0xa4,0xa5,0xa6,0xa7,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0xb0,0xb1,0xb0,0xb1,0xb3,0x00,0x00,0x00,
|
||||
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0xbd,0xbd,0x00,0x00,0x00,0x00,
|
||||
|
||||
// state[71 + 2] 0x001fc0 Byte 3 of 3
|
||||
0, 0, 0, 0, 0, 0, 0, 0, S2_,S2_,S2_,S2_,S1_, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, S1_,S1_,S2_,S2_, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, S1_,S1_,S2_,S2_,S1_, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, S2_,S2_,S2_,S2_,S1_, 0, 0, 0,
|
||||
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0xb2,0xb3,0xb4,0xb5,0x83,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x90,0x91,0xb6,0xb7,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0xa0,0xa1,0xba,0xbb,0xa5,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0xb8,0xb9,0xbc,0xbd,0xb3,0x00,0x00,0x00,
|
||||
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0xbd,0xbd,0xbd,0xbd,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0xbd,0xbd,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0xbd,0xbd,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0xbd,0xbd,0xbd,0xbd,0x00,0x00,0x00,0x00,
|
||||
|
||||
// state[74 + 2] 0x002000 Byte 2 of 3
|
||||
6, 6, 6, 6, 75, 6, 78, 6, 6, 6, 6, 6, 6, 6, 6, 6,
|
||||
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
|
||||
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
|
||||
80, 83, 59, 86, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
|
||||
|
||||
// state[75 + 2] 0x002100 Byte 3 of 3
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0,S32, 0, 0, 0,S31,S32, 0, 0, 0, 0,
|
||||
0, 0,S2_, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x89,0x00, 0x00,0x00,0x6b,0xa5,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x8e,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0xcf,0x00, 0x00,0x00,0x00,0xc3,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x85,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
|
||||
// state[78 + 2] 0x002180 Byte 3 of 3
|
||||
0, 0, 0,S1_, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
|
||||
0x00,0x00,0x00,0x84,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
|
||||
// state[80 + 2] 0x002c00 Byte 3 of 3
|
||||
S1_,S1_,S1_,S1_,S1_,S1_,S1_,S1_, S1_,S1_,S1_,S1_,S1_,S1_,S1_,S1_,
|
||||
S2_,S2_,S2_,S2_,S2_,S2_,S2_,S2_, S2_,S2_,S2_,S2_,S2_,S2_,S2_,S2_,
|
||||
S2_,S2_,S2_,S2_,S2_,S2_,S2_,S2_, S2_,S2_,S2_,S2_,S2_,S2_,S2_, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
|
||||
0xb0,0xb1,0xb2,0xb3,0xb4,0xb5,0xb6,0xb7, 0xb8,0xb9,0xba,0xbb,0xbc,0xbd,0xbe,0xbf,
|
||||
0x80,0x81,0x82,0x83,0x84,0x85,0x86,0x87, 0x88,0x89,0x8a,0x8b,0x8c,0x8d,0x8e,0x8f,
|
||||
0x90,0x91,0x92,0x93,0x94,0x95,0x96,0x97, 0x98,0x99,0x9a,0x9b,0x9c,0x9d,0x9e,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0xb1,0xb1,0xb1,0xb1,0xb1,0xb1,0xb1,0xb1, 0xb1,0xb1,0xb1,0xb1,0xb1,0xb1,0xb1,0xb1,
|
||||
0xb1,0xb1,0xb1,0xb1,0xb1,0xb1,0xb1,0xb1, 0xb1,0xb1,0xb1,0xb1,0xb1,0xb1,0xb1,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
|
||||
// state[83 + 2] 0x002c40 Byte 3 of 3
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
S1_, 0,S32,T1_,S32, 0, 0,S1_, 0,S1_, 0,S1_, 0,S32,S32,S32,
|
||||
S32, 0,S1_, 0, 0,S1_, 0, 0, 0, 0, 0, 0, 0, 0,S32,S32,
|
||||
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0xa1,0x00,0xab,0x2a,0xbd,0x00,0x00,0xa8, 0x00,0xaa,0x00,0xac,0x00,0x91,0xb1,0x90,
|
||||
0x92,0x00,0xb3,0x00,0x00,0xb6,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0xbf,0x80,
|
||||
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0xc9,0x00,0xc9,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0xc9,0xc9,0xc9,
|
||||
0xc9,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0xc8,0xc9,
|
||||
|
||||
// state[86 + 2] 0x002cc0 Byte 3 of 3
|
||||
S1_, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
|
||||
S1_, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
|
||||
S1_, 0,S1_, 0, 0, 0, 0, 0, 0, 0, 0,S1_, 0,S1_, 0, 0,
|
||||
0, 0,S1_, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
|
||||
0x81,0x00,0x83,0x00,0x85,0x00,0x87,0x00, 0x89,0x00,0x8b,0x00,0x8d,0x00,0x8f,0x00,
|
||||
0x91,0x00,0x93,0x00,0x95,0x00,0x97,0x00, 0x99,0x00,0x9b,0x00,0x9d,0x00,0x9f,0x00,
|
||||
0xa1,0x00,0xa3,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0xac,0x00,0xae,0x00,0x00,
|
||||
0x00,0x00,0xb3,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
|
||||
// state[88 + 2] 0x00a000 Byte 2 of 3
|
||||
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
|
||||
6, 6, 6, 6, 6, 6, 6, 6, 6, 89, 91, 6, 93, 95, 97, 6,
|
||||
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
|
||||
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
|
||||
|
||||
// state[89 + 2] 0x00a640 Byte 3 of 3
|
||||
S1_, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
|
||||
S1_, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
|
||||
S1_, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S1_, 0,S1_, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
|
||||
0x81,0x00,0x83,0x00,0x85,0x00,0x87,0x00, 0x89,0x00,0x8b,0x00,0x8d,0x00,0x8f,0x00,
|
||||
0x91,0x00,0x93,0x00,0x95,0x00,0x97,0x00, 0x99,0x00,0x9b,0x00,0x9d,0x00,0x9f,0x00,
|
||||
0xa1,0x00,0xa3,0x00,0xa5,0x00,0xa7,0x00, 0xa9,0x00,0xab,0x00,0xad,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
|
||||
// state[91 + 2] 0x00a680 Byte 3 of 3
|
||||
S1_, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
|
||||
S1_, 0,S1_, 0,S1_, 0,S1_, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
|
||||
0x81,0x00,0x83,0x00,0x85,0x00,0x87,0x00, 0x89,0x00,0x8b,0x00,0x8d,0x00,0x8f,0x00,
|
||||
0x91,0x00,0x93,0x00,0x95,0x00,0x97,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
|
||||
// state[93 + 2] 0x00a700 Byte 3 of 3
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
|
||||
0, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
|
||||
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0xa3,0x00,0xa5,0x00,0xa7,0x00, 0xa9,0x00,0xab,0x00,0xad,0x00,0xaf,0x00,
|
||||
0x00,0x00,0xb3,0x00,0xb5,0x00,0xb7,0x00, 0xb9,0x00,0xbb,0x00,0xbd,0x00,0xbf,0x00,
|
||||
|
||||
// state[95 + 2] 0x00a740 Byte 3 of 3
|
||||
S1_, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
|
||||
S1_, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
|
||||
S1_, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S1_, 0,S1_, 0,S1_, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0,S1_, 0,S1_, 0,T1_,S1_, 0,
|
||||
|
||||
0x81,0x00,0x83,0x00,0x85,0x00,0x87,0x00, 0x89,0x00,0x8b,0x00,0x8d,0x00,0x8f,0x00,
|
||||
0x91,0x00,0x93,0x00,0x95,0x00,0x97,0x00, 0x99,0x00,0x9b,0x00,0x9d,0x00,0x9f,0x00,
|
||||
0xa1,0x00,0xa3,0x00,0xa5,0x00,0xa7,0x00, 0xa9,0x00,0xab,0x00,0xad,0x00,0xaf,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0xba,0x00,0xbc,0x00,0x2b,0xbf,0x00,
|
||||
|
||||
// state[97 + 2] 0x00a780 Byte 3 of 3
|
||||
S1_, 0,S1_, 0,S1_, 0,S1_, 0, 0, 0, 0,S1_, 0,S32, 0, 0,
|
||||
S1_, 0,S1_, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
S1_, 0,S1_, 0,S1_, 0,S1_, 0, S1_, 0,S32, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
|
||||
0x81,0x00,0x83,0x00,0x85,0x00,0x87,0x00, 0x00,0x00,0x00,0x8c,0x00,0xa5,0x00,0x00,
|
||||
0x91,0x00,0x93,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0xa1,0x00,0xa3,0x00,0xa5,0x00,0xa7,0x00, 0xa9,0x00,0xa6,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0xc9,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0xc9,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
|
||||
// state[100 + 2] 0x00f000 Byte 2 of 3
|
||||
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
|
||||
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
|
||||
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
|
||||
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,101, 6, 6, 6,
|
||||
|
||||
// state[101 + 2] 0x00ff00 Byte 3 of 3
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0,S2_,S2_,S2_,S2_,S2_,S2_,S2_, S2_,S2_,S2_,S2_,S2_,S2_,S2_,S2_,
|
||||
S2_,S2_,S2_,S2_,S2_,S2_,S2_,S2_, S2_,S2_,S2_, 0, 0, 0, 0, 0,
|
||||
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x81,0x82,0x83,0x84,0x85,0x86,0x87, 0x88,0x89,0x8a,0x8b,0x8c,0x8d,0x8e,0x8f,
|
||||
0x90,0x91,0x92,0x93,0x94,0x95,0x96,0x97, 0x98,0x99,0x9a,0x00,0x00,0x00,0x00,0x00,
|
||||
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0xbd,0xbd,0xbd,0xbd,0xbd,0xbd,0xbd, 0xbd,0xbd,0xbd,0xbd,0xbd,0xbd,0xbd,0xbd,
|
||||
0xbd,0xbd,0xbd,0xbd,0xbd,0xbd,0xbd,0xbd, 0xbd,0xbd,0xbd,0x00,0x00,0x00,0x00,0x00,
|
||||
|
||||
// state[104 + 2] 0x000000 Byte 2 of 4
|
||||
X__,X__,X__,X__,X__,X__,X__,X__, X__,X__,X__,X__,X__,X__,X__,X__,
|
||||
105, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
|
||||
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
|
||||
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
|
||||
|
||||
// state[105 + 2] 0x010000 Byte 3 of 4
|
||||
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
|
||||
106, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
|
||||
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
|
||||
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
|
||||
|
||||
// state[106 + 2] 0x010400 Byte 4 of 4
|
||||
S1_,S1_,S1_,S1_,S1_,S1_,S1_,S1_, S1_,S1_,S1_,S1_,S1_,S1_,S1_,S1_,
|
||||
S1_,S1_,S1_,S1_,S1_,S1_,S1_,S1_, S2_,S2_,S2_,S2_,S2_,S2_,S2_,S2_,
|
||||
S2_,S2_,S2_,S2_,S2_,S2_,S2_,S2_, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
|
||||
0xa8,0xa9,0xaa,0xab,0xac,0xad,0xae,0xaf, 0xb0,0xb1,0xb2,0xb3,0xb4,0xb5,0xb6,0xb7,
|
||||
0xb8,0xb9,0xba,0xbb,0xbc,0xbd,0xbe,0xbf, 0x80,0x81,0x82,0x83,0x84,0x85,0x86,0x87,
|
||||
0x88,0x89,0x8a,0x8b,0x8c,0x8d,0x8e,0x8f, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x91,0x91,0x91,0x91,0x91,0x91,0x91,0x91,
|
||||
0x91,0x91,0x91,0x91,0x91,0x91,0x91,0x91, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
|
||||
};
|
||||
|
||||
// Remap base[44] = (del, add, string_offset)
|
||||
static const RemapEntry utf8repl_lettermarklower_remap_base[] = {
|
||||
{2,3, 0}, {2,3, 3}, {3,3, 6}, {3,3, 9},
|
||||
{3,3, 12}, {3,3, 15}, {3,3, 18}, {3,3, 21},
|
||||
{3,3, 24}, {3,3, 27}, {3,3, 30}, {3,3, 33},
|
||||
{3,3, 36}, {3,3, 39}, {3,3, 42}, {3,3, 45},
|
||||
|
||||
{3,3, 48}, {3,3, 51}, {3,3, 54}, {3,3, 57},
|
||||
{3,3, 60}, {3,3, 63}, {3,3, 66}, {3,3, 69},
|
||||
{3,3, 72}, {3,3, 75}, {3,3, 78}, {3,3, 81},
|
||||
{3,3, 84}, {3,3, 87}, {3,3, 90}, {3,3, 93},
|
||||
|
||||
{3,3, 96}, {3,3, 99}, {3,3, 102}, {3,3, 105},
|
||||
{3,3, 108}, {3,3, 111}, {3,3, 114}, {3,3, 117},
|
||||
{3,3, 120}, {3,3, 123}, {3,3, 126}, {3,3, 129},
|
||||
{0,0,0} };
|
||||
|
||||
// Remap string[132]
|
||||
static const unsigned char utf8repl_lettermarklower_remap_string[] = {
|
||||
0xe2,0xb1,0xa5,0xe2,0xb1,0xa6,0xe2,0xb4, 0x80,0xe2,0xb4,0x81,0xe2,0xb4,0x82,0xe2,
|
||||
0xb4,0x83,0xe2,0xb4,0x84,0xe2,0xb4,0x85, 0xe2,0xb4,0x86,0xe2,0xb4,0x87,0xe2,0xb4,
|
||||
0x88,0xe2,0xb4,0x89,0xe2,0xb4,0x8a,0xe2, 0xb4,0x8b,0xe2,0xb4,0x8c,0xe2,0xb4,0x8d,
|
||||
0xe2,0xb4,0x8e,0xe2,0xb4,0x8f,0xe2,0xb4, 0x90,0xe2,0xb4,0x91,0xe2,0xb4,0x92,0xe2,
|
||||
|
||||
0xb4,0x93,0xe2,0xb4,0x94,0xe2,0xb4,0x95, 0xe2,0xb4,0x96,0xe2,0xb4,0x97,0xe2,0xb4,
|
||||
0x98,0xe2,0xb4,0x99,0xe2,0xb4,0x9a,0xe2, 0xb4,0x9b,0xe2,0xb4,0x9c,0xe2,0xb4,0x9d,
|
||||
0xe2,0xb4,0x9e,0xe2,0xb4,0x9f,0xe2,0xb4, 0xa0,0xe2,0xb4,0xa1,0xe2,0xb4,0xa2,0xe2,
|
||||
0xb4,0xa3,0xe2,0xb4,0xa4,0xe2,0xb4,0xa5, 0xe2,0xb4,0xa7,0xe2,0xb4,0xad,0xe1,0xb5,
|
||||
|
||||
0xbd,0xe1,0xb5,0xb9,0 };
|
||||
|
||||
static const unsigned char utf8repl_lettermarklower_fast[256] = {
|
||||
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
|
||||
|
||||
0,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
|
||||
1,1,1,1,1,1,1,1, 1,1,1,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
|
||||
|
||||
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
|
||||
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
|
||||
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
|
||||
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
|
||||
|
||||
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
|
||||
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
|
||||
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
|
||||
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
|
||||
|
||||
};
|
||||
|
||||
static const UTF8ReplaceObj utf8repl_lettermarklower_obj = {
|
||||
utf8repl_lettermarklower_STATE0,
|
||||
utf8repl_lettermarklower_STATE0_SIZE,
|
||||
utf8repl_lettermarklower_TOTAL_SIZE,
|
||||
utf8repl_lettermarklower_MAX_EXPAND_X4,
|
||||
utf8repl_lettermarklower_SHIFT,
|
||||
utf8repl_lettermarklower_BYTES,
|
||||
utf8repl_lettermarklower_LOSUB,
|
||||
utf8repl_lettermarklower_HIADD,
|
||||
utf8repl_lettermarklower,
|
||||
utf8repl_lettermarklower_remap_base,
|
||||
utf8repl_lettermarklower_remap_string,
|
||||
utf8repl_lettermarklower_fast
|
||||
};
|
||||
|
||||
|
||||
#undef X__
|
||||
#undef RJ_
|
||||
#undef S1_
|
||||
#undef S2_
|
||||
#undef S3_
|
||||
#undef S21
|
||||
#undef S31
|
||||
#undef S32
|
||||
#undef T1_
|
||||
#undef T2_
|
||||
#undef S11
|
||||
#undef SP_
|
||||
#undef D__
|
||||
#undef RJA
|
||||
|
||||
// Table has 7668 bytes, Hash = 07A2-C4E3
|
||||
|
||||
} // End namespace CLD2
|
||||
|
||||
#endif // UTF8REPL_LETTERMARKLOWER_H__
|
||||
1453
internal/utf8scannot_lettermarkspecial.h
Normal file
1453
internal/utf8scannot_lettermarkspecial.h
Normal file
File diff suppressed because it is too large
Load Diff
1369
internal/utf8statetable.cc
Normal file
1369
internal/utf8statetable.cc
Normal file
File diff suppressed because it is too large
Load Diff
283
internal/utf8statetable.h
Normal file
283
internal/utf8statetable.h
Normal file
@@ -0,0 +1,283 @@
|
||||
// Copyright 2013 Google Inc. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
//
|
||||
// State Table follower for scanning UTF-8 strings without converting to
|
||||
// 32- or 16-bit Unicode values.
|
||||
//
|
||||
// Author: dsites@google.com (Dick Sites)
|
||||
//
|
||||
|
||||
#ifndef UTIL_UTF8_UTF8STATETABLE_H_
|
||||
#define UTIL_UTF8_UTF8STATETABLE_H_
|
||||
|
||||
#include <string>
|
||||
#include "integral_types.h" // for uint8, uint32, uint16
|
||||
#include "stringpiece.h"
|
||||
|
||||
|
||||
namespace CLD2 {
|
||||
|
||||
class OffsetMap;
|
||||
|
||||
|
||||
// These four-byte entries compactly encode how many bytes 0..255 to delete
|
||||
// in making a string replacement, how many bytes to add 0..255, and the offset
|
||||
// 0..64k-1 of the replacement string in remap_string.
|
||||
struct RemapEntry {
|
||||
uint8 delete_bytes;
|
||||
uint8 add_bytes;
|
||||
uint16 bytes_offset;
|
||||
};
|
||||
|
||||
// Exit type codes for state tables. All but the first get stuffed into
|
||||
// signed one-byte entries. The first is only generated by executable code.
|
||||
// To distinguish from next-state entries, these must be contiguous and
|
||||
// all <= kExitNone
|
||||
typedef enum {
|
||||
kExitDstSpaceFull = 239,
|
||||
kExitIllegalStructure, // 240
|
||||
kExitOK, // 241
|
||||
kExitReject, // ...
|
||||
kExitReplace1,
|
||||
kExitReplace2,
|
||||
kExitReplace3,
|
||||
kExitReplace21,
|
||||
kExitReplace31,
|
||||
kExitReplace32,
|
||||
kExitReplaceOffset1,
|
||||
kExitReplaceOffset2,
|
||||
kExitReplace1S0,
|
||||
kExitSpecial,
|
||||
kExitDoAgain,
|
||||
kExitRejectAlt,
|
||||
kExitNone // 255
|
||||
} ExitReason;
|
||||
|
||||
typedef enum {
|
||||
kExitDstSpaceFull_2 = 32767, // 0x7fff
|
||||
kExitIllegalStructure_2, // 32768 0x8000
|
||||
kExitOK_2, // 32769 0x8001
|
||||
kExitReject_2, // ...
|
||||
kExitReplace1_2,
|
||||
kExitReplace2_2,
|
||||
kExitReplace3_2,
|
||||
kExitReplace21_2,
|
||||
kExitReplace31_2,
|
||||
kExitReplace32_2,
|
||||
kExitReplaceOffset1_2,
|
||||
kExitReplaceOffset2_2,
|
||||
kExitReplace1S0_2,
|
||||
kExitSpecial_2,
|
||||
kExitDoAgain_2,
|
||||
kExitRejectAlt_2,
|
||||
kExitNone_2 // 32783 0x800f
|
||||
} ExitReason_2;
|
||||
|
||||
|
||||
// This struct represents one entire state table. The three initialized byte
|
||||
// areas are state_table, remap_base, and remap_string. state0 and state0_size
|
||||
// give the byte offset and length within state_table of the initial state --
|
||||
// table lookups are expected to start and end in this state, but for
|
||||
// truncated UTF-8 strings, may end in a different state. These allow a quick
|
||||
// test for that condition. entry_shift is 8 for tables subscripted by a full
|
||||
// byte value and 6 for space-optimized tables subscripted by only six
|
||||
// significant bits in UTF-8 continuation bytes.
|
||||
typedef struct {
|
||||
const uint32 state0;
|
||||
const uint32 state0_size;
|
||||
const uint32 total_size;
|
||||
const int max_expand;
|
||||
const int entry_shift;
|
||||
const int bytes_per_entry;
|
||||
const uint32 losub;
|
||||
const uint32 hiadd;
|
||||
const uint8* state_table;
|
||||
const RemapEntry* remap_base;
|
||||
const uint8* remap_string;
|
||||
const uint8* fast_state;
|
||||
} UTF8StateMachineObj;
|
||||
|
||||
// Near-duplicate declaration for tables with two-byte entries
|
||||
typedef struct {
|
||||
const uint32 state0;
|
||||
const uint32 state0_size;
|
||||
const uint32 total_size;
|
||||
const int max_expand;
|
||||
const int entry_shift;
|
||||
const int bytes_per_entry;
|
||||
const uint32 losub;
|
||||
const uint32 hiadd;
|
||||
const unsigned short* state_table;
|
||||
const RemapEntry* remap_base;
|
||||
const uint8* remap_string;
|
||||
const uint8* fast_state;
|
||||
} UTF8StateMachineObj_2;
|
||||
|
||||
|
||||
typedef UTF8StateMachineObj UTF8PropObj;
|
||||
typedef UTF8StateMachineObj UTF8ScanObj;
|
||||
typedef UTF8StateMachineObj UTF8ReplaceObj;
|
||||
typedef UTF8StateMachineObj_2 UTF8PropObj_2;
|
||||
typedef UTF8StateMachineObj_2 UTF8ReplaceObj_2;
|
||||
// NOT IMPLEMENTED typedef UTF8StateMachineObj_2 UTF8ScanObj_2;
|
||||
|
||||
|
||||
// Look up property of one UTF-8 character and advance over it
|
||||
// Return 0 if input length is zero
|
||||
// Return 0 and advance one byte if input is ill-formed
|
||||
uint8 UTF8GenericProperty(const UTF8PropObj* st,
|
||||
const uint8** src,
|
||||
int* srclen);
|
||||
|
||||
// Look up property of one UTF-8 character (assumed to be valid).
|
||||
// (This is a faster version of UTF8GenericProperty.)
|
||||
bool UTF8HasGenericProperty(const UTF8PropObj& st, const char* src);
|
||||
|
||||
|
||||
// BigOneByte versions are needed for tables > 240 states, but most
|
||||
// won't need the TwoByte versions.
|
||||
|
||||
// Look up property of one UTF-8 character and advance over it
|
||||
// Return 0 if input length is zero
|
||||
// Return 0 and advance one byte if input is ill-formed
|
||||
uint8 UTF8GenericPropertyBigOneByte(const UTF8PropObj* st,
|
||||
const uint8** src,
|
||||
int* srclen);
|
||||
|
||||
|
||||
// TwoByte versions are needed for tables > 240 states that don't fit onto
|
||||
// BigOneByte -- rare ultimate fallback
|
||||
|
||||
// Look up property of one UTF-8 character (assumed to be valid).
|
||||
// (This is a faster version of UTF8GenericProperty.)
|
||||
bool UTF8HasGenericPropertyBigOneByte(const UTF8PropObj& st, const char* src);
|
||||
|
||||
// Look up property of one UTF-8 character and advance over it
|
||||
// Return 0 if input length is zero
|
||||
// Return 0 and advance one byte if input is ill-formed
|
||||
uint8 UTF8GenericPropertyTwoByte(const UTF8PropObj_2* st,
|
||||
const uint8** src,
|
||||
int* srclen);
|
||||
|
||||
// Look up property of one UTF-8 character (assumed to be valid).
|
||||
// (This is a faster version of UTF8GenericProperty.)
|
||||
bool UTF8HasGenericPropertyTwoByte(const UTF8PropObj_2& st, const char* src);
|
||||
|
||||
// Scan a UTF-8 stringpiece based on a state table.
|
||||
// Always scan complete UTF-8 characters
|
||||
// Set number of bytes scanned. Return reason for exiting
|
||||
int UTF8GenericScan(const UTF8ScanObj* st,
|
||||
const StringPiece& str,
|
||||
int* bytes_consumed);
|
||||
|
||||
|
||||
|
||||
// Scan a UTF-8 stringpiece based on state table, copying to output stringpiece
|
||||
// and doing text replacements.
|
||||
// Always scan complete UTF-8 characters
|
||||
// Set number of bytes consumed from input, number filled to output.
|
||||
// Return reason for exiting
|
||||
// Also writes an optional OffsetMap. Pass NULL to skip writing one.
|
||||
int UTF8GenericReplace(const UTF8ReplaceObj* st,
|
||||
const StringPiece& istr,
|
||||
StringPiece& ostr,
|
||||
bool is_plain_text,
|
||||
int* bytes_consumed,
|
||||
int* bytes_filled,
|
||||
int* chars_changed,
|
||||
OffsetMap* offsetmap);
|
||||
|
||||
// Older version without offsetmap
|
||||
int UTF8GenericReplace(const UTF8ReplaceObj* st,
|
||||
const StringPiece& istr,
|
||||
StringPiece& ostr,
|
||||
bool is_plain_text,
|
||||
int* bytes_consumed,
|
||||
int* bytes_filled,
|
||||
int* chars_changed);
|
||||
|
||||
// Older version without is_plain_text or offsetmap
|
||||
int UTF8GenericReplace(const UTF8ReplaceObj* st,
|
||||
const StringPiece& istr,
|
||||
StringPiece& ostr,
|
||||
int* bytes_consumed,
|
||||
int* bytes_filled,
|
||||
int* chars_changed);
|
||||
|
||||
|
||||
// TwoByte version is needed for tables > about 256 states, such
|
||||
// as the table for full Unicode 4.1 canonical + compatibility mapping
|
||||
|
||||
// Scan a UTF-8 stringpiece based on state table with two-byte entries,
|
||||
// copying to output stringpiece
|
||||
// and doing text replacements.
|
||||
// Always scan complete UTF-8 characters
|
||||
// Set number of bytes consumed from input, number filled to output.
|
||||
// Return reason for exiting
|
||||
// Also writes an optional OffsetMap. Pass NULL to skip writing one.
|
||||
int UTF8GenericReplaceTwoByte(const UTF8ReplaceObj_2* st,
|
||||
const StringPiece& istr,
|
||||
StringPiece& ostr,
|
||||
bool is_plain_text,
|
||||
int* bytes_consumed,
|
||||
int* bytes_filled,
|
||||
int* chars_changed,
|
||||
OffsetMap* offsetmap);
|
||||
|
||||
// Older version without offsetmap
|
||||
int UTF8GenericReplaceTwoByte(const UTF8ReplaceObj_2* st,
|
||||
const StringPiece& istr,
|
||||
StringPiece& ostr,
|
||||
bool is_plain_text,
|
||||
int* bytes_consumed,
|
||||
int* bytes_filled,
|
||||
int* chars_changed);
|
||||
|
||||
// Older version without is_plain_text or offsetmap
|
||||
int UTF8GenericReplaceTwoByte(const UTF8ReplaceObj_2* st,
|
||||
const StringPiece& istr,
|
||||
StringPiece& ostr,
|
||||
int* bytes_consumed,
|
||||
int* bytes_filled,
|
||||
int* chars_changed);
|
||||
|
||||
|
||||
static const unsigned char kUTF8LenTbl[256] = {
|
||||
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
|
||||
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
|
||||
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
|
||||
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
|
||||
|
||||
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
|
||||
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
|
||||
2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,
|
||||
3,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4
|
||||
};
|
||||
|
||||
inline int UTF8OneCharLen(const char* in) {
|
||||
return kUTF8LenTbl[*reinterpret_cast<const uint8*>(in)];
|
||||
}
|
||||
|
||||
// Adjust a stringpiece to encompass complete UTF-8 characters.
|
||||
// The data pointer will be increased by 0..3 bytes to get to a character
|
||||
// boundary, and the length will then be decreased by 0..3 bytes
|
||||
// to encompass the last complete character.
|
||||
// This is useful especially when a UTF-8 string must be put into a fixed-
|
||||
// maximum-size buffer cleanly, such as a MySQL buffer.
|
||||
void UTF8TrimToChars(StringPiece* istr);
|
||||
|
||||
} // End namespace CLD2
|
||||
|
||||
#endif // UTIL_UTF8_UTF8STATETABLE_H_
|
||||
300
public/compact_lang_det.h
Normal file
300
public/compact_lang_det.h
Normal file
@@ -0,0 +1,300 @@
|
||||
// Copyright 2013 Google Inc. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
//
|
||||
// Author: dsites@google.com (Dick Sites)
|
||||
//
|
||||
|
||||
// NOTE:
|
||||
// Baybayin (ancient script of the Philippines) is detected as TAGALOG.
|
||||
// Chu Nom (Vietnamese ancient Han characters) is detected as VIETNAMESE.
|
||||
// HAITIAN_CREOLE is detected as such.
|
||||
// NORWEGIAN and NORWEGIAN_N are detected separately (but not robustly)
|
||||
// PORTUGUESE, PORTUGUESE_P, and PORTUGUESE_B are all detected as PORTUGUESE.
|
||||
// ROMANIAN-Latin is detected as ROMANIAN; ROMANIAN-Cyrillic as ROMANIAN.
|
||||
// BOSNIAN is not detected as such, but likely scores as Croatian or Serbian.
|
||||
// MONTENEGRIN is not detected as such, but likely scores as Serbian.
|
||||
// CROATIAN is detected in the Latin script
|
||||
// SERBIAN is detected in the Cyrililc and Latin scripts
|
||||
// Zhuang is detected in the Latin script only.
|
||||
//
|
||||
// The languages X_PIG_LATIN and X_KLINGON are detected in the
|
||||
// extended calls ExtDetectLanguageSummary().
|
||||
//
|
||||
// UNKNOWN_LANGUAGE is returned if no language's internal reliablity measure
|
||||
// is high enough. This happens with non-text input such as the bytes of a
|
||||
// JPEG, and also with text in languages outside training set.
|
||||
//
|
||||
// The following languages are to be detected in multiple scripts:
|
||||
// AZERBAIJANI (Latin, Cyrillic*, Arabic*)
|
||||
// BURMESE (Latin, Myanmar)
|
||||
// HAUSA (Latin, Arabic)
|
||||
// KASHMIRI (Arabic, Devanagari)
|
||||
// KAZAKH (Latin, Cyrillic, Arabic)
|
||||
// KURDISH (Latin*, Arabic)
|
||||
// KYRGYZ (Cyrillic, Arabic)
|
||||
// LIMBU (Devanagari, Limbu)
|
||||
// MONGOLIAN (Cyrillic, Mongolian)
|
||||
// SANSKRIT (Latin, Devanagari)
|
||||
// SINDHI (Arabic, Devanagari)
|
||||
// TAGALOG (Latin, Tagalog)
|
||||
// TAJIK (Cyrillic, Arabic*)
|
||||
// TATAR (Latin, Cyrillic, Arabic)
|
||||
// TURKMEN (Latin, Cyrillic, Arabic)
|
||||
// UIGHUR (Latin, Cyrillic, Arabic)
|
||||
// UZBEK (Latin, Cyrillic, Arabic)
|
||||
//
|
||||
// * Due to a shortage of training text, AZERBAIJANI is not currently detected
|
||||
// in Arabic or Cyrillic scripts, nor KURDISH in Latin script, nor TAJIK in
|
||||
// Arabic script.
|
||||
//
|
||||
|
||||
#ifndef I18N_ENCODINGS_CLD2_PUBLIC_COMPACT_LANG_DET_H_
|
||||
#define I18N_ENCODINGS_CLD2_PUBLIC_COMPACT_LANG_DET_H_
|
||||
|
||||
#include <vector>
|
||||
#include "../internal/lang_script.h" // For Language
|
||||
|
||||
namespace CLD2 {
|
||||
|
||||
// Scan interchange-valid UTF-8 bytes and detect most likely language,
|
||||
// or set of languages.
|
||||
//
|
||||
// Design goals:
|
||||
// Skip over big stretches of HTML tags
|
||||
// Able to return ranges of different languages
|
||||
// Relatively small tables and relatively fast processing
|
||||
// Thread safe
|
||||
//
|
||||
// For HTML documents, tags are skipped, along with <script> ... </script>
|
||||
// and <style> ... </style> sequences, and entities are expanded.
|
||||
//
|
||||
// We distinguish between bytes of the raw input buffer and bytes of non-tag
|
||||
// text letters. Since tags can be over 50% of the bytes of an HTML Page,
|
||||
// and are nearly all seven-bit ASCII English, we prefer to distinguish
|
||||
// language mixture fractions based on just the non-tag text.
|
||||
//
|
||||
// Inputs: text and text_length
|
||||
// Code skips HTML tags and expands HTML entities, unless
|
||||
// is_plain_text is true
|
||||
// Outputs:
|
||||
// language3 is an array of the top 3 languages or UNKNOWN_LANGUAGE
|
||||
// percent3 is an array of the text percentages 0..100 of the top 3 languages
|
||||
// text_bytes is the amount of non-tag/letters-only text found
|
||||
// is_reliable set true if the returned Language is some amount more
|
||||
// probable then the second-best Language. Calculation is a complex function
|
||||
// of the length of the text and the different-script runs of text.
|
||||
// Return value: the most likely Language for the majority of the input text
|
||||
// Length 0 input returns UNKNOWN_LANGUAGE. Very short indeterminate text
|
||||
// defaults to ENGLISH.
|
||||
//
|
||||
// The first two versions return ENGLISH instead of UNKNOWN_LANGUAGE, for
|
||||
// backwards compatibility with a different detector.
|
||||
//
|
||||
// The third version may return UNKNOWN_LANGUAGE, and also returns extended
|
||||
// language codes from lang_script.h
|
||||
//
|
||||
|
||||
|
||||
// Instead of individual arguments, pass in hints as an initialized struct
|
||||
// Init to {NULL, NULL, UNKNOWN_ENCODING, UNKNOWN_LANGUAGE} if not known.
|
||||
//
|
||||
// Pass in hints whenever possible; doing so improves detection accuracy. The
|
||||
// set of passed-in hints are all information that is external to the text
|
||||
// itself.
|
||||
//
|
||||
// The content_language_hint is intended to come from an HTTP header
|
||||
// Content-Language: field, the tld_hint from the hostname of a URL, the
|
||||
// encoding-hint from an encoding detector applied to the input
|
||||
// document, and the language hint from any other context you might have.
|
||||
// The lang= tags inside an HTML document will be picked up as hints
|
||||
// by code within the compact language detector.
|
||||
|
||||
typedef struct {
|
||||
const char* content_language_hint; // "mi,en" boosts Maori and English
|
||||
const char* tld_hint; // "id" boosts Indonesian
|
||||
int encoding_hint; // SJS boosts Japanese
|
||||
Language language_hint; // ITALIAN boosts it
|
||||
} CLDHints;
|
||||
|
||||
static const int kMaxResultChunkBytes = 65535;
|
||||
|
||||
// For returning a vector of per-language pieces of the input buffer
|
||||
// Unreliable and too-short are mapped to UNKNOWN_LANGUAGE
|
||||
typedef struct {
|
||||
int offset; // Starting byte offset in original buffer
|
||||
uint16 bytes; // Number of bytes in chunk
|
||||
uint16 lang1; // Top lang, as full Language. Apply
|
||||
// static_cast<Language>() to this short value.
|
||||
} ResultChunk;
|
||||
typedef std::vector<ResultChunk> ResultChunkVector;
|
||||
|
||||
|
||||
// Scan interchange-valid UTF-8 bytes and detect most likely language
|
||||
Language DetectLanguage(
|
||||
const char* buffer,
|
||||
int buffer_length,
|
||||
bool is_plain_text,
|
||||
bool* is_reliable);
|
||||
|
||||
// Scan interchange-valid UTF-8 bytes and detect list of top 3 languages.
|
||||
// language3[0] is usually also the return value
|
||||
Language DetectLanguageSummary(
|
||||
const char* buffer,
|
||||
int buffer_length,
|
||||
bool is_plain_text,
|
||||
Language* language3,
|
||||
int* percent3,
|
||||
int* text_bytes,
|
||||
bool* is_reliable);
|
||||
|
||||
// Same as above, with hints supplied
|
||||
// Scan interchange-valid UTF-8 bytes and detect list of top 3 languages.
|
||||
// language3[0] is usually also the return value
|
||||
Language DetectLanguageSummary(
|
||||
const char* buffer,
|
||||
int buffer_length,
|
||||
bool is_plain_text,
|
||||
const char* tld_hint, // "id" boosts Indonesian
|
||||
int encoding_hint, // SJS boosts Japanese
|
||||
Language language_hint, // ITALIAN boosts it
|
||||
Language* language3,
|
||||
int* percent3,
|
||||
int* text_bytes,
|
||||
bool* is_reliable);
|
||||
|
||||
// Scan interchange-valid UTF-8 bytes and detect list of top 3 extended
|
||||
// languages.
|
||||
//
|
||||
// Extended languages are additional interface languages and Unicode
|
||||
// single-language scripts, from lang_script.h
|
||||
//
|
||||
// language3[0] is usually also the return value
|
||||
Language ExtDetectLanguageSummary(
|
||||
const char* buffer,
|
||||
int buffer_length,
|
||||
bool is_plain_text,
|
||||
Language* language3,
|
||||
int* percent3,
|
||||
int* text_bytes,
|
||||
bool* is_reliable);
|
||||
|
||||
// Same as above, with hints supplied
|
||||
// Scan interchange-valid UTF-8 bytes and detect list of top 3 extended
|
||||
// languages.
|
||||
//
|
||||
// Extended languages are additional Google interface languages and Unicode
|
||||
// single-language scripts, from lang_script.h
|
||||
//
|
||||
// language3[0] is usually also the return value
|
||||
Language ExtDetectLanguageSummary(
|
||||
const char* buffer,
|
||||
int buffer_length,
|
||||
bool is_plain_text,
|
||||
const char* tld_hint, // "id" boosts Indonesian
|
||||
int encoding_hint, // SJS boosts Japanese
|
||||
Language language_hint, // ITALIAN boosts it
|
||||
Language* language3,
|
||||
int* percent3,
|
||||
int* text_bytes,
|
||||
bool* is_reliable);
|
||||
|
||||
// Same as above, and also returns 3 internal language scores as a ratio to
|
||||
// normal score for real text in that language. Scores close to 1.0 indicate
|
||||
// normal text, while scores far away from 1.0 indicate badly-skewed text or
|
||||
// gibberish
|
||||
//
|
||||
Language ExtDetectLanguageSummary(
|
||||
const char* buffer,
|
||||
int buffer_length,
|
||||
bool is_plain_text,
|
||||
const char* tld_hint, // "id" boosts Indonesian
|
||||
int encoding_hint, // SJS boosts Japanese
|
||||
Language language_hint, // ITALIAN boosts it
|
||||
Language* language3,
|
||||
int* percent3,
|
||||
double* normalized_score3,
|
||||
int* text_bytes,
|
||||
bool* is_reliable);
|
||||
|
||||
|
||||
// Use this one.
|
||||
// Hints are collected into a struct.
|
||||
// Flags are passed in (normally zero).
|
||||
//
|
||||
// Also returns 3 internal language scores as a ratio to
|
||||
// normal score for real text in that language. Scores close to 1.0 indicate
|
||||
// normal text, while scores far away from 1.0 indicate badly-skewed text or
|
||||
// gibberish
|
||||
//
|
||||
// Returns a vector of chunks in different languages, so that caller may
|
||||
// spell-check, translate, or otherwaise process different parts of the input
|
||||
// buffer in language-dependant ways.
|
||||
//
|
||||
Language ExtDetectLanguageSummary(
|
||||
const char* buffer,
|
||||
int buffer_length,
|
||||
bool is_plain_text,
|
||||
const CLDHints* cld_hints,
|
||||
int flags,
|
||||
Language* language3,
|
||||
int* percent3,
|
||||
double* normalized_score3,
|
||||
ResultChunkVector* resultchunkvector,
|
||||
int* text_bytes,
|
||||
bool* is_reliable);
|
||||
|
||||
// Return version text string
|
||||
// String is "code_version - data_build_date"
|
||||
const char* DetectLanguageVersion();
|
||||
|
||||
|
||||
// Public use flags, debug output controls
|
||||
static const int kCLDFlagScoreAsQuads = 0x0100; // Force Greek, etc. => quads
|
||||
static const int kCLDFlagHtml = 0x0200; // Debug HTML => stderr
|
||||
static const int kCLDFlagCr = 0x0400; // <cr> per chunk if HTML
|
||||
static const int kCLDFlagVerbose = 0x0800; // More debug HTML => stderr
|
||||
static const int kCLDFlagQuiet = 0x1000; // Less debug HTML => stderr
|
||||
static const int kCLDFlagEcho = 0x2000; // Echo input => stderr
|
||||
|
||||
|
||||
/***
|
||||
|
||||
Flag meanings:
|
||||
kCLDFlagScoreAsQuads
|
||||
Normally, several languages are detected solely by their Unicode script.
|
||||
Combined with appropritate lookup tables, this flag forces them instead
|
||||
to be detected via quadgrams. This can be a useful refinement when looking
|
||||
for meaningful text in these languages, instead of just character sets.
|
||||
The default tables do not support this use.
|
||||
kCLDFlagHtml
|
||||
For each detection call, write an HTML file to stderr, showing the text
|
||||
chunks and their detected languages.
|
||||
kCLDFlagCr
|
||||
In that HTML file, force a new line for each chunk.
|
||||
kCLDFlagVerbose
|
||||
In that HTML file, show every lookup entry.
|
||||
kCLDFlagQuiet
|
||||
In that HTML file, suppress most of the output detail.
|
||||
kCLDFlagEcho
|
||||
Echo every input buffer to stderr.
|
||||
***/
|
||||
|
||||
// Debug output: Print the resultchunkvector to file f
|
||||
void DumpResultChunkVector(FILE* f, const char* src,
|
||||
ResultChunkVector* resultchunkvector);
|
||||
|
||||
}; // End namespace CLD2
|
||||
|
||||
#endif // I18N_ENCODINGS_CLD2_PUBLIC_COMPACT_LANG_DET_H_
|
||||
169
public/encodings.h
Normal file
169
public/encodings.h
Normal file
@@ -0,0 +1,169 @@
|
||||
// Copyright 2013 Google Inc. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
//
|
||||
// Author: dsites@google.com (Dick Sites)
|
||||
//
|
||||
|
||||
#ifndef I18N_ENCODINGS_CLD2_PUBLIC_ENCODINGS_H__
|
||||
#define I18N_ENCODINGS_CLD2_PUBLIC_ENCODINGS_H__
|
||||
|
||||
namespace CLD2 {
|
||||
|
||||
enum Encoding {
|
||||
ISO_8859_1 = 0, // ASCII
|
||||
ISO_8859_2 = 1, // Latin2
|
||||
ISO_8859_3 = 2, //
|
||||
ISO_8859_4 = 3, // Latin4
|
||||
ISO_8859_5 = 4, // ISO-8859-5
|
||||
ISO_8859_6 = 5, // Arabic
|
||||
ISO_8859_7 = 6, // Greek
|
||||
ISO_8859_8 = 7, // Hebrew
|
||||
ISO_8859_9 = 8, //
|
||||
ISO_8859_10 = 9, //
|
||||
JAPANESE_EUC_JP = 10, // EUC_JP
|
||||
JAPANESE_SHIFT_JIS = 11, // SJS
|
||||
JAPANESE_JIS = 12, // JIS
|
||||
CHINESE_BIG5 = 13, // BIG5
|
||||
CHINESE_GB = 14, // GB
|
||||
CHINESE_EUC_CN = 15, // Misnamed. Should be EUC_TW. Was Basis Tech
|
||||
// CNS11643EUC, before that EUC-CN(!)
|
||||
KOREAN_EUC_KR = 16, // KSC
|
||||
UNICODE = 17, // Unicode
|
||||
CHINESE_EUC_DEC = 18, // Misnamed. Should be EUC_TW. Was
|
||||
// CNS11643EUC, before that EUC.
|
||||
CHINESE_CNS = 19, // Misnamed. Should be EUC_TW. Was
|
||||
// CNS11643EUC, before that CNS.
|
||||
CHINESE_BIG5_CP950 = 20, // BIG5_CP950
|
||||
JAPANESE_CP932 = 21, // CP932
|
||||
UTF8 = 22,
|
||||
UNKNOWN_ENCODING = 23,
|
||||
ASCII_7BIT = 24, // ISO_8859_1 with all characters <= 127.
|
||||
RUSSIAN_KOI8_R = 25, // KOI8R
|
||||
RUSSIAN_CP1251 = 26, // CP1251
|
||||
|
||||
//----------------------------------------------------------
|
||||
MSFT_CP1252 = 27, // 27: CP1252 aka MSFT euro ascii
|
||||
RUSSIAN_KOI8_RU = 28, // CP21866 aka KOI8-U, used for Ukrainian.
|
||||
// Misnamed, this is _not_ KOI8-RU but KOI8-U.
|
||||
// KOI8-U is used much more often than KOI8-RU.
|
||||
MSFT_CP1250 = 29, // CP1250 aka MSFT eastern european
|
||||
ISO_8859_15 = 30, // aka ISO_8859_0 aka ISO_8859_1 euroized
|
||||
//----------------------------------------------------------
|
||||
|
||||
//----------------------------------------------------------
|
||||
MSFT_CP1254 = 31, // used for Turkish
|
||||
MSFT_CP1257 = 32, // used in Baltic countries
|
||||
//----------------------------------------------------------
|
||||
|
||||
//----------------------------------------------------------
|
||||
//----------------------------------------------------------
|
||||
ISO_8859_11 = 33, // aka TIS-620, used for Thai
|
||||
MSFT_CP874 = 34, // used for Thai
|
||||
MSFT_CP1256 = 35, // used for Arabic
|
||||
|
||||
//----------------------------------------------------------
|
||||
MSFT_CP1255 = 36, // Logical Hebrew Microsoft
|
||||
ISO_8859_8_I = 37, // Iso Hebrew Logical
|
||||
HEBREW_VISUAL = 38, // Iso Hebrew Visual
|
||||
//----------------------------------------------------------
|
||||
|
||||
//----------------------------------------------------------
|
||||
CZECH_CP852 = 39,
|
||||
CZECH_CSN_369103 = 40, // aka ISO_IR_139 aka KOI8_CS
|
||||
MSFT_CP1253 = 41, // used for Greek
|
||||
RUSSIAN_CP866 = 42,
|
||||
//----------------------------------------------------------
|
||||
|
||||
//----------------------------------------------------------
|
||||
// Handled by iconv in glibc
|
||||
ISO_8859_13 = 43,
|
||||
ISO_2022_KR = 44,
|
||||
GBK = 45,
|
||||
GB18030 = 46,
|
||||
BIG5_HKSCS = 47,
|
||||
ISO_2022_CN = 48,
|
||||
|
||||
//-----------------------------------------------------------
|
||||
// Following 4 encodings are deprecated (font encodings)
|
||||
TSCII = 49,
|
||||
TAMIL_MONO = 50,
|
||||
TAMIL_BI = 51,
|
||||
JAGRAN = 52,
|
||||
|
||||
|
||||
MACINTOSH_ROMAN = 53,
|
||||
UTF7 = 54,
|
||||
|
||||
//-----------------------------------------------------------
|
||||
// Following 2 encodings are deprecated (font encodings)
|
||||
BHASKAR = 55, // Indic encoding - Devanagari
|
||||
HTCHANAKYA = 56, // 56 Indic encoding - Devanagari
|
||||
|
||||
//-----------------------------------------------------------
|
||||
UTF16BE = 57, // big-endian UTF-16
|
||||
UTF16LE = 58, // little-endian UTF-16
|
||||
UTF32BE = 59, // big-endian UTF-32
|
||||
UTF32LE = 60, // little-endian UTF-32
|
||||
//-----------------------------------------------------------
|
||||
|
||||
//-----------------------------------------------------------
|
||||
// An encoding that means "This is not text, but it may have some
|
||||
// simple ASCII text embedded". Intended input conversion
|
||||
// is to keep strings of >=4 seven-bit ASCII characters
|
||||
BINARYENC = 61,
|
||||
//-----------------------------------------------------------
|
||||
|
||||
//-----------------------------------------------------------
|
||||
// Some Web pages allow a mixture of HZ-GB and GB-2312 by using
|
||||
// ~{ ... ~} for 2-byte pairs, and the browsers support this.
|
||||
HZ_GB_2312 = 62,
|
||||
//-----------------------------------------------------------
|
||||
|
||||
//-----------------------------------------------------------
|
||||
// Some external vendors make the common input error of
|
||||
// converting MSFT_CP1252 to UTF8 *twice*.
|
||||
UTF8UTF8 = 63,
|
||||
//-----------------------------------------------------------
|
||||
|
||||
//-----------------------------------------------------------
|
||||
// Following 6 encodings are deprecated (font encodings)
|
||||
TAM_ELANGO = 64, // Elango - Tamil
|
||||
TAM_LTTMBARANI = 65, // Barani - Tamil
|
||||
TAM_SHREE = 66, // Shree - Tamil
|
||||
TAM_TBOOMIS = 67, // TBoomis - Tamil
|
||||
TAM_TMNEWS = 68, // TMNews - Tamil
|
||||
TAM_WEBTAMIL = 69, // Webtamil - Tamil
|
||||
//-----------------------------------------------------------
|
||||
|
||||
//-----------------------------------------------------------
|
||||
// Shift_JIS variants used by Japanese cell phone carriers.
|
||||
KDDI_SHIFT_JIS = 70,
|
||||
DOCOMO_SHIFT_JIS = 71,
|
||||
SOFTBANK_SHIFT_JIS = 72,
|
||||
// ISO-2022-JP variants used by KDDI and SoftBank.
|
||||
KDDI_ISO_2022_JP = 73,
|
||||
SOFTBANK_ISO_2022_JP = 74,
|
||||
//-----------------------------------------------------------
|
||||
|
||||
NUM_ENCODINGS = 75, // Always keep this at the end. It is not a
|
||||
// valid Encoding enum, it is only used to
|
||||
// indicate the total number of Encodings.
|
||||
};
|
||||
|
||||
} // End namespace CLD2
|
||||
|
||||
#endif // I18N_ENCODINGS_CLD2_PUBLIC_ENCODINGS_H__
|
||||
|
||||
|
||||
Reference in New Issue
Block a user