From 9f083bb52192fd7f8e3a4b9e3d398908bff1e012 Mon Sep 17 00:00:00 2001 From: Simon Rozman Date: Tue, 14 Mar 2017 14:14:39 +0100 Subject: [PATCH] Character-Language table extended to support multi-UTF-16 characters --- ZRColaCompile/dbsource.cpp | 2 +- ZRColaCompile/dbsource.h | 2 +- ZRColaCompile/main.cpp | 6 +- lib/libZRCola/include/zrcola/language.h | 70 ++++-------------------- lib/libZRCola/src/language.cpp | 37 +++---------- output/data/ZRCola.zrcdb | Bin 2332334 -> 2333196 bytes 6 files changed, 26 insertions(+), 91 deletions(-) diff --git a/ZRColaCompile/dbsource.cpp b/ZRColaCompile/dbsource.cpp index c31edc0..d57929c 100644 --- a/ZRColaCompile/dbsource.cpp +++ b/ZRColaCompile/dbsource.cpp @@ -797,7 +797,7 @@ bool ZRCola::DBSource::GetLanguageCharacter(const com_obj& rs, ZRC { com_obj f; wxVERIFY(SUCCEEDED(flds->get_Item(variant(L"znak"), &f))); - wxCHECK(GetUnicodeCharacter(f, lc.chr), false); + wxCHECK(GetUnicodeString(f, lc.chr), false); } { diff --git a/ZRColaCompile/dbsource.h b/ZRColaCompile/dbsource.h index 6de0a9b..ba39bd3 100644 --- a/ZRColaCompile/dbsource.h +++ b/ZRColaCompile/dbsource.h @@ -134,7 +134,7 @@ namespace ZRCola { /// class langchar { public: - wchar_t chr; ///> Character + std::wstring chr; ///> Character ZRCola::langid_t lang; ///< Language ID }; diff --git a/ZRColaCompile/main.cpp b/ZRColaCompile/main.cpp index 0034e8a..5bc5974 100644 --- a/ZRColaCompile/main.cpp +++ b/ZRColaCompile/main.cpp @@ -383,9 +383,13 @@ int _tmain(int argc, _TCHAR *argv[]) if (src.GetLanguageCharacter(rs, lc)) { // Add language characters to index and data. unsigned __int32 idx = db.data.size(); - db.data.push_back(lc.chr); for (wstring::size_type i = 0; i < sizeof(ZRCola::langid_t)/sizeof(unsigned __int16); i++) db.data.push_back(((const unsigned __int16*)lc.lang.data)[i]); + wstring::size_type n = lc.chr.length(); + wxASSERT_MSG(n <= 0xffff, wxT("character string too long")); + db.data.push_back((unsigned __int16)n); + for (wstring::size_type i = 0; i < n; i++) + db.data.push_back(lc.chr[i]); db.idxChr.push_back(idx); #ifdef ZRCOLA_LANGCHAR_LANG_IDX db.idxLng.push_back(idx); diff --git a/lib/libZRCola/include/zrcola/language.h b/lib/libZRCola/include/zrcola/language.h index b3036b9..1bfe4e1 100644 --- a/lib/libZRCola/include/zrcola/language.h +++ b/lib/libZRCola/include/zrcola/language.h @@ -45,8 +45,9 @@ namespace ZRCola { /// Character data /// struct langchar { - wchar_t chr; ///> Character langid_t lang; ///< Language ID + unsigned __int16 chr_len; ///< \c chr length (in UTF-16 characters) + wchar_t chr[]; ///< Character }; #pragma pack(pop) @@ -76,27 +77,8 @@ namespace ZRCola { /// virtual int compare(_In_ const langchar &a, _In_ const langchar &b) const { - if (a.chr < b.chr) return -1; - else if (a.chr > b.chr) return 1; - - return 0; - } - - /// - /// Compares two characters by ID (for sorting) - /// - /// \param[in] a Pointer to first element - /// \param[in] b Pointer to second element - /// - /// \returns - /// - <0 when a < b - /// - =0 when a == b - /// - >0 when a > b - /// - virtual int compare_sort(_In_ const langchar &a, _In_ const langchar &b) const - { - if (a.chr < b.chr) return -1; - else if (a.chr > b.chr) return 1; + int r = ZRCola::CompareString(a.chr, a.chr + a.chr_len, b.chr, b.chr + b.chr_len); + if (r != 0) return r; if (a.lang < b.lang) return -1; else if (a.lang > b.lang) return 1; @@ -133,33 +115,14 @@ namespace ZRCola { /// virtual int compare(_In_ const langchar &a, _In_ const langchar &b) const { - int r = memcmp(a.lang, b.lang, sizeof(langid_t)); + if (a.lang < b.lang) return -1; + else if (a.lang > b.lang) return 1; + + int r = ZRCola::CompareString(a.chr, a.chr + a.chr_len, b.chr, b.chr + b.chr_len); if (r != 0) return r; return 0; } - - /// - /// Compares two languages by ID (for sorting) - /// - /// \param[in] a Pointer to first element - /// \param[in] b Pointer to second element - /// - /// \returns - /// - <0 when a < b - /// - =0 when a == b - /// - >0 when a > b - /// - virtual int compare_sort(_In_ const langchar &a, _In_ const langchar &b) const - { - int r = memcmp(a.lang, b.lang, sizeof(langid_t)); - if (r != 0) return r; - - if (a.chr < b.chr) return -1; - else if (a.chr > b.chr) return 1; - - return 0; - } } idxLng; ///< Character language index #endif @@ -190,19 +153,8 @@ namespace ZRCola { /// /// Tests presence of character in the given language /// - /// \param[in] chr Character (UTF-16) - /// \param[in] lang Language - /// - /// \returns - /// - \c true when character is used in language - /// - \c false otherwise - bool IsLocalCharacter(_In_ wchar_t chr, _In_ langid_t lang) const; - - /// - /// Tests presence of character in the given language - /// - /// \param[in] chr Pointer to UTF-16 character start - /// \param[in] chr_end Pointer to UTF-16 character end + /// \param[in] chr Pointer to character + /// \param[in] chr_end Pointer to character end /// \param[in] lang Language /// /// \returns @@ -227,7 +179,7 @@ namespace ZRCola { /// struct language { langid_t id; ///< Language ID - unsigned __int16 name_len; ///< \c name length (in characters) + unsigned __int16 name_len; ///< \c name length (in UTF-16 characters) wchar_t name[]; ///< Language name }; #pragma pack(pop) diff --git a/lib/libZRCola/src/language.cpp b/lib/libZRCola/src/language.cpp index f523a90..841c627 100644 --- a/lib/libZRCola/src/language.cpp +++ b/lib/libZRCola/src/language.cpp @@ -71,35 +71,14 @@ void ZRCola::LangConvert(_In_ LANGID lang_win, _Inout_ ZRCola::langid_t &lang) #endif -bool ZRCola::langchar_db::IsLocalCharacter(_In_ wchar_t chr, _In_ ZRCola::langid_t lang) const -{ - for (size_t l = 0, r = idxChr.size(); l < r; ) { - // Test the character in the middle of the search area. - size_t m = (l + r) / 2; - const langchar &lc = idxChr[m]; - - // Do the bisection test on character. - if (chr < lc.chr) r = m; - else if (lc.chr < chr ) l = m + 1; - else { - // Do the bisection test on language. - if (lang < lc.lang) r = m; - else if (lang > lc.lang) l = m + 1; - else { - // Match found. - return true; - } - } - } - - return false; -} - - bool ZRCola::langchar_db::IsLocalCharacter(_In_ const wchar_t *chr, _In_ const wchar_t *chr_end, _In_ ZRCola::langid_t lang) const { - // TODO: Implement properly! - UNREFERENCED_PARAMETER(chr_end); - assert(chr < chr_end); - return IsLocalCharacter(*chr, lang); + size_t n = chr_end - chr; + assert(n <= 0xffff); + std::unique_ptr lc((ZRCola::langchar_db::langchar*)new char[sizeof(ZRCola::langchar_db::langchar) + sizeof(wchar_t)*n]); + lc->lang = lang; + lc->chr_len = (unsigned __int16)n; + memcpy(lc->chr, chr, sizeof(wchar_t)*n); + ZRCola::langchar_db::indexChar::size_type start; + return idxChr.find(*lc, start); } diff --git a/output/data/ZRCola.zrcdb b/output/data/ZRCola.zrcdb index 71bdd87cfeaae626d34fd5e0f99543659c90c51c..0ebd89b9cc4ff680b1176c3d232a02e17f35e3a4 100644 GIT binary patch delta 5309 zcmXxnUrF(Iq8y(G7nL zQQ*)SI^}}NaKVf;!vz;ya6wxxXv+l`NKnC`rAi&5o9RkWLD5oW9HMKd-}9cInSA!y z_uunA=RIe4@y~zy)b1Jk)b=0z#eU28+3)7uz!b`FKr306{Qh zg^+_>=)=t4Jo(2G9wV*rB~ z!Z1cKiZP610+X1=4CXM81uPPS#RG<=7s74LKh@c)(#L(>U(uy{8q6^*VK`;8yj{yu~2*Vh`D8?|32~1)d zGnm6X7O;po5?ICx*02s|gO^PtVWn^aNJ9WYWFmwdd3kq73DzKqabB zjT(dzK|P{~rEvbuylF)nI?;u0^q?1g=*IvCF@#}^U=(8*#{?!ZjTy{g9t&7R90@FA z1#4JO;r?&%W)n$RTQ~uvA%Gw<5kd}fk%xQ~pb$kUMhQw$hH_M(5>=>14Z?_^9#O;` zUYgO0Hguv3-RMCt`p}O73}Ohw7{MsUFpddKVj44;!#ozSh&U2h#tPQptn;#gO(bFY z_#bHqAc#zakb_+0As+=OL=lQnf>M;B92KZU6{=B#Fe0c&)W`jg@unHAXhSEu(2XAS zq7VHTz#xV&j1i1t4C9!Cx zAQySaM*#{^gkqGS6lEw!1u9X6YSbW%2_7{DNgFpLq5 zVhrP$z$B(IgE`D&0gH$ufn}^9V~v+}Y+w_~GxwhD%H9_8bFs)pp8rfTbN8EjD|ugq zYSbW%2X2ZuFq{%-jFlmF}0B1-?1<;|b68e1A1)6r};nzD{I z9ijcXCVXy1>RM<|TXjv%CVCr8TTLIk1x#C!TDO36)^StdIrX1Y|9LA~>#jSm@$)L^ zf(p7|HGSlK$b<`)R{#U%%4alTMup6%uvyDHfmzL))k(}+-bu`A{;W=dXinn!{ENza zQS&ZZM;&v;Unt=hROA*!+nmUpCeCT&OWN>~HoT~@?6&X%R2JQ?opHXvM!Wy zbD{sR9IwKE>I$!DfM^Q1q7ANS{HhAMszR=+kgLpd@2wl%XXUD{ocZPg7nGN1@-ArJ zf}_e8bg!@JUSCt)MZKZjEzYmEBOH z8`|iG3W}?sxaP%`Fs=k~&5vv2ue8BeTK{jYODKOrg(STBj&)O&-c;h7ns8Hzh^Eq; zO0=XzOS-p9n!lubyrhDbbo5_q{A(4otP5H;`T2{7H|FECtfRf9qrIj6Tk5~9kJD|9 z-&R3)RL~tgymwRp{pKNC(fEoASy5rD?g_Y`|5Z&~)k&=CBv!RxRVP6-C-Hp#T_wJ& zd3Tlgp7PyOLA1?7x2Ab(x>fht&vCyYC{_k%?2Bq zzoGdXy62B|&mSxQW8K=vmg9Z%J=Q(m)V z%JWnw^;88VRZ!B=#H11?l_04FNp1X08$9D7Jjsbzwij*N`%y`;y>GY_wYR9f#r96h zXGcGDe+zuJcd|b9J6kodRRde?BPW`e=(oM^dcR%Y=x*rOxZiGydhMphnCbtx&ixVc zYh9|=rD|QO)}?A4(X30g%{s?QwHu;tWiKecs6+uJ3fSJg4JcuNMDEY58{I;PCSkx! zNZY%BbgfU<`gE;N*LpXewD|n<#w18rg69jjY2r2|+OGC?Z4gv{P>F*kq0PV4ynI2; z%h0?G&CAd{Vq?tx{AcK1zNGlFZpkZ(uPTOglp(dV)XvfdI~22R@1cH8@pYYSuHsJZ zvy14sN4Q%9-&K4=6W&z(o;Lix;t!O#P~$&T{E<#*kK&Kj|F&X@`hTMMQ(M3Pdvzsy zbqn_D>D;IOed^z*{{8CTum1h&FH?V+`pdNO&lG>I6M9Dlykk4wQ(mr)a@+e$rc&{M z`m2<%O6_;mepd;r)vi|iJ+OZXh!|Fe*hbpX% z!nXIfVOS4WSOvP#y|qO1aEG;FSnF%G-lBiHiKtztcAaj;Z@p*9v3O&?(T?fiI;I836;J3^#1uc$yhg<)9qA`3@DsI9seMY% z#%Vp=r!{`s^z->|(?FXB+H^$iDxh5_)UFe1*Oj=@Jt3kwp>}Q9uJs*S-=Xy#THm4d zZanFJ{&`~(bSOcGjx?0weW!;~yzlrdwX@XTq4o~7v(?T{X*d=kujM;Z=l+V?zQz{! zE8X^a{abu3$J}4lRN9U^F`$8TEy(gUoH#{)mKJ7dL6*k%Xnc>x_h_7cvvH;79nkmz z_5V`yex-Jeui*`|;MYocR&mPb{pkEL;l+O?{;>I03bw$9t?(liY4{diz>5eV9ow)S oL1f@1yo^kI8?WG1gph?D$VLvngV*pna{Pa~RA=3evC$OW;Qi0?5ZhSb<`!!lPJ?C-D@X z!Sh&$^>_uZ;SIcnckwx)u zpW<_Ti4yFJ?$j;pZv2!tRM-4KQd^h6&-BMt))k83du!*L@<;uefX z0>OM+!AEmmKQne>?8+){d$PRlnTc#b#lrK}Sy_(%?Ii^9m zwl3GE<=V7dW#!tmLS;m@tI(-b=sYTPx%)J`UyTo_%>iYItUsWADwQX41C{DhsVx|C`ic(o3^T3b}>c&l|Ohjhq?bi9XEdswwKYFwkn zHL9xV%m@8Wrd6ZHN7Rx?%Ol#ZR?TYFtX8wNDz4S~TCF*%t&i&BzR~(Rtt7Hfo!ZoE zO}*OJYlr&IJa3G{KPAjFYfu@HvIZS;gBCUDs2kMmxX$Ld@+Z{ngf8xc+7o&8PU;L! z>I_aQb4r<0di72ze_FkW^m0z?bvUgCXEZ@%;*8ofswI&p*QnV>-AAMDqp@?1dGi}} z8)vnK$PQ;Udrmu?Q~sQKo!9Jn%{DpOx=EWhY15{kCv;EEDkHL8vreU1=h3Xoy`b6e z)cB&>TvUe0`it79MR_7O(4sCa>e9ljpN!uB7BwUCAX-(>s*Q+j)T)hIwWdwS-lk)3 z(?)GNURG_p+PABHyQZm{W*utQq1g@< zcW8Zw)?C%rS9Nig?e`T}wxY{6pD&kfK3pLhhiL4!&F9N)`1_l^(71Xr;$CD~YW1Xk~Xh%b&-; ze@zTiRhX%A_`%$9n2N(F%Vcbdi4=#a*sB7s+IY3ztMy*3H?hDXvfiup;aVB4*$9mz zG>+6hk;+7B2Paa+M2e#{9i{0gO-E^#$ZSs?Y%j&WI<0<+(VfwpP>jYgI)hm46Q?-9 zHlOx^ii30p@rr{TZ8t=5s8Yiehbwcv;!WCggyKkTF}nj8}yD^~}#R576GpyG1f{c<%f)VNUN6?!Nu z^p>npex))im077wkxrsWZHjbbMY^#fU6zRj{12bGF{enI7OS9G1;r{TRza}}Of0a7 z6clTHw99;6qFv_m5~Fd9#<3d5Y8L1)-KiOVVjw?gg(%~3-1(XO?bi=p z#TYG$acAWeS}|H0qcsW2Cn%qwe1h_mRW(JkQ