diff --git a/ZRColaCompile/dbsource.cpp b/ZRColaCompile/dbsource.cpp index cf295e9..c31edc0 100644 --- a/ZRColaCompile/dbsource.cpp +++ b/ZRColaCompile/dbsource.cpp @@ -621,7 +621,7 @@ bool ZRCola::DBSource::GetTranslation(const com_obj& rs, ZRCola::D { com_obj f; wxVERIFY(SUCCEEDED(flds->get_Item(variant(L"znak"), &f))); - wxCHECK(GetUnicodeCharacter(f, t.chr), false); + wxCHECK(GetUnicodeString(f, t.chr), false); } { diff --git a/ZRColaCompile/dbsource.h b/ZRColaCompile/dbsource.h index c7c90ed..6de0a9b 100644 --- a/ZRColaCompile/dbsource.h +++ b/ZRColaCompile/dbsource.h @@ -79,7 +79,7 @@ namespace ZRCola { /// class translation { public: - wchar_t chr; ///< Composed character + std::wstring chr; ///< Composed character charseq decomp; ///< Decomposed sequence }; diff --git a/ZRColaCompile/main.cpp b/ZRColaCompile/main.cpp index 44b6aa2..0034e8a 100644 --- a/ZRColaCompile/main.cpp +++ b/ZRColaCompile/main.cpp @@ -24,10 +24,10 @@ using namespace stdex; using namespace winstd; -typedef map > translation_db; +typedef map > translation_db; -static set decompose(_In_ const translation_db &db, _In_z_ const wchar_t *str, _Inout_ set &path) +static set decompose(_In_ const translation_db &db, _In_z_ const wchar_t *str, _Inout_ set &path) { set res; @@ -37,10 +37,11 @@ static set decompose(_In_ const translation_db &db, _In_z_ const wchar_ if (rem.empty()) return res; - auto const t = db.find(*str); + translation_db::key_type _str(1, *str); + auto const t = db.find(_str); if (t != db.end()) { // Current characted decomposed. Iterate all possible decompositions and combine them with the remainder. - auto p = path.insert(*str); + auto p = path.insert(_str); if (!p.second) { // Path already contains this character: Cycle detected! return res; @@ -55,14 +56,14 @@ static set decompose(_In_ const translation_db &db, _In_z_ const wchar_ } else { // Cycle detected. Do not continue decomposition. for (auto r = rem.cbegin(), r_end = rem.cend(); r != r_end; ++r) - res.insert(wstring(1, *str) + *r); + res.insert(_str + *r); } } path.erase(p.first); } else { // Current character is non-decomposable. Combine it with the remainder(s). for (auto r = rem.cbegin(), r_end = rem.cend(); r != r_end; ++r) - res.insert(wstring(1, *str) + *r); + res.insert(_str + *r); } } else { // Empty string results in empty decomposition. @@ -180,7 +181,7 @@ int _tmain(int argc, _TCHAR *argv[]) translation_db db_temp2; for (auto t1 = db_temp1.cbegin(), t1_end = db_temp1.cend(); t1 != t1_end; ++t1) { for (auto d1 = t1->second.cbegin(), d1_end = t1->second.cend(); d1 != d1_end; ++d1) { - set path; + set path; path.insert(t1->first); auto str = decompose(db_temp1, d1->str.c_str(), path); assert(!str.empty()); @@ -204,20 +205,24 @@ int _tmain(int argc, _TCHAR *argv[]) // Preallocate memory. db.idxComp .reserve(count); db.idxDecomp.reserve(count); - db.data .reserve(count*4); + db.data .reserve(count*5); // Parse translations and build index and data. for (auto t = db_temp2.cbegin(), t_end = db_temp2.cend(); t != t_end; ++t) { // Add translation to index and data. for (auto d = t->second.cbegin(), d_end = t->second.cend(); d != d_end; ++d) { unsigned __int32 idx = db.data.size(); - db.data.push_back(t->first); wxASSERT_MSG((int)0xffff8000 <= d->rank && d->rank <= (int)0x00007fff, wxT("transformation rank out of bounds")); db.data.push_back((unsigned __int16)d->rank); - wstring::size_type n = d->str.length(); - wxASSERT_MSG(n <= 0xffff, wxT("transformation string too long")); - db.data.push_back((unsigned __int16)n); - for (wstring::size_type i = 0; i < n; i++) + wstring::size_type n_com = t->first.length(); + wxASSERT_MSG(n_com <= 0xffff, wxT("composition string too long")); + db.data.push_back((unsigned __int16)n_com); + wstring::size_type n_dec = d->str.length(); + wxASSERT_MSG(n_com + n_dec <= 0xffff, wxT("decomposition string too long")); + db.data.push_back((unsigned __int16)(n_com + n_dec)); + for (wstring::size_type i = 0; i < n_com; i++) + db.data.push_back(t->first[i]); + for (wstring::size_type i = 0; i < n_dec; i++) db.data.push_back(d->str[i]); db.idxComp .push_back(idx); db.idxDecomp.push_back(idx); diff --git a/lib/libZRCola/include/zrcola/language.h b/lib/libZRCola/include/zrcola/language.h index 5a15576..b3036b9 100644 --- a/lib/libZRCola/include/zrcola/language.h +++ b/lib/libZRCola/include/zrcola/language.h @@ -197,6 +197,18 @@ namespace ZRCola { /// - \c true when character is used in language /// - \c false otherwise bool IsLocalCharacter(_In_ wchar_t chr, _In_ langid_t lang) const; + + /// + /// Tests presence of character in the given language + /// + /// \param[in] chr Pointer to UTF-16 character start + /// \param[in] chr_end Pointer to UTF-16 character end + /// \param[in] lang Language + /// + /// \returns + /// - \c true when character is used in language + /// - \c false otherwise + bool IsLocalCharacter(_In_ const wchar_t *chr, _In_ const wchar_t *chr_end, _In_ langid_t lang) const; }; diff --git a/lib/libZRCola/include/zrcola/translate.h b/lib/libZRCola/include/zrcola/translate.h index d7174b4..8d43c3a 100644 --- a/lib/libZRCola/include/zrcola/translate.h +++ b/lib/libZRCola/include/zrcola/translate.h @@ -46,18 +46,22 @@ namespace ZRCola { /// Translation data /// struct translation { - wchar_t chr; ///< Composed character - unsigned __int16 rank; ///< Decomposition rank - unsigned __int16 str_len; ///< \c str length (in characters) - wchar_t str[]; ///< Decomposed string + unsigned __int16 rank; ///< Decomposition rank + static unsigned __int16 com_start; ///< Composed character start in \c data + union { + unsigned __int16 com_end; ///< Composed character end in \c data + unsigned __int16 dec_start; ///< Decomposed character start in \c data + }; + unsigned __int16 dec_end; ///< Decomposed string end in \c data + wchar_t data[]; ///< Decomposed string and composed character /// /// Binary compares two strings /// - /// \param[in] str_a First string - /// \param[in] count_a Number of characters in string \p str_a - /// \param[in] str_b Second string - /// \param[in] count_b Number of characters in string \p str_b + /// \param[in] str_a First string + /// \param[in] str_a_end First string end + /// \param[in] str_b Second string + /// \param[in] str_b_end Second string end /// /// \returns /// - <0 when str_a < str_b @@ -66,16 +70,16 @@ namespace ZRCola { /// /// \note /// The function does not treat \\0 characters as terminators for performance reasons. - /// Therefore \p count_a and \p count_b must represent exact string lengths. + /// Therefore \p str_a_end and \p str_b_end must represent exact string ends. /// - static inline int CompareString(const wchar_t *str_a, unsigned __int16 count_a, const wchar_t *str_b, unsigned __int16 count_b) + static inline int CompareString(const wchar_t *str_a, const wchar_t *str_a_end, const wchar_t *str_b, const wchar_t *str_b_end) { - for (unsigned __int16 i = 0; ; i++) { - if (i >= count_a && i >= count_b) return 0; - else if (i >= count_a && i < count_b) return -1; - else if (i < count_a && i >= count_b) return +1; - else if (str_a[i] < str_b[i]) return -1; - else if (str_a[i] > str_b[i]) return +1; + for (; ; str_a++, str_b++) { + if (str_a >= str_a_end && str_b >= str_b_end) return 0; + else if (str_a >= str_a_end && str_b < str_b_end) return -1; + else if (str_a < str_a_end && str_b >= str_b_end) return +1; + else if (*str_a < *str_b) return -1; + else if (*str_a > *str_b) return +1; } } }; @@ -107,7 +111,7 @@ namespace ZRCola { /// virtual int compare(_In_ const translation &a, _In_ const translation &b) const { - int r = translation::CompareString(a.str, a.str_len, b.str, b.str_len); + int r = translation::CompareString(a.data + a.dec_start, a.data + a.dec_end, b.data + b.dec_start, b.data + b.dec_end); if (r != 0) return r; return 0; @@ -126,11 +130,11 @@ namespace ZRCola { /// virtual int compare_sort(_In_ const translation &a, _In_ const translation &b) const { - int r = translation::CompareString(a.str, a.str_len, b.str, b.str_len); + int r = translation::CompareString(a.data + a.dec_start, a.data + a.dec_end, b.data + b.dec_start, b.data + b.dec_end); if (r != 0) return r; - if (a.chr < b.chr) return -1; - else if (a.chr > b.chr) return +1; + r = translation::CompareString(a.data + a.com_start, a.data + a.com_end, b.data + b.com_start, b.data + b.com_end); + if (r != 0) return r; return 0; } @@ -163,8 +167,8 @@ namespace ZRCola { /// virtual int compare(_In_ const translation &a, _In_ const translation &b) const { - if (a.chr < b.chr) return -1; - else if (a.chr > b.chr) return +1; + int r = translation::CompareString(a.data + a.com_start, a.data + a.com_end, b.data + b.com_start, b.data + b.com_end); + if (r != 0) return r; return 0; } @@ -182,13 +186,13 @@ namespace ZRCola { /// virtual int compare_sort(_In_ const translation &a, _In_ const translation &b) const { - if (a.chr < b.chr) return -1; - else if (a.chr > b.chr) return +1; + int r = translation::CompareString(a.data + a.com_start, a.data + a.com_end, b.data + b.com_start, b.data + b.com_end); + if (r != 0) return r; if (a.rank < b.rank) return -1; else if (a.rank > b.rank) return +1; - int r = translation::CompareString(a.str, a.str_len, b.str, b.str_len); + r = translation::CompareString(a.data + a.dec_start, a.data + a.dec_end, b.data + b.dec_start, b.data + b.dec_end); if (r != 0) return r; return 0; diff --git a/lib/libZRCola/src/language.cpp b/lib/libZRCola/src/language.cpp index c5fda39..f523a90 100644 --- a/lib/libZRCola/src/language.cpp +++ b/lib/libZRCola/src/language.cpp @@ -94,3 +94,12 @@ bool ZRCola::langchar_db::IsLocalCharacter(_In_ wchar_t chr, _In_ ZRCola::langid return false; } + + +bool ZRCola::langchar_db::IsLocalCharacter(_In_ const wchar_t *chr, _In_ const wchar_t *chr_end, _In_ ZRCola::langid_t lang) const +{ + // TODO: Implement properly! + UNREFERENCED_PARAMETER(chr_end); + assert(chr < chr_end); + return IsLocalCharacter(*chr, lang); +} diff --git a/lib/libZRCola/src/translate.cpp b/lib/libZRCola/src/translate.cpp index 9e7bcc9..bd2b6ab 100644 --- a/lib/libZRCola/src/translate.cpp +++ b/lib/libZRCola/src/translate.cpp @@ -19,6 +19,8 @@ #include "stdafx.h" +unsigned __int16 ZRCola::translation_db::translation::com_start = 0; + void ZRCola::translation_db::Compose(_In_z_count_(inputMax) const wchar_t* input, _In_ size_t inputMax, _Out_ std::wstring &output, _Out_opt_ std::vector* map) const { @@ -27,8 +29,7 @@ void ZRCola::translation_db::Compose(_In_z_count_(inputMax) const wchar_t* input // Trim inputMax to actual length. inputMax = inputMax != (size_t)-1 ? wcsnlen(input, inputMax) : wcslen(input); - // Clear the output string and preallocate at least inputMax chars. - // Since composing is usually reducing the number of chars, memory reallocation is not expected later. + // Clear the output. output.clear(); output.reserve(inputMax); if (map) @@ -49,7 +50,8 @@ void ZRCola::translation_db::Compose(_In_z_count_(inputMax) const wchar_t* input // All compositions that get short on characters are lexically ordered before. // Thus the j-th character is considered 0. const translation &trans = idxComp[m]; - wchar_t s = j < trans.str_len ? trans.str[j] : 0; + size_t jj = trans.dec_start + j; + wchar_t s = jj < trans.dec_end ? trans.data[jj] : 0; // Do the bisection test. if (c < s) r = m; @@ -61,7 +63,8 @@ void ZRCola::translation_db::Compose(_In_z_count_(inputMax) const wchar_t* input for (size_t rr = m; l < rr;) { size_t m = (l + rr) / 2; const translation &trans = idxComp[m]; - wchar_t s = j < trans.str_len ? trans.str[j] : 0; + size_t jj = trans.dec_start + j; + wchar_t s = jj < trans.dec_end ? trans.data[jj] : 0; if (c <= s) rr = m; else l = m + 1; } @@ -69,12 +72,13 @@ void ZRCola::translation_db::Compose(_In_z_count_(inputMax) const wchar_t* input for (size_t ll = m + 1; ll < r;) { size_t m = (ll + r) / 2; const translation &trans = idxComp[m]; - wchar_t s = j < trans.str_len ? trans.str[j] : 0; + size_t jj = trans.dec_start + j; + wchar_t s = jj < trans.dec_end ? trans.data[jj] : 0; if (s <= c) ll = m + 1; else r = m; } const translation &trans = idxComp[l]; - if (j + 1 == trans.str_len) { + if (trans.dec_start + j + 1 == trans.dec_end) { // The first composition of the run was a match (thus far). Save it. l_match = l; } @@ -87,9 +91,9 @@ void ZRCola::translation_db::Compose(_In_z_count_(inputMax) const wchar_t* input if (l_match < compositionsCount) { // The saved composition was an exact match. const translation &trans = idxComp[l_match]; - output += trans.chr; - i += trans.str_len; - if (trans.str_len > 1 && map) { + output.append(trans.data + trans.com_start, trans.data + trans.com_end); + i += trans.dec_end - trans.dec_start; + if (trans.dec_end - trans.dec_start != trans.com_end - trans.com_start && map) { // Mapping changed. map->push_back(ZRCola::mapping(i, output.length())); } @@ -109,59 +113,85 @@ void ZRCola::translation_db::Decompose(_In_z_count_(inputMax) const wchar_t* inp // Trim inputMax to actual length. inputMax = inputMax != (size_t)-1 ? wcsnlen(input, inputMax) : wcslen(input); - // Clear the output string and preallocate at least 2*inputMax chars. - // Since decomposition expands the string, let's keep our fingers crossed to avoid reallocation later. + // Clear the output. output.clear(); - output.reserve(inputMax * 2); + output.reserve(inputMax); if (map) map->clear(); auto decompositionsCount = idxDecomp.size(); for (size_t i = 0; i < inputMax;) { - // Find whether the character can be decomposed. - wchar_t c = input[i]; - - for (size_t l = 0, r = decompositionsCount;; ) { - if (l < r) { + // Find the longest matching decomposition at i-th character. + size_t l_match = (size_t)-1; + for (size_t l = 0, r = decompositionsCount, ii = i, j = 0; ii < inputMax && l < r; ii++, j++) { + wchar_t c = input[ii]; + while (l < r) { + // Test the decomposition in the middle of the search area. size_t m = (l + r) / 2; + + // Get the j-th character of the decomposition. + // All decompositions that get short on characters are lexically ordered before. + // Thus the j-th character is considered 0. const translation &trans = idxDecomp[m]; - wchar_t decompSrc = trans.chr; - if (c < decompSrc) r = m; - else if (decompSrc < c) l = m + 1; + size_t jj = trans.com_start + j; + wchar_t s = jj < trans.com_end ? trans.data[jj] : 0; + + // Do the bisection test. + if (c < s) r = m; + else if (s < c) l = m + 1; else { // Character found. - // Narrow the search area on the left to start at the first decomposition in the run (first by rank). + // Narrow the search area on the left to start at the first decomposition in the run. for (size_t rr = m; l < rr;) { size_t m = (l + rr) / 2; const translation &trans = idxDecomp[m]; - wchar_t decompSrc = trans.chr; - if (c <= decompSrc) rr = m; else l = m + 1; + size_t jj = trans.com_start + j; + wchar_t s = jj < trans.com_end ? trans.data[jj] : 0; + if (c <= s) rr = m; else l = m + 1; + } + + // Narrow the search area on the right to end at the first decomposition not in the run. + for (size_t ll = m + 1; ll < r;) { + size_t m = (ll + r) / 2; + const translation &trans = idxDecomp[m]; + size_t jj = trans.com_start + j; + wchar_t s = jj < trans.com_end ? trans.data[jj] : 0; + if (s <= c) ll = m + 1; else r = m; } const translation &trans = idxDecomp[l]; - if (trans.str_len && trans.str[0] != L'#' && (!lc_db || !lc_db->IsLocalCharacter(c, lang))) { - // Append decomposed sequence. - output.append(trans.str, trans.str_len); - i++; - if (map) { - // Mapping changed. - map->push_back(ZRCola::mapping(i, output.length())); - } - } else { - // Character is inhibited to decompose. - output += c; - i++; + if (trans.com_start + j + 1 == trans.com_end) { + // The first decomposition of the run was a match (thus far). Save it. + l_match = l; } + break; } - } else { - // Character not found. - output += c; - i++; - break; } } + + if (l_match < decompositionsCount) { + // The saved decomposition was an exact match. + const translation &trans = idxDecomp[l_match]; + if (trans.dec_start < trans.dec_end && trans.data[trans.dec_start] != L'#' && (!lc_db || !lc_db->IsLocalCharacter(trans.data + trans.com_start, trans.data + trans.com_end, lang))) { + // Append decomposed sequence. + output.append(trans.data + trans.dec_start, trans.data + trans.dec_end); + i += trans.com_end - trans.com_start; + if (trans.dec_end - trans.dec_start != trans.com_end - trans.com_start && map) { + // Mapping changed. + map->push_back(ZRCola::mapping(i, output.length())); + } + } else { + // Character is inhibited to decompose. + output.append(trans.data + trans.com_start, trans.data + trans.com_end); + i += trans.com_end - trans.com_start; + } + } else { + // The match was not found. + output += input[i]; + i++; + } } } diff --git a/output/data/ZRCola.zrcdb b/output/data/ZRCola.zrcdb index 53ac07a..71bdd87 100644 Binary files a/output/data/ZRCola.zrcdb and b/output/data/ZRCola.zrcdb differ