Composed characters extended to support multiple UTF-16 character

This commit is contained in:
Simon Rozman 2017-03-14 11:53:42 +01:00
parent 87814981db
commit bd0fdba435
8 changed files with 140 additions and 80 deletions

View File

@ -621,7 +621,7 @@ bool ZRCola::DBSource::GetTranslation(const com_obj<ADORecordset>& rs, ZRCola::D
{ {
com_obj<ADOField> f; com_obj<ADOField> f;
wxVERIFY(SUCCEEDED(flds->get_Item(variant(L"znak"), &f))); wxVERIFY(SUCCEEDED(flds->get_Item(variant(L"znak"), &f)));
wxCHECK(GetUnicodeCharacter(f, t.chr), false); wxCHECK(GetUnicodeString(f, t.chr), false);
} }
{ {

View File

@ -79,7 +79,7 @@ namespace ZRCola {
/// ///
class translation { class translation {
public: public:
wchar_t chr; ///< Composed character std::wstring chr; ///< Composed character
charseq decomp; ///< Decomposed sequence charseq decomp; ///< Decomposed sequence
}; };

View File

@ -24,10 +24,10 @@ using namespace stdex;
using namespace winstd; using namespace winstd;
typedef map<wchar_t, set<ZRCola::DBSource::charseq, ZRCola::DBSource::charseq::less_rank_str> > translation_db; typedef map<wstring, set<ZRCola::DBSource::charseq, ZRCola::DBSource::charseq::less_rank_str> > translation_db;
static set<wstring> decompose(_In_ const translation_db &db, _In_z_ const wchar_t *str, _Inout_ set<wchar_t> &path) static set<wstring> decompose(_In_ const translation_db &db, _In_z_ const wchar_t *str, _Inout_ set<translation_db::key_type> &path)
{ {
set<wstring> res; set<wstring> res;
@ -37,10 +37,11 @@ static set<wstring> decompose(_In_ const translation_db &db, _In_z_ const wchar_
if (rem.empty()) if (rem.empty())
return res; return res;
auto const t = db.find(*str); translation_db::key_type _str(1, *str);
auto const t = db.find(_str);
if (t != db.end()) { if (t != db.end()) {
// Current characted decomposed. Iterate all possible decompositions and combine them with the remainder. // Current characted decomposed. Iterate all possible decompositions and combine them with the remainder.
auto p = path.insert(*str); auto p = path.insert(_str);
if (!p.second) { if (!p.second) {
// Path already contains this character: Cycle detected! // Path already contains this character: Cycle detected!
return res; return res;
@ -55,14 +56,14 @@ static set<wstring> decompose(_In_ const translation_db &db, _In_z_ const wchar_
} else { } else {
// Cycle detected. Do not continue decomposition. // Cycle detected. Do not continue decomposition.
for (auto r = rem.cbegin(), r_end = rem.cend(); r != r_end; ++r) for (auto r = rem.cbegin(), r_end = rem.cend(); r != r_end; ++r)
res.insert(wstring(1, *str) + *r); res.insert(_str + *r);
} }
} }
path.erase(p.first); path.erase(p.first);
} else { } else {
// Current character is non-decomposable. Combine it with the remainder(s). // Current character is non-decomposable. Combine it with the remainder(s).
for (auto r = rem.cbegin(), r_end = rem.cend(); r != r_end; ++r) for (auto r = rem.cbegin(), r_end = rem.cend(); r != r_end; ++r)
res.insert(wstring(1, *str) + *r); res.insert(_str + *r);
} }
} else { } else {
// Empty string results in empty decomposition. // Empty string results in empty decomposition.
@ -180,7 +181,7 @@ int _tmain(int argc, _TCHAR *argv[])
translation_db db_temp2; translation_db db_temp2;
for (auto t1 = db_temp1.cbegin(), t1_end = db_temp1.cend(); t1 != t1_end; ++t1) { for (auto t1 = db_temp1.cbegin(), t1_end = db_temp1.cend(); t1 != t1_end; ++t1) {
for (auto d1 = t1->second.cbegin(), d1_end = t1->second.cend(); d1 != d1_end; ++d1) { for (auto d1 = t1->second.cbegin(), d1_end = t1->second.cend(); d1 != d1_end; ++d1) {
set<wchar_t> path; set<translation_db::key_type> path;
path.insert(t1->first); path.insert(t1->first);
auto str = decompose(db_temp1, d1->str.c_str(), path); auto str = decompose(db_temp1, d1->str.c_str(), path);
assert(!str.empty()); assert(!str.empty());
@ -204,20 +205,24 @@ int _tmain(int argc, _TCHAR *argv[])
// Preallocate memory. // Preallocate memory.
db.idxComp .reserve(count); db.idxComp .reserve(count);
db.idxDecomp.reserve(count); db.idxDecomp.reserve(count);
db.data .reserve(count*4); db.data .reserve(count*5);
// Parse translations and build index and data. // Parse translations and build index and data.
for (auto t = db_temp2.cbegin(), t_end = db_temp2.cend(); t != t_end; ++t) { for (auto t = db_temp2.cbegin(), t_end = db_temp2.cend(); t != t_end; ++t) {
// Add translation to index and data. // Add translation to index and data.
for (auto d = t->second.cbegin(), d_end = t->second.cend(); d != d_end; ++d) { for (auto d = t->second.cbegin(), d_end = t->second.cend(); d != d_end; ++d) {
unsigned __int32 idx = db.data.size(); unsigned __int32 idx = db.data.size();
db.data.push_back(t->first);
wxASSERT_MSG((int)0xffff8000 <= d->rank && d->rank <= (int)0x00007fff, wxT("transformation rank out of bounds")); wxASSERT_MSG((int)0xffff8000 <= d->rank && d->rank <= (int)0x00007fff, wxT("transformation rank out of bounds"));
db.data.push_back((unsigned __int16)d->rank); db.data.push_back((unsigned __int16)d->rank);
wstring::size_type n = d->str.length(); wstring::size_type n_com = t->first.length();
wxASSERT_MSG(n <= 0xffff, wxT("transformation string too long")); wxASSERT_MSG(n_com <= 0xffff, wxT("composition string too long"));
db.data.push_back((unsigned __int16)n); db.data.push_back((unsigned __int16)n_com);
for (wstring::size_type i = 0; i < n; i++) wstring::size_type n_dec = d->str.length();
wxASSERT_MSG(n_com + n_dec <= 0xffff, wxT("decomposition string too long"));
db.data.push_back((unsigned __int16)(n_com + n_dec));
for (wstring::size_type i = 0; i < n_com; i++)
db.data.push_back(t->first[i]);
for (wstring::size_type i = 0; i < n_dec; i++)
db.data.push_back(d->str[i]); db.data.push_back(d->str[i]);
db.idxComp .push_back(idx); db.idxComp .push_back(idx);
db.idxDecomp.push_back(idx); db.idxDecomp.push_back(idx);

View File

@ -197,6 +197,18 @@ namespace ZRCola {
/// - \c true when character is used in language /// - \c true when character is used in language
/// - \c false otherwise /// - \c false otherwise
bool IsLocalCharacter(_In_ wchar_t chr, _In_ langid_t lang) const; bool IsLocalCharacter(_In_ wchar_t chr, _In_ langid_t lang) const;
///
/// Tests presence of character in the given language
///
/// \param[in] chr Pointer to UTF-16 character start
/// \param[in] chr_end Pointer to UTF-16 character end
/// \param[in] lang Language
///
/// \returns
/// - \c true when character is used in language
/// - \c false otherwise
bool IsLocalCharacter(_In_ const wchar_t *chr, _In_ const wchar_t *chr_end, _In_ langid_t lang) const;
}; };

View File

@ -46,18 +46,22 @@ namespace ZRCola {
/// Translation data /// Translation data
/// ///
struct translation { struct translation {
wchar_t chr; ///< Composed character unsigned __int16 rank; ///< Decomposition rank
unsigned __int16 rank; ///< Decomposition rank static unsigned __int16 com_start; ///< Composed character start in \c data
unsigned __int16 str_len; ///< \c str length (in characters) union {
wchar_t str[]; ///< Decomposed string unsigned __int16 com_end; ///< Composed character end in \c data
unsigned __int16 dec_start; ///< Decomposed character start in \c data
};
unsigned __int16 dec_end; ///< Decomposed string end in \c data
wchar_t data[]; ///< Decomposed string and composed character
/// ///
/// Binary compares two strings /// Binary compares two strings
/// ///
/// \param[in] str_a First string /// \param[in] str_a First string
/// \param[in] count_a Number of characters in string \p str_a /// \param[in] str_a_end First string end
/// \param[in] str_b Second string /// \param[in] str_b Second string
/// \param[in] count_b Number of characters in string \p str_b /// \param[in] str_b_end Second string end
/// ///
/// \returns /// \returns
/// - <0 when str_a < str_b /// - <0 when str_a < str_b
@ -66,16 +70,16 @@ namespace ZRCola {
/// ///
/// \note /// \note
/// The function does not treat \\0 characters as terminators for performance reasons. /// The function does not treat \\0 characters as terminators for performance reasons.
/// Therefore \p count_a and \p count_b must represent exact string lengths. /// Therefore \p str_a_end and \p str_b_end must represent exact string ends.
/// ///
static inline int CompareString(const wchar_t *str_a, unsigned __int16 count_a, const wchar_t *str_b, unsigned __int16 count_b) static inline int CompareString(const wchar_t *str_a, const wchar_t *str_a_end, const wchar_t *str_b, const wchar_t *str_b_end)
{ {
for (unsigned __int16 i = 0; ; i++) { for (; ; str_a++, str_b++) {
if (i >= count_a && i >= count_b) return 0; if (str_a >= str_a_end && str_b >= str_b_end) return 0;
else if (i >= count_a && i < count_b) return -1; else if (str_a >= str_a_end && str_b < str_b_end) return -1;
else if (i < count_a && i >= count_b) return +1; else if (str_a < str_a_end && str_b >= str_b_end) return +1;
else if (str_a[i] < str_b[i]) return -1; else if (*str_a < *str_b) return -1;
else if (str_a[i] > str_b[i]) return +1; else if (*str_a > *str_b) return +1;
} }
} }
}; };
@ -107,7 +111,7 @@ namespace ZRCola {
/// ///
virtual int compare(_In_ const translation &a, _In_ const translation &b) const virtual int compare(_In_ const translation &a, _In_ const translation &b) const
{ {
int r = translation::CompareString(a.str, a.str_len, b.str, b.str_len); int r = translation::CompareString(a.data + a.dec_start, a.data + a.dec_end, b.data + b.dec_start, b.data + b.dec_end);
if (r != 0) return r; if (r != 0) return r;
return 0; return 0;
@ -126,11 +130,11 @@ namespace ZRCola {
/// ///
virtual int compare_sort(_In_ const translation &a, _In_ const translation &b) const virtual int compare_sort(_In_ const translation &a, _In_ const translation &b) const
{ {
int r = translation::CompareString(a.str, a.str_len, b.str, b.str_len); int r = translation::CompareString(a.data + a.dec_start, a.data + a.dec_end, b.data + b.dec_start, b.data + b.dec_end);
if (r != 0) return r; if (r != 0) return r;
if (a.chr < b.chr) return -1; r = translation::CompareString(a.data + a.com_start, a.data + a.com_end, b.data + b.com_start, b.data + b.com_end);
else if (a.chr > b.chr) return +1; if (r != 0) return r;
return 0; return 0;
} }
@ -163,8 +167,8 @@ namespace ZRCola {
/// ///
virtual int compare(_In_ const translation &a, _In_ const translation &b) const virtual int compare(_In_ const translation &a, _In_ const translation &b) const
{ {
if (a.chr < b.chr) return -1; int r = translation::CompareString(a.data + a.com_start, a.data + a.com_end, b.data + b.com_start, b.data + b.com_end);
else if (a.chr > b.chr) return +1; if (r != 0) return r;
return 0; return 0;
} }
@ -182,13 +186,13 @@ namespace ZRCola {
/// ///
virtual int compare_sort(_In_ const translation &a, _In_ const translation &b) const virtual int compare_sort(_In_ const translation &a, _In_ const translation &b) const
{ {
if (a.chr < b.chr) return -1; int r = translation::CompareString(a.data + a.com_start, a.data + a.com_end, b.data + b.com_start, b.data + b.com_end);
else if (a.chr > b.chr) return +1; if (r != 0) return r;
if (a.rank < b.rank) return -1; if (a.rank < b.rank) return -1;
else if (a.rank > b.rank) return +1; else if (a.rank > b.rank) return +1;
int r = translation::CompareString(a.str, a.str_len, b.str, b.str_len); r = translation::CompareString(a.data + a.dec_start, a.data + a.dec_end, b.data + b.dec_start, b.data + b.dec_end);
if (r != 0) return r; if (r != 0) return r;
return 0; return 0;

View File

@ -94,3 +94,12 @@ bool ZRCola::langchar_db::IsLocalCharacter(_In_ wchar_t chr, _In_ ZRCola::langid
return false; return false;
} }
bool ZRCola::langchar_db::IsLocalCharacter(_In_ const wchar_t *chr, _In_ const wchar_t *chr_end, _In_ ZRCola::langid_t lang) const
{
// TODO: Implement properly!
UNREFERENCED_PARAMETER(chr_end);
assert(chr < chr_end);
return IsLocalCharacter(*chr, lang);
}

View File

@ -19,6 +19,8 @@
#include "stdafx.h" #include "stdafx.h"
unsigned __int16 ZRCola::translation_db::translation::com_start = 0;
void ZRCola::translation_db::Compose(_In_z_count_(inputMax) const wchar_t* input, _In_ size_t inputMax, _Out_ std::wstring &output, _Out_opt_ std::vector<mapping>* map) const void ZRCola::translation_db::Compose(_In_z_count_(inputMax) const wchar_t* input, _In_ size_t inputMax, _Out_ std::wstring &output, _Out_opt_ std::vector<mapping>* map) const
{ {
@ -27,8 +29,7 @@ void ZRCola::translation_db::Compose(_In_z_count_(inputMax) const wchar_t* input
// Trim inputMax to actual length. // Trim inputMax to actual length.
inputMax = inputMax != (size_t)-1 ? wcsnlen(input, inputMax) : wcslen(input); inputMax = inputMax != (size_t)-1 ? wcsnlen(input, inputMax) : wcslen(input);
// Clear the output string and preallocate at least inputMax chars. // Clear the output.
// Since composing is usually reducing the number of chars, memory reallocation is not expected later.
output.clear(); output.clear();
output.reserve(inputMax); output.reserve(inputMax);
if (map) if (map)
@ -49,7 +50,8 @@ void ZRCola::translation_db::Compose(_In_z_count_(inputMax) const wchar_t* input
// All compositions that get short on characters are lexically ordered before. // All compositions that get short on characters are lexically ordered before.
// Thus the j-th character is considered 0. // Thus the j-th character is considered 0.
const translation &trans = idxComp[m]; const translation &trans = idxComp[m];
wchar_t s = j < trans.str_len ? trans.str[j] : 0; size_t jj = trans.dec_start + j;
wchar_t s = jj < trans.dec_end ? trans.data[jj] : 0;
// Do the bisection test. // Do the bisection test.
if (c < s) r = m; if (c < s) r = m;
@ -61,7 +63,8 @@ void ZRCola::translation_db::Compose(_In_z_count_(inputMax) const wchar_t* input
for (size_t rr = m; l < rr;) { for (size_t rr = m; l < rr;) {
size_t m = (l + rr) / 2; size_t m = (l + rr) / 2;
const translation &trans = idxComp[m]; const translation &trans = idxComp[m];
wchar_t s = j < trans.str_len ? trans.str[j] : 0; size_t jj = trans.dec_start + j;
wchar_t s = jj < trans.dec_end ? trans.data[jj] : 0;
if (c <= s) rr = m; else l = m + 1; if (c <= s) rr = m; else l = m + 1;
} }
@ -69,12 +72,13 @@ void ZRCola::translation_db::Compose(_In_z_count_(inputMax) const wchar_t* input
for (size_t ll = m + 1; ll < r;) { for (size_t ll = m + 1; ll < r;) {
size_t m = (ll + r) / 2; size_t m = (ll + r) / 2;
const translation &trans = idxComp[m]; const translation &trans = idxComp[m];
wchar_t s = j < trans.str_len ? trans.str[j] : 0; size_t jj = trans.dec_start + j;
wchar_t s = jj < trans.dec_end ? trans.data[jj] : 0;
if (s <= c) ll = m + 1; else r = m; if (s <= c) ll = m + 1; else r = m;
} }
const translation &trans = idxComp[l]; const translation &trans = idxComp[l];
if (j + 1 == trans.str_len) { if (trans.dec_start + j + 1 == trans.dec_end) {
// The first composition of the run was a match (thus far). Save it. // The first composition of the run was a match (thus far). Save it.
l_match = l; l_match = l;
} }
@ -87,9 +91,9 @@ void ZRCola::translation_db::Compose(_In_z_count_(inputMax) const wchar_t* input
if (l_match < compositionsCount) { if (l_match < compositionsCount) {
// The saved composition was an exact match. // The saved composition was an exact match.
const translation &trans = idxComp[l_match]; const translation &trans = idxComp[l_match];
output += trans.chr; output.append(trans.data + trans.com_start, trans.data + trans.com_end);
i += trans.str_len; i += trans.dec_end - trans.dec_start;
if (trans.str_len > 1 && map) { if (trans.dec_end - trans.dec_start != trans.com_end - trans.com_start && map) {
// Mapping changed. // Mapping changed.
map->push_back(ZRCola::mapping(i, output.length())); map->push_back(ZRCola::mapping(i, output.length()));
} }
@ -109,59 +113,85 @@ void ZRCola::translation_db::Decompose(_In_z_count_(inputMax) const wchar_t* inp
// Trim inputMax to actual length. // Trim inputMax to actual length.
inputMax = inputMax != (size_t)-1 ? wcsnlen(input, inputMax) : wcslen(input); inputMax = inputMax != (size_t)-1 ? wcsnlen(input, inputMax) : wcslen(input);
// Clear the output string and preallocate at least 2*inputMax chars. // Clear the output.
// Since decomposition expands the string, let's keep our fingers crossed to avoid reallocation later.
output.clear(); output.clear();
output.reserve(inputMax * 2); output.reserve(inputMax);
if (map) if (map)
map->clear(); map->clear();
auto decompositionsCount = idxDecomp.size(); auto decompositionsCount = idxDecomp.size();
for (size_t i = 0; i < inputMax;) { for (size_t i = 0; i < inputMax;) {
// Find whether the character can be decomposed. // Find the longest matching decomposition at i-th character.
wchar_t c = input[i]; size_t l_match = (size_t)-1;
for (size_t l = 0, r = decompositionsCount, ii = i, j = 0; ii < inputMax && l < r; ii++, j++) {
for (size_t l = 0, r = decompositionsCount;; ) { wchar_t c = input[ii];
if (l < r) { while (l < r) {
// Test the decomposition in the middle of the search area.
size_t m = (l + r) / 2; size_t m = (l + r) / 2;
// Get the j-th character of the decomposition.
// All decompositions that get short on characters are lexically ordered before.
// Thus the j-th character is considered 0.
const translation &trans = idxDecomp[m]; const translation &trans = idxDecomp[m];
wchar_t decompSrc = trans.chr; size_t jj = trans.com_start + j;
if (c < decompSrc) r = m; wchar_t s = jj < trans.com_end ? trans.data[jj] : 0;
else if (decompSrc < c) l = m + 1;
// Do the bisection test.
if (c < s) r = m;
else if (s < c) l = m + 1;
else { else {
// Character found. // Character found.
// Narrow the search area on the left to start at the first decomposition in the run (first by rank). // Narrow the search area on the left to start at the first decomposition in the run.
for (size_t rr = m; l < rr;) { for (size_t rr = m; l < rr;) {
size_t m = (l + rr) / 2; size_t m = (l + rr) / 2;
const translation &trans = idxDecomp[m]; const translation &trans = idxDecomp[m];
wchar_t decompSrc = trans.chr; size_t jj = trans.com_start + j;
if (c <= decompSrc) rr = m; else l = m + 1; wchar_t s = jj < trans.com_end ? trans.data[jj] : 0;
if (c <= s) rr = m; else l = m + 1;
}
// Narrow the search area on the right to end at the first decomposition not in the run.
for (size_t ll = m + 1; ll < r;) {
size_t m = (ll + r) / 2;
const translation &trans = idxDecomp[m];
size_t jj = trans.com_start + j;
wchar_t s = jj < trans.com_end ? trans.data[jj] : 0;
if (s <= c) ll = m + 1; else r = m;
} }
const translation &trans = idxDecomp[l]; const translation &trans = idxDecomp[l];
if (trans.str_len && trans.str[0] != L'#' && (!lc_db || !lc_db->IsLocalCharacter(c, lang))) { if (trans.com_start + j + 1 == trans.com_end) {
// Append decomposed sequence. // The first decomposition of the run was a match (thus far). Save it.
output.append(trans.str, trans.str_len); l_match = l;
i++;
if (map) {
// Mapping changed.
map->push_back(ZRCola::mapping(i, output.length()));
}
} else {
// Character is inhibited to decompose.
output += c;
i++;
} }
break; break;
} }
} else {
// Character not found.
output += c;
i++;
break;
} }
} }
if (l_match < decompositionsCount) {
// The saved decomposition was an exact match.
const translation &trans = idxDecomp[l_match];
if (trans.dec_start < trans.dec_end && trans.data[trans.dec_start] != L'#' && (!lc_db || !lc_db->IsLocalCharacter(trans.data + trans.com_start, trans.data + trans.com_end, lang))) {
// Append decomposed sequence.
output.append(trans.data + trans.dec_start, trans.data + trans.dec_end);
i += trans.com_end - trans.com_start;
if (trans.dec_end - trans.dec_start != trans.com_end - trans.com_start && map) {
// Mapping changed.
map->push_back(ZRCola::mapping(i, output.length()));
}
} else {
// Character is inhibited to decompose.
output.append(trans.data + trans.com_start, trans.data + trans.com_end);
i += trans.com_end - trans.com_start;
}
} else {
// The match was not found.
output += input[i];
i++;
}
} }
} }

Binary file not shown.