Composed characters extended to support multiple UTF-16 character

This commit is contained in:
Simon Rozman 2017-03-14 11:53:42 +01:00
parent 87814981db
commit bd0fdba435
8 changed files with 140 additions and 80 deletions

View File

@ -621,7 +621,7 @@ bool ZRCola::DBSource::GetTranslation(const com_obj<ADORecordset>& rs, ZRCola::D
{
com_obj<ADOField> f;
wxVERIFY(SUCCEEDED(flds->get_Item(variant(L"znak"), &f)));
wxCHECK(GetUnicodeCharacter(f, t.chr), false);
wxCHECK(GetUnicodeString(f, t.chr), false);
}
{

View File

@ -79,7 +79,7 @@ namespace ZRCola {
///
class translation {
public:
wchar_t chr; ///< Composed character
std::wstring chr; ///< Composed character
charseq decomp; ///< Decomposed sequence
};

View File

@ -24,10 +24,10 @@ using namespace stdex;
using namespace winstd;
typedef map<wchar_t, set<ZRCola::DBSource::charseq, ZRCola::DBSource::charseq::less_rank_str> > translation_db;
typedef map<wstring, set<ZRCola::DBSource::charseq, ZRCola::DBSource::charseq::less_rank_str> > translation_db;
static set<wstring> decompose(_In_ const translation_db &db, _In_z_ const wchar_t *str, _Inout_ set<wchar_t> &path)
static set<wstring> decompose(_In_ const translation_db &db, _In_z_ const wchar_t *str, _Inout_ set<translation_db::key_type> &path)
{
set<wstring> res;
@ -37,10 +37,11 @@ static set<wstring> decompose(_In_ const translation_db &db, _In_z_ const wchar_
if (rem.empty())
return res;
auto const t = db.find(*str);
translation_db::key_type _str(1, *str);
auto const t = db.find(_str);
if (t != db.end()) {
// Current characted decomposed. Iterate all possible decompositions and combine them with the remainder.
auto p = path.insert(*str);
auto p = path.insert(_str);
if (!p.second) {
// Path already contains this character: Cycle detected!
return res;
@ -55,14 +56,14 @@ static set<wstring> decompose(_In_ const translation_db &db, _In_z_ const wchar_
} else {
// Cycle detected. Do not continue decomposition.
for (auto r = rem.cbegin(), r_end = rem.cend(); r != r_end; ++r)
res.insert(wstring(1, *str) + *r);
res.insert(_str + *r);
}
}
path.erase(p.first);
} else {
// Current character is non-decomposable. Combine it with the remainder(s).
for (auto r = rem.cbegin(), r_end = rem.cend(); r != r_end; ++r)
res.insert(wstring(1, *str) + *r);
res.insert(_str + *r);
}
} else {
// Empty string results in empty decomposition.
@ -180,7 +181,7 @@ int _tmain(int argc, _TCHAR *argv[])
translation_db db_temp2;
for (auto t1 = db_temp1.cbegin(), t1_end = db_temp1.cend(); t1 != t1_end; ++t1) {
for (auto d1 = t1->second.cbegin(), d1_end = t1->second.cend(); d1 != d1_end; ++d1) {
set<wchar_t> path;
set<translation_db::key_type> path;
path.insert(t1->first);
auto str = decompose(db_temp1, d1->str.c_str(), path);
assert(!str.empty());
@ -204,20 +205,24 @@ int _tmain(int argc, _TCHAR *argv[])
// Preallocate memory.
db.idxComp .reserve(count);
db.idxDecomp.reserve(count);
db.data .reserve(count*4);
db.data .reserve(count*5);
// Parse translations and build index and data.
for (auto t = db_temp2.cbegin(), t_end = db_temp2.cend(); t != t_end; ++t) {
// Add translation to index and data.
for (auto d = t->second.cbegin(), d_end = t->second.cend(); d != d_end; ++d) {
unsigned __int32 idx = db.data.size();
db.data.push_back(t->first);
wxASSERT_MSG((int)0xffff8000 <= d->rank && d->rank <= (int)0x00007fff, wxT("transformation rank out of bounds"));
db.data.push_back((unsigned __int16)d->rank);
wstring::size_type n = d->str.length();
wxASSERT_MSG(n <= 0xffff, wxT("transformation string too long"));
db.data.push_back((unsigned __int16)n);
for (wstring::size_type i = 0; i < n; i++)
wstring::size_type n_com = t->first.length();
wxASSERT_MSG(n_com <= 0xffff, wxT("composition string too long"));
db.data.push_back((unsigned __int16)n_com);
wstring::size_type n_dec = d->str.length();
wxASSERT_MSG(n_com + n_dec <= 0xffff, wxT("decomposition string too long"));
db.data.push_back((unsigned __int16)(n_com + n_dec));
for (wstring::size_type i = 0; i < n_com; i++)
db.data.push_back(t->first[i]);
for (wstring::size_type i = 0; i < n_dec; i++)
db.data.push_back(d->str[i]);
db.idxComp .push_back(idx);
db.idxDecomp.push_back(idx);

View File

@ -197,6 +197,18 @@ namespace ZRCola {
/// - \c true when character is used in language
/// - \c false otherwise
bool IsLocalCharacter(_In_ wchar_t chr, _In_ langid_t lang) const;
///
/// Tests presence of character in the given language
///
/// \param[in] chr Pointer to UTF-16 character start
/// \param[in] chr_end Pointer to UTF-16 character end
/// \param[in] lang Language
///
/// \returns
/// - \c true when character is used in language
/// - \c false otherwise
bool IsLocalCharacter(_In_ const wchar_t *chr, _In_ const wchar_t *chr_end, _In_ langid_t lang) const;
};

View File

@ -46,18 +46,22 @@ namespace ZRCola {
/// Translation data
///
struct translation {
wchar_t chr; ///< Composed character
unsigned __int16 rank; ///< Decomposition rank
unsigned __int16 str_len; ///< \c str length (in characters)
wchar_t str[]; ///< Decomposed string
unsigned __int16 rank; ///< Decomposition rank
static unsigned __int16 com_start; ///< Composed character start in \c data
union {
unsigned __int16 com_end; ///< Composed character end in \c data
unsigned __int16 dec_start; ///< Decomposed character start in \c data
};
unsigned __int16 dec_end; ///< Decomposed string end in \c data
wchar_t data[]; ///< Decomposed string and composed character
///
/// Binary compares two strings
///
/// \param[in] str_a First string
/// \param[in] count_a Number of characters in string \p str_a
/// \param[in] str_b Second string
/// \param[in] count_b Number of characters in string \p str_b
/// \param[in] str_a First string
/// \param[in] str_a_end First string end
/// \param[in] str_b Second string
/// \param[in] str_b_end Second string end
///
/// \returns
/// - <0 when str_a < str_b
@ -66,16 +70,16 @@ namespace ZRCola {
///
/// \note
/// The function does not treat \\0 characters as terminators for performance reasons.
/// Therefore \p count_a and \p count_b must represent exact string lengths.
/// Therefore \p str_a_end and \p str_b_end must represent exact string ends.
///
static inline int CompareString(const wchar_t *str_a, unsigned __int16 count_a, const wchar_t *str_b, unsigned __int16 count_b)
static inline int CompareString(const wchar_t *str_a, const wchar_t *str_a_end, const wchar_t *str_b, const wchar_t *str_b_end)
{
for (unsigned __int16 i = 0; ; i++) {
if (i >= count_a && i >= count_b) return 0;
else if (i >= count_a && i < count_b) return -1;
else if (i < count_a && i >= count_b) return +1;
else if (str_a[i] < str_b[i]) return -1;
else if (str_a[i] > str_b[i]) return +1;
for (; ; str_a++, str_b++) {
if (str_a >= str_a_end && str_b >= str_b_end) return 0;
else if (str_a >= str_a_end && str_b < str_b_end) return -1;
else if (str_a < str_a_end && str_b >= str_b_end) return +1;
else if (*str_a < *str_b) return -1;
else if (*str_a > *str_b) return +1;
}
}
};
@ -107,7 +111,7 @@ namespace ZRCola {
///
virtual int compare(_In_ const translation &a, _In_ const translation &b) const
{
int r = translation::CompareString(a.str, a.str_len, b.str, b.str_len);
int r = translation::CompareString(a.data + a.dec_start, a.data + a.dec_end, b.data + b.dec_start, b.data + b.dec_end);
if (r != 0) return r;
return 0;
@ -126,11 +130,11 @@ namespace ZRCola {
///
virtual int compare_sort(_In_ const translation &a, _In_ const translation &b) const
{
int r = translation::CompareString(a.str, a.str_len, b.str, b.str_len);
int r = translation::CompareString(a.data + a.dec_start, a.data + a.dec_end, b.data + b.dec_start, b.data + b.dec_end);
if (r != 0) return r;
if (a.chr < b.chr) return -1;
else if (a.chr > b.chr) return +1;
r = translation::CompareString(a.data + a.com_start, a.data + a.com_end, b.data + b.com_start, b.data + b.com_end);
if (r != 0) return r;
return 0;
}
@ -163,8 +167,8 @@ namespace ZRCola {
///
virtual int compare(_In_ const translation &a, _In_ const translation &b) const
{
if (a.chr < b.chr) return -1;
else if (a.chr > b.chr) return +1;
int r = translation::CompareString(a.data + a.com_start, a.data + a.com_end, b.data + b.com_start, b.data + b.com_end);
if (r != 0) return r;
return 0;
}
@ -182,13 +186,13 @@ namespace ZRCola {
///
virtual int compare_sort(_In_ const translation &a, _In_ const translation &b) const
{
if (a.chr < b.chr) return -1;
else if (a.chr > b.chr) return +1;
int r = translation::CompareString(a.data + a.com_start, a.data + a.com_end, b.data + b.com_start, b.data + b.com_end);
if (r != 0) return r;
if (a.rank < b.rank) return -1;
else if (a.rank > b.rank) return +1;
int r = translation::CompareString(a.str, a.str_len, b.str, b.str_len);
r = translation::CompareString(a.data + a.dec_start, a.data + a.dec_end, b.data + b.dec_start, b.data + b.dec_end);
if (r != 0) return r;
return 0;

View File

@ -94,3 +94,12 @@ bool ZRCola::langchar_db::IsLocalCharacter(_In_ wchar_t chr, _In_ ZRCola::langid
return false;
}
bool ZRCola::langchar_db::IsLocalCharacter(_In_ const wchar_t *chr, _In_ const wchar_t *chr_end, _In_ ZRCola::langid_t lang) const
{
// TODO: Implement properly!
UNREFERENCED_PARAMETER(chr_end);
assert(chr < chr_end);
return IsLocalCharacter(*chr, lang);
}

View File

@ -19,6 +19,8 @@
#include "stdafx.h"
unsigned __int16 ZRCola::translation_db::translation::com_start = 0;
void ZRCola::translation_db::Compose(_In_z_count_(inputMax) const wchar_t* input, _In_ size_t inputMax, _Out_ std::wstring &output, _Out_opt_ std::vector<mapping>* map) const
{
@ -27,8 +29,7 @@ void ZRCola::translation_db::Compose(_In_z_count_(inputMax) const wchar_t* input
// Trim inputMax to actual length.
inputMax = inputMax != (size_t)-1 ? wcsnlen(input, inputMax) : wcslen(input);
// Clear the output string and preallocate at least inputMax chars.
// Since composing is usually reducing the number of chars, memory reallocation is not expected later.
// Clear the output.
output.clear();
output.reserve(inputMax);
if (map)
@ -49,7 +50,8 @@ void ZRCola::translation_db::Compose(_In_z_count_(inputMax) const wchar_t* input
// All compositions that get short on characters are lexically ordered before.
// Thus the j-th character is considered 0.
const translation &trans = idxComp[m];
wchar_t s = j < trans.str_len ? trans.str[j] : 0;
size_t jj = trans.dec_start + j;
wchar_t s = jj < trans.dec_end ? trans.data[jj] : 0;
// Do the bisection test.
if (c < s) r = m;
@ -61,7 +63,8 @@ void ZRCola::translation_db::Compose(_In_z_count_(inputMax) const wchar_t* input
for (size_t rr = m; l < rr;) {
size_t m = (l + rr) / 2;
const translation &trans = idxComp[m];
wchar_t s = j < trans.str_len ? trans.str[j] : 0;
size_t jj = trans.dec_start + j;
wchar_t s = jj < trans.dec_end ? trans.data[jj] : 0;
if (c <= s) rr = m; else l = m + 1;
}
@ -69,12 +72,13 @@ void ZRCola::translation_db::Compose(_In_z_count_(inputMax) const wchar_t* input
for (size_t ll = m + 1; ll < r;) {
size_t m = (ll + r) / 2;
const translation &trans = idxComp[m];
wchar_t s = j < trans.str_len ? trans.str[j] : 0;
size_t jj = trans.dec_start + j;
wchar_t s = jj < trans.dec_end ? trans.data[jj] : 0;
if (s <= c) ll = m + 1; else r = m;
}
const translation &trans = idxComp[l];
if (j + 1 == trans.str_len) {
if (trans.dec_start + j + 1 == trans.dec_end) {
// The first composition of the run was a match (thus far). Save it.
l_match = l;
}
@ -87,9 +91,9 @@ void ZRCola::translation_db::Compose(_In_z_count_(inputMax) const wchar_t* input
if (l_match < compositionsCount) {
// The saved composition was an exact match.
const translation &trans = idxComp[l_match];
output += trans.chr;
i += trans.str_len;
if (trans.str_len > 1 && map) {
output.append(trans.data + trans.com_start, trans.data + trans.com_end);
i += trans.dec_end - trans.dec_start;
if (trans.dec_end - trans.dec_start != trans.com_end - trans.com_start && map) {
// Mapping changed.
map->push_back(ZRCola::mapping(i, output.length()));
}
@ -109,59 +113,85 @@ void ZRCola::translation_db::Decompose(_In_z_count_(inputMax) const wchar_t* inp
// Trim inputMax to actual length.
inputMax = inputMax != (size_t)-1 ? wcsnlen(input, inputMax) : wcslen(input);
// Clear the output string and preallocate at least 2*inputMax chars.
// Since decomposition expands the string, let's keep our fingers crossed to avoid reallocation later.
// Clear the output.
output.clear();
output.reserve(inputMax * 2);
output.reserve(inputMax);
if (map)
map->clear();
auto decompositionsCount = idxDecomp.size();
for (size_t i = 0; i < inputMax;) {
// Find whether the character can be decomposed.
wchar_t c = input[i];
for (size_t l = 0, r = decompositionsCount;; ) {
if (l < r) {
// Find the longest matching decomposition at i-th character.
size_t l_match = (size_t)-1;
for (size_t l = 0, r = decompositionsCount, ii = i, j = 0; ii < inputMax && l < r; ii++, j++) {
wchar_t c = input[ii];
while (l < r) {
// Test the decomposition in the middle of the search area.
size_t m = (l + r) / 2;
// Get the j-th character of the decomposition.
// All decompositions that get short on characters are lexically ordered before.
// Thus the j-th character is considered 0.
const translation &trans = idxDecomp[m];
wchar_t decompSrc = trans.chr;
if (c < decompSrc) r = m;
else if (decompSrc < c) l = m + 1;
size_t jj = trans.com_start + j;
wchar_t s = jj < trans.com_end ? trans.data[jj] : 0;
// Do the bisection test.
if (c < s) r = m;
else if (s < c) l = m + 1;
else {
// Character found.
// Narrow the search area on the left to start at the first decomposition in the run (first by rank).
// Narrow the search area on the left to start at the first decomposition in the run.
for (size_t rr = m; l < rr;) {
size_t m = (l + rr) / 2;
const translation &trans = idxDecomp[m];
wchar_t decompSrc = trans.chr;
if (c <= decompSrc) rr = m; else l = m + 1;
size_t jj = trans.com_start + j;
wchar_t s = jj < trans.com_end ? trans.data[jj] : 0;
if (c <= s) rr = m; else l = m + 1;
}
// Narrow the search area on the right to end at the first decomposition not in the run.
for (size_t ll = m + 1; ll < r;) {
size_t m = (ll + r) / 2;
const translation &trans = idxDecomp[m];
size_t jj = trans.com_start + j;
wchar_t s = jj < trans.com_end ? trans.data[jj] : 0;
if (s <= c) ll = m + 1; else r = m;
}
const translation &trans = idxDecomp[l];
if (trans.str_len && trans.str[0] != L'#' && (!lc_db || !lc_db->IsLocalCharacter(c, lang))) {
// Append decomposed sequence.
output.append(trans.str, trans.str_len);
i++;
if (map) {
// Mapping changed.
map->push_back(ZRCola::mapping(i, output.length()));
}
} else {
// Character is inhibited to decompose.
output += c;
i++;
if (trans.com_start + j + 1 == trans.com_end) {
// The first decomposition of the run was a match (thus far). Save it.
l_match = l;
}
break;
}
} else {
// Character not found.
output += c;
i++;
break;
}
}
if (l_match < decompositionsCount) {
// The saved decomposition was an exact match.
const translation &trans = idxDecomp[l_match];
if (trans.dec_start < trans.dec_end && trans.data[trans.dec_start] != L'#' && (!lc_db || !lc_db->IsLocalCharacter(trans.data + trans.com_start, trans.data + trans.com_end, lang))) {
// Append decomposed sequence.
output.append(trans.data + trans.dec_start, trans.data + trans.dec_end);
i += trans.com_end - trans.com_start;
if (trans.dec_end - trans.dec_start != trans.com_end - trans.com_start && map) {
// Mapping changed.
map->push_back(ZRCola::mapping(i, output.length()));
}
} else {
// Character is inhibited to decompose.
output.append(trans.data + trans.com_start, trans.data + trans.com_end);
i += trans.com_end - trans.com_start;
}
} else {
// The match was not found.
output += input[i];
i++;
}
}
}

Binary file not shown.