Composed characters extended to support multiple UTF-16 character
This commit is contained in:
parent
87814981db
commit
bd0fdba435
@ -621,7 +621,7 @@ bool ZRCola::DBSource::GetTranslation(const com_obj<ADORecordset>& rs, ZRCola::D
|
||||
{
|
||||
com_obj<ADOField> f;
|
||||
wxVERIFY(SUCCEEDED(flds->get_Item(variant(L"znak"), &f)));
|
||||
wxCHECK(GetUnicodeCharacter(f, t.chr), false);
|
||||
wxCHECK(GetUnicodeString(f, t.chr), false);
|
||||
}
|
||||
|
||||
{
|
||||
|
@ -79,7 +79,7 @@ namespace ZRCola {
|
||||
///
|
||||
class translation {
|
||||
public:
|
||||
wchar_t chr; ///< Composed character
|
||||
std::wstring chr; ///< Composed character
|
||||
charseq decomp; ///< Decomposed sequence
|
||||
};
|
||||
|
||||
|
@ -24,10 +24,10 @@ using namespace stdex;
|
||||
using namespace winstd;
|
||||
|
||||
|
||||
typedef map<wchar_t, set<ZRCola::DBSource::charseq, ZRCola::DBSource::charseq::less_rank_str> > translation_db;
|
||||
typedef map<wstring, set<ZRCola::DBSource::charseq, ZRCola::DBSource::charseq::less_rank_str> > translation_db;
|
||||
|
||||
|
||||
static set<wstring> decompose(_In_ const translation_db &db, _In_z_ const wchar_t *str, _Inout_ set<wchar_t> &path)
|
||||
static set<wstring> decompose(_In_ const translation_db &db, _In_z_ const wchar_t *str, _Inout_ set<translation_db::key_type> &path)
|
||||
{
|
||||
set<wstring> res;
|
||||
|
||||
@ -37,10 +37,11 @@ static set<wstring> decompose(_In_ const translation_db &db, _In_z_ const wchar_
|
||||
if (rem.empty())
|
||||
return res;
|
||||
|
||||
auto const t = db.find(*str);
|
||||
translation_db::key_type _str(1, *str);
|
||||
auto const t = db.find(_str);
|
||||
if (t != db.end()) {
|
||||
// Current characted decomposed. Iterate all possible decompositions and combine them with the remainder.
|
||||
auto p = path.insert(*str);
|
||||
auto p = path.insert(_str);
|
||||
if (!p.second) {
|
||||
// Path already contains this character: Cycle detected!
|
||||
return res;
|
||||
@ -55,14 +56,14 @@ static set<wstring> decompose(_In_ const translation_db &db, _In_z_ const wchar_
|
||||
} else {
|
||||
// Cycle detected. Do not continue decomposition.
|
||||
for (auto r = rem.cbegin(), r_end = rem.cend(); r != r_end; ++r)
|
||||
res.insert(wstring(1, *str) + *r);
|
||||
res.insert(_str + *r);
|
||||
}
|
||||
}
|
||||
path.erase(p.first);
|
||||
} else {
|
||||
// Current character is non-decomposable. Combine it with the remainder(s).
|
||||
for (auto r = rem.cbegin(), r_end = rem.cend(); r != r_end; ++r)
|
||||
res.insert(wstring(1, *str) + *r);
|
||||
res.insert(_str + *r);
|
||||
}
|
||||
} else {
|
||||
// Empty string results in empty decomposition.
|
||||
@ -180,7 +181,7 @@ int _tmain(int argc, _TCHAR *argv[])
|
||||
translation_db db_temp2;
|
||||
for (auto t1 = db_temp1.cbegin(), t1_end = db_temp1.cend(); t1 != t1_end; ++t1) {
|
||||
for (auto d1 = t1->second.cbegin(), d1_end = t1->second.cend(); d1 != d1_end; ++d1) {
|
||||
set<wchar_t> path;
|
||||
set<translation_db::key_type> path;
|
||||
path.insert(t1->first);
|
||||
auto str = decompose(db_temp1, d1->str.c_str(), path);
|
||||
assert(!str.empty());
|
||||
@ -204,20 +205,24 @@ int _tmain(int argc, _TCHAR *argv[])
|
||||
// Preallocate memory.
|
||||
db.idxComp .reserve(count);
|
||||
db.idxDecomp.reserve(count);
|
||||
db.data .reserve(count*4);
|
||||
db.data .reserve(count*5);
|
||||
|
||||
// Parse translations and build index and data.
|
||||
for (auto t = db_temp2.cbegin(), t_end = db_temp2.cend(); t != t_end; ++t) {
|
||||
// Add translation to index and data.
|
||||
for (auto d = t->second.cbegin(), d_end = t->second.cend(); d != d_end; ++d) {
|
||||
unsigned __int32 idx = db.data.size();
|
||||
db.data.push_back(t->first);
|
||||
wxASSERT_MSG((int)0xffff8000 <= d->rank && d->rank <= (int)0x00007fff, wxT("transformation rank out of bounds"));
|
||||
db.data.push_back((unsigned __int16)d->rank);
|
||||
wstring::size_type n = d->str.length();
|
||||
wxASSERT_MSG(n <= 0xffff, wxT("transformation string too long"));
|
||||
db.data.push_back((unsigned __int16)n);
|
||||
for (wstring::size_type i = 0; i < n; i++)
|
||||
wstring::size_type n_com = t->first.length();
|
||||
wxASSERT_MSG(n_com <= 0xffff, wxT("composition string too long"));
|
||||
db.data.push_back((unsigned __int16)n_com);
|
||||
wstring::size_type n_dec = d->str.length();
|
||||
wxASSERT_MSG(n_com + n_dec <= 0xffff, wxT("decomposition string too long"));
|
||||
db.data.push_back((unsigned __int16)(n_com + n_dec));
|
||||
for (wstring::size_type i = 0; i < n_com; i++)
|
||||
db.data.push_back(t->first[i]);
|
||||
for (wstring::size_type i = 0; i < n_dec; i++)
|
||||
db.data.push_back(d->str[i]);
|
||||
db.idxComp .push_back(idx);
|
||||
db.idxDecomp.push_back(idx);
|
||||
|
@ -197,6 +197,18 @@ namespace ZRCola {
|
||||
/// - \c true when character is used in language
|
||||
/// - \c false otherwise
|
||||
bool IsLocalCharacter(_In_ wchar_t chr, _In_ langid_t lang) const;
|
||||
|
||||
///
|
||||
/// Tests presence of character in the given language
|
||||
///
|
||||
/// \param[in] chr Pointer to UTF-16 character start
|
||||
/// \param[in] chr_end Pointer to UTF-16 character end
|
||||
/// \param[in] lang Language
|
||||
///
|
||||
/// \returns
|
||||
/// - \c true when character is used in language
|
||||
/// - \c false otherwise
|
||||
bool IsLocalCharacter(_In_ const wchar_t *chr, _In_ const wchar_t *chr_end, _In_ langid_t lang) const;
|
||||
};
|
||||
|
||||
|
||||
|
@ -46,18 +46,22 @@ namespace ZRCola {
|
||||
/// Translation data
|
||||
///
|
||||
struct translation {
|
||||
wchar_t chr; ///< Composed character
|
||||
unsigned __int16 rank; ///< Decomposition rank
|
||||
unsigned __int16 str_len; ///< \c str length (in characters)
|
||||
wchar_t str[]; ///< Decomposed string
|
||||
static unsigned __int16 com_start; ///< Composed character start in \c data
|
||||
union {
|
||||
unsigned __int16 com_end; ///< Composed character end in \c data
|
||||
unsigned __int16 dec_start; ///< Decomposed character start in \c data
|
||||
};
|
||||
unsigned __int16 dec_end; ///< Decomposed string end in \c data
|
||||
wchar_t data[]; ///< Decomposed string and composed character
|
||||
|
||||
///
|
||||
/// Binary compares two strings
|
||||
///
|
||||
/// \param[in] str_a First string
|
||||
/// \param[in] count_a Number of characters in string \p str_a
|
||||
/// \param[in] str_a_end First string end
|
||||
/// \param[in] str_b Second string
|
||||
/// \param[in] count_b Number of characters in string \p str_b
|
||||
/// \param[in] str_b_end Second string end
|
||||
///
|
||||
/// \returns
|
||||
/// - <0 when str_a < str_b
|
||||
@ -66,16 +70,16 @@ namespace ZRCola {
|
||||
///
|
||||
/// \note
|
||||
/// The function does not treat \\0 characters as terminators for performance reasons.
|
||||
/// Therefore \p count_a and \p count_b must represent exact string lengths.
|
||||
/// Therefore \p str_a_end and \p str_b_end must represent exact string ends.
|
||||
///
|
||||
static inline int CompareString(const wchar_t *str_a, unsigned __int16 count_a, const wchar_t *str_b, unsigned __int16 count_b)
|
||||
static inline int CompareString(const wchar_t *str_a, const wchar_t *str_a_end, const wchar_t *str_b, const wchar_t *str_b_end)
|
||||
{
|
||||
for (unsigned __int16 i = 0; ; i++) {
|
||||
if (i >= count_a && i >= count_b) return 0;
|
||||
else if (i >= count_a && i < count_b) return -1;
|
||||
else if (i < count_a && i >= count_b) return +1;
|
||||
else if (str_a[i] < str_b[i]) return -1;
|
||||
else if (str_a[i] > str_b[i]) return +1;
|
||||
for (; ; str_a++, str_b++) {
|
||||
if (str_a >= str_a_end && str_b >= str_b_end) return 0;
|
||||
else if (str_a >= str_a_end && str_b < str_b_end) return -1;
|
||||
else if (str_a < str_a_end && str_b >= str_b_end) return +1;
|
||||
else if (*str_a < *str_b) return -1;
|
||||
else if (*str_a > *str_b) return +1;
|
||||
}
|
||||
}
|
||||
};
|
||||
@ -107,7 +111,7 @@ namespace ZRCola {
|
||||
///
|
||||
virtual int compare(_In_ const translation &a, _In_ const translation &b) const
|
||||
{
|
||||
int r = translation::CompareString(a.str, a.str_len, b.str, b.str_len);
|
||||
int r = translation::CompareString(a.data + a.dec_start, a.data + a.dec_end, b.data + b.dec_start, b.data + b.dec_end);
|
||||
if (r != 0) return r;
|
||||
|
||||
return 0;
|
||||
@ -126,11 +130,11 @@ namespace ZRCola {
|
||||
///
|
||||
virtual int compare_sort(_In_ const translation &a, _In_ const translation &b) const
|
||||
{
|
||||
int r = translation::CompareString(a.str, a.str_len, b.str, b.str_len);
|
||||
int r = translation::CompareString(a.data + a.dec_start, a.data + a.dec_end, b.data + b.dec_start, b.data + b.dec_end);
|
||||
if (r != 0) return r;
|
||||
|
||||
if (a.chr < b.chr) return -1;
|
||||
else if (a.chr > b.chr) return +1;
|
||||
r = translation::CompareString(a.data + a.com_start, a.data + a.com_end, b.data + b.com_start, b.data + b.com_end);
|
||||
if (r != 0) return r;
|
||||
|
||||
return 0;
|
||||
}
|
||||
@ -163,8 +167,8 @@ namespace ZRCola {
|
||||
///
|
||||
virtual int compare(_In_ const translation &a, _In_ const translation &b) const
|
||||
{
|
||||
if (a.chr < b.chr) return -1;
|
||||
else if (a.chr > b.chr) return +1;
|
||||
int r = translation::CompareString(a.data + a.com_start, a.data + a.com_end, b.data + b.com_start, b.data + b.com_end);
|
||||
if (r != 0) return r;
|
||||
|
||||
return 0;
|
||||
}
|
||||
@ -182,13 +186,13 @@ namespace ZRCola {
|
||||
///
|
||||
virtual int compare_sort(_In_ const translation &a, _In_ const translation &b) const
|
||||
{
|
||||
if (a.chr < b.chr) return -1;
|
||||
else if (a.chr > b.chr) return +1;
|
||||
int r = translation::CompareString(a.data + a.com_start, a.data + a.com_end, b.data + b.com_start, b.data + b.com_end);
|
||||
if (r != 0) return r;
|
||||
|
||||
if (a.rank < b.rank) return -1;
|
||||
else if (a.rank > b.rank) return +1;
|
||||
|
||||
int r = translation::CompareString(a.str, a.str_len, b.str, b.str_len);
|
||||
r = translation::CompareString(a.data + a.dec_start, a.data + a.dec_end, b.data + b.dec_start, b.data + b.dec_end);
|
||||
if (r != 0) return r;
|
||||
|
||||
return 0;
|
||||
|
@ -94,3 +94,12 @@ bool ZRCola::langchar_db::IsLocalCharacter(_In_ wchar_t chr, _In_ ZRCola::langid
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
bool ZRCola::langchar_db::IsLocalCharacter(_In_ const wchar_t *chr, _In_ const wchar_t *chr_end, _In_ ZRCola::langid_t lang) const
|
||||
{
|
||||
// TODO: Implement properly!
|
||||
UNREFERENCED_PARAMETER(chr_end);
|
||||
assert(chr < chr_end);
|
||||
return IsLocalCharacter(*chr, lang);
|
||||
}
|
||||
|
@ -19,6 +19,8 @@
|
||||
|
||||
#include "stdafx.h"
|
||||
|
||||
unsigned __int16 ZRCola::translation_db::translation::com_start = 0;
|
||||
|
||||
|
||||
void ZRCola::translation_db::Compose(_In_z_count_(inputMax) const wchar_t* input, _In_ size_t inputMax, _Out_ std::wstring &output, _Out_opt_ std::vector<mapping>* map) const
|
||||
{
|
||||
@ -27,8 +29,7 @@ void ZRCola::translation_db::Compose(_In_z_count_(inputMax) const wchar_t* input
|
||||
// Trim inputMax to actual length.
|
||||
inputMax = inputMax != (size_t)-1 ? wcsnlen(input, inputMax) : wcslen(input);
|
||||
|
||||
// Clear the output string and preallocate at least inputMax chars.
|
||||
// Since composing is usually reducing the number of chars, memory reallocation is not expected later.
|
||||
// Clear the output.
|
||||
output.clear();
|
||||
output.reserve(inputMax);
|
||||
if (map)
|
||||
@ -49,7 +50,8 @@ void ZRCola::translation_db::Compose(_In_z_count_(inputMax) const wchar_t* input
|
||||
// All compositions that get short on characters are lexically ordered before.
|
||||
// Thus the j-th character is considered 0.
|
||||
const translation &trans = idxComp[m];
|
||||
wchar_t s = j < trans.str_len ? trans.str[j] : 0;
|
||||
size_t jj = trans.dec_start + j;
|
||||
wchar_t s = jj < trans.dec_end ? trans.data[jj] : 0;
|
||||
|
||||
// Do the bisection test.
|
||||
if (c < s) r = m;
|
||||
@ -61,7 +63,8 @@ void ZRCola::translation_db::Compose(_In_z_count_(inputMax) const wchar_t* input
|
||||
for (size_t rr = m; l < rr;) {
|
||||
size_t m = (l + rr) / 2;
|
||||
const translation &trans = idxComp[m];
|
||||
wchar_t s = j < trans.str_len ? trans.str[j] : 0;
|
||||
size_t jj = trans.dec_start + j;
|
||||
wchar_t s = jj < trans.dec_end ? trans.data[jj] : 0;
|
||||
if (c <= s) rr = m; else l = m + 1;
|
||||
}
|
||||
|
||||
@ -69,12 +72,13 @@ void ZRCola::translation_db::Compose(_In_z_count_(inputMax) const wchar_t* input
|
||||
for (size_t ll = m + 1; ll < r;) {
|
||||
size_t m = (ll + r) / 2;
|
||||
const translation &trans = idxComp[m];
|
||||
wchar_t s = j < trans.str_len ? trans.str[j] : 0;
|
||||
size_t jj = trans.dec_start + j;
|
||||
wchar_t s = jj < trans.dec_end ? trans.data[jj] : 0;
|
||||
if (s <= c) ll = m + 1; else r = m;
|
||||
}
|
||||
|
||||
const translation &trans = idxComp[l];
|
||||
if (j + 1 == trans.str_len) {
|
||||
if (trans.dec_start + j + 1 == trans.dec_end) {
|
||||
// The first composition of the run was a match (thus far). Save it.
|
||||
l_match = l;
|
||||
}
|
||||
@ -87,9 +91,9 @@ void ZRCola::translation_db::Compose(_In_z_count_(inputMax) const wchar_t* input
|
||||
if (l_match < compositionsCount) {
|
||||
// The saved composition was an exact match.
|
||||
const translation &trans = idxComp[l_match];
|
||||
output += trans.chr;
|
||||
i += trans.str_len;
|
||||
if (trans.str_len > 1 && map) {
|
||||
output.append(trans.data + trans.com_start, trans.data + trans.com_end);
|
||||
i += trans.dec_end - trans.dec_start;
|
||||
if (trans.dec_end - trans.dec_start != trans.com_end - trans.com_start && map) {
|
||||
// Mapping changed.
|
||||
map->push_back(ZRCola::mapping(i, output.length()));
|
||||
}
|
||||
@ -109,59 +113,85 @@ void ZRCola::translation_db::Decompose(_In_z_count_(inputMax) const wchar_t* inp
|
||||
// Trim inputMax to actual length.
|
||||
inputMax = inputMax != (size_t)-1 ? wcsnlen(input, inputMax) : wcslen(input);
|
||||
|
||||
// Clear the output string and preallocate at least 2*inputMax chars.
|
||||
// Since decomposition expands the string, let's keep our fingers crossed to avoid reallocation later.
|
||||
// Clear the output.
|
||||
output.clear();
|
||||
output.reserve(inputMax * 2);
|
||||
output.reserve(inputMax);
|
||||
if (map)
|
||||
map->clear();
|
||||
|
||||
auto decompositionsCount = idxDecomp.size();
|
||||
|
||||
for (size_t i = 0; i < inputMax;) {
|
||||
// Find whether the character can be decomposed.
|
||||
wchar_t c = input[i];
|
||||
|
||||
for (size_t l = 0, r = decompositionsCount;; ) {
|
||||
if (l < r) {
|
||||
// Find the longest matching decomposition at i-th character.
|
||||
size_t l_match = (size_t)-1;
|
||||
for (size_t l = 0, r = decompositionsCount, ii = i, j = 0; ii < inputMax && l < r; ii++, j++) {
|
||||
wchar_t c = input[ii];
|
||||
while (l < r) {
|
||||
// Test the decomposition in the middle of the search area.
|
||||
size_t m = (l + r) / 2;
|
||||
|
||||
// Get the j-th character of the decomposition.
|
||||
// All decompositions that get short on characters are lexically ordered before.
|
||||
// Thus the j-th character is considered 0.
|
||||
const translation &trans = idxDecomp[m];
|
||||
wchar_t decompSrc = trans.chr;
|
||||
if (c < decompSrc) r = m;
|
||||
else if (decompSrc < c) l = m + 1;
|
||||
size_t jj = trans.com_start + j;
|
||||
wchar_t s = jj < trans.com_end ? trans.data[jj] : 0;
|
||||
|
||||
// Do the bisection test.
|
||||
if (c < s) r = m;
|
||||
else if (s < c) l = m + 1;
|
||||
else {
|
||||
// Character found.
|
||||
|
||||
// Narrow the search area on the left to start at the first decomposition in the run (first by rank).
|
||||
// Narrow the search area on the left to start at the first decomposition in the run.
|
||||
for (size_t rr = m; l < rr;) {
|
||||
size_t m = (l + rr) / 2;
|
||||
const translation &trans = idxDecomp[m];
|
||||
wchar_t decompSrc = trans.chr;
|
||||
if (c <= decompSrc) rr = m; else l = m + 1;
|
||||
size_t jj = trans.com_start + j;
|
||||
wchar_t s = jj < trans.com_end ? trans.data[jj] : 0;
|
||||
if (c <= s) rr = m; else l = m + 1;
|
||||
}
|
||||
|
||||
// Narrow the search area on the right to end at the first decomposition not in the run.
|
||||
for (size_t ll = m + 1; ll < r;) {
|
||||
size_t m = (ll + r) / 2;
|
||||
const translation &trans = idxDecomp[m];
|
||||
size_t jj = trans.com_start + j;
|
||||
wchar_t s = jj < trans.com_end ? trans.data[jj] : 0;
|
||||
if (s <= c) ll = m + 1; else r = m;
|
||||
}
|
||||
|
||||
const translation &trans = idxDecomp[l];
|
||||
if (trans.str_len && trans.str[0] != L'#' && (!lc_db || !lc_db->IsLocalCharacter(c, lang))) {
|
||||
if (trans.com_start + j + 1 == trans.com_end) {
|
||||
// The first decomposition of the run was a match (thus far). Save it.
|
||||
l_match = l;
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (l_match < decompositionsCount) {
|
||||
// The saved decomposition was an exact match.
|
||||
const translation &trans = idxDecomp[l_match];
|
||||
if (trans.dec_start < trans.dec_end && trans.data[trans.dec_start] != L'#' && (!lc_db || !lc_db->IsLocalCharacter(trans.data + trans.com_start, trans.data + trans.com_end, lang))) {
|
||||
// Append decomposed sequence.
|
||||
output.append(trans.str, trans.str_len);
|
||||
i++;
|
||||
if (map) {
|
||||
output.append(trans.data + trans.dec_start, trans.data + trans.dec_end);
|
||||
i += trans.com_end - trans.com_start;
|
||||
if (trans.dec_end - trans.dec_start != trans.com_end - trans.com_start && map) {
|
||||
// Mapping changed.
|
||||
map->push_back(ZRCola::mapping(i, output.length()));
|
||||
}
|
||||
} else {
|
||||
// Character is inhibited to decompose.
|
||||
output += c;
|
||||
i++;
|
||||
}
|
||||
break;
|
||||
output.append(trans.data + trans.com_start, trans.data + trans.com_end);
|
||||
i += trans.com_end - trans.com_start;
|
||||
}
|
||||
} else {
|
||||
// Character not found.
|
||||
output += c;
|
||||
// The match was not found.
|
||||
output += input[i];
|
||||
i++;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
Binary file not shown.
Loading…
x
Reference in New Issue
Block a user