Composed characters extended to support multiple UTF-16 character
This commit is contained in:
parent
87814981db
commit
bd0fdba435
@ -621,7 +621,7 @@ bool ZRCola::DBSource::GetTranslation(const com_obj<ADORecordset>& rs, ZRCola::D
|
|||||||
{
|
{
|
||||||
com_obj<ADOField> f;
|
com_obj<ADOField> f;
|
||||||
wxVERIFY(SUCCEEDED(flds->get_Item(variant(L"znak"), &f)));
|
wxVERIFY(SUCCEEDED(flds->get_Item(variant(L"znak"), &f)));
|
||||||
wxCHECK(GetUnicodeCharacter(f, t.chr), false);
|
wxCHECK(GetUnicodeString(f, t.chr), false);
|
||||||
}
|
}
|
||||||
|
|
||||||
{
|
{
|
||||||
|
@ -79,7 +79,7 @@ namespace ZRCola {
|
|||||||
///
|
///
|
||||||
class translation {
|
class translation {
|
||||||
public:
|
public:
|
||||||
wchar_t chr; ///< Composed character
|
std::wstring chr; ///< Composed character
|
||||||
charseq decomp; ///< Decomposed sequence
|
charseq decomp; ///< Decomposed sequence
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -24,10 +24,10 @@ using namespace stdex;
|
|||||||
using namespace winstd;
|
using namespace winstd;
|
||||||
|
|
||||||
|
|
||||||
typedef map<wchar_t, set<ZRCola::DBSource::charseq, ZRCola::DBSource::charseq::less_rank_str> > translation_db;
|
typedef map<wstring, set<ZRCola::DBSource::charseq, ZRCola::DBSource::charseq::less_rank_str> > translation_db;
|
||||||
|
|
||||||
|
|
||||||
static set<wstring> decompose(_In_ const translation_db &db, _In_z_ const wchar_t *str, _Inout_ set<wchar_t> &path)
|
static set<wstring> decompose(_In_ const translation_db &db, _In_z_ const wchar_t *str, _Inout_ set<translation_db::key_type> &path)
|
||||||
{
|
{
|
||||||
set<wstring> res;
|
set<wstring> res;
|
||||||
|
|
||||||
@ -37,10 +37,11 @@ static set<wstring> decompose(_In_ const translation_db &db, _In_z_ const wchar_
|
|||||||
if (rem.empty())
|
if (rem.empty())
|
||||||
return res;
|
return res;
|
||||||
|
|
||||||
auto const t = db.find(*str);
|
translation_db::key_type _str(1, *str);
|
||||||
|
auto const t = db.find(_str);
|
||||||
if (t != db.end()) {
|
if (t != db.end()) {
|
||||||
// Current characted decomposed. Iterate all possible decompositions and combine them with the remainder.
|
// Current characted decomposed. Iterate all possible decompositions and combine them with the remainder.
|
||||||
auto p = path.insert(*str);
|
auto p = path.insert(_str);
|
||||||
if (!p.second) {
|
if (!p.second) {
|
||||||
// Path already contains this character: Cycle detected!
|
// Path already contains this character: Cycle detected!
|
||||||
return res;
|
return res;
|
||||||
@ -55,14 +56,14 @@ static set<wstring> decompose(_In_ const translation_db &db, _In_z_ const wchar_
|
|||||||
} else {
|
} else {
|
||||||
// Cycle detected. Do not continue decomposition.
|
// Cycle detected. Do not continue decomposition.
|
||||||
for (auto r = rem.cbegin(), r_end = rem.cend(); r != r_end; ++r)
|
for (auto r = rem.cbegin(), r_end = rem.cend(); r != r_end; ++r)
|
||||||
res.insert(wstring(1, *str) + *r);
|
res.insert(_str + *r);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
path.erase(p.first);
|
path.erase(p.first);
|
||||||
} else {
|
} else {
|
||||||
// Current character is non-decomposable. Combine it with the remainder(s).
|
// Current character is non-decomposable. Combine it with the remainder(s).
|
||||||
for (auto r = rem.cbegin(), r_end = rem.cend(); r != r_end; ++r)
|
for (auto r = rem.cbegin(), r_end = rem.cend(); r != r_end; ++r)
|
||||||
res.insert(wstring(1, *str) + *r);
|
res.insert(_str + *r);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
// Empty string results in empty decomposition.
|
// Empty string results in empty decomposition.
|
||||||
@ -180,7 +181,7 @@ int _tmain(int argc, _TCHAR *argv[])
|
|||||||
translation_db db_temp2;
|
translation_db db_temp2;
|
||||||
for (auto t1 = db_temp1.cbegin(), t1_end = db_temp1.cend(); t1 != t1_end; ++t1) {
|
for (auto t1 = db_temp1.cbegin(), t1_end = db_temp1.cend(); t1 != t1_end; ++t1) {
|
||||||
for (auto d1 = t1->second.cbegin(), d1_end = t1->second.cend(); d1 != d1_end; ++d1) {
|
for (auto d1 = t1->second.cbegin(), d1_end = t1->second.cend(); d1 != d1_end; ++d1) {
|
||||||
set<wchar_t> path;
|
set<translation_db::key_type> path;
|
||||||
path.insert(t1->first);
|
path.insert(t1->first);
|
||||||
auto str = decompose(db_temp1, d1->str.c_str(), path);
|
auto str = decompose(db_temp1, d1->str.c_str(), path);
|
||||||
assert(!str.empty());
|
assert(!str.empty());
|
||||||
@ -204,20 +205,24 @@ int _tmain(int argc, _TCHAR *argv[])
|
|||||||
// Preallocate memory.
|
// Preallocate memory.
|
||||||
db.idxComp .reserve(count);
|
db.idxComp .reserve(count);
|
||||||
db.idxDecomp.reserve(count);
|
db.idxDecomp.reserve(count);
|
||||||
db.data .reserve(count*4);
|
db.data .reserve(count*5);
|
||||||
|
|
||||||
// Parse translations and build index and data.
|
// Parse translations and build index and data.
|
||||||
for (auto t = db_temp2.cbegin(), t_end = db_temp2.cend(); t != t_end; ++t) {
|
for (auto t = db_temp2.cbegin(), t_end = db_temp2.cend(); t != t_end; ++t) {
|
||||||
// Add translation to index and data.
|
// Add translation to index and data.
|
||||||
for (auto d = t->second.cbegin(), d_end = t->second.cend(); d != d_end; ++d) {
|
for (auto d = t->second.cbegin(), d_end = t->second.cend(); d != d_end; ++d) {
|
||||||
unsigned __int32 idx = db.data.size();
|
unsigned __int32 idx = db.data.size();
|
||||||
db.data.push_back(t->first);
|
|
||||||
wxASSERT_MSG((int)0xffff8000 <= d->rank && d->rank <= (int)0x00007fff, wxT("transformation rank out of bounds"));
|
wxASSERT_MSG((int)0xffff8000 <= d->rank && d->rank <= (int)0x00007fff, wxT("transformation rank out of bounds"));
|
||||||
db.data.push_back((unsigned __int16)d->rank);
|
db.data.push_back((unsigned __int16)d->rank);
|
||||||
wstring::size_type n = d->str.length();
|
wstring::size_type n_com = t->first.length();
|
||||||
wxASSERT_MSG(n <= 0xffff, wxT("transformation string too long"));
|
wxASSERT_MSG(n_com <= 0xffff, wxT("composition string too long"));
|
||||||
db.data.push_back((unsigned __int16)n);
|
db.data.push_back((unsigned __int16)n_com);
|
||||||
for (wstring::size_type i = 0; i < n; i++)
|
wstring::size_type n_dec = d->str.length();
|
||||||
|
wxASSERT_MSG(n_com + n_dec <= 0xffff, wxT("decomposition string too long"));
|
||||||
|
db.data.push_back((unsigned __int16)(n_com + n_dec));
|
||||||
|
for (wstring::size_type i = 0; i < n_com; i++)
|
||||||
|
db.data.push_back(t->first[i]);
|
||||||
|
for (wstring::size_type i = 0; i < n_dec; i++)
|
||||||
db.data.push_back(d->str[i]);
|
db.data.push_back(d->str[i]);
|
||||||
db.idxComp .push_back(idx);
|
db.idxComp .push_back(idx);
|
||||||
db.idxDecomp.push_back(idx);
|
db.idxDecomp.push_back(idx);
|
||||||
|
@ -197,6 +197,18 @@ namespace ZRCola {
|
|||||||
/// - \c true when character is used in language
|
/// - \c true when character is used in language
|
||||||
/// - \c false otherwise
|
/// - \c false otherwise
|
||||||
bool IsLocalCharacter(_In_ wchar_t chr, _In_ langid_t lang) const;
|
bool IsLocalCharacter(_In_ wchar_t chr, _In_ langid_t lang) const;
|
||||||
|
|
||||||
|
///
|
||||||
|
/// Tests presence of character in the given language
|
||||||
|
///
|
||||||
|
/// \param[in] chr Pointer to UTF-16 character start
|
||||||
|
/// \param[in] chr_end Pointer to UTF-16 character end
|
||||||
|
/// \param[in] lang Language
|
||||||
|
///
|
||||||
|
/// \returns
|
||||||
|
/// - \c true when character is used in language
|
||||||
|
/// - \c false otherwise
|
||||||
|
bool IsLocalCharacter(_In_ const wchar_t *chr, _In_ const wchar_t *chr_end, _In_ langid_t lang) const;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
|
@ -46,18 +46,22 @@ namespace ZRCola {
|
|||||||
/// Translation data
|
/// Translation data
|
||||||
///
|
///
|
||||||
struct translation {
|
struct translation {
|
||||||
wchar_t chr; ///< Composed character
|
unsigned __int16 rank; ///< Decomposition rank
|
||||||
unsigned __int16 rank; ///< Decomposition rank
|
static unsigned __int16 com_start; ///< Composed character start in \c data
|
||||||
unsigned __int16 str_len; ///< \c str length (in characters)
|
union {
|
||||||
wchar_t str[]; ///< Decomposed string
|
unsigned __int16 com_end; ///< Composed character end in \c data
|
||||||
|
unsigned __int16 dec_start; ///< Decomposed character start in \c data
|
||||||
|
};
|
||||||
|
unsigned __int16 dec_end; ///< Decomposed string end in \c data
|
||||||
|
wchar_t data[]; ///< Decomposed string and composed character
|
||||||
|
|
||||||
///
|
///
|
||||||
/// Binary compares two strings
|
/// Binary compares two strings
|
||||||
///
|
///
|
||||||
/// \param[in] str_a First string
|
/// \param[in] str_a First string
|
||||||
/// \param[in] count_a Number of characters in string \p str_a
|
/// \param[in] str_a_end First string end
|
||||||
/// \param[in] str_b Second string
|
/// \param[in] str_b Second string
|
||||||
/// \param[in] count_b Number of characters in string \p str_b
|
/// \param[in] str_b_end Second string end
|
||||||
///
|
///
|
||||||
/// \returns
|
/// \returns
|
||||||
/// - <0 when str_a < str_b
|
/// - <0 when str_a < str_b
|
||||||
@ -66,16 +70,16 @@ namespace ZRCola {
|
|||||||
///
|
///
|
||||||
/// \note
|
/// \note
|
||||||
/// The function does not treat \\0 characters as terminators for performance reasons.
|
/// The function does not treat \\0 characters as terminators for performance reasons.
|
||||||
/// Therefore \p count_a and \p count_b must represent exact string lengths.
|
/// Therefore \p str_a_end and \p str_b_end must represent exact string ends.
|
||||||
///
|
///
|
||||||
static inline int CompareString(const wchar_t *str_a, unsigned __int16 count_a, const wchar_t *str_b, unsigned __int16 count_b)
|
static inline int CompareString(const wchar_t *str_a, const wchar_t *str_a_end, const wchar_t *str_b, const wchar_t *str_b_end)
|
||||||
{
|
{
|
||||||
for (unsigned __int16 i = 0; ; i++) {
|
for (; ; str_a++, str_b++) {
|
||||||
if (i >= count_a && i >= count_b) return 0;
|
if (str_a >= str_a_end && str_b >= str_b_end) return 0;
|
||||||
else if (i >= count_a && i < count_b) return -1;
|
else if (str_a >= str_a_end && str_b < str_b_end) return -1;
|
||||||
else if (i < count_a && i >= count_b) return +1;
|
else if (str_a < str_a_end && str_b >= str_b_end) return +1;
|
||||||
else if (str_a[i] < str_b[i]) return -1;
|
else if (*str_a < *str_b) return -1;
|
||||||
else if (str_a[i] > str_b[i]) return +1;
|
else if (*str_a > *str_b) return +1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
@ -107,7 +111,7 @@ namespace ZRCola {
|
|||||||
///
|
///
|
||||||
virtual int compare(_In_ const translation &a, _In_ const translation &b) const
|
virtual int compare(_In_ const translation &a, _In_ const translation &b) const
|
||||||
{
|
{
|
||||||
int r = translation::CompareString(a.str, a.str_len, b.str, b.str_len);
|
int r = translation::CompareString(a.data + a.dec_start, a.data + a.dec_end, b.data + b.dec_start, b.data + b.dec_end);
|
||||||
if (r != 0) return r;
|
if (r != 0) return r;
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
@ -126,11 +130,11 @@ namespace ZRCola {
|
|||||||
///
|
///
|
||||||
virtual int compare_sort(_In_ const translation &a, _In_ const translation &b) const
|
virtual int compare_sort(_In_ const translation &a, _In_ const translation &b) const
|
||||||
{
|
{
|
||||||
int r = translation::CompareString(a.str, a.str_len, b.str, b.str_len);
|
int r = translation::CompareString(a.data + a.dec_start, a.data + a.dec_end, b.data + b.dec_start, b.data + b.dec_end);
|
||||||
if (r != 0) return r;
|
if (r != 0) return r;
|
||||||
|
|
||||||
if (a.chr < b.chr) return -1;
|
r = translation::CompareString(a.data + a.com_start, a.data + a.com_end, b.data + b.com_start, b.data + b.com_end);
|
||||||
else if (a.chr > b.chr) return +1;
|
if (r != 0) return r;
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
@ -163,8 +167,8 @@ namespace ZRCola {
|
|||||||
///
|
///
|
||||||
virtual int compare(_In_ const translation &a, _In_ const translation &b) const
|
virtual int compare(_In_ const translation &a, _In_ const translation &b) const
|
||||||
{
|
{
|
||||||
if (a.chr < b.chr) return -1;
|
int r = translation::CompareString(a.data + a.com_start, a.data + a.com_end, b.data + b.com_start, b.data + b.com_end);
|
||||||
else if (a.chr > b.chr) return +1;
|
if (r != 0) return r;
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
@ -182,13 +186,13 @@ namespace ZRCola {
|
|||||||
///
|
///
|
||||||
virtual int compare_sort(_In_ const translation &a, _In_ const translation &b) const
|
virtual int compare_sort(_In_ const translation &a, _In_ const translation &b) const
|
||||||
{
|
{
|
||||||
if (a.chr < b.chr) return -1;
|
int r = translation::CompareString(a.data + a.com_start, a.data + a.com_end, b.data + b.com_start, b.data + b.com_end);
|
||||||
else if (a.chr > b.chr) return +1;
|
if (r != 0) return r;
|
||||||
|
|
||||||
if (a.rank < b.rank) return -1;
|
if (a.rank < b.rank) return -1;
|
||||||
else if (a.rank > b.rank) return +1;
|
else if (a.rank > b.rank) return +1;
|
||||||
|
|
||||||
int r = translation::CompareString(a.str, a.str_len, b.str, b.str_len);
|
r = translation::CompareString(a.data + a.dec_start, a.data + a.dec_end, b.data + b.dec_start, b.data + b.dec_end);
|
||||||
if (r != 0) return r;
|
if (r != 0) return r;
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
|
@ -94,3 +94,12 @@ bool ZRCola::langchar_db::IsLocalCharacter(_In_ wchar_t chr, _In_ ZRCola::langid
|
|||||||
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
bool ZRCola::langchar_db::IsLocalCharacter(_In_ const wchar_t *chr, _In_ const wchar_t *chr_end, _In_ ZRCola::langid_t lang) const
|
||||||
|
{
|
||||||
|
// TODO: Implement properly!
|
||||||
|
UNREFERENCED_PARAMETER(chr_end);
|
||||||
|
assert(chr < chr_end);
|
||||||
|
return IsLocalCharacter(*chr, lang);
|
||||||
|
}
|
||||||
|
@ -19,6 +19,8 @@
|
|||||||
|
|
||||||
#include "stdafx.h"
|
#include "stdafx.h"
|
||||||
|
|
||||||
|
unsigned __int16 ZRCola::translation_db::translation::com_start = 0;
|
||||||
|
|
||||||
|
|
||||||
void ZRCola::translation_db::Compose(_In_z_count_(inputMax) const wchar_t* input, _In_ size_t inputMax, _Out_ std::wstring &output, _Out_opt_ std::vector<mapping>* map) const
|
void ZRCola::translation_db::Compose(_In_z_count_(inputMax) const wchar_t* input, _In_ size_t inputMax, _Out_ std::wstring &output, _Out_opt_ std::vector<mapping>* map) const
|
||||||
{
|
{
|
||||||
@ -27,8 +29,7 @@ void ZRCola::translation_db::Compose(_In_z_count_(inputMax) const wchar_t* input
|
|||||||
// Trim inputMax to actual length.
|
// Trim inputMax to actual length.
|
||||||
inputMax = inputMax != (size_t)-1 ? wcsnlen(input, inputMax) : wcslen(input);
|
inputMax = inputMax != (size_t)-1 ? wcsnlen(input, inputMax) : wcslen(input);
|
||||||
|
|
||||||
// Clear the output string and preallocate at least inputMax chars.
|
// Clear the output.
|
||||||
// Since composing is usually reducing the number of chars, memory reallocation is not expected later.
|
|
||||||
output.clear();
|
output.clear();
|
||||||
output.reserve(inputMax);
|
output.reserve(inputMax);
|
||||||
if (map)
|
if (map)
|
||||||
@ -49,7 +50,8 @@ void ZRCola::translation_db::Compose(_In_z_count_(inputMax) const wchar_t* input
|
|||||||
// All compositions that get short on characters are lexically ordered before.
|
// All compositions that get short on characters are lexically ordered before.
|
||||||
// Thus the j-th character is considered 0.
|
// Thus the j-th character is considered 0.
|
||||||
const translation &trans = idxComp[m];
|
const translation &trans = idxComp[m];
|
||||||
wchar_t s = j < trans.str_len ? trans.str[j] : 0;
|
size_t jj = trans.dec_start + j;
|
||||||
|
wchar_t s = jj < trans.dec_end ? trans.data[jj] : 0;
|
||||||
|
|
||||||
// Do the bisection test.
|
// Do the bisection test.
|
||||||
if (c < s) r = m;
|
if (c < s) r = m;
|
||||||
@ -61,7 +63,8 @@ void ZRCola::translation_db::Compose(_In_z_count_(inputMax) const wchar_t* input
|
|||||||
for (size_t rr = m; l < rr;) {
|
for (size_t rr = m; l < rr;) {
|
||||||
size_t m = (l + rr) / 2;
|
size_t m = (l + rr) / 2;
|
||||||
const translation &trans = idxComp[m];
|
const translation &trans = idxComp[m];
|
||||||
wchar_t s = j < trans.str_len ? trans.str[j] : 0;
|
size_t jj = trans.dec_start + j;
|
||||||
|
wchar_t s = jj < trans.dec_end ? trans.data[jj] : 0;
|
||||||
if (c <= s) rr = m; else l = m + 1;
|
if (c <= s) rr = m; else l = m + 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -69,12 +72,13 @@ void ZRCola::translation_db::Compose(_In_z_count_(inputMax) const wchar_t* input
|
|||||||
for (size_t ll = m + 1; ll < r;) {
|
for (size_t ll = m + 1; ll < r;) {
|
||||||
size_t m = (ll + r) / 2;
|
size_t m = (ll + r) / 2;
|
||||||
const translation &trans = idxComp[m];
|
const translation &trans = idxComp[m];
|
||||||
wchar_t s = j < trans.str_len ? trans.str[j] : 0;
|
size_t jj = trans.dec_start + j;
|
||||||
|
wchar_t s = jj < trans.dec_end ? trans.data[jj] : 0;
|
||||||
if (s <= c) ll = m + 1; else r = m;
|
if (s <= c) ll = m + 1; else r = m;
|
||||||
}
|
}
|
||||||
|
|
||||||
const translation &trans = idxComp[l];
|
const translation &trans = idxComp[l];
|
||||||
if (j + 1 == trans.str_len) {
|
if (trans.dec_start + j + 1 == trans.dec_end) {
|
||||||
// The first composition of the run was a match (thus far). Save it.
|
// The first composition of the run was a match (thus far). Save it.
|
||||||
l_match = l;
|
l_match = l;
|
||||||
}
|
}
|
||||||
@ -87,9 +91,9 @@ void ZRCola::translation_db::Compose(_In_z_count_(inputMax) const wchar_t* input
|
|||||||
if (l_match < compositionsCount) {
|
if (l_match < compositionsCount) {
|
||||||
// The saved composition was an exact match.
|
// The saved composition was an exact match.
|
||||||
const translation &trans = idxComp[l_match];
|
const translation &trans = idxComp[l_match];
|
||||||
output += trans.chr;
|
output.append(trans.data + trans.com_start, trans.data + trans.com_end);
|
||||||
i += trans.str_len;
|
i += trans.dec_end - trans.dec_start;
|
||||||
if (trans.str_len > 1 && map) {
|
if (trans.dec_end - trans.dec_start != trans.com_end - trans.com_start && map) {
|
||||||
// Mapping changed.
|
// Mapping changed.
|
||||||
map->push_back(ZRCola::mapping(i, output.length()));
|
map->push_back(ZRCola::mapping(i, output.length()));
|
||||||
}
|
}
|
||||||
@ -109,59 +113,85 @@ void ZRCola::translation_db::Decompose(_In_z_count_(inputMax) const wchar_t* inp
|
|||||||
// Trim inputMax to actual length.
|
// Trim inputMax to actual length.
|
||||||
inputMax = inputMax != (size_t)-1 ? wcsnlen(input, inputMax) : wcslen(input);
|
inputMax = inputMax != (size_t)-1 ? wcsnlen(input, inputMax) : wcslen(input);
|
||||||
|
|
||||||
// Clear the output string and preallocate at least 2*inputMax chars.
|
// Clear the output.
|
||||||
// Since decomposition expands the string, let's keep our fingers crossed to avoid reallocation later.
|
|
||||||
output.clear();
|
output.clear();
|
||||||
output.reserve(inputMax * 2);
|
output.reserve(inputMax);
|
||||||
if (map)
|
if (map)
|
||||||
map->clear();
|
map->clear();
|
||||||
|
|
||||||
auto decompositionsCount = idxDecomp.size();
|
auto decompositionsCount = idxDecomp.size();
|
||||||
|
|
||||||
for (size_t i = 0; i < inputMax;) {
|
for (size_t i = 0; i < inputMax;) {
|
||||||
// Find whether the character can be decomposed.
|
// Find the longest matching decomposition at i-th character.
|
||||||
wchar_t c = input[i];
|
size_t l_match = (size_t)-1;
|
||||||
|
for (size_t l = 0, r = decompositionsCount, ii = i, j = 0; ii < inputMax && l < r; ii++, j++) {
|
||||||
for (size_t l = 0, r = decompositionsCount;; ) {
|
wchar_t c = input[ii];
|
||||||
if (l < r) {
|
while (l < r) {
|
||||||
|
// Test the decomposition in the middle of the search area.
|
||||||
size_t m = (l + r) / 2;
|
size_t m = (l + r) / 2;
|
||||||
|
|
||||||
|
// Get the j-th character of the decomposition.
|
||||||
|
// All decompositions that get short on characters are lexically ordered before.
|
||||||
|
// Thus the j-th character is considered 0.
|
||||||
const translation &trans = idxDecomp[m];
|
const translation &trans = idxDecomp[m];
|
||||||
wchar_t decompSrc = trans.chr;
|
size_t jj = trans.com_start + j;
|
||||||
if (c < decompSrc) r = m;
|
wchar_t s = jj < trans.com_end ? trans.data[jj] : 0;
|
||||||
else if (decompSrc < c) l = m + 1;
|
|
||||||
|
// Do the bisection test.
|
||||||
|
if (c < s) r = m;
|
||||||
|
else if (s < c) l = m + 1;
|
||||||
else {
|
else {
|
||||||
// Character found.
|
// Character found.
|
||||||
|
|
||||||
// Narrow the search area on the left to start at the first decomposition in the run (first by rank).
|
// Narrow the search area on the left to start at the first decomposition in the run.
|
||||||
for (size_t rr = m; l < rr;) {
|
for (size_t rr = m; l < rr;) {
|
||||||
size_t m = (l + rr) / 2;
|
size_t m = (l + rr) / 2;
|
||||||
const translation &trans = idxDecomp[m];
|
const translation &trans = idxDecomp[m];
|
||||||
wchar_t decompSrc = trans.chr;
|
size_t jj = trans.com_start + j;
|
||||||
if (c <= decompSrc) rr = m; else l = m + 1;
|
wchar_t s = jj < trans.com_end ? trans.data[jj] : 0;
|
||||||
|
if (c <= s) rr = m; else l = m + 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Narrow the search area on the right to end at the first decomposition not in the run.
|
||||||
|
for (size_t ll = m + 1; ll < r;) {
|
||||||
|
size_t m = (ll + r) / 2;
|
||||||
|
const translation &trans = idxDecomp[m];
|
||||||
|
size_t jj = trans.com_start + j;
|
||||||
|
wchar_t s = jj < trans.com_end ? trans.data[jj] : 0;
|
||||||
|
if (s <= c) ll = m + 1; else r = m;
|
||||||
}
|
}
|
||||||
|
|
||||||
const translation &trans = idxDecomp[l];
|
const translation &trans = idxDecomp[l];
|
||||||
if (trans.str_len && trans.str[0] != L'#' && (!lc_db || !lc_db->IsLocalCharacter(c, lang))) {
|
if (trans.com_start + j + 1 == trans.com_end) {
|
||||||
// Append decomposed sequence.
|
// The first decomposition of the run was a match (thus far). Save it.
|
||||||
output.append(trans.str, trans.str_len);
|
l_match = l;
|
||||||
i++;
|
|
||||||
if (map) {
|
|
||||||
// Mapping changed.
|
|
||||||
map->push_back(ZRCola::mapping(i, output.length()));
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
// Character is inhibited to decompose.
|
|
||||||
output += c;
|
|
||||||
i++;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
} else {
|
|
||||||
// Character not found.
|
|
||||||
output += c;
|
|
||||||
i++;
|
|
||||||
break;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (l_match < decompositionsCount) {
|
||||||
|
// The saved decomposition was an exact match.
|
||||||
|
const translation &trans = idxDecomp[l_match];
|
||||||
|
if (trans.dec_start < trans.dec_end && trans.data[trans.dec_start] != L'#' && (!lc_db || !lc_db->IsLocalCharacter(trans.data + trans.com_start, trans.data + trans.com_end, lang))) {
|
||||||
|
// Append decomposed sequence.
|
||||||
|
output.append(trans.data + trans.dec_start, trans.data + trans.dec_end);
|
||||||
|
i += trans.com_end - trans.com_start;
|
||||||
|
if (trans.dec_end - trans.dec_start != trans.com_end - trans.com_start && map) {
|
||||||
|
// Mapping changed.
|
||||||
|
map->push_back(ZRCola::mapping(i, output.length()));
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// Character is inhibited to decompose.
|
||||||
|
output.append(trans.data + trans.com_start, trans.data + trans.com_end);
|
||||||
|
i += trans.com_end - trans.com_start;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// The match was not found.
|
||||||
|
output += input[i];
|
||||||
|
i++;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Binary file not shown.
Loading…
x
Reference in New Issue
Block a user