Indexes simplified to save space and add flexibility
This commit is contained in:
parent
4e5811803a
commit
0501a5c7ca
@ -35,7 +35,7 @@ inline std::ostream& operator <<(std::ostream& stream, const ZRCola::translation
|
|||||||
unsigned __int32 count;
|
unsigned __int32 count;
|
||||||
|
|
||||||
// Write index count.
|
// Write index count.
|
||||||
std::vector<ZRCola::translation_db::index>::size_type trans_count = t_db.idxComp.size();
|
std::vector<unsigned __int32>::size_type trans_count = t_db.idxComp.size();
|
||||||
#if defined(_WIN64) || defined(__x86_64__) || defined(__ppc64__)
|
#if defined(_WIN64) || defined(__x86_64__) || defined(__ppc64__)
|
||||||
// 4G check
|
// 4G check
|
||||||
if (trans_count > 0xffffffff) {
|
if (trans_count > 0xffffffff) {
|
||||||
@ -49,11 +49,11 @@ inline std::ostream& operator <<(std::ostream& stream, const ZRCola::translation
|
|||||||
|
|
||||||
// Write composition index.
|
// Write composition index.
|
||||||
if (stream.fail()) return stream;
|
if (stream.fail()) return stream;
|
||||||
stream.write((const char*)t_db.idxComp.data(), sizeof(ZRCola::translation_db::index)*count);
|
stream.write((const char*)t_db.idxComp.data(), sizeof(unsigned __int32)*count);
|
||||||
|
|
||||||
// Write decomposition index.
|
// Write decomposition index.
|
||||||
if (stream.fail()) return stream;
|
if (stream.fail()) return stream;
|
||||||
stream.write((const char*)t_db.idxDecomp.data(), sizeof(ZRCola::translation_db::index)*count);
|
stream.write((const char*)t_db.idxDecomp.data(), sizeof(unsigned __int32)*count);
|
||||||
|
|
||||||
// Write data count.
|
// Write data count.
|
||||||
std::vector<wchar_t>::size_type data_count = t_db.data.size();
|
std::vector<wchar_t>::size_type data_count = t_db.data.size();
|
||||||
@ -121,21 +121,15 @@ static inline int CompareBinary(const wchar_t *str_a, size_t count_a, const wcha
|
|||||||
///
|
///
|
||||||
static int __cdecl CompareCompositionIndex(void *data, const void *a, const void *b)
|
static int __cdecl CompareCompositionIndex(void *data, const void *a, const void *b)
|
||||||
{
|
{
|
||||||
const wchar_t
|
const ZRCola::translation_db::translation
|
||||||
*chr_a = (const wchar_t*)data + ((const ZRCola::translation_db::index*)a)->start,
|
&trans_a = (const ZRCola::translation_db::translation&)((const wchar_t*)data)[*(const unsigned __int32*)a],
|
||||||
*chr_b = (const wchar_t*)data + ((const ZRCola::translation_db::index*)b)->start;
|
&trans_b = (const ZRCola::translation_db::translation&)((const wchar_t*)data)[*(const unsigned __int32*)b];
|
||||||
const wchar_t
|
|
||||||
*str_a = chr_a + 1,
|
|
||||||
*str_b = chr_b + 1;
|
|
||||||
size_t
|
|
||||||
count_a = (const wchar_t*)data + ((const ZRCola::translation_db::index*)a)->end - str_a,
|
|
||||||
count_b = (const wchar_t*)data + ((const ZRCola::translation_db::index*)b)->end - str_b;
|
|
||||||
|
|
||||||
int r = CompareBinary(str_a, count_a, str_b, count_b);
|
int r = CompareBinary(trans_a.str, trans_a.str_len, trans_b.str, trans_b.str_len);
|
||||||
if (r != 0) return r;
|
if (r != 0) return r;
|
||||||
|
|
||||||
if (*chr_a < *chr_b) return -1;
|
if (trans_a.chr < trans_b.chr) return -1;
|
||||||
else if (*chr_a > *chr_b) return +1;
|
else if (trans_a.chr > trans_b.chr) return +1;
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
@ -155,21 +149,14 @@ static int __cdecl CompareCompositionIndex(void *data, const void *a, const void
|
|||||||
///
|
///
|
||||||
static int __cdecl CompareDecompositionIndex(void *data, const void *a, const void *b)
|
static int __cdecl CompareDecompositionIndex(void *data, const void *a, const void *b)
|
||||||
{
|
{
|
||||||
const wchar_t
|
const ZRCola::translation_db::translation
|
||||||
*chr_a = (const wchar_t*)data + ((const ZRCola::translation_db::index*)a)->start,
|
&trans_a = (const ZRCola::translation_db::translation&)((const wchar_t*)data)[*(const unsigned __int32*)a],
|
||||||
*chr_b = (const wchar_t*)data + ((const ZRCola::translation_db::index*)b)->start;
|
&trans_b = (const ZRCola::translation_db::translation&)((const wchar_t*)data)[*(const unsigned __int32*)b];
|
||||||
|
|
||||||
if (*chr_a < *chr_b) return -1;
|
if (trans_a.chr < trans_b.chr) return -1;
|
||||||
else if (*chr_a > *chr_b) return +1;
|
else if (trans_a.chr > trans_b.chr) return +1;
|
||||||
|
|
||||||
const wchar_t
|
return CompareBinary(trans_a.str, trans_a.str_len, trans_b.str, trans_b.str_len);
|
||||||
*str_a = chr_a + 1,
|
|
||||||
*str_b = chr_b + 1;
|
|
||||||
size_t
|
|
||||||
count_a = (const wchar_t*)data + ((const ZRCola::translation_db::index*)a)->end - str_a,
|
|
||||||
count_b = (const wchar_t*)data + ((const ZRCola::translation_db::index*)b)->end - str_b;
|
|
||||||
|
|
||||||
return CompareBinary(str_a, count_a, str_b, count_b);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -267,12 +254,14 @@ int _tmain(int argc, _TCHAR *argv[])
|
|||||||
// Read translation from the database.
|
// Read translation from the database.
|
||||||
if (src.GetTranslation(rs, trans)) {
|
if (src.GetTranslation(rs, trans)) {
|
||||||
// Add translation to index and data.
|
// Add translation to index and data.
|
||||||
ZRCola::translation_db::index ti;
|
unsigned __int32 ti;
|
||||||
ti.start = t_db.data.size();
|
ti = t_db.data.size();
|
||||||
t_db.data.push_back(trans.chr);
|
t_db.data.push_back(trans.chr);
|
||||||
for (std::wstring::size_type i = 0, n = trans.str.length(); i < n; i++)
|
std::wstring::size_type n = trans.str.length();
|
||||||
|
wxASSERT_MSG(n <= 0xffff, wxT("transformation string too long"));
|
||||||
|
t_db.data.push_back((wchar_t)n);
|
||||||
|
for (std::wstring::size_type i = 0; i < n; i++)
|
||||||
t_db.data.push_back(trans.str[i]);
|
t_db.data.push_back(trans.str[i]);
|
||||||
ti.end = t_db.data.size();
|
|
||||||
t_db.idxComp .push_back(ti);
|
t_db.idxComp .push_back(ti);
|
||||||
t_db.idxDecomp.push_back(ti);
|
t_db.idxDecomp.push_back(ti);
|
||||||
} else
|
} else
|
||||||
@ -282,8 +271,8 @@ int _tmain(int argc, _TCHAR *argv[])
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Sort indices.
|
// Sort indices.
|
||||||
qsort_s(t_db.idxComp .data(), trans_count, sizeof(ZRCola::translation_db::index), CompareCompositionIndex , t_db.data.data());
|
qsort_s(t_db.idxComp .data(), trans_count, sizeof(unsigned __int32), CompareCompositionIndex , t_db.data.data());
|
||||||
qsort_s(t_db.idxDecomp.data(), trans_count, sizeof(ZRCola::translation_db::index), CompareDecompositionIndex, t_db.data.data());
|
qsort_s(t_db.idxDecomp.data(), trans_count, sizeof(unsigned __int32), CompareDecompositionIndex, t_db.data.data());
|
||||||
|
|
||||||
// Write translations to file.
|
// Write translations to file.
|
||||||
dst << ZRCola::translation_rec(t_db);
|
dst << ZRCola::translation_rec(t_db);
|
||||||
|
@ -36,22 +36,6 @@ namespace ZRCola {
|
|||||||
///
|
///
|
||||||
class ZRCOLA_API translation_db {
|
class ZRCOLA_API translation_db {
|
||||||
public:
|
public:
|
||||||
#pragma pack(push)
|
|
||||||
#pragma pack(4)
|
|
||||||
///
|
|
||||||
/// Translation index
|
|
||||||
///
|
|
||||||
struct index {
|
|
||||||
unsigned __int32 start; ///< Composed character offset
|
|
||||||
unsigned __int32 end; ///< Decomposed string end offset
|
|
||||||
|
|
||||||
///
|
|
||||||
/// Returns translation string length
|
|
||||||
///
|
|
||||||
inline unsigned __int32 GetStrLength() const { return end - (start + 1); }
|
|
||||||
};
|
|
||||||
#pragma pack(pop)
|
|
||||||
|
|
||||||
#pragma pack(push)
|
#pragma pack(push)
|
||||||
#pragma pack(2)
|
#pragma pack(2)
|
||||||
#pragma warning(push)
|
#pragma warning(push)
|
||||||
@ -60,15 +44,16 @@ namespace ZRCola {
|
|||||||
/// Translation data
|
/// Translation data
|
||||||
///
|
///
|
||||||
struct translation {
|
struct translation {
|
||||||
wchar_t chr; ///< Composed character
|
wchar_t chr; ///< Composed character
|
||||||
wchar_t str[]; ///< Decomposed string
|
unsigned __int16 str_len; ///< \c str length (in characters)
|
||||||
|
wchar_t str[]; ///< Decomposed string
|
||||||
};
|
};
|
||||||
#pragma warning(pop)
|
#pragma warning(pop)
|
||||||
#pragma pack(pop)
|
#pragma pack(pop)
|
||||||
|
|
||||||
std::vector<index> idxComp; ///< Composition index
|
std::vector<unsigned __int32> idxComp; ///< Composition index
|
||||||
std::vector<index> idxDecomp; ///< Decomposition index
|
std::vector<unsigned __int32> idxDecomp; ///< Decomposition index
|
||||||
std::vector<wchar_t> data; ///< Transformation data
|
std::vector<wchar_t> data; ///< Transformation data
|
||||||
|
|
||||||
public:
|
public:
|
||||||
///
|
///
|
||||||
@ -118,12 +103,12 @@ inline std::istream& operator >>(_In_ std::istream& stream, _Out_ ZRCola::transl
|
|||||||
|
|
||||||
// Read composition index.
|
// Read composition index.
|
||||||
t_db.idxComp.resize(count);
|
t_db.idxComp.resize(count);
|
||||||
stream.read((char*)t_db.idxComp.data(), sizeof(ZRCola::translation_db::index)*count);
|
stream.read((char*)t_db.idxComp.data(), sizeof(unsigned __int32)*count);
|
||||||
if (!stream.good()) return stream;
|
if (!stream.good()) return stream;
|
||||||
|
|
||||||
// Read decomposition index.
|
// Read decomposition index.
|
||||||
t_db.idxDecomp.resize(count);
|
t_db.idxDecomp.resize(count);
|
||||||
stream.read((char*)t_db.idxDecomp.data(), sizeof(ZRCola::translation_db::index)*count);
|
stream.read((char*)t_db.idxDecomp.data(), sizeof(unsigned __int32)*count);
|
||||||
if (!stream.good()) return stream;
|
if (!stream.good()) return stream;
|
||||||
|
|
||||||
// Read data count.
|
// Read data count.
|
||||||
|
@ -34,7 +34,7 @@ void ZRCola::translation_db::Compose(_In_z_count_(inputMax) const wchar_t* input
|
|||||||
if (map)
|
if (map)
|
||||||
map->clear();
|
map->clear();
|
||||||
|
|
||||||
std::vector<index>::size_type compositionsCount = idxComp.size();
|
std::vector<unsigned __int32>::size_type compositionsCount = idxComp.size();
|
||||||
|
|
||||||
for (size_t i = 0; i < inputMax;) {
|
for (size_t i = 0; i < inputMax;) {
|
||||||
// Start with the full search area at i-th character.
|
// Start with the full search area at i-th character.
|
||||||
@ -49,7 +49,8 @@ void ZRCola::translation_db::Compose(_In_z_count_(inputMax) const wchar_t* input
|
|||||||
// Get the j-th character of the composition.
|
// Get the j-th character of the composition.
|
||||||
// All compositions that get short on characters are lexically ordered before.
|
// All compositions that get short on characters are lexically ordered before.
|
||||||
// Thus the j-th character is considered 0.
|
// Thus the j-th character is considered 0.
|
||||||
wchar_t s = j < idxComp[m].GetStrLength() ? ((translation*)&data[idxComp[m].start])->str[j] : 0;
|
const translation &trans = (const translation&)data[idxComp[m]];
|
||||||
|
wchar_t s = j < trans.str_len ? trans.str[j] : 0;
|
||||||
|
|
||||||
// Do the bisection test.
|
// Do the bisection test.
|
||||||
if (c < s) r = m;
|
if (c < s) r = m;
|
||||||
@ -60,14 +61,16 @@ void ZRCola::translation_db::Compose(_In_z_count_(inputMax) const wchar_t* input
|
|||||||
// Narrow the search area on the left to start at the first composition in the run.
|
// Narrow the search area on the left to start at the first composition in the run.
|
||||||
for (size_t rr = m; l < rr;) {
|
for (size_t rr = m; l < rr;) {
|
||||||
size_t m = (l + rr) / 2;
|
size_t m = (l + rr) / 2;
|
||||||
wchar_t s = j < idxComp[m].GetStrLength() ? ((translation*)&data[idxComp[m].start])->str[j] : 0;
|
const translation &trans = (const translation&)data[idxComp[m]];
|
||||||
|
wchar_t s = j < trans.str_len ? trans.str[j] : 0;
|
||||||
if (c <= s) rr = m; else l = m + 1;
|
if (c <= s) rr = m; else l = m + 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Narrow the search area on the right to end at the first composition not in the run.
|
// Narrow the search area on the right to end at the first composition not in the run.
|
||||||
for (size_t ll = m + 1; ll < r;) {
|
for (size_t ll = m + 1; ll < r;) {
|
||||||
size_t m = (ll + r) / 2;
|
size_t m = (ll + r) / 2;
|
||||||
wchar_t s = j < idxComp[m].GetStrLength() ? ((translation*)&data[idxComp[m].start])->str[j] : 0;
|
const translation &trans = (const translation&)data[idxComp[m]];
|
||||||
|
wchar_t s = j < trans.str_len ? trans.str[j] : 0;
|
||||||
if (s <= c) ll = m + 1; else r = m;
|
if (s <= c) ll = m + 1; else r = m;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -77,9 +80,10 @@ void ZRCola::translation_db::Compose(_In_z_count_(inputMax) const wchar_t* input
|
|||||||
|
|
||||||
if (l >= r) {
|
if (l >= r) {
|
||||||
// The search area is empty.
|
// The search area is empty.
|
||||||
if (j && l_prev < compositionsCount && j == idxComp[l_prev].GetStrLength()) {
|
const translation &trans = (const translation&)data[idxComp[l_prev]];
|
||||||
|
if (j && l_prev < compositionsCount && j == trans.str_len) {
|
||||||
// The first composition of the previous run was a match.
|
// The first composition of the previous run was a match.
|
||||||
output += ((translation*)&data[idxComp[l_prev].start])->chr;
|
output += trans.chr;
|
||||||
i = ii;
|
i = ii;
|
||||||
if (j > 1 && map) {
|
if (j > 1 && map) {
|
||||||
// Mapping changed.
|
// Mapping changed.
|
||||||
@ -95,9 +99,10 @@ void ZRCola::translation_db::Compose(_In_z_count_(inputMax) const wchar_t* input
|
|||||||
} else {
|
} else {
|
||||||
// End of input reached.
|
// End of input reached.
|
||||||
|
|
||||||
if (l < compositionsCount && j == idxComp[l].GetStrLength()) {
|
const translation &trans = (const translation&)data[idxComp[l]];
|
||||||
|
if (l < compositionsCount && j == trans.str_len) {
|
||||||
// The first composition of the previous run was a match.
|
// The first composition of the previous run was a match.
|
||||||
output += ((translation*)&data[idxComp[l].start])->chr;
|
output += trans.chr;
|
||||||
i = ii;
|
i = ii;
|
||||||
if (j > 1 && map) {
|
if (j > 1 && map) {
|
||||||
// Mapping changed.
|
// Mapping changed.
|
||||||
@ -129,7 +134,7 @@ void ZRCOLA_API ZRCola::translation_db::Decompose(_In_z_count_(inputMax) const w
|
|||||||
if (map)
|
if (map)
|
||||||
map->clear();
|
map->clear();
|
||||||
|
|
||||||
std::vector<index>::size_type decompositionsCount = idxDecomp.size();
|
std::vector<unsigned __int32>::size_type decompositionsCount = idxDecomp.size();
|
||||||
|
|
||||||
for (size_t i = 0; i < inputMax;) {
|
for (size_t i = 0; i < inputMax;) {
|
||||||
// Find whether the character can be decomposed.
|
// Find whether the character can be decomposed.
|
||||||
@ -138,12 +143,13 @@ void ZRCOLA_API ZRCola::translation_db::Decompose(_In_z_count_(inputMax) const w
|
|||||||
for (size_t l = 0, r = decompositionsCount;; ) {
|
for (size_t l = 0, r = decompositionsCount;; ) {
|
||||||
if (l < r) {
|
if (l < r) {
|
||||||
size_t m = (l + r) / 2;
|
size_t m = (l + r) / 2;
|
||||||
wchar_t decompSrc = ((translation*)&data[idxDecomp[m].start])->chr;
|
const translation &trans = (const translation&)data[idxDecomp[m]];
|
||||||
|
wchar_t decompSrc = trans.chr;
|
||||||
if (c < decompSrc) r = m;
|
if (c < decompSrc) r = m;
|
||||||
else if (decompSrc < c) l = m + 1;
|
else if (decompSrc < c) l = m + 1;
|
||||||
else {
|
else {
|
||||||
// Character found.
|
// Character found.
|
||||||
output.append(((translation*)&data[idxDecomp[m].start])->str, idxDecomp[m].GetStrLength());
|
output.append(trans.str, trans.str_len);
|
||||||
i++;
|
i++;
|
||||||
if (map) {
|
if (map) {
|
||||||
// Mapping changed.
|
// Mapping changed.
|
||||||
|
Binary file not shown.
Loading…
x
Reference in New Issue
Block a user