ZRCola/lib/libZRCola/src/translate.cpp

184 lines
7.2 KiB
C++

/*
Copyright 2015-2018 Amebis
This file is part of ZRCola.
ZRCola is free software: you can redistribute it and/or modify it
under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
ZRCola is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with ZRCola. If not, see <http://www.gnu.org/licenses/>.
*/
#include "stdafx.h"
void ZRCola::translation_db::Translate(_In_ transetid_t set, _In_z_count_(inputMax) const wchar_t* input, _In_ size_t inputMax, _Out_ std::wstring &output, _Out_opt_ std::vector<mapping>* map) const
{
assert(input || inputMax == 0);
// Trim inputMax to actual length.
inputMax = inputMax != (size_t)-1 ? wcsnlen(input, inputMax) : wcslen(input);
// Clear the output.
output.clear();
output.reserve(inputMax);
if (map)
map->clear();
// Limit search to the given set first.
indexSrc::size_type l_set, r_set;
idxSrc.find(translation(set ), l_set);
idxSrc.find(translation(set + 1), r_set);
for (size_t i = 0; i < inputMax;) {
// Find the longest matching translation at i-th character.
size_t l_match = (size_t)-1;
for (size_t l = l_set, r = r_set, ii = i, j = 0; ii < inputMax && l < r; ii++, j++) {
wchar_t c = input[ii];
while (l < r) {
// Test the translation in the middle of the search area.
size_t m = (l + r) / 2;
// Get the j-th character of the translation.
// All translations that get short on characters are lexically ordered before.
// Thus the j-th character is considered 0.
wchar_t s = idxSrc[m].src_at(j);
// Do the bisection test.
if (c < s) r = m;
else if (s < c) l = m + 1;
else {
// Character found.
// Narrow the search area on the left to start at the first translation in the run.
for (size_t r2 = m; l < r2;) {
size_t m2 = (l + r2) / 2;
if (c <= idxSrc[m2].src_at(j)) r2 = m2; else l = m2 + 1;
}
// Narrow the search area on the right to end at the first translation not in the run.
for (size_t l2 = m + 1; l2 < r;) {
size_t m2 = (l2 + r) / 2;
if (idxSrc[m2].src_at(j) <= c) l2 = m2 + 1; else r = m2;
}
if (j + 1 == idxSrc[l].src_len()) {
// The first translation of the run was a match (thus far). Save it.
l_match = l;
}
break;
}
}
}
if (l_match < r_set) {
// The saved translation was an exact match.
const translation &trans = idxSrc[l_match];
output.append(trans.dst(), trans.dst_end());
i += trans.src_len();
if (trans.src_len() != trans.dst_len() && map) {
// Mapping changed.
map->push_back(ZRCola::mapping(i, output.length()));
}
} else {
// The match was not found.
output += input[i];
i++;
}
}
}
void ZRCola::translation_db::TranslateInv(_In_ transetid_t set, _In_z_count_(inputMax) const wchar_t* input, _In_ size_t inputMax, _In_ const langchar_db *lc_db, _In_ langid_t lang, _Out_ std::wstring &output, _Out_opt_ std::vector<mapping>* map) const
{
assert(input || inputMax == 0);
// Trim inputMax to actual length.
inputMax = inputMax != (size_t)-1 ? wcsnlen(input, inputMax) : wcslen(input);
// Clear the output.
output.clear();
output.reserve(inputMax);
if (map)
map->clear();
// Limit search to the given set first.
indexDst::size_type l_set, r_set;
idxDst.find(translation(set ), l_set);
idxDst.find(translation(set + 1), r_set);
for (size_t i = 0; i < inputMax;) {
// Find the longest matching inverse translation at i-th character.
size_t l_match = (size_t)-1;
for (size_t l = l_set, r = r_set, ii = i, j = 0; ii < inputMax && l < r; ii++, j++) {
wchar_t c = input[ii];
while (l < r) {
// Test the inverse translation in the middle of the search area.
size_t m = (l + r) / 2;
// Get the j-th character of the inverse translation.
// All inverse translations that get short on characters are lexically ordered before.
// Thus the j-th character is considered 0.
wchar_t s = idxDst[m].dst_at(j);
// Do the bisection test.
if (c < s) r = m;
else if (s < c) l = m + 1;
else {
// Character found.
// Narrow the search area on the left to start at the first inverse translation in the run.
for (size_t r2 = m; l < r2;) {
size_t m2 = (l + r2) / 2;
if (c <= idxDst[m2].dst_at(j)) r2 = m2; else l = m2 + 1;
}
// Narrow the search area on the right to end at the first inverse translation not in the run.
for (size_t l2 = m + 1; l2 < r;) {
size_t m2 = (l2 + r) / 2;
if (idxDst[m2].dst_at(j) <= c) l2 = m2 + 1; else r = m2;
}
if (j + 1 == idxDst[l].dst_len()) {
// The first inverse translation of the run was a match (thus far). Save it.
l_match = l;
}
break;
}
}
}
if (l_match < r_set) {
// The saved inverse translation was an exact match.
const translation &trans = idxDst[l_match];
if (trans.src_len() && trans.src()[0] != L'#' && (!lc_db || !lc_db->IsLocalCharacter(trans.dst(), trans.dst_end(), lang))) {
// Append source sequence.
output.append(trans.src(), trans.src_end());
i += trans.dst_len();
if (trans.dst_len() != trans.src_len() && map) {
// Mapping changed.
map->push_back(ZRCola::mapping(i, output.length()));
}
} else {
// Character is inhibited to inverse translate.
output.append(trans.dst(), trans.dst_end());
i += trans.dst_len();
}
} else {
// The match was not found.
output += input[i];
i++;
}
}
}