Build Unicode Combining Characters evaluation CSV with ZRColaCompile

Signed-off-by: Simon Rozman <simon@rozman.si>
This commit is contained in:
Simon Rozman 2021-04-02 15:34:52 +02:00
parent bc4b6b30d1
commit c4f2726256
3 changed files with 64 additions and 175 deletions

View File

@ -131,8 +131,12 @@ namespace ZRCola {
charseq src; ///< Source sequence
std::string norm; ///< Normalization footprint
charseq dst; ///< Destination sequence
double score; ///< Score
inline translation() : set((int)ZRCOLA_TRANSEQID_DEFAULT) {}
inline translation() :
set((int)ZRCOLA_TRANSEQID_DEFAULT),
score(0)
{}
};

View File

@ -269,6 +269,15 @@ static double compare_bitmaps(
}
static string make_unicode(_In_ const wstring &str)
{
string out;
for (size_t i = 0, n = str.length(); i < n; i++)
out += string_printf(i ? "+%04X" : "%04X", str[i]);
return out;
}
///
/// Main function
///
@ -344,6 +353,9 @@ int _tmain(int argc, _TCHAR *argv[])
bool build_pot = parser.GetParamCount() > 2;
set<wstring> pot;
bool build_csv = parser.GetParamCount() > 3;
vector<ZRCola::DBSource::translation> csv;
// Open file ID.
streamoff dst_start = idrec::open<ZRCola::recordid_t, ZRCola::recordsize_t>(dst, ZRCOLA_DB_ID);
@ -621,18 +633,18 @@ int _tmain(int argc, _TCHAR *argv[])
// Add results to a temporary database.
auto hit = trans.find(comp_orig);
if (hit != trans.end()) {
if (score_pre <= FONT_MATCH_THRESHOLD) {
if (build_csv || score_pre <= FONT_MATCH_THRESHOLD) {
if (hit->second.find(comp_pre) == hit->second.end())
hit->second.insert(make_pair(comp_pre, make_pair(score_pre, 1)));
} if (score_comb <= FONT_MATCH_THRESHOLD && comp_pre != comp_comb) {
} if ((build_csv || score_comb <= FONT_MATCH_THRESHOLD) && comp_pre != comp_comb) {
if (hit->second.find(comp_comb) == hit->second.end())
hit->second.insert(make_pair(comp_comb, make_pair(score_comb, 100)));
}
} else {
map<wstring, pair<double, int>> v;
if (score_pre <= FONT_MATCH_THRESHOLD)
if (build_csv || score_pre <= FONT_MATCH_THRESHOLD)
v.insert(make_pair(comp_pre, make_pair(score_pre, 1)));
if (score_comb <= FONT_MATCH_THRESHOLD && comp_pre != comp_comb)
if ((build_csv || score_comb <= FONT_MATCH_THRESHOLD) && comp_pre != comp_comb)
v.insert(make_pair(comp_comb, make_pair(score_comb, 100)));
if (!v.empty())
trans.insert(make_pair(comp_orig, std::move(v)));
@ -643,10 +655,12 @@ int _tmain(int argc, _TCHAR *argv[])
}
// Preallocate memory.
size_t reserve = db_trans.idxSrc.size() + trans.size();
size_t reserve = db_trans.idxSrc.size() + trans.size()*2;
db_trans.idxSrc.reserve(reserve);
db_trans.idxDst.reserve(reserve);
db_trans.data .reserve(reserve*5);
if (build_csv)
csv.reserve(trans.size()*2);
ZRCola::DBSource::translation t;
t.set = (int)ZRCOLA_TRANSEQID_UNICODE;
@ -664,7 +678,10 @@ int _tmain(int argc, _TCHAR *argv[])
t.src.str = i->first;
t.src.rank = j->second.second + (j->second.second >= 100 ? rank_comb++ : rank_pre++);
t.dst.str = j->second.first;
t.score = j->first;
db_trans << t;
if (build_csv)
csv.push_back(t);
}
}
} else {
@ -1228,6 +1245,43 @@ int _tmain(int argc, _TCHAR *argv[])
}
}
if (!has_errors && build_csv) {
const wxString& filenameCsv = parser.GetParam(3);
fstream dst_csv((LPCTSTR)filenameCsv, ios_base::out | ios_base::trunc);
if (dst_csv.good()) {
dst_csv
<< "\xef\xbb\xbf" // UTF-8 BOM
<< "\"znak\";"
<< "\"znakZRCola\";"
<< "\"znakRank\";"
<< "\"komb\";"
<< "\"kombZRCola\";"
<< "\"kombRank\";"
<< "\"razlika\"" << endl;
wstring_convert<codecvt_utf8<wchar_t>> conv;
for (auto i = csv.cbegin(), i_end = csv.cend(); i != i_end; ++i) {
dst_csv
<< "\"" << make_unicode(i->src.str) << "\";"
<< "\"" << conv.to_bytes(i->src.str) << "\";"
<< i->src.rank << ";"
<< "\"" << make_unicode(i->dst.str) << "\";"
<< "\"" << conv.to_bytes(i->dst.str) << "\";"
<< i->dst.rank << ";"
<< i->score << endl;
}
if (dst_csv.fail()) {
_ftprintf(stderr, wxT("%s: error ZCC0013: Writing to CSV report failed.\n"), (LPCTSTR)filenameOut.c_str());
has_errors = true;
}
dst_csv.close();
} else {
_ftprintf(stderr, wxT("%s: error ZCC0012: Error opening CSV report.\n"), filenameOut.fn_str());
has_errors = true;
}
}
if (has_errors) {
dst.close();
wxRemoveFile(filenameOut);

View File

@ -1,169 +0,0 @@
<?xml version="1.0" encoding="utf-8"?>
<!--
Copyright © 2021 Amebis
This file is part of ZRCola.
ZRCola is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
ZRCola is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with ZRCola. If not, see <http://www.gnu.org/licenses/>.
-->
<package>
<job id="Analyse">
<runtime>
<description>Unicode Combining Character Analysis - Amebis, Copyright © 2021</description>
<unnamed name="&lt;ZRCola.mdb&gt;" required="true" helpstring="ZRCola database"/>
<unnamed name="&lt;results.csv&gt;" required="true" helpstring="Output file with analysis results"/>
</runtime>
<reference object="ADODB.Connection"/>
<reference object="ADODB.Command"/>
<reference object="ADODB.Recordset"/>
<reference object="Scripting.FileSystemObject"/>
<script language="JScript"><![CDATA[
if (WScript.Arguments.Unnamed.Length < 2) {
WScript.Arguments.ShowUsage();
WScript.Quit(1);
}
var parseUnicode_stat = {
"re_separator" : new RegExp("\\s*\\+\\s*", "g")
};
function fromUni(str)
{
var result = "";
var a = str.split(parseUnicode_stat.re_separator);
for (var i in a)
result += String.fromCharCode(parseInt(a[i], 16));
return result;
}
function toUni(str)
{
var i, n = str.length, result = "";
for (i = 0; i < n; i++) {
if (i) result += "+";
var val = str.charCodeAt(i);
if (val < 0x10) result += "000";
else if (val < 0x100) result += "00";
else if (val < 0x1000) result += "0";
result += val.toString(16).toUpperCase();
}
return result;
}
var escapeRegExp_stat = {
"re_specialChar" : new RegExp("[.*+?^${}()|[\\]\\\\]", "g")
};
function escapeRegExp(str)
{
return str.replace(escapeRegExp_stat.re_specialChar, "\\$&");
}
var escapeCSV_stat = {
"re_quote" : new RegExp("\"", "g")
};
function escapeCSV(str)
{
return str.replace(escapeCSV_stat.re_quote, "\"\"");
}
// Open ZRCola database.
var dbPath = WScript.Arguments.Unnamed(0);
var outputPath = WScript.Arguments.Unnamed(1);
var db = WScript.CreateObject("ADODB.Connection");
db.Open("Driver={Microsoft Access Driver (*.mdb)};Dbq=" + dbPath + ";Uid=;Pwd=;");
try {
// Open Unicode Data file.
var
fso = WScript.CreateObject("Scripting.FileSystemObject"),
f = fso.CreateTextFile(outputPath, true, true);
try {
// Build a dictionary of all compositions and known combining characters.
var zrcola = [], combining = [];
var rs = WScript.CreateObject("ADODB.Recordset");
rs.CursorLocation = adUseClient;
rs.Open("SELECT [komb], [znak] FROM [VRS_ReplChar] ORDER BY [rang_komb], LEN([komb]) DESC", db, adOpenDynamic, adLockOptimistic, adCmdText);
try {
for (; !rs.EOF; rs.MoveNext()) {
var
decomposed = fromUni(rs("komb").Value),
composed = fromUni(rs("znak").Value);
zrcola.push({
"decomposed" : decomposed,
"re_decomposed" : new RegExp(escapeRegExp(decomposed), "g"),
"composed" : composed
});
if (decomposed.charCodeAt(0) == 0x203f)
combining.push({
"decomposed" : decomposed.substring(1),
"re_decomposed" : new RegExp(escapeRegExp(decomposed.substring(1)), "g"),
"composed" : composed
});
}
} finally {
rs.Close();
}
f.WriteLine(
"\"compOrig\"" + "\t" + "\"compOrigZRCOLA\"" + "\t" +
"\"decompOrig\"" + "\t" + "\"decompOrigZRCOLA\"" + "\t" +
"\"comp\"" + "\t" + "\"compZRCOLA\"" + "\t" +
"\"decomp\"" + "\t" + "\"decompZRCOLA\"" + "\t");
// Traverse all characters in PUA and their decompositions and try to replace as much decompositions as possible with combining characters.
rs.Open("SELECT [VRS_CharList].[znak] AS [znak], [VRS_ReplChar].[komb] AS [komb] " +
"FROM [VRS_CharList] RIGHT JOIN [VRS_ReplChar] ON [VRS_CharList].[znak]=[VRS_ReplChar].[znak] "+
"WHERE [VRS_CharList].[znak]>='E000' AND [VRS_CharList].[znak]<='F8FF' "+
"ORDER BY [VRS_CharList].[znak]", db, adOpenDynamic, adLockOptimistic, adCmdText);
try {
for (; !rs.EOF; rs.MoveNext()) {
var
compOrig = fromUni(rs("znak").Value),
decompOrig = fromUni(rs("komb").Value),
decomp = decompOrig;
for (var i in combining)
decomp = decomp.replace(combining[i].re_decomposed, combining[i].composed);
var comp = decomp;
for (var i in zrcola)
comp = comp.replace(zrcola[i].re_decomposed, zrcola[i].composed);
f.WriteLine(
"\"" + escapeCSV(toUni(compOrig )) + "\"\t\"" + escapeCSV(compOrig ) + "\"\t"+
"\"" + escapeCSV(toUni(decompOrig)) + "\"\t\"" + escapeCSV(decompOrig) + "\"\t"+
"\"" + escapeCSV(toUni(comp )) + "\"\t\"" + escapeCSV(comp ) + "\"\t"+
"\"" + escapeCSV(toUni(decomp )) + "\"\t\"" + escapeCSV(decomp ) + "\"\t");
}
} finally {
rs.Close();
}
} finally {
f.Close();
}
} catch (err) {
// In case of error, delete the output file.
try { fso.DeleteFile(outputPath); } catch (err2) {}
throw err;
} finally {
db.Close();
}
WScript.Quit(0);
]]></script>
</job>
</package>