Build Unicode Combining Characters evaluation CSV with ZRColaCompile
Signed-off-by: Simon Rozman <simon@rozman.si>
This commit is contained in:
parent
bc4b6b30d1
commit
c4f2726256
@ -131,8 +131,12 @@ namespace ZRCola {
|
||||
charseq src; ///< Source sequence
|
||||
std::string norm; ///< Normalization footprint
|
||||
charseq dst; ///< Destination sequence
|
||||
double score; ///< Score
|
||||
|
||||
inline translation() : set((int)ZRCOLA_TRANSEQID_DEFAULT) {}
|
||||
inline translation() :
|
||||
set((int)ZRCOLA_TRANSEQID_DEFAULT),
|
||||
score(0)
|
||||
{}
|
||||
};
|
||||
|
||||
|
||||
|
@ -269,6 +269,15 @@ static double compare_bitmaps(
|
||||
}
|
||||
|
||||
|
||||
static string make_unicode(_In_ const wstring &str)
|
||||
{
|
||||
string out;
|
||||
for (size_t i = 0, n = str.length(); i < n; i++)
|
||||
out += string_printf(i ? "+%04X" : "%04X", str[i]);
|
||||
return out;
|
||||
}
|
||||
|
||||
|
||||
///
|
||||
/// Main function
|
||||
///
|
||||
@ -344,6 +353,9 @@ int _tmain(int argc, _TCHAR *argv[])
|
||||
bool build_pot = parser.GetParamCount() > 2;
|
||||
set<wstring> pot;
|
||||
|
||||
bool build_csv = parser.GetParamCount() > 3;
|
||||
vector<ZRCola::DBSource::translation> csv;
|
||||
|
||||
// Open file ID.
|
||||
streamoff dst_start = idrec::open<ZRCola::recordid_t, ZRCola::recordsize_t>(dst, ZRCOLA_DB_ID);
|
||||
|
||||
@ -621,18 +633,18 @@ int _tmain(int argc, _TCHAR *argv[])
|
||||
// Add results to a temporary database.
|
||||
auto hit = trans.find(comp_orig);
|
||||
if (hit != trans.end()) {
|
||||
if (score_pre <= FONT_MATCH_THRESHOLD) {
|
||||
if (build_csv || score_pre <= FONT_MATCH_THRESHOLD) {
|
||||
if (hit->second.find(comp_pre) == hit->second.end())
|
||||
hit->second.insert(make_pair(comp_pre, make_pair(score_pre, 1)));
|
||||
} if (score_comb <= FONT_MATCH_THRESHOLD && comp_pre != comp_comb) {
|
||||
} if ((build_csv || score_comb <= FONT_MATCH_THRESHOLD) && comp_pre != comp_comb) {
|
||||
if (hit->second.find(comp_comb) == hit->second.end())
|
||||
hit->second.insert(make_pair(comp_comb, make_pair(score_comb, 100)));
|
||||
}
|
||||
} else {
|
||||
map<wstring, pair<double, int>> v;
|
||||
if (score_pre <= FONT_MATCH_THRESHOLD)
|
||||
if (build_csv || score_pre <= FONT_MATCH_THRESHOLD)
|
||||
v.insert(make_pair(comp_pre, make_pair(score_pre, 1)));
|
||||
if (score_comb <= FONT_MATCH_THRESHOLD && comp_pre != comp_comb)
|
||||
if ((build_csv || score_comb <= FONT_MATCH_THRESHOLD) && comp_pre != comp_comb)
|
||||
v.insert(make_pair(comp_comb, make_pair(score_comb, 100)));
|
||||
if (!v.empty())
|
||||
trans.insert(make_pair(comp_orig, std::move(v)));
|
||||
@ -643,10 +655,12 @@ int _tmain(int argc, _TCHAR *argv[])
|
||||
}
|
||||
|
||||
// Preallocate memory.
|
||||
size_t reserve = db_trans.idxSrc.size() + trans.size();
|
||||
size_t reserve = db_trans.idxSrc.size() + trans.size()*2;
|
||||
db_trans.idxSrc.reserve(reserve);
|
||||
db_trans.idxDst.reserve(reserve);
|
||||
db_trans.data .reserve(reserve*5);
|
||||
if (build_csv)
|
||||
csv.reserve(trans.size()*2);
|
||||
|
||||
ZRCola::DBSource::translation t;
|
||||
t.set = (int)ZRCOLA_TRANSEQID_UNICODE;
|
||||
@ -664,7 +678,10 @@ int _tmain(int argc, _TCHAR *argv[])
|
||||
t.src.str = i->first;
|
||||
t.src.rank = j->second.second + (j->second.second >= 100 ? rank_comb++ : rank_pre++);
|
||||
t.dst.str = j->second.first;
|
||||
t.score = j->first;
|
||||
db_trans << t;
|
||||
if (build_csv)
|
||||
csv.push_back(t);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
@ -1228,6 +1245,43 @@ int _tmain(int argc, _TCHAR *argv[])
|
||||
}
|
||||
}
|
||||
|
||||
if (!has_errors && build_csv) {
|
||||
const wxString& filenameCsv = parser.GetParam(3);
|
||||
fstream dst_csv((LPCTSTR)filenameCsv, ios_base::out | ios_base::trunc);
|
||||
if (dst_csv.good()) {
|
||||
dst_csv
|
||||
<< "\xef\xbb\xbf" // UTF-8 BOM
|
||||
<< "\"znak\";"
|
||||
<< "\"znakZRCola\";"
|
||||
<< "\"znakRank\";"
|
||||
<< "\"komb\";"
|
||||
<< "\"kombZRCola\";"
|
||||
<< "\"kombRank\";"
|
||||
<< "\"razlika\"" << endl;
|
||||
wstring_convert<codecvt_utf8<wchar_t>> conv;
|
||||
for (auto i = csv.cbegin(), i_end = csv.cend(); i != i_end; ++i) {
|
||||
dst_csv
|
||||
<< "\"" << make_unicode(i->src.str) << "\";"
|
||||
<< "\"" << conv.to_bytes(i->src.str) << "\";"
|
||||
<< i->src.rank << ";"
|
||||
<< "\"" << make_unicode(i->dst.str) << "\";"
|
||||
<< "\"" << conv.to_bytes(i->dst.str) << "\";"
|
||||
<< i->dst.rank << ";"
|
||||
<< i->score << endl;
|
||||
}
|
||||
|
||||
if (dst_csv.fail()) {
|
||||
_ftprintf(stderr, wxT("%s: error ZCC0013: Writing to CSV report failed.\n"), (LPCTSTR)filenameOut.c_str());
|
||||
has_errors = true;
|
||||
}
|
||||
|
||||
dst_csv.close();
|
||||
} else {
|
||||
_ftprintf(stderr, wxT("%s: error ZCC0012: Error opening CSV report.\n"), filenameOut.fn_str());
|
||||
has_errors = true;
|
||||
}
|
||||
}
|
||||
|
||||
if (has_errors) {
|
||||
dst.close();
|
||||
wxRemoveFile(filenameOut);
|
||||
|
@ -1,169 +0,0 @@
|
||||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<!--
|
||||
Copyright © 2021 Amebis
|
||||
|
||||
This file is part of ZRCola.
|
||||
|
||||
ZRCola is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
ZRCola is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with ZRCola. If not, see <http://www.gnu.org/licenses/>.
|
||||
-->
|
||||
<package>
|
||||
<job id="Analyse">
|
||||
<runtime>
|
||||
<description>Unicode Combining Character Analysis - Amebis, Copyright © 2021</description>
|
||||
<unnamed name="<ZRCola.mdb>" required="true" helpstring="ZRCola database"/>
|
||||
<unnamed name="<results.csv>" required="true" helpstring="Output file with analysis results"/>
|
||||
</runtime>
|
||||
<reference object="ADODB.Connection"/>
|
||||
<reference object="ADODB.Command"/>
|
||||
<reference object="ADODB.Recordset"/>
|
||||
<reference object="Scripting.FileSystemObject"/>
|
||||
<script language="JScript"><![CDATA[
|
||||
if (WScript.Arguments.Unnamed.Length < 2) {
|
||||
WScript.Arguments.ShowUsage();
|
||||
WScript.Quit(1);
|
||||
}
|
||||
|
||||
var parseUnicode_stat = {
|
||||
"re_separator" : new RegExp("\\s*\\+\\s*", "g")
|
||||
};
|
||||
|
||||
function fromUni(str)
|
||||
{
|
||||
var result = "";
|
||||
var a = str.split(parseUnicode_stat.re_separator);
|
||||
for (var i in a)
|
||||
result += String.fromCharCode(parseInt(a[i], 16));
|
||||
return result;
|
||||
}
|
||||
|
||||
function toUni(str)
|
||||
{
|
||||
var i, n = str.length, result = "";
|
||||
for (i = 0; i < n; i++) {
|
||||
if (i) result += "+";
|
||||
var val = str.charCodeAt(i);
|
||||
if (val < 0x10) result += "000";
|
||||
else if (val < 0x100) result += "00";
|
||||
else if (val < 0x1000) result += "0";
|
||||
result += val.toString(16).toUpperCase();
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
var escapeRegExp_stat = {
|
||||
"re_specialChar" : new RegExp("[.*+?^${}()|[\\]\\\\]", "g")
|
||||
};
|
||||
|
||||
function escapeRegExp(str)
|
||||
{
|
||||
return str.replace(escapeRegExp_stat.re_specialChar, "\\$&");
|
||||
}
|
||||
|
||||
var escapeCSV_stat = {
|
||||
"re_quote" : new RegExp("\"", "g")
|
||||
};
|
||||
|
||||
function escapeCSV(str)
|
||||
{
|
||||
return str.replace(escapeCSV_stat.re_quote, "\"\"");
|
||||
}
|
||||
|
||||
// Open ZRCola database.
|
||||
var dbPath = WScript.Arguments.Unnamed(0);
|
||||
var outputPath = WScript.Arguments.Unnamed(1);
|
||||
var db = WScript.CreateObject("ADODB.Connection");
|
||||
db.Open("Driver={Microsoft Access Driver (*.mdb)};Dbq=" + dbPath + ";Uid=;Pwd=;");
|
||||
try {
|
||||
// Open Unicode Data file.
|
||||
var
|
||||
fso = WScript.CreateObject("Scripting.FileSystemObject"),
|
||||
f = fso.CreateTextFile(outputPath, true, true);
|
||||
try {
|
||||
// Build a dictionary of all compositions and known combining characters.
|
||||
var zrcola = [], combining = [];
|
||||
var rs = WScript.CreateObject("ADODB.Recordset");
|
||||
rs.CursorLocation = adUseClient;
|
||||
rs.Open("SELECT [komb], [znak] FROM [VRS_ReplChar] ORDER BY [rang_komb], LEN([komb]) DESC", db, adOpenDynamic, adLockOptimistic, adCmdText);
|
||||
try {
|
||||
for (; !rs.EOF; rs.MoveNext()) {
|
||||
var
|
||||
decomposed = fromUni(rs("komb").Value),
|
||||
composed = fromUni(rs("znak").Value);
|
||||
|
||||
zrcola.push({
|
||||
"decomposed" : decomposed,
|
||||
"re_decomposed" : new RegExp(escapeRegExp(decomposed), "g"),
|
||||
"composed" : composed
|
||||
});
|
||||
|
||||
if (decomposed.charCodeAt(0) == 0x203f)
|
||||
combining.push({
|
||||
"decomposed" : decomposed.substring(1),
|
||||
"re_decomposed" : new RegExp(escapeRegExp(decomposed.substring(1)), "g"),
|
||||
"composed" : composed
|
||||
});
|
||||
}
|
||||
} finally {
|
||||
rs.Close();
|
||||
}
|
||||
|
||||
f.WriteLine(
|
||||
"\"compOrig\"" + "\t" + "\"compOrigZRCOLA\"" + "\t" +
|
||||
"\"decompOrig\"" + "\t" + "\"decompOrigZRCOLA\"" + "\t" +
|
||||
"\"comp\"" + "\t" + "\"compZRCOLA\"" + "\t" +
|
||||
"\"decomp\"" + "\t" + "\"decompZRCOLA\"" + "\t");
|
||||
|
||||
// Traverse all characters in PUA and their decompositions and try to replace as much decompositions as possible with combining characters.
|
||||
rs.Open("SELECT [VRS_CharList].[znak] AS [znak], [VRS_ReplChar].[komb] AS [komb] " +
|
||||
"FROM [VRS_CharList] RIGHT JOIN [VRS_ReplChar] ON [VRS_CharList].[znak]=[VRS_ReplChar].[znak] "+
|
||||
"WHERE [VRS_CharList].[znak]>='E000' AND [VRS_CharList].[znak]<='F8FF' "+
|
||||
"ORDER BY [VRS_CharList].[znak]", db, adOpenDynamic, adLockOptimistic, adCmdText);
|
||||
try {
|
||||
for (; !rs.EOF; rs.MoveNext()) {
|
||||
var
|
||||
compOrig = fromUni(rs("znak").Value),
|
||||
decompOrig = fromUni(rs("komb").Value),
|
||||
decomp = decompOrig;
|
||||
|
||||
for (var i in combining)
|
||||
decomp = decomp.replace(combining[i].re_decomposed, combining[i].composed);
|
||||
var comp = decomp;
|
||||
for (var i in zrcola)
|
||||
comp = comp.replace(zrcola[i].re_decomposed, zrcola[i].composed);
|
||||
|
||||
f.WriteLine(
|
||||
"\"" + escapeCSV(toUni(compOrig )) + "\"\t\"" + escapeCSV(compOrig ) + "\"\t"+
|
||||
"\"" + escapeCSV(toUni(decompOrig)) + "\"\t\"" + escapeCSV(decompOrig) + "\"\t"+
|
||||
"\"" + escapeCSV(toUni(comp )) + "\"\t\"" + escapeCSV(comp ) + "\"\t"+
|
||||
"\"" + escapeCSV(toUni(decomp )) + "\"\t\"" + escapeCSV(decomp ) + "\"\t");
|
||||
}
|
||||
} finally {
|
||||
rs.Close();
|
||||
}
|
||||
} finally {
|
||||
f.Close();
|
||||
}
|
||||
} catch (err) {
|
||||
// In case of error, delete the output file.
|
||||
try { fso.DeleteFile(outputPath); } catch (err2) {}
|
||||
|
||||
throw err;
|
||||
} finally {
|
||||
db.Close();
|
||||
}
|
||||
|
||||
WScript.Quit(0);
|
||||
]]></script>
|
||||
</job>
|
||||
</package>
|
Loading…
x
Reference in New Issue
Block a user