Build Unicode Combining Characters evaluation CSV with ZRColaCompile

Signed-off-by: Simon Rozman <simon@rozman.si>
2021-04-02 15:34:52 +02:00 · 2021-04-02 15:34:52 +02:00 · c4f2726256
commit c4f2726256
parent bc4b6b30d1
3 changed files with 64 additions and 175 deletions
--- a/ZRColaCompile/dbsource.h
+++ b/ZRColaCompile/dbsource.h
@ -131,8 +131,12 @@ namespace ZRCola {
            charseq src;        ///< Source sequence
            std::string norm;   ///< Normalization footprint
            charseq dst;        ///< Destination sequence
+            double score;       ///< Score

-            inline translation() : set((int)ZRCOLA_TRANSEQID_DEFAULT) {}
+            inline translation() :
+                set((int)ZRCOLA_TRANSEQID_DEFAULT),
+                score(0)
+            {}
        };


--- a/ZRColaCompile/main.cpp
+++ b/ZRColaCompile/main.cpp
@ -269,6 +269,15 @@ static double compare_bitmaps(
 }


+static string make_unicode(_In_ const wstring &str)
+{
+    string out;
+    for (size_t i = 0, n = str.length(); i < n; i++)
+        out += string_printf(i ? "+%04X" : "%04X", str[i]);
+    return out;
+}
+
+
 ///
 /// Main function
 ///
@ -344,6 +353,9 @@ int _tmain(int argc, _TCHAR *argv[])
    bool build_pot = parser.GetParamCount() > 2;
    set<wstring> pot;

+    bool build_csv = parser.GetParamCount() > 3;
+    vector<ZRCola::DBSource::translation> csv;
+
    // Open file ID.
    streamoff dst_start = idrec::open<ZRCola::recordid_t, ZRCola::recordsize_t>(dst, ZRCOLA_DB_ID);

@ -621,18 +633,18 @@ int _tmain(int argc, _TCHAR *argv[])
                            // Add results to a temporary database.
                            auto hit = trans.find(comp_orig);
                            if (hit != trans.end()) {
-                                if (score_pre <= FONT_MATCH_THRESHOLD) {
+                                if (build_csv || score_pre <= FONT_MATCH_THRESHOLD) {
                                    if (hit->second.find(comp_pre) == hit->second.end())
                                        hit->second.insert(make_pair(comp_pre, make_pair(score_pre, 1)));
-                                } if (score_comb <= FONT_MATCH_THRESHOLD && comp_pre != comp_comb) {
+                                } if ((build_csv || score_comb <= FONT_MATCH_THRESHOLD) && comp_pre != comp_comb) {
                                    if (hit->second.find(comp_comb) == hit->second.end())
                                        hit->second.insert(make_pair(comp_comb, make_pair(score_comb, 100)));
                                }
                            } else {
                                map<wstring, pair<double, int>> v;
-                                if (score_pre <= FONT_MATCH_THRESHOLD)
+                                if (build_csv || score_pre <= FONT_MATCH_THRESHOLD)
                                    v.insert(make_pair(comp_pre, make_pair(score_pre, 1)));
-                                if (score_comb <= FONT_MATCH_THRESHOLD && comp_pre != comp_comb)
+                                if ((build_csv || score_comb <= FONT_MATCH_THRESHOLD) && comp_pre != comp_comb)
                                    v.insert(make_pair(comp_comb, make_pair(score_comb, 100)));
                                if (!v.empty())
                                    trans.insert(make_pair(comp_orig, std::move(v)));
@ -643,10 +655,12 @@ int _tmain(int argc, _TCHAR *argv[])
                }

                // Preallocate memory.
-                size_t reserve = db_trans.idxSrc.size() + trans.size();
+                size_t reserve = db_trans.idxSrc.size() + trans.size()*2;
                db_trans.idxSrc.reserve(reserve);
                db_trans.idxDst.reserve(reserve);
                db_trans.data  .reserve(reserve*5);
+                if (build_csv)
+                    csv.reserve(trans.size()*2);

                ZRCola::DBSource::translation t;
                t.set = (int)ZRCOLA_TRANSEQID_UNICODE;
@ -664,7 +678,10 @@ int _tmain(int argc, _TCHAR *argv[])
                        t.src.str  = i->first;
                        t.src.rank = j->second.second + (j->second.second >= 100 ? rank_comb++ : rank_pre++);
                        t.dst.str  = j->second.first;
+                        t.score    = j->first;
                        db_trans << t;
+                        if (build_csv)
+                            csv.push_back(t);
                    }
                }
            } else {
@ -1228,6 +1245,43 @@ int _tmain(int argc, _TCHAR *argv[])
        }
    }

+    if (!has_errors && build_csv) {
+        const wxString& filenameCsv = parser.GetParam(3);
+        fstream dst_csv((LPCTSTR)filenameCsv, ios_base::out | ios_base::trunc);
+        if (dst_csv.good()) {
+            dst_csv
+                << "\xef\xbb\xbf" // UTF-8 BOM
+                << "\"znak\";"
+                << "\"znakZRCola\";"
+                << "\"znakRank\";"
+                << "\"komb\";"
+                << "\"kombZRCola\";"
+                << "\"kombRank\";"
+                << "\"razlika\"" << endl;
+            wstring_convert<codecvt_utf8<wchar_t>> conv;
+            for (auto i = csv.cbegin(), i_end = csv.cend(); i != i_end; ++i) {
+                dst_csv
+                    << "\"" << make_unicode(i->src.str) << "\";"
+                    << "\"" << conv.to_bytes(i->src.str) << "\";"
+                    << i->src.rank << ";"
+                    << "\"" << make_unicode(i->dst.str) << "\";"
+                    << "\"" << conv.to_bytes(i->dst.str) << "\";"
+                    << i->dst.rank << ";"
+                    << i->score << endl;
+            }
+
+            if (dst_csv.fail()) {
+                _ftprintf(stderr, wxT("%s: error ZCC0013: Writing to CSV report failed.\n"), (LPCTSTR)filenameOut.c_str());
+                has_errors = true;
+            }
+
+            dst_csv.close();
+        } else {
+            _ftprintf(stderr, wxT("%s: error ZCC0012: Error opening CSV report.\n"), filenameOut.fn_str());
+            has_errors = true;
+        }
+    }
+
    if (has_errors) {
        dst.close();
        wxRemoveFile(filenameOut);
--- a/bin/Combining.wsf
+++ b/bin/Combining.wsf
@ -1,169 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?>
-<!--
-    Copyright © 2021 Amebis
-
-    This file is part of ZRCola.
-
-    ZRCola is free software: you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
-
-    ZRCola is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with ZRCola. If not, see <http://www.gnu.org/licenses/>.
-->
-<package>
-	<job id="Analyse">
-		<runtime>
-			<description>Unicode Combining Character Analysis - Amebis, Copyright © 2021</description>
-			<unnamed name="&lt;ZRCola.mdb&gt;"  required="true" helpstring="ZRCola database"/>
-			<unnamed name="&lt;results.csv&gt;" required="true" helpstring="Output file with analysis results"/>
-		</runtime>
-		<reference object="ADODB.Connection"/>
-		<reference object="ADODB.Command"/>
-		<reference object="ADODB.Recordset"/>
-		<reference object="Scripting.FileSystemObject"/>
-		<script language="JScript"><![CDATA[
-			if (WScript.Arguments.Unnamed.Length < 2) {
-				WScript.Arguments.ShowUsage();
-				WScript.Quit(1);
-			}
-
-			var parseUnicode_stat = {
-				"re_separator" : new RegExp("\\s*\\+\\s*", "g")
-			};
-
-			function fromUni(str)
-			{
-				var result = "";
-				var a = str.split(parseUnicode_stat.re_separator);
-				for (var i in a)
-					result += String.fromCharCode(parseInt(a[i], 16));
-				return result;
-			}
-
-			function toUni(str)
-			{
-				var i, n = str.length, result = "";
-				for (i = 0; i < n; i++) {
-					if (i) result += "+";
-					var val = str.charCodeAt(i);
-					if (val < 0x10) result += "000";
-					else if (val < 0x100) result += "00";
-					else if (val < 0x1000) result += "0";
-					result += val.toString(16).toUpperCase();
-				}
-				return result;
-			}
-
-			var escapeRegExp_stat = {
-				"re_specialChar" : new RegExp("[.*+?^${}()|[\\]\\\\]", "g")
-			};
-
-			function escapeRegExp(str)
-			{
-				return str.replace(escapeRegExp_stat.re_specialChar, "\\$&");
-			}
-
-			var escapeCSV_stat = {
-				"re_quote" : new RegExp("\"", "g")
-			};
-
-			function escapeCSV(str)
-			{
-				return str.replace(escapeCSV_stat.re_quote, "\"\"");
-			}
-
-			// Open ZRCola database.
-			var dbPath = WScript.Arguments.Unnamed(0);
-			var outputPath = WScript.Arguments.Unnamed(1);
-			var db = WScript.CreateObject("ADODB.Connection");
-			db.Open("Driver={Microsoft Access Driver (*.mdb)};Dbq=" + dbPath + ";Uid=;Pwd=;");
-			try {
-				// Open Unicode Data file.
-				var
-					fso = WScript.CreateObject("Scripting.FileSystemObject"),
-					f = fso.CreateTextFile(outputPath, true, true);
-				try {
-					// Build a dictionary of all compositions and known combining characters.
-					var zrcola = [], combining = [];
-					var rs = WScript.CreateObject("ADODB.Recordset");
-					rs.CursorLocation = adUseClient;
-					rs.Open("SELECT [komb], [znak] FROM [VRS_ReplChar] ORDER BY [rang_komb], LEN([komb]) DESC", db, adOpenDynamic, adLockOptimistic, adCmdText);
-					try {
-						for (; !rs.EOF; rs.MoveNext()) {
-							var
-								decomposed = fromUni(rs("komb").Value),
-								composed   = fromUni(rs("znak").Value);
-
-							zrcola.push({
-								"decomposed"    : decomposed,
-								"re_decomposed" : new RegExp(escapeRegExp(decomposed), "g"),
-								"composed"      : composed
-							});
-
-							if (decomposed.charCodeAt(0) == 0x203f)
-								combining.push({
-									"decomposed"    : decomposed.substring(1),
-									"re_decomposed" : new RegExp(escapeRegExp(decomposed.substring(1)), "g"),
-									"composed"      : composed
-								});
-						}
-					} finally {
-						rs.Close();
-					}
-
-					f.WriteLine(
-						"\"compOrig\""   + "\t" + "\"compOrigZRCOLA\""   + "\t" +
-						"\"decompOrig\"" + "\t" + "\"decompOrigZRCOLA\"" + "\t" +
-						"\"comp\""       + "\t" + "\"compZRCOLA\""       + "\t" +
-						"\"decomp\""     + "\t" + "\"decompZRCOLA\""     + "\t");
-
-					// Traverse all characters in PUA and their decompositions and try to replace as much decompositions as possible with combining characters.
-					rs.Open("SELECT [VRS_CharList].[znak] AS [znak], [VRS_ReplChar].[komb] AS [komb] " +
-						"FROM [VRS_CharList] RIGHT JOIN [VRS_ReplChar] ON [VRS_CharList].[znak]=[VRS_ReplChar].[znak] "+
-						"WHERE [VRS_CharList].[znak]>='E000' AND [VRS_CharList].[znak]<='F8FF' "+
-						"ORDER BY [VRS_CharList].[znak]", db, adOpenDynamic, adLockOptimistic, adCmdText);
-					try {
-						for (; !rs.EOF; rs.MoveNext()) {
-							var
-								compOrig   = fromUni(rs("znak").Value),
-								decompOrig = fromUni(rs("komb").Value),
-								decomp     = decompOrig;
-
-							for (var i in combining)
-								decomp = decomp.replace(combining[i].re_decomposed, combining[i].composed);
-							var comp = decomp;
-							for (var i in zrcola)
-								comp = comp.replace(zrcola[i].re_decomposed, zrcola[i].composed);
-
-							f.WriteLine(
-								"\"" + escapeCSV(toUni(compOrig  )) + "\"\t\"" + escapeCSV(compOrig  ) + "\"\t"+
-								"\"" + escapeCSV(toUni(decompOrig)) + "\"\t\"" + escapeCSV(decompOrig) + "\"\t"+
-								"\"" + escapeCSV(toUni(comp      )) + "\"\t\"" + escapeCSV(comp      ) + "\"\t"+
-								"\"" + escapeCSV(toUni(decomp    )) + "\"\t\"" + escapeCSV(decomp    ) + "\"\t");
-						}
-					} finally {
-						rs.Close();
-					}
-				} finally {
-					f.Close();
-				}
-			} catch (err) {
-				// In case of error, delete the output file.
-				try { fso.DeleteFile(outputPath); } catch (err2) {}
-
-				throw err;
-			} finally {
-				db.Close();
-			}
-
-			WScript.Quit(0);
-		]]></script>
-	</job>
-</package>