ZRCola/bin/Combining.wsf

<?xml version="1.0" encoding="utf-8"?>
<!--
    Copyright © 2021 Amebis

    This file is part of ZRCola.

    ZRCola is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    ZRCola is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with ZRCola. If not, see <http://www.gnu.org/licenses/>.
-->
<package>
	<job id="Analyse">
		<runtime>
			<description>Unicode Combining Character Analysis - Amebis, Copyright © 2021</description>
			<unnamed name="&lt;ZRCola.mdb&gt;"  required="true" helpstring="ZRCola database"/>
			<unnamed name="&lt;results.csv&gt;" required="true" helpstring="Output file with analysis results"/>
		</runtime>
		<reference object="ADODB.Connection"/>
		<reference object="ADODB.Command"/>
		<reference object="ADODB.Recordset"/>
		<reference object="Scripting.FileSystemObject"/>
		<script language="JScript"><![CDATA[
			if (WScript.Arguments.Unnamed.Length < 2) {
				WScript.Arguments.ShowUsage();
				WScript.Quit(1);
			}

			var parseUnicode_stat = {
				"re_separator" : new RegExp("\\s*\\+\\s*", "g")
			};

			function fromUni(str)
			{
				var result = "";
				var a = str.split(parseUnicode_stat.re_separator);
				for (var i in a)
					result += String.fromCharCode(parseInt(a[i], 16));
				return result;
			}

			function toUni(str)
			{
				var i, n = str.length, result = "";
				for (i = 0; i < n; i++) {
					if (i) result += "+";
					var val = str.charCodeAt(i);
					if (val < 0x10) result += "000";
					else if (val < 0x100) result += "00";
					else if (val < 0x1000) result += "0";
					result += val.toString(16).toUpperCase();
				}
				return result;
			}

			var escapeRegExp_stat = {
				"re_specialChar" : new RegExp("[.*+?^${}()|[\\]\\\\]", "g")
			};

			function escapeRegExp(str)
			{
				return str.replace(escapeRegExp_stat.re_specialChar, "\\$&");
			}

			var escapeCSV_stat = {
				"re_quote" : new RegExp("\"", "g")
			};

			function escapeCSV(str)
			{
				return str.replace(escapeCSV_stat.re_quote, "\"\"");
			}

			// Open ZRCola database.
			var dbPath = WScript.Arguments.Unnamed(0);
			var outputPath = WScript.Arguments.Unnamed(1);
			var db = WScript.CreateObject("ADODB.Connection");
			db.Open("Driver={Microsoft Access Driver (*.mdb)};Dbq=" + dbPath + ";Uid=;Pwd=;");
			try {
				// Open Unicode Data file.
				var
					fso = WScript.CreateObject("Scripting.FileSystemObject"),
					f = fso.CreateTextFile(outputPath, true, true);
				try {
					// Build a dictionary of all compositions and known combining characters.
					var zrcola = [], combining = [];
					var rs = WScript.CreateObject("ADODB.Recordset");
					rs.CursorLocation = adUseClient;
					rs.Open("SELECT [komb], [znak] FROM [VRS_ReplChar] ORDER BY [rang_komb], LEN([komb]) DESC", db, adOpenDynamic, adLockOptimistic, adCmdText);
					try {
						for (; !rs.EOF; rs.MoveNext()) {
							var
								decomposed = fromUni(rs("komb").Value),
								composed   = fromUni(rs("znak").Value);

							zrcola.push({
								"decomposed"    : decomposed,
								"re_decomposed" : new RegExp(escapeRegExp(decomposed), "g"),
								"composed"      : composed
							});

							if (decomposed.charCodeAt(0) == 0x203f)
								combining.push({
									"decomposed"    : decomposed.substring(1),
									"re_decomposed" : new RegExp(escapeRegExp(decomposed.substring(1)), "g"),
									"composed"      : composed
								});
						}
					} finally {
						rs.Close();
					}

					f.WriteLine(
						"\"compOrig\""   + "\t" + "\"compOrigZRCOLA\""   + "\t" +
						"\"decompOrig\"" + "\t" + "\"decompOrigZRCOLA\"" + "\t" +
						"\"comp\""       + "\t" + "\"compZRCOLA\""       + "\t" +
						"\"decomp\""     + "\t" + "\"decompZRCOLA\""     + "\t");

					// Traverse all characters in PUA and their decompositions and try to replace as much decompositions as possible with combining characters.
					rs.Open("SELECT [VRS_CharList].[znak] AS [znak], [VRS_ReplChar].[komb] AS [komb] " +
						"FROM [VRS_CharList] RIGHT JOIN [VRS_ReplChar] ON [VRS_CharList].[znak]=[VRS_ReplChar].[znak] "+
						"WHERE [VRS_CharList].[znak]>='E000' AND [VRS_CharList].[znak]<='F8FF' "+
						"ORDER BY [VRS_CharList].[znak]", db, adOpenDynamic, adLockOptimistic, adCmdText);
					try {
						for (; !rs.EOF; rs.MoveNext()) {
							var
								compOrig   = fromUni(rs("znak").Value),
								decompOrig = fromUni(rs("komb").Value),
								decomp     = decompOrig;

							for (var i in combining)
								decomp = decomp.replace(combining[i].re_decomposed, combining[i].composed);
							var comp = decomp;
							for (var i in zrcola)
								comp = comp.replace(zrcola[i].re_decomposed, zrcola[i].composed);

							f.WriteLine(
								"\"" + escapeCSV(toUni(compOrig  )) + "\"\t\"" + escapeCSV(compOrig  ) + "\"\t"+
								"\"" + escapeCSV(toUni(decompOrig)) + "\"\t\"" + escapeCSV(decompOrig) + "\"\t"+
								"\"" + escapeCSV(toUni(comp      )) + "\"\t\"" + escapeCSV(comp      ) + "\"\t"+
								"\"" + escapeCSV(toUni(decomp    )) + "\"\t\"" + escapeCSV(decomp    ) + "\"\t");
						}
					} finally {
						rs.Close();
					}
				} finally {
					f.Close();
				}
			} catch (err) {
				// In case of error, delete the output file.
				try { fso.DeleteFile(outputPath); } catch (err2) {}

				throw err;
			} finally {
				db.Close();
			}

			WScript.Quit(0);
		]]></script>
	</job>
</package>