Add support for Unicode character blocks

Signed-off-by: Simon Rozman <simon@rozman.si>
2025-11-14 08:18:36 +01:00
parent 9c4caf506b
commit 1ba71443f5
9 changed files with 2974 additions and 2611 deletions
--- a/ZRCola/zrcolaapp.cpp
+++ b/ZRCola/zrcolaapp.cpp
@@ -132,6 +132,12 @@ bool ZRColaApp::OnInit()
                            wxFAIL_MSG(wxT("Error reading character category data from ZRCola.zrcdb."));
                            m_cc_db.clear();
                        }
+                    } else if (id == ZRCola::chrblk_rec::id()) {
+                        dat >> ZRCola::chrblk_rec(m_cb_db);
+                        if (!dat.good()) {
+                            wxFAIL_MSG(wxT("Error reading character block data from ZRCola.zrcdb."));
+                            m_cb_db.clear();
+                        }
                    } else if (id == ZRCola::chrtag_rec::id()) {
                        dat >> ZRCola::chrtag_rec(m_ct_db);
                        if (!dat.good()) {
--- a/ZRCola/zrcolaapp.h
+++ b/ZRCola/zrcolaapp.h
@@ -70,7 +70,8 @@ public:
    ZRCola::language_db m_lang_db;  ///< Language database
    ZRCola::keyseq_db m_ks_db;      ///< Key sequence database
    ZRCola::character_db m_chr_db;  ///< Character database
-    ZRCola::chrcat_db m_cc_db;      ///< Characted category database
+    ZRCola::chrcat_db m_cc_db;      ///< Character category database
+    ZRCola::chrblk_db m_cb_db;      ///< Character block database
    ZRCola::chrtag_db m_ct_db;      ///< Character tag database
    ZRCola::tagname_db m_tn_db;     ///< Tag name database
    ZRCola::highlight_db m_h_db;    ///< Highlight database
--- a/ZRColaCompile/dbsource.cpp
+++ b/ZRColaCompile/dbsource.cpp
--- a/ZRColaCompile/dbsource.h
+++ b/ZRColaCompile/dbsource.h
--- a/ZRColaCompile/main.cpp
+++ b/ZRColaCompile/main.cpp
@@ -756,7 +756,32 @@ int _tmain(int argc, _TCHAR *argv[])
                ZRCola::DBSource::character_desc_idx idxChrDsc, idxChrDscSub;
                ZRCola::DBSource::character_bank chrs;

-                // Phase 1: Parse characters and build indexes.
+                // Phase 1: Get character blocks.
+                com_obj<ADORecordset> rs2;
+                if (src.SelectCharacterBlocks(rs2)) {
+                    size_t count2 = src.GetRecordsetCount(rs2);
+                    if (count2 < 0xffffffff) { // 4G check (-1 is reserved for error condition)
+                        // Parse character blocks and build index and data.
+                        for (; !ZRCola::DBSource::IsEOF(rs2); rs2->MoveNext()) {
+                            // Read character block from the database.
+                            ZRCola::DBSource::chrblk cb;
+                            if (src.GetCharacterBlock(rs2, cb))
+                                chrs.idxChrBlk[cb.first] = std::move(cb.second);
+                            else
+                                has_errors = true;
+                        }
+                    }
+                    else {
+                        _ftprintf(stderr, wxT("%s: error ZCC0029: Error getting character block count from database or too many character blocks.\n"), (LPCTSTR)filenameIn.c_str());
+                        has_errors = true;
+                    }
+                }
+                else {
+                    _ftprintf(stderr, wxT("%s: error ZCC0028: Error getting character blocks from database. Please make sure the file is ZRCola.zrc compatible.\n"), (LPCTSTR)filenameIn.c_str());
+                    has_errors = true;
+                }
+
+                // Phase 2: Parse characters and build indexes.
                for (; !ZRCola::DBSource::IsEOF(rs); rs->MoveNext()) {
                    // Read character from the database.
                    ZRCola::DBSource::character chr;
@@ -766,33 +791,64 @@ int _tmain(int argc, _TCHAR *argv[])
                        has_errors = true;
                }

-                // Phase 2: Build related character lists.
+                // Phase 3: Build related character lists.
                chrs.build_related();

-                ZRCola::character_db db;
+                {
+                    ZRCola::character_db db;

-                // Preallocate memory.
-                db.idxChr.reserve(count);
-                db.data  .reserve(count*4);
+                    // Preallocate memory.
+                    db.idxChr.reserve(count);
+                    db.data.reserve(count * 4);

-                // Phase 3: Parse characters and build index and data.
-                for (auto chr = chrs.cbegin(), chr_end = chrs.cend(); chr != chr_end; ++chr) {
-                    // Add character to index and data.
-                    db << *chr;
+                    // Phase 4: Parse characters and build index and data.
+                    for (auto chr = chrs.cbegin(), chr_end = chrs.cend(); chr != chr_end; ++chr) {
+                        // Add character to index and data.
+                        db << *chr;

-                    // Add description (and keywords) to index.
-                    idxChrDsc   .add_keywords(chr->second.terms, chr->first, 0);
-                    idxChrDscSub.add_keywords(chr->second.terms, chr->first, 3);
+                        // Add description (and keywords) to index.
+                        idxChrDsc.add_keywords(chr->second.terms, chr->first, 0);
+                        idxChrDscSub.add_keywords(chr->second.terms, chr->first, 3);

-                    // Mark category used.
-                    categories_used.insert(chr->second.cat);
+                        // Mark category used.
+                        categories_used.insert(chr->second.cat);
+                    }
+
+                    // Write characters to file.
+                    db.idxChr.sort();
+                    idxChrDsc.save(db.idxDsc);
+                    idxChrDscSub.save(db.idxDscSub);
+                    dst << ZRCola::character_rec(db);
                }

-                // Write characters to file.
-                db.idxChr.sort();
-                idxChrDsc   .save(db.idxDsc   );
-                idxChrDscSub.save(db.idxDscSub);
-                dst << ZRCola::character_rec(db);
+                {
+                    ZRCola::chrblk_db db;
+
+                    // Preallocate memory.
+                    db.idxChrId.reserve(chrs.idxChrBlk.size());
+                    db.idxRank.reserve(chrs.idxChrBlk.size());
+                    db.data.reserve(chrs.idxChrBlk.size() * 16);
+
+                    // Phase 5: Parse character blocks and build index and data.
+                    for (auto& cb : chrs.idxChrBlk)
+                    {
+                        if (!cb.second.used) {
+                            // Skip unused character blocks.
+                            continue;
+                        }
+
+                        if (build_pot)
+                            pot.insert(cb.second.name);
+
+                        // Add character block to index and data.
+                        db << cb;
+                    }
+
+                    // Write character blocks to file.
+                    db.idxChrId.sort();
+                    db.idxRank.sort();
+                    dst << ZRCola::chrblk_rec(db);
+                }
            } else {
                _ftprintf(stderr, wxT("%s: error ZCC0017: Error getting character count from database or too many characters.\n"), (LPCTSTR)filenameIn.c_str());
                has_errors = true;
--- a/lib/libZRCola/include/zrcola/character.h
+++ b/lib/libZRCola/include/zrcola/character.h
@@ -171,6 +171,12 @@ namespace ZRCola {
    }


+    ///
+    /// Character block ID
+    ///
+    typedef uint16_t chrblkid_t;
+
+
    ///
    /// Character Database
    ///
@@ -184,6 +190,7 @@ namespace ZRCola {
        struct character {
        public:
            chrcatid_t cat;     ///> Character category ID
+            chrblkid_t blk;     ///> Character block ID

        protected:
            uint16_t chr_to;    ///< Character end in \c data
@@ -202,6 +209,7 @@ namespace ZRCola {
            /// \param[in] chr       Character
            /// \param[in] chr_len   Number of UTF-16 characters in \p chr
            /// \param[in] cat       Category
+            /// \param[in] blk       Unicode block
            /// \param[in] desc      Description
            /// \param[in] desc_len  Number of UTF-16 characters in \p desc
            /// \param[in] rel       Related characters list (zero delimited)
@@ -211,12 +219,14 @@ namespace ZRCola {
                _In_opt_z_count_(chr_len)  const char_t     *chr      = NULL,
                _In_opt_                         size_t      chr_len  = 0,
                _In_opt_                         chrcatid_t  cat      = chrcatid_t(),
+                _In_opt_                         chrblkid_t  blk      = 0,
                _In_opt_z_count_(desc_len) const char_t     *desc     = NULL,
                _In_opt_                         size_t      desc_len = 0,
                _In_opt_z_count_(rel_len)  const char_t     *rel      = NULL,
                _In_opt_                         size_t      rel_len  = 0)
            {
                this->cat = cat;
+                this->blk = blk;
                this->chr_to = static_cast<uint16_t>(chr_len);
                if (chr && chr_len) memcpy(this->data, chr, sizeof(char_t)*chr_len);
                this->desc_to = static_cast<uint16_t>(this->chr_to + desc_len);
@@ -330,6 +340,25 @@ namespace ZRCola {
            return idxChr.find(*c, start) ? idxChr[start].cat : chrcatid_t();
        }

+        ///
+        /// Get character block
+        ///
+        /// \param[in] chr  Character
+        /// \param[in] len  Number of UTF-16 characters in \p chr
+        ///
+        /// \returns
+        /// - Character block if character found
+        /// - 0 otherwise
+        ///
+        chrblkid_t GetCharBlk(_In_z_count_(len) const char_t *chr, _In_ const size_t len) const
+        {
+            assert(len <= 0xffff);
+            std::unique_ptr<character> c((character*)new char[sizeof(character) + sizeof(char_t)*len]);
+            new (c.get()) character(chr, len);
+            indexChr::size_type start;
+            return idxChr.find(*c, start) ? idxChr[start].blk : 0;
+        }
+
        ///
        /// Writes character database to a stream
        ///
@@ -650,6 +679,11 @@ namespace ZRCola {
    /// Character category database
    ///
    using chrcat_db = chrclass_db<chrcatid_t>;
+
+    ///
+    /// Character block database
+    ///
+    using chrblk_db = chrclass_db<chrblkid_t>;
 };

 #pragma warning(pop)
--- a/lib/libZRCola/include/zrcola/idrec.h
+++ b/lib/libZRCola/include/zrcola/idrec.h
@@ -16,6 +16,7 @@
 namespace ZRCola {
    typedef stdex::idrec::record<character_db, recordid_t, 0x524843 /*"CHR"*/, recordsize_t, ZRCOLA_RECORD_ALIGN> character_rec;
    typedef stdex::idrec::record<chrcat_db, recordid_t, 0x544343 /*"CCT"*/, recordsize_t, ZRCOLA_RECORD_ALIGN> chrcat_rec;
+    typedef stdex::idrec::record<chrblk_db, recordid_t, 0x4c4243 /*"CBL"*/, recordsize_t, ZRCOLA_RECORD_ALIGN> chrblk_rec;
    typedef stdex::idrec::record<highlight_db, recordid_t, 0x484748 /*"HGH"*/, recordsize_t, ZRCOLA_RECORD_ALIGN> highlight_rec;
    typedef stdex::idrec::record<langchar_db, recordid_t, 0x432d4c /*"L-C"*/, recordsize_t, ZRCOLA_RECORD_ALIGN> langchar_rec;
    typedef stdex::idrec::record<language_db, recordid_t, 0x474e4c /*"LNG"*/, recordsize_t, ZRCOLA_RECORD_ALIGN> language_rec;
--- a/output/data/ZRCola.zrcdb
+++ b/output/data/ZRCola.zrcdb
--- a/output/locale/ZRCola-zrcdb.pot
+++ b/output/locale/ZRCola-zrcdb.pot
@@ -10,6 +10,9 @@ msgstr ""
 msgid "Albanian"
 msgstr ""

+msgid "Alphabetic Presentation Forms"
+msgstr ""
+
 msgid "Apostrophes 1"
 msgstr ""

@@ -19,24 +22,42 @@ msgstr ""
 msgid "Arabic"
 msgstr ""

+msgid "Armenian"
+msgstr ""
+
 msgid "Arrows"
 msgstr ""

 msgid "Belarusian"
 msgstr ""

+msgid "Block Elements"
+msgstr ""
+
 msgid "Bosnian – Cyrillic"
 msgstr ""

 msgid "Bosnian – Latinic"
 msgstr ""

+msgid "Box Drawing"
+msgstr ""
+
+msgid "C0 Controls and Basic Latin (Basic Latin)"
+msgstr ""
+
+msgid "C1 Controls and Latin-1 Supplement (Latin-1 Supplement)"
+msgstr ""
+
 msgid "CAPITAL Case"
 msgstr ""

 msgid "CAPITAL Case » small Case"
 msgstr ""

+msgid "CJK Symbols and Punctuation"
+msgstr ""
+
 msgid "Combine"
 msgstr ""

@@ -49,6 +70,18 @@ msgstr ""
 msgid "Combine Over"
 msgstr ""

+msgid "Combining Diacritical Marks"
+msgstr ""
+
+msgid "Combining Diacritical Marks Supplement"
+msgstr ""
+
+msgid "Combining Diacritical Marks for Symbols"
+msgstr ""
+
+msgid "Combining Half Marks"
+msgstr ""
+
 msgid "Combining Marks"
 msgstr ""

@@ -58,6 +91,9 @@ msgstr ""
 msgid "Currencies"
 msgstr ""

+msgid "Currency Symbols"
+msgstr ""
+
 msgid "Cyrillic"
 msgstr ""

@@ -118,6 +154,12 @@ msgstr ""
 msgid "Cyrillic BdC"
 msgstr ""

+msgid "Cyrillic Extended-B"
+msgstr ""
+
+msgid "Cyrillic Supplement"
+msgstr ""
+
 msgid "Cyrillic » Latin (GOST2000)"
 msgstr ""

@@ -130,9 +172,15 @@ msgstr ""
 msgid "Diacritics"
 msgstr ""

+msgid "Dingbats"
+msgstr ""
+
 msgid "Encircled Characters"
 msgstr ""

+msgid "Enclosed Alphanumerics"
+msgstr ""
+
 msgid "English"
 msgstr ""

@@ -148,6 +196,12 @@ msgstr ""
 msgid "Friulian"
 msgstr ""

+msgid "General Punctuation"
+msgstr ""
+
+msgid "Geometric Shapes"
+msgstr ""
+
 msgid "Geometrical Shapes"
 msgstr ""

@@ -169,12 +223,24 @@ msgstr ""
 msgid "Greek (Old)"
 msgstr ""

+msgid "Greek Extended"
+msgstr ""
+
+msgid "Greek and Coptic"
+msgstr ""
+
+msgid "Gujarati"
+msgstr ""
+
 msgid "Hebrew"
 msgstr ""

 msgid "Hungarian"
 msgstr ""

+msgid "IPA Extensions"
+msgstr ""
+
 msgid "Irish Gaelic"
 msgstr ""

@@ -295,6 +361,21 @@ msgstr ""
 msgid "Latin BdC"
 msgstr ""

+msgid "Latin Extended Additional"
+msgstr ""
+
+msgid "Latin Extended-A"
+msgstr ""
+
+msgid "Latin Extended-B"
+msgstr ""
+
+msgid "Latin Extended-C"
+msgstr ""
+
+msgid "Latin Extended-D"
+msgstr ""
+
 msgid "Latin » Cyrillic (Belarusian)"
 msgstr ""

@@ -433,6 +514,9 @@ msgstr ""
 msgid "Letter, Uppercase"
 msgstr ""

+msgid "Letterlike Symbols"
+msgstr ""
+
 msgid "Ligatures"
 msgstr ""

@@ -457,9 +541,27 @@ msgstr ""
 msgid "Mathematical And Physical Symbols"
 msgstr ""

+msgid "Mathematical Operators"
+msgstr ""
+
 msgid "Metric"
 msgstr ""

+msgid "Miscellaneous Mathematical Symbols-A"
+msgstr ""
+
+msgid "Miscellaneous Mathematical Symbols-B"
+msgstr ""
+
+msgid "Miscellaneous Symbols"
+msgstr ""
+
+msgid "Miscellaneous Symbols and Arrows"
+msgstr ""
+
+msgid "Miscellaneous Technical"
+msgstr ""
+
 msgid "Modified"
 msgstr ""

@@ -505,6 +607,9 @@ msgstr ""
 msgid "Number 9"
 msgstr ""

+msgid "Number Forms"
+msgstr ""
+
 msgid "Number, Decimal Digit"
 msgstr ""

@@ -532,12 +637,21 @@ msgstr ""
 msgid "Parentheses"
 msgstr ""

+msgid "Phonetic Extensions"
+msgstr ""
+
+msgid "Phonetic Extensions Supplement"
+msgstr ""
+
 msgid "Polish"
 msgstr ""

 msgid "Portuguese"
 msgstr ""

+msgid "Private Use Area"
+msgstr ""
+
 msgid "Punctuation"
 msgstr ""

@@ -607,12 +721,18 @@ msgstr ""
 msgid "Spaces"
 msgstr ""

+msgid "Spacing Modifier Letters"
+msgstr ""
+
 msgid "Spanish"
 msgstr ""

 msgid "Special Characters"
 msgstr ""

+msgid "Specials"
+msgstr ""
+
 msgid "Strokes"
 msgstr ""

@@ -622,6 +742,18 @@ msgstr ""
 msgid "Superscript Sharacters"
 msgstr ""

+msgid "Superscripts and Subscripts"
+msgstr ""
+
+msgid "Supplemental Arrows-A"
+msgstr ""
+
+msgid "Supplemental Arrows-B"
+msgstr ""
+
+msgid "Supplemental Punctuation"
+msgstr ""
+
 msgid "Surrounded"
 msgstr ""

@@ -676,6 +808,9 @@ msgstr ""
 msgid "Symbol, Other"
 msgstr ""

+msgid "Syriac"
+msgstr ""
+
 msgid "Technical Characters"
 msgstr ""

@@ -694,6 +829,9 @@ msgstr ""
 msgid "Ukrainian"
 msgstr ""

+msgid "Unified Canadian Aboriginal Syllabics"
+msgstr ""
+
 msgid "Units"
 msgstr ""