Add support for Unicode character blocks

Signed-off-by: Simon Rozman <simon@rozman.si>
This commit is contained in:
Simon Rozman
2025-11-14 08:18:36 +01:00
parent 9c4caf506b
commit 1ba71443f5
9 changed files with 2974 additions and 2611 deletions

View File

@@ -132,6 +132,12 @@ bool ZRColaApp::OnInit()
wxFAIL_MSG(wxT("Error reading character category data from ZRCola.zrcdb."));
m_cc_db.clear();
}
} else if (id == ZRCola::chrblk_rec::id()) {
dat >> ZRCola::chrblk_rec(m_cb_db);
if (!dat.good()) {
wxFAIL_MSG(wxT("Error reading character block data from ZRCola.zrcdb."));
m_cb_db.clear();
}
} else if (id == ZRCola::chrtag_rec::id()) {
dat >> ZRCola::chrtag_rec(m_ct_db);
if (!dat.good()) {

View File

@@ -70,7 +70,8 @@ public:
ZRCola::language_db m_lang_db; ///< Language database
ZRCola::keyseq_db m_ks_db; ///< Key sequence database
ZRCola::character_db m_chr_db; ///< Character database
ZRCola::chrcat_db m_cc_db; ///< Characted category database
ZRCola::chrcat_db m_cc_db; ///< Character category database
ZRCola::chrblk_db m_cb_db; ///< Character block database
ZRCola::chrtag_db m_ct_db; ///< Character tag database
ZRCola::tagname_db m_tn_db; ///< Tag name database
ZRCola::highlight_db m_h_db; ///< Highlight database

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -756,7 +756,32 @@ int _tmain(int argc, _TCHAR *argv[])
ZRCola::DBSource::character_desc_idx idxChrDsc, idxChrDscSub;
ZRCola::DBSource::character_bank chrs;
// Phase 1: Parse characters and build indexes.
// Phase 1: Get character blocks.
com_obj<ADORecordset> rs2;
if (src.SelectCharacterBlocks(rs2)) {
size_t count2 = src.GetRecordsetCount(rs2);
if (count2 < 0xffffffff) { // 4G check (-1 is reserved for error condition)
// Parse character blocks and build index and data.
for (; !ZRCola::DBSource::IsEOF(rs2); rs2->MoveNext()) {
// Read character block from the database.
ZRCola::DBSource::chrblk cb;
if (src.GetCharacterBlock(rs2, cb))
chrs.idxChrBlk[cb.first] = std::move(cb.second);
else
has_errors = true;
}
}
else {
_ftprintf(stderr, wxT("%s: error ZCC0029: Error getting character block count from database or too many character blocks.\n"), (LPCTSTR)filenameIn.c_str());
has_errors = true;
}
}
else {
_ftprintf(stderr, wxT("%s: error ZCC0028: Error getting character blocks from database. Please make sure the file is ZRCola.zrc compatible.\n"), (LPCTSTR)filenameIn.c_str());
has_errors = true;
}
// Phase 2: Parse characters and build indexes.
for (; !ZRCola::DBSource::IsEOF(rs); rs->MoveNext()) {
// Read character from the database.
ZRCola::DBSource::character chr;
@@ -766,33 +791,64 @@ int _tmain(int argc, _TCHAR *argv[])
has_errors = true;
}
// Phase 2: Build related character lists.
// Phase 3: Build related character lists.
chrs.build_related();
ZRCola::character_db db;
{
ZRCola::character_db db;
// Preallocate memory.
db.idxChr.reserve(count);
db.data .reserve(count*4);
// Preallocate memory.
db.idxChr.reserve(count);
db.data.reserve(count * 4);
// Phase 3: Parse characters and build index and data.
for (auto chr = chrs.cbegin(), chr_end = chrs.cend(); chr != chr_end; ++chr) {
// Add character to index and data.
db << *chr;
// Phase 4: Parse characters and build index and data.
for (auto chr = chrs.cbegin(), chr_end = chrs.cend(); chr != chr_end; ++chr) {
// Add character to index and data.
db << *chr;
// Add description (and keywords) to index.
idxChrDsc .add_keywords(chr->second.terms, chr->first, 0);
idxChrDscSub.add_keywords(chr->second.terms, chr->first, 3);
// Add description (and keywords) to index.
idxChrDsc.add_keywords(chr->second.terms, chr->first, 0);
idxChrDscSub.add_keywords(chr->second.terms, chr->first, 3);
// Mark category used.
categories_used.insert(chr->second.cat);
// Mark category used.
categories_used.insert(chr->second.cat);
}
// Write characters to file.
db.idxChr.sort();
idxChrDsc.save(db.idxDsc);
idxChrDscSub.save(db.idxDscSub);
dst << ZRCola::character_rec(db);
}
// Write characters to file.
db.idxChr.sort();
idxChrDsc .save(db.idxDsc );
idxChrDscSub.save(db.idxDscSub);
dst << ZRCola::character_rec(db);
{
ZRCola::chrblk_db db;
// Preallocate memory.
db.idxChrId.reserve(chrs.idxChrBlk.size());
db.idxRank.reserve(chrs.idxChrBlk.size());
db.data.reserve(chrs.idxChrBlk.size() * 16);
// Phase 5: Parse character blocks and build index and data.
for (auto& cb : chrs.idxChrBlk)
{
if (!cb.second.used) {
// Skip unused character blocks.
continue;
}
if (build_pot)
pot.insert(cb.second.name);
// Add character block to index and data.
db << cb;
}
// Write character blocks to file.
db.idxChrId.sort();
db.idxRank.sort();
dst << ZRCola::chrblk_rec(db);
}
} else {
_ftprintf(stderr, wxT("%s: error ZCC0017: Error getting character count from database or too many characters.\n"), (LPCTSTR)filenameIn.c_str());
has_errors = true;

View File

@@ -171,6 +171,12 @@ namespace ZRCola {
}
///
/// Character block ID
///
typedef uint16_t chrblkid_t;
///
/// Character Database
///
@@ -184,6 +190,7 @@ namespace ZRCola {
struct character {
public:
chrcatid_t cat; ///> Character category ID
chrblkid_t blk; ///> Character block ID
protected:
uint16_t chr_to; ///< Character end in \c data
@@ -202,6 +209,7 @@ namespace ZRCola {
/// \param[in] chr Character
/// \param[in] chr_len Number of UTF-16 characters in \p chr
/// \param[in] cat Category
/// \param[in] blk Unicode block
/// \param[in] desc Description
/// \param[in] desc_len Number of UTF-16 characters in \p desc
/// \param[in] rel Related characters list (zero delimited)
@@ -211,12 +219,14 @@ namespace ZRCola {
_In_opt_z_count_(chr_len) const char_t *chr = NULL,
_In_opt_ size_t chr_len = 0,
_In_opt_ chrcatid_t cat = chrcatid_t(),
_In_opt_ chrblkid_t blk = 0,
_In_opt_z_count_(desc_len) const char_t *desc = NULL,
_In_opt_ size_t desc_len = 0,
_In_opt_z_count_(rel_len) const char_t *rel = NULL,
_In_opt_ size_t rel_len = 0)
{
this->cat = cat;
this->blk = blk;
this->chr_to = static_cast<uint16_t>(chr_len);
if (chr && chr_len) memcpy(this->data, chr, sizeof(char_t)*chr_len);
this->desc_to = static_cast<uint16_t>(this->chr_to + desc_len);
@@ -330,6 +340,25 @@ namespace ZRCola {
return idxChr.find(*c, start) ? idxChr[start].cat : chrcatid_t();
}
///
/// Get character block
///
/// \param[in] chr Character
/// \param[in] len Number of UTF-16 characters in \p chr
///
/// \returns
/// - Character block if character found
/// - 0 otherwise
///
chrblkid_t GetCharBlk(_In_z_count_(len) const char_t *chr, _In_ const size_t len) const
{
assert(len <= 0xffff);
std::unique_ptr<character> c((character*)new char[sizeof(character) + sizeof(char_t)*len]);
new (c.get()) character(chr, len);
indexChr::size_type start;
return idxChr.find(*c, start) ? idxChr[start].blk : 0;
}
///
/// Writes character database to a stream
///
@@ -650,6 +679,11 @@ namespace ZRCola {
/// Character category database
///
using chrcat_db = chrclass_db<chrcatid_t>;
///
/// Character block database
///
using chrblk_db = chrclass_db<chrblkid_t>;
};
#pragma warning(pop)

View File

@@ -16,6 +16,7 @@
namespace ZRCola {
typedef stdex::idrec::record<character_db, recordid_t, 0x524843 /*"CHR"*/, recordsize_t, ZRCOLA_RECORD_ALIGN> character_rec;
typedef stdex::idrec::record<chrcat_db, recordid_t, 0x544343 /*"CCT"*/, recordsize_t, ZRCOLA_RECORD_ALIGN> chrcat_rec;
typedef stdex::idrec::record<chrblk_db, recordid_t, 0x4c4243 /*"CBL"*/, recordsize_t, ZRCOLA_RECORD_ALIGN> chrblk_rec;
typedef stdex::idrec::record<highlight_db, recordid_t, 0x484748 /*"HGH"*/, recordsize_t, ZRCOLA_RECORD_ALIGN> highlight_rec;
typedef stdex::idrec::record<langchar_db, recordid_t, 0x432d4c /*"L-C"*/, recordsize_t, ZRCOLA_RECORD_ALIGN> langchar_rec;
typedef stdex::idrec::record<language_db, recordid_t, 0x474e4c /*"LNG"*/, recordsize_t, ZRCOLA_RECORD_ALIGN> language_rec;

Binary file not shown.

View File

@@ -10,6 +10,9 @@ msgstr ""
msgid "Albanian"
msgstr ""
msgid "Alphabetic Presentation Forms"
msgstr ""
msgid "Apostrophes 1"
msgstr ""
@@ -19,24 +22,42 @@ msgstr ""
msgid "Arabic"
msgstr ""
msgid "Armenian"
msgstr ""
msgid "Arrows"
msgstr ""
msgid "Belarusian"
msgstr ""
msgid "Block Elements"
msgstr ""
msgid "Bosnian Cyrillic"
msgstr ""
msgid "Bosnian Latinic"
msgstr ""
msgid "Box Drawing"
msgstr ""
msgid "C0 Controls and Basic Latin (Basic Latin)"
msgstr ""
msgid "C1 Controls and Latin-1 Supplement (Latin-1 Supplement)"
msgstr ""
msgid "CAPITAL Case"
msgstr ""
msgid "CAPITAL Case » small Case"
msgstr ""
msgid "CJK Symbols and Punctuation"
msgstr ""
msgid "Combine"
msgstr ""
@@ -49,6 +70,18 @@ msgstr ""
msgid "Combine Over"
msgstr ""
msgid "Combining Diacritical Marks"
msgstr ""
msgid "Combining Diacritical Marks Supplement"
msgstr ""
msgid "Combining Diacritical Marks for Symbols"
msgstr ""
msgid "Combining Half Marks"
msgstr ""
msgid "Combining Marks"
msgstr ""
@@ -58,6 +91,9 @@ msgstr ""
msgid "Currencies"
msgstr ""
msgid "Currency Symbols"
msgstr ""
msgid "Cyrillic"
msgstr ""
@@ -118,6 +154,12 @@ msgstr ""
msgid "Cyrillic BdC"
msgstr ""
msgid "Cyrillic Extended-B"
msgstr ""
msgid "Cyrillic Supplement"
msgstr ""
msgid "Cyrillic » Latin (GOST2000)"
msgstr ""
@@ -130,9 +172,15 @@ msgstr ""
msgid "Diacritics"
msgstr ""
msgid "Dingbats"
msgstr ""
msgid "Encircled Characters"
msgstr ""
msgid "Enclosed Alphanumerics"
msgstr ""
msgid "English"
msgstr ""
@@ -148,6 +196,12 @@ msgstr ""
msgid "Friulian"
msgstr ""
msgid "General Punctuation"
msgstr ""
msgid "Geometric Shapes"
msgstr ""
msgid "Geometrical Shapes"
msgstr ""
@@ -169,12 +223,24 @@ msgstr ""
msgid "Greek (Old)"
msgstr ""
msgid "Greek Extended"
msgstr ""
msgid "Greek and Coptic"
msgstr ""
msgid "Gujarati"
msgstr ""
msgid "Hebrew"
msgstr ""
msgid "Hungarian"
msgstr ""
msgid "IPA Extensions"
msgstr ""
msgid "Irish Gaelic"
msgstr ""
@@ -295,6 +361,21 @@ msgstr ""
msgid "Latin BdC"
msgstr ""
msgid "Latin Extended Additional"
msgstr ""
msgid "Latin Extended-A"
msgstr ""
msgid "Latin Extended-B"
msgstr ""
msgid "Latin Extended-C"
msgstr ""
msgid "Latin Extended-D"
msgstr ""
msgid "Latin » Cyrillic (Belarusian)"
msgstr ""
@@ -433,6 +514,9 @@ msgstr ""
msgid "Letter, Uppercase"
msgstr ""
msgid "Letterlike Symbols"
msgstr ""
msgid "Ligatures"
msgstr ""
@@ -457,9 +541,27 @@ msgstr ""
msgid "Mathematical And Physical Symbols"
msgstr ""
msgid "Mathematical Operators"
msgstr ""
msgid "Metric"
msgstr ""
msgid "Miscellaneous Mathematical Symbols-A"
msgstr ""
msgid "Miscellaneous Mathematical Symbols-B"
msgstr ""
msgid "Miscellaneous Symbols"
msgstr ""
msgid "Miscellaneous Symbols and Arrows"
msgstr ""
msgid "Miscellaneous Technical"
msgstr ""
msgid "Modified"
msgstr ""
@@ -505,6 +607,9 @@ msgstr ""
msgid "Number 9"
msgstr ""
msgid "Number Forms"
msgstr ""
msgid "Number, Decimal Digit"
msgstr ""
@@ -532,12 +637,21 @@ msgstr ""
msgid "Parentheses"
msgstr ""
msgid "Phonetic Extensions"
msgstr ""
msgid "Phonetic Extensions Supplement"
msgstr ""
msgid "Polish"
msgstr ""
msgid "Portuguese"
msgstr ""
msgid "Private Use Area"
msgstr ""
msgid "Punctuation"
msgstr ""
@@ -607,12 +721,18 @@ msgstr ""
msgid "Spaces"
msgstr ""
msgid "Spacing Modifier Letters"
msgstr ""
msgid "Spanish"
msgstr ""
msgid "Special Characters"
msgstr ""
msgid "Specials"
msgstr ""
msgid "Strokes"
msgstr ""
@@ -622,6 +742,18 @@ msgstr ""
msgid "Superscript Sharacters"
msgstr ""
msgid "Superscripts and Subscripts"
msgstr ""
msgid "Supplemental Arrows-A"
msgstr ""
msgid "Supplemental Arrows-B"
msgstr ""
msgid "Supplemental Punctuation"
msgstr ""
msgid "Surrounded"
msgstr ""
@@ -676,6 +808,9 @@ msgstr ""
msgid "Symbol, Other"
msgstr ""
msgid "Syriac"
msgstr ""
msgid "Technical Characters"
msgstr ""
@@ -694,6 +829,9 @@ msgstr ""
msgid "Ukrainian"
msgstr ""
msgid "Unified Canadian Aboriginal Syllabics"
msgstr ""
msgid "Units"
msgstr ""