diff --git a/ZRColaCompile/dbsource.cpp b/ZRColaCompile/dbsource.cpp index e47244e..339c63c 100644 --- a/ZRColaCompile/dbsource.cpp +++ b/ZRColaCompile/dbsource.cpp @@ -22,7 +22,7 @@ using namespace std; -bool ZRCola::DBSource::character_desc_idx::add_keywords(const wchar_t *str, wchar_t chr, size_t sub) +void ZRCola::DBSource::character_desc_idx::parse_keywords(const wchar_t *str, list &terms) { wxASSERT_MSG(str, wxT("string is NULL")); @@ -30,7 +30,7 @@ bool ZRCola::DBSource::character_desc_idx::add_keywords(const wchar_t *str, wcha // Skip white space. for (;;) { if (*str == 0) - return true; + return; else if (!iswspace(*str)) break; else @@ -62,23 +62,29 @@ bool ZRCola::DBSource::character_desc_idx::add_keywords(const wchar_t *str, wcha if (!term.empty()) { transform(term.begin(), term.end(), term.begin(), towlower); - if (sub) { - wstring::size_type j_end = term.size(); - if (j_end >= sub) { - // Insert all keyword substrings "sub" or more characters long. - for (wstring::size_type i = 0, i_end = j_end - sub; i <= i_end; ++i) { - for (wstring::size_type j = i + sub; j <= j_end; ++j) - add_keyword(term.substr(i, j - i), chr); - } - } - } else { - // Insert exact keyword only. - add_keyword(term, chr); - } + terms.push_back(term); } } +} - return true; + +void ZRCola::DBSource::character_desc_idx::add_keywords(const std::list &terms, wchar_t chr, size_t sub) +{ + for (list::const_iterator term = terms.cbegin(), term_end = terms.cend(); term != term_end; ++term) { + if (sub) { + wstring::size_type j_end = term->size(); + if (j_end >= sub) { + // Insert all keyword substrings "sub" or more characters long. + for (wstring::size_type i = 0, i_end = j_end - sub; i <= i_end; ++i) { + for (wstring::size_type j = i + sub; j <= j_end; ++j) + add_keyword(term->substr(i, j - i), chr); + } + } + } else { + // Insert exact keyword only. + add_keyword(*term, chr); + } + } } @@ -772,12 +778,15 @@ bool ZRCola::DBSource::GetCharacter(const ATL::CComPtr& rs, charac wxVERIFY(SUCCEEDED(flds->get_Item(ATL::CComVariant(L"opis_en"), &f))); wxCHECK(GetValue(f, chr.desc), false); } + ZRCola::DBSource::character_desc_idx::parse_keywords(chr.desc.c_str(), chr.terms); + wstring keywords; { ATL::CComPtr f; wxVERIFY(SUCCEEDED(flds->get_Item(ATL::CComVariant(L"klj_bes_en"), &f))); - wxCHECK(GetValue(f, chr.keywords), false); + wxCHECK(GetValue(f, keywords), false); } + ZRCola::DBSource::character_desc_idx::parse_keywords(keywords.c_str(), chr.terms); { ATL::CComPtr f; diff --git a/ZRColaCompile/dbsource.h b/ZRColaCompile/dbsource.h index 8771837..bb66b0f 100644 --- a/ZRColaCompile/dbsource.h +++ b/ZRColaCompile/dbsource.h @@ -24,6 +24,7 @@ #include #include +#include #include #include #include @@ -119,11 +120,42 @@ namespace ZRCola { /// class character { public: - wchar_t chr; ///< Character - ZRCola::chrcatid_t cat; ///> Category ID - std::wstring desc; ///< Character description - std::wstring keywords; ///< Additional keywords - std::wstring rel; ///< Related characters + inline character() + { + chr = 0; + cat.data[0] = 0; + cat.data[1] = 0; + } + + inline character(_In_ const character &othr) : + chr (othr.chr), + cat (othr.cat), + desc (othr.desc), + terms(othr.terms), + rel (othr.rel) + { + } + + inline bool operator==(_In_ const character &othr) const + { + return + chr == othr.chr && + cat == othr.cat && + desc == othr.desc && + terms == othr.terms && + rel == othr.rel; + } + + inline bool operator!=(_In_ const character &othr) const + { + return !operator==(othr); + } + + wchar_t chr; ///< Character + ZRCola::chrcatid_t cat; ///< Category ID + std::wstring desc; ///< Character description + std::list terms; ///< Search terms + std::wstring rel; ///< Related characters }; @@ -153,7 +185,14 @@ namespace ZRCola { class character_desc_idx : public std::map, character_desc_idx_less> { public: - bool add_keywords(const wchar_t *str, wchar_t chr, size_t sub = 0); + static void parse_keywords(const wchar_t *str, std::list &terms); + void add_keywords(const std::list &terms, wchar_t chr, size_t sub = 0); + inline void add_keywords(const wchar_t *str, wchar_t chr, size_t sub = 0) + { + std::list terms; + parse_keywords(str, terms); + add_keywords(terms, chr, sub); + } void save(ZRCola::textindex &idx) const; diff --git a/ZRColaCompile/main.cpp b/ZRColaCompile/main.cpp index c0a3387..4ae53d8 100644 --- a/ZRColaCompile/main.cpp +++ b/ZRColaCompile/main.cpp @@ -393,47 +393,70 @@ int _tmain(int argc, _TCHAR *argv[]) if (src.SelectCharacters(rs)) { size_t count = src.GetRecordsetCount(rs); if (count < 0xffffffff) { // 4G check (-1 is reserved for error condition) - ZRCola::DBSource::character chr; - ZRCola::character_db db; ZRCola::DBSource::character_desc_idx idxChrDsc, idxChrDscSub; + vector > chrs; + chrs.resize(0x10000); + + // Phase 1: Parse characters and build indexes. + while (!ZRCola::DBSource::IsEOF(rs)) { + // Read character from the database. + unique_ptr c(new ZRCola::DBSource::character); + if (src.GetCharacter(rs, *c)) { + const ZRCola::DBSource::character &chr = *c.get(); + chrs[chr.chr].swap(c); + + // Add description (and keywords) to index. + idxChrDsc .add_keywords(chr.terms, chr.chr, 0); + idxChrDscSub.add_keywords(chr.terms, chr.chr, 3); + } else + has_errors = true; + + wxVERIFY(SUCCEEDED(rs->MoveNext())); + } + + // Phase 2: Build related character lists. + for (size_t i = 0, i_end = chrs.size(); i < i_end; i++) { + ZRCola::DBSource::character &chr = *(chrs[i].get()); + if (&chr == NULL) continue; + + // Remove all unexisting or inactive related characters. + for (wstring::size_type i = chr.rel.length(); i--;) { + if (!chrs[chr.rel[i]]) + chr.rel.erase(i, 1); + } + } + + ZRCola::character_db db; + // Preallocate memory. db.idxChr.reserve(count); db.data .reserve(count*4); - // Parse characters and build index and data. - while (!ZRCola::DBSource::IsEOF(rs)) { - // Read character from the database. - if (src.GetCharacter(rs, chr)) { - // Add character to index and data. - unsigned __int32 idx = db.data.size(); - db.data.push_back((unsigned __int16)chr.chr); - for (wstring::size_type i = 0; i < sizeof(ZRCola::chrcatid_t)/sizeof(unsigned __int16); i++) - db.data.push_back(((const unsigned __int16*)chr.cat.data)[i]); - wstring::size_type n_desc = chr.desc.length(); - wxASSERT_MSG(n_desc <= 0xffff, wxT("character description too long")); - db.data.push_back((unsigned __int16)n_desc); - wstring::size_type n_rel = chr.rel.length(); - wxASSERT_MSG(n_rel <= 0xffff, wxT("too many related characters")); - db.data.push_back((unsigned __int16)n_rel); - for (wstring::size_type i = 0; i < n_desc; i++) - db.data.push_back(chr.desc[i]); - for (wstring::size_type i = 0; i < n_rel; i++) - db.data.push_back(chr.rel[i]); - db.idxChr.push_back(idx); + // Phase 3: Parse characters and build index and data. + for (size_t i = 0, i_end = chrs.size(); i < i_end; i++) { + const ZRCola::DBSource::character &chr = *(chrs[i].get()); + if (&chr == NULL) continue; - // Add description (and keywords) to index. - idxChrDsc .add_keywords(chr.desc .c_str(), chr.chr, 0); - idxChrDsc .add_keywords(chr.keywords.c_str(), chr.chr, 0); - idxChrDscSub.add_keywords(chr.desc .c_str(), chr.chr, 3); - idxChrDscSub.add_keywords(chr.keywords.c_str(), chr.chr, 3); + // Add character to index and data. + unsigned __int32 idx = db.data.size(); + db.data.push_back((unsigned __int16)chr.chr); + for (wstring::size_type i = 0; i < sizeof(ZRCola::chrcatid_t)/sizeof(unsigned __int16); i++) + db.data.push_back(((const unsigned __int16*)chr.cat.data)[i]); + wstring::size_type n_desc = chr.desc.length(); + wxASSERT_MSG(n_desc <= 0xffff, wxT("character description too long")); + db.data.push_back((unsigned __int16)n_desc); + wstring::size_type n_rel = chr.rel.length(); + wxASSERT_MSG(n_rel <= 0xffff, wxT("too many related characters")); + db.data.push_back((unsigned __int16)n_rel); + for (wstring::size_type i = 0; i < n_desc; i++) + db.data.push_back(chr.desc[i]); + for (wstring::size_type i = 0; i < n_rel; i++) + db.data.push_back(chr.rel[i]); + db.idxChr.push_back(idx); - // Mark category used. - categories_used.insert(chr.cat); - } else - has_errors = true; - - wxVERIFY(SUCCEEDED(rs->MoveNext())); + // Mark category used. + categories_used.insert(chr.cat); } // Sort indices. diff --git a/ZRColaCompile/stdafx.h b/ZRColaCompile/stdafx.h index 9d54a3a..58d3486 100644 --- a/ZRColaCompile/stdafx.h +++ b/ZRColaCompile/stdafx.h @@ -52,4 +52,6 @@ #include #include #include +#include #include +#include diff --git a/output/data/ZRCola.zrcdb b/output/data/ZRCola.zrcdb index 538d2a7..477e5d3 100644 Binary files a/output/data/ZRCola.zrcdb and b/output/data/ZRCola.zrcdb differ