diff --git a/ZRColaCompile/dbsource.cpp b/ZRColaCompile/dbsource.cpp index 3fc473c..17ab538 100644 --- a/ZRColaCompile/dbsource.cpp +++ b/ZRColaCompile/dbsource.cpp @@ -35,6 +35,17 @@ ZRCola::DBSource::character_bank::character_bank() : vector al(m_heap); + basic_string, heap_allocator > rel(al); + set, heap_allocator > matching(less(), al); for (size_type i = m_from; i < m_to; i++) { ZRCola::DBSource::character &chr = *(m_cb->at(i).get()); @@ -89,19 +110,34 @@ unsigned int ZRCola::DBSource::character_bank::build_related_worker::process() // Add all characters that share enought keywords. for (size_type j = 0, j_end = m_cb->size(); j < j_end; j++) { - if (i == j) continue; + if (i == j || rel.find((wchar_t)j) != wstring::npos) + continue; const ZRCola::DBSource::character &chr2 = *(m_cb->at(j).get()); - if (&chr2 == NULL) continue; + if (&chr2 == NULL) + continue; - std::list::size_type matching = 0; - for (std::list::const_iterator term = chr.terms.cbegin(), term_end = chr.terms.cend(); term != term_end; ++term) - for (std::list::const_iterator term2 = chr2.terms.cbegin(), term2_end = chr2.terms.cend(); term2 != term2_end; ++term2) + set::size_type comparisons = 0; + matching.clear(); + for (set::const_iterator term = chr.terms.cbegin(), term_end = chr.terms.cend(); term != term_end; ++term) { + // Test for ignored word(s). + if (m_cb->m_ignore.find(*term) != m_cb->m_ignore.cend()) + continue; + for (set::const_iterator term2 = chr2.terms.cbegin(), term2_end = chr2.terms.cend(); term2 != term2_end; ++term2) { + // Test for ignored word(s). + if (m_cb->m_ignore.find(*term2) != m_cb->m_ignore.cend()) + continue; + comparisons++; if (*term == *term2) - matching++; + matching.insert(*term); + } + } - // If 7/8 terms match, assume related. - if (matching*8 > std::min::size_type>(chr.terms.size(), chr2.terms.size())*7) - rel += chr2.chr; + if (comparisons) { + // If 1/2 terms match, assume related. + set::size_type hits = matching.size(); + if (hits*hits*2 >= comparisons) + rel += chr2.chr; + } } chr.rel.assign(rel.c_str(), rel.length()); @@ -121,7 +157,7 @@ unsigned int __stdcall ZRCola::DBSource::character_bank::build_related_worker::p // ZRCola::DBSource::character_desc_idx ////////////////////////////////////////////////////////////////////////// -void ZRCola::DBSource::character_desc_idx::parse_keywords(const wchar_t *str, list &terms) +void ZRCola::DBSource::character_desc_idx::parse_keywords(const wchar_t *str, set &terms) { wxASSERT_MSG(str, wxT("string is NULL")); @@ -161,15 +197,15 @@ void ZRCola::DBSource::character_desc_idx::parse_keywords(const wchar_t *str, li if (!term.empty()) { transform(term.begin(), term.end(), term.begin(), towlower); - terms.push_back(term); + terms.insert(term); } } } -void ZRCola::DBSource::character_desc_idx::add_keywords(const std::list &terms, wchar_t chr, size_t sub) +void ZRCola::DBSource::character_desc_idx::add_keywords(const set &terms, wchar_t chr, size_t sub) { - for (list::const_iterator term = terms.cbegin(), term_end = terms.cend(); term != term_end; ++term) { + for (set::const_iterator term = terms.cbegin(), term_end = terms.cend(); term != term_end; ++term) { if (sub) { wstring::size_type j_end = term->size(); if (j_end >= sub) { diff --git a/ZRColaCompile/dbsource.h b/ZRColaCompile/dbsource.h index 1eb658e..e913ae6 100644 --- a/ZRColaCompile/dbsource.h +++ b/ZRColaCompile/dbsource.h @@ -24,9 +24,9 @@ #include #include -#include #include #include +#include #include #include @@ -155,7 +155,7 @@ namespace ZRCola { wchar_t chr; ///< Character ZRCola::chrcatid_t cat; ///< Category ID std::wstring desc; ///< Character description - std::list terms; ///< Search terms + std::set terms; ///< Search terms std::wstring rel; ///< Related characters }; @@ -176,7 +176,8 @@ namespace ZRCola { typedef std::unique_ptr > thread_type; public: - build_related_worker(_In_ character_bank *cb, _In_ size_type from, _In_ size_type to); + build_related_worker(_In_ const character_bank *cb, _In_ size_type from, _In_ size_type to); + virtual ~build_related_worker(); inline void join() { @@ -197,9 +198,13 @@ namespace ZRCola { static unsigned int __stdcall process(_In_ void *param); protected: - character_bank *m_cb; + const character_bank *m_cb; size_type m_from, m_to; + HANDLE m_heap; }; + + protected: + std::set m_ignore; }; @@ -229,11 +234,11 @@ namespace ZRCola { class character_desc_idx : public std::map, character_desc_idx_less> { public: - static void parse_keywords(const wchar_t *str, std::list &terms); - void add_keywords(const std::list &terms, wchar_t chr, size_t sub = 0); + static void parse_keywords(const wchar_t *str, std::set &terms); + void add_keywords(const std::set &terms, wchar_t chr, size_t sub = 0); inline void add_keywords(const wchar_t *str, wchar_t chr, size_t sub = 0) { - std::list terms; + std::set terms; parse_keywords(str, terms); add_keywords(terms, chr, sub); } diff --git a/lib/stdex b/lib/stdex index 0ed2d0a..b091bc9 160000 --- a/lib/stdex +++ b/lib/stdex @@ -1 +1 @@ -Subproject commit 0ed2d0a53dd3f8ee72fce0ba324aaeb86bf4f4d9 +Subproject commit b091bc90bc7445fc117d5818ffd9b5314721fc4c diff --git a/output/data/ZRCola.zrcdb b/output/data/ZRCola.zrcdb index c474f8e..a9be16e 100644 Binary files a/output/data/ZRCola.zrcdb and b/output/data/ZRCola.zrcdb differ