Related character detection is smarter and more accurate now

This commit is contained in:
Simon Rozman 2016-05-30 12:11:16 +02:00
parent beabeb17aa
commit d7968f539c
4 changed files with 64 additions and 23 deletions

View File

@ -35,6 +35,17 @@ ZRCola::DBSource::character_bank::character_bank() : vector<unique_ptr<ZRCola::D
void ZRCola::DBSource::character_bank::build_related()
{
// Initialize ignore list.
m_ignore.insert(L"letter");
m_ignore.insert(L"modifier");
m_ignore.insert(L"symbol");
m_ignore.insert(L"accent");
m_ignore.insert(L"with");
m_ignore.insert(L"and");
m_ignore.insert(L"capital");
m_ignore.insert(L"small");
m_ignore.insert(L"combining");
SYSTEM_INFO si;
GetSystemInfo(&si);
@ -59,8 +70,9 @@ void ZRCola::DBSource::character_bank::build_related()
}
ZRCola::DBSource::character_bank::build_related_worker::build_related_worker(_In_ character_bank *cb, _In_ size_type from, _In_ size_type to) :
ZRCola::DBSource::character_bank::build_related_worker::build_related_worker(_In_ const character_bank *cb, _In_ size_type from, _In_ size_type to) :
thread_type((HANDLE)_beginthreadex(NULL, 0, process, this, CREATE_SUSPENDED, NULL)),
m_heap(HeapCreate(0, 0, 0)),
m_cb(cb),
m_from(from),
m_to(to)
@ -70,9 +82,18 @@ ZRCola::DBSource::character_bank::build_related_worker::build_related_worker(_In
}
ZRCola::DBSource::character_bank::build_related_worker::~build_related_worker()
{
assert(m_heap);
HeapDestroy(m_heap);
}
unsigned int ZRCola::DBSource::character_bank::build_related_worker::process()
{
wstring rel;
heap_allocator<wchar_t> al(m_heap);
basic_string<wchar_t, char_traits<wchar_t>, heap_allocator<wchar_t> > rel(al);
set<wstring, less<wstring>, heap_allocator<wstring> > matching(less<wstring>(), al);
for (size_type i = m_from; i < m_to; i++) {
ZRCola::DBSource::character &chr = *(m_cb->at(i).get());
@ -89,19 +110,34 @@ unsigned int ZRCola::DBSource::character_bank::build_related_worker::process()
// Add all characters that share enought keywords.
for (size_type j = 0, j_end = m_cb->size(); j < j_end; j++) {
if (i == j) continue;
if (i == j || rel.find((wchar_t)j) != wstring::npos)
continue;
const ZRCola::DBSource::character &chr2 = *(m_cb->at(j).get());
if (&chr2 == NULL) continue;
if (&chr2 == NULL)
continue;
std::list<std::wstring>::size_type matching = 0;
for (std::list<std::wstring>::const_iterator term = chr.terms.cbegin(), term_end = chr.terms.cend(); term != term_end; ++term)
for (std::list<std::wstring>::const_iterator term2 = chr2.terms.cbegin(), term2_end = chr2.terms.cend(); term2 != term2_end; ++term2)
set<wstring>::size_type comparisons = 0;
matching.clear();
for (set<wstring>::const_iterator term = chr.terms.cbegin(), term_end = chr.terms.cend(); term != term_end; ++term) {
// Test for ignored word(s).
if (m_cb->m_ignore.find(*term) != m_cb->m_ignore.cend())
continue;
for (set<wstring>::const_iterator term2 = chr2.terms.cbegin(), term2_end = chr2.terms.cend(); term2 != term2_end; ++term2) {
// Test for ignored word(s).
if (m_cb->m_ignore.find(*term2) != m_cb->m_ignore.cend())
continue;
comparisons++;
if (*term == *term2)
matching++;
matching.insert(*term);
}
}
// If 7/8 terms match, assume related.
if (matching*8 > std::min<std::list<std::wstring>::size_type>(chr.terms.size(), chr2.terms.size())*7)
rel += chr2.chr;
if (comparisons) {
// If 1/2 terms match, assume related.
set<wstring>::size_type hits = matching.size();
if (hits*hits*2 >= comparisons)
rel += chr2.chr;
}
}
chr.rel.assign(rel.c_str(), rel.length());
@ -121,7 +157,7 @@ unsigned int __stdcall ZRCola::DBSource::character_bank::build_related_worker::p
// ZRCola::DBSource::character_desc_idx
//////////////////////////////////////////////////////////////////////////
void ZRCola::DBSource::character_desc_idx::parse_keywords(const wchar_t *str, list<wstring> &terms)
void ZRCola::DBSource::character_desc_idx::parse_keywords(const wchar_t *str, set<wstring> &terms)
{
wxASSERT_MSG(str, wxT("string is NULL"));
@ -161,15 +197,15 @@ void ZRCola::DBSource::character_desc_idx::parse_keywords(const wchar_t *str, li
if (!term.empty()) {
transform(term.begin(), term.end(), term.begin(), towlower);
terms.push_back(term);
terms.insert(term);
}
}
}
void ZRCola::DBSource::character_desc_idx::add_keywords(const std::list<std::wstring> &terms, wchar_t chr, size_t sub)
void ZRCola::DBSource::character_desc_idx::add_keywords(const set<wstring> &terms, wchar_t chr, size_t sub)
{
for (list<wstring>::const_iterator term = terms.cbegin(), term_end = terms.cend(); term != term_end; ++term) {
for (set<wstring>::const_iterator term = terms.cbegin(), term_end = terms.cend(); term != term_end; ++term) {
if (sub) {
wstring::size_type j_end = term->size();
if (j_end >= sub) {

View File

@ -24,9 +24,9 @@
#include <atlbase.h>
#include <adoint.h>
#include <list>
#include <map>
#include <memory>
#include <set>
#include <string>
#include <vector>
@ -155,7 +155,7 @@ namespace ZRCola {
wchar_t chr; ///< Character
ZRCola::chrcatid_t cat; ///< Category ID
std::wstring desc; ///< Character description
std::list<std::wstring> terms; ///< Search terms
std::set<std::wstring> terms; ///< Search terms
std::wstring rel; ///< Related characters
};
@ -176,7 +176,8 @@ namespace ZRCola {
typedef std::unique_ptr<void, stdex::CloseHandle_delete<void> > thread_type;
public:
build_related_worker(_In_ character_bank *cb, _In_ size_type from, _In_ size_type to);
build_related_worker(_In_ const character_bank *cb, _In_ size_type from, _In_ size_type to);
virtual ~build_related_worker();
inline void join()
{
@ -197,9 +198,13 @@ namespace ZRCola {
static unsigned int __stdcall process(_In_ void *param);
protected:
character_bank *m_cb;
const character_bank *m_cb;
size_type m_from, m_to;
HANDLE m_heap;
};
protected:
std::set<std::wstring> m_ignore;
};
@ -229,11 +234,11 @@ namespace ZRCola {
class character_desc_idx : public std::map<std::wstring, std::vector<wchar_t>, character_desc_idx_less>
{
public:
static void parse_keywords(const wchar_t *str, std::list<std::wstring> &terms);
void add_keywords(const std::list<std::wstring> &terms, wchar_t chr, size_t sub = 0);
static void parse_keywords(const wchar_t *str, std::set<std::wstring> &terms);
void add_keywords(const std::set<std::wstring> &terms, wchar_t chr, size_t sub = 0);
inline void add_keywords(const wchar_t *str, wchar_t chr, size_t sub = 0)
{
std::list<std::wstring> terms;
std::set<std::wstring> terms;
parse_keywords(str, terms);
add_keywords(terms, chr, sub);
}

@ -1 +1 @@
Subproject commit 0ed2d0a53dd3f8ee72fce0ba324aaeb86bf4f4d9
Subproject commit b091bc90bc7445fc117d5818ffd9b5314721fc4c

Binary file not shown.