Related character detection is smarter and more accurate now
This commit is contained in:
parent
beabeb17aa
commit
d7968f539c
@ -35,6 +35,17 @@ ZRCola::DBSource::character_bank::character_bank() : vector<unique_ptr<ZRCola::D
|
|||||||
|
|
||||||
void ZRCola::DBSource::character_bank::build_related()
|
void ZRCola::DBSource::character_bank::build_related()
|
||||||
{
|
{
|
||||||
|
// Initialize ignore list.
|
||||||
|
m_ignore.insert(L"letter");
|
||||||
|
m_ignore.insert(L"modifier");
|
||||||
|
m_ignore.insert(L"symbol");
|
||||||
|
m_ignore.insert(L"accent");
|
||||||
|
m_ignore.insert(L"with");
|
||||||
|
m_ignore.insert(L"and");
|
||||||
|
m_ignore.insert(L"capital");
|
||||||
|
m_ignore.insert(L"small");
|
||||||
|
m_ignore.insert(L"combining");
|
||||||
|
|
||||||
SYSTEM_INFO si;
|
SYSTEM_INFO si;
|
||||||
GetSystemInfo(&si);
|
GetSystemInfo(&si);
|
||||||
|
|
||||||
@ -59,8 +70,9 @@ void ZRCola::DBSource::character_bank::build_related()
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
ZRCola::DBSource::character_bank::build_related_worker::build_related_worker(_In_ character_bank *cb, _In_ size_type from, _In_ size_type to) :
|
ZRCola::DBSource::character_bank::build_related_worker::build_related_worker(_In_ const character_bank *cb, _In_ size_type from, _In_ size_type to) :
|
||||||
thread_type((HANDLE)_beginthreadex(NULL, 0, process, this, CREATE_SUSPENDED, NULL)),
|
thread_type((HANDLE)_beginthreadex(NULL, 0, process, this, CREATE_SUSPENDED, NULL)),
|
||||||
|
m_heap(HeapCreate(0, 0, 0)),
|
||||||
m_cb(cb),
|
m_cb(cb),
|
||||||
m_from(from),
|
m_from(from),
|
||||||
m_to(to)
|
m_to(to)
|
||||||
@ -70,9 +82,18 @@ ZRCola::DBSource::character_bank::build_related_worker::build_related_worker(_In
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
ZRCola::DBSource::character_bank::build_related_worker::~build_related_worker()
|
||||||
|
{
|
||||||
|
assert(m_heap);
|
||||||
|
HeapDestroy(m_heap);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
unsigned int ZRCola::DBSource::character_bank::build_related_worker::process()
|
unsigned int ZRCola::DBSource::character_bank::build_related_worker::process()
|
||||||
{
|
{
|
||||||
wstring rel;
|
heap_allocator<wchar_t> al(m_heap);
|
||||||
|
basic_string<wchar_t, char_traits<wchar_t>, heap_allocator<wchar_t> > rel(al);
|
||||||
|
set<wstring, less<wstring>, heap_allocator<wstring> > matching(less<wstring>(), al);
|
||||||
|
|
||||||
for (size_type i = m_from; i < m_to; i++) {
|
for (size_type i = m_from; i < m_to; i++) {
|
||||||
ZRCola::DBSource::character &chr = *(m_cb->at(i).get());
|
ZRCola::DBSource::character &chr = *(m_cb->at(i).get());
|
||||||
@ -89,19 +110,34 @@ unsigned int ZRCola::DBSource::character_bank::build_related_worker::process()
|
|||||||
|
|
||||||
// Add all characters that share enought keywords.
|
// Add all characters that share enought keywords.
|
||||||
for (size_type j = 0, j_end = m_cb->size(); j < j_end; j++) {
|
for (size_type j = 0, j_end = m_cb->size(); j < j_end; j++) {
|
||||||
if (i == j) continue;
|
if (i == j || rel.find((wchar_t)j) != wstring::npos)
|
||||||
|
continue;
|
||||||
const ZRCola::DBSource::character &chr2 = *(m_cb->at(j).get());
|
const ZRCola::DBSource::character &chr2 = *(m_cb->at(j).get());
|
||||||
if (&chr2 == NULL) continue;
|
if (&chr2 == NULL)
|
||||||
|
continue;
|
||||||
|
|
||||||
std::list<std::wstring>::size_type matching = 0;
|
set<wstring>::size_type comparisons = 0;
|
||||||
for (std::list<std::wstring>::const_iterator term = chr.terms.cbegin(), term_end = chr.terms.cend(); term != term_end; ++term)
|
matching.clear();
|
||||||
for (std::list<std::wstring>::const_iterator term2 = chr2.terms.cbegin(), term2_end = chr2.terms.cend(); term2 != term2_end; ++term2)
|
for (set<wstring>::const_iterator term = chr.terms.cbegin(), term_end = chr.terms.cend(); term != term_end; ++term) {
|
||||||
|
// Test for ignored word(s).
|
||||||
|
if (m_cb->m_ignore.find(*term) != m_cb->m_ignore.cend())
|
||||||
|
continue;
|
||||||
|
for (set<wstring>::const_iterator term2 = chr2.terms.cbegin(), term2_end = chr2.terms.cend(); term2 != term2_end; ++term2) {
|
||||||
|
// Test for ignored word(s).
|
||||||
|
if (m_cb->m_ignore.find(*term2) != m_cb->m_ignore.cend())
|
||||||
|
continue;
|
||||||
|
comparisons++;
|
||||||
if (*term == *term2)
|
if (*term == *term2)
|
||||||
matching++;
|
matching.insert(*term);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// If 7/8 terms match, assume related.
|
if (comparisons) {
|
||||||
if (matching*8 > std::min<std::list<std::wstring>::size_type>(chr.terms.size(), chr2.terms.size())*7)
|
// If 1/2 terms match, assume related.
|
||||||
rel += chr2.chr;
|
set<wstring>::size_type hits = matching.size();
|
||||||
|
if (hits*hits*2 >= comparisons)
|
||||||
|
rel += chr2.chr;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
chr.rel.assign(rel.c_str(), rel.length());
|
chr.rel.assign(rel.c_str(), rel.length());
|
||||||
@ -121,7 +157,7 @@ unsigned int __stdcall ZRCola::DBSource::character_bank::build_related_worker::p
|
|||||||
// ZRCola::DBSource::character_desc_idx
|
// ZRCola::DBSource::character_desc_idx
|
||||||
//////////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
void ZRCola::DBSource::character_desc_idx::parse_keywords(const wchar_t *str, list<wstring> &terms)
|
void ZRCola::DBSource::character_desc_idx::parse_keywords(const wchar_t *str, set<wstring> &terms)
|
||||||
{
|
{
|
||||||
wxASSERT_MSG(str, wxT("string is NULL"));
|
wxASSERT_MSG(str, wxT("string is NULL"));
|
||||||
|
|
||||||
@ -161,15 +197,15 @@ void ZRCola::DBSource::character_desc_idx::parse_keywords(const wchar_t *str, li
|
|||||||
|
|
||||||
if (!term.empty()) {
|
if (!term.empty()) {
|
||||||
transform(term.begin(), term.end(), term.begin(), towlower);
|
transform(term.begin(), term.end(), term.begin(), towlower);
|
||||||
terms.push_back(term);
|
terms.insert(term);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
void ZRCola::DBSource::character_desc_idx::add_keywords(const std::list<std::wstring> &terms, wchar_t chr, size_t sub)
|
void ZRCola::DBSource::character_desc_idx::add_keywords(const set<wstring> &terms, wchar_t chr, size_t sub)
|
||||||
{
|
{
|
||||||
for (list<wstring>::const_iterator term = terms.cbegin(), term_end = terms.cend(); term != term_end; ++term) {
|
for (set<wstring>::const_iterator term = terms.cbegin(), term_end = terms.cend(); term != term_end; ++term) {
|
||||||
if (sub) {
|
if (sub) {
|
||||||
wstring::size_type j_end = term->size();
|
wstring::size_type j_end = term->size();
|
||||||
if (j_end >= sub) {
|
if (j_end >= sub) {
|
||||||
|
@ -24,9 +24,9 @@
|
|||||||
|
|
||||||
#include <atlbase.h>
|
#include <atlbase.h>
|
||||||
#include <adoint.h>
|
#include <adoint.h>
|
||||||
#include <list>
|
|
||||||
#include <map>
|
#include <map>
|
||||||
#include <memory>
|
#include <memory>
|
||||||
|
#include <set>
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
@ -155,7 +155,7 @@ namespace ZRCola {
|
|||||||
wchar_t chr; ///< Character
|
wchar_t chr; ///< Character
|
||||||
ZRCola::chrcatid_t cat; ///< Category ID
|
ZRCola::chrcatid_t cat; ///< Category ID
|
||||||
std::wstring desc; ///< Character description
|
std::wstring desc; ///< Character description
|
||||||
std::list<std::wstring> terms; ///< Search terms
|
std::set<std::wstring> terms; ///< Search terms
|
||||||
std::wstring rel; ///< Related characters
|
std::wstring rel; ///< Related characters
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -176,7 +176,8 @@ namespace ZRCola {
|
|||||||
typedef std::unique_ptr<void, stdex::CloseHandle_delete<void> > thread_type;
|
typedef std::unique_ptr<void, stdex::CloseHandle_delete<void> > thread_type;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
build_related_worker(_In_ character_bank *cb, _In_ size_type from, _In_ size_type to);
|
build_related_worker(_In_ const character_bank *cb, _In_ size_type from, _In_ size_type to);
|
||||||
|
virtual ~build_related_worker();
|
||||||
|
|
||||||
inline void join()
|
inline void join()
|
||||||
{
|
{
|
||||||
@ -197,9 +198,13 @@ namespace ZRCola {
|
|||||||
static unsigned int __stdcall process(_In_ void *param);
|
static unsigned int __stdcall process(_In_ void *param);
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
character_bank *m_cb;
|
const character_bank *m_cb;
|
||||||
size_type m_from, m_to;
|
size_type m_from, m_to;
|
||||||
|
HANDLE m_heap;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
protected:
|
||||||
|
std::set<std::wstring> m_ignore;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
@ -229,11 +234,11 @@ namespace ZRCola {
|
|||||||
class character_desc_idx : public std::map<std::wstring, std::vector<wchar_t>, character_desc_idx_less>
|
class character_desc_idx : public std::map<std::wstring, std::vector<wchar_t>, character_desc_idx_less>
|
||||||
{
|
{
|
||||||
public:
|
public:
|
||||||
static void parse_keywords(const wchar_t *str, std::list<std::wstring> &terms);
|
static void parse_keywords(const wchar_t *str, std::set<std::wstring> &terms);
|
||||||
void add_keywords(const std::list<std::wstring> &terms, wchar_t chr, size_t sub = 0);
|
void add_keywords(const std::set<std::wstring> &terms, wchar_t chr, size_t sub = 0);
|
||||||
inline void add_keywords(const wchar_t *str, wchar_t chr, size_t sub = 0)
|
inline void add_keywords(const wchar_t *str, wchar_t chr, size_t sub = 0)
|
||||||
{
|
{
|
||||||
std::list<std::wstring> terms;
|
std::set<std::wstring> terms;
|
||||||
parse_keywords(str, terms);
|
parse_keywords(str, terms);
|
||||||
add_keywords(terms, chr, sub);
|
add_keywords(terms, chr, sub);
|
||||||
}
|
}
|
||||||
|
@ -1 +1 @@
|
|||||||
Subproject commit 0ed2d0a53dd3f8ee72fce0ba324aaeb86bf4f4d9
|
Subproject commit b091bc90bc7445fc117d5818ffd9b5314721fc4c
|
Binary file not shown.
Loading…
x
Reference in New Issue
Block a user