Related character detection is smarter and more accurate now
This commit is contained in:
parent
beabeb17aa
commit
d7968f539c
@ -35,6 +35,17 @@ ZRCola::DBSource::character_bank::character_bank() : vector<unique_ptr<ZRCola::D
|
||||
|
||||
void ZRCola::DBSource::character_bank::build_related()
|
||||
{
|
||||
// Initialize ignore list.
|
||||
m_ignore.insert(L"letter");
|
||||
m_ignore.insert(L"modifier");
|
||||
m_ignore.insert(L"symbol");
|
||||
m_ignore.insert(L"accent");
|
||||
m_ignore.insert(L"with");
|
||||
m_ignore.insert(L"and");
|
||||
m_ignore.insert(L"capital");
|
||||
m_ignore.insert(L"small");
|
||||
m_ignore.insert(L"combining");
|
||||
|
||||
SYSTEM_INFO si;
|
||||
GetSystemInfo(&si);
|
||||
|
||||
@ -59,8 +70,9 @@ void ZRCola::DBSource::character_bank::build_related()
|
||||
}
|
||||
|
||||
|
||||
ZRCola::DBSource::character_bank::build_related_worker::build_related_worker(_In_ character_bank *cb, _In_ size_type from, _In_ size_type to) :
|
||||
ZRCola::DBSource::character_bank::build_related_worker::build_related_worker(_In_ const character_bank *cb, _In_ size_type from, _In_ size_type to) :
|
||||
thread_type((HANDLE)_beginthreadex(NULL, 0, process, this, CREATE_SUSPENDED, NULL)),
|
||||
m_heap(HeapCreate(0, 0, 0)),
|
||||
m_cb(cb),
|
||||
m_from(from),
|
||||
m_to(to)
|
||||
@ -70,9 +82,18 @@ ZRCola::DBSource::character_bank::build_related_worker::build_related_worker(_In
|
||||
}
|
||||
|
||||
|
||||
ZRCola::DBSource::character_bank::build_related_worker::~build_related_worker()
|
||||
{
|
||||
assert(m_heap);
|
||||
HeapDestroy(m_heap);
|
||||
}
|
||||
|
||||
|
||||
unsigned int ZRCola::DBSource::character_bank::build_related_worker::process()
|
||||
{
|
||||
wstring rel;
|
||||
heap_allocator<wchar_t> al(m_heap);
|
||||
basic_string<wchar_t, char_traits<wchar_t>, heap_allocator<wchar_t> > rel(al);
|
||||
set<wstring, less<wstring>, heap_allocator<wstring> > matching(less<wstring>(), al);
|
||||
|
||||
for (size_type i = m_from; i < m_to; i++) {
|
||||
ZRCola::DBSource::character &chr = *(m_cb->at(i).get());
|
||||
@ -89,19 +110,34 @@ unsigned int ZRCola::DBSource::character_bank::build_related_worker::process()
|
||||
|
||||
// Add all characters that share enought keywords.
|
||||
for (size_type j = 0, j_end = m_cb->size(); j < j_end; j++) {
|
||||
if (i == j) continue;
|
||||
if (i == j || rel.find((wchar_t)j) != wstring::npos)
|
||||
continue;
|
||||
const ZRCola::DBSource::character &chr2 = *(m_cb->at(j).get());
|
||||
if (&chr2 == NULL) continue;
|
||||
if (&chr2 == NULL)
|
||||
continue;
|
||||
|
||||
std::list<std::wstring>::size_type matching = 0;
|
||||
for (std::list<std::wstring>::const_iterator term = chr.terms.cbegin(), term_end = chr.terms.cend(); term != term_end; ++term)
|
||||
for (std::list<std::wstring>::const_iterator term2 = chr2.terms.cbegin(), term2_end = chr2.terms.cend(); term2 != term2_end; ++term2)
|
||||
set<wstring>::size_type comparisons = 0;
|
||||
matching.clear();
|
||||
for (set<wstring>::const_iterator term = chr.terms.cbegin(), term_end = chr.terms.cend(); term != term_end; ++term) {
|
||||
// Test for ignored word(s).
|
||||
if (m_cb->m_ignore.find(*term) != m_cb->m_ignore.cend())
|
||||
continue;
|
||||
for (set<wstring>::const_iterator term2 = chr2.terms.cbegin(), term2_end = chr2.terms.cend(); term2 != term2_end; ++term2) {
|
||||
// Test for ignored word(s).
|
||||
if (m_cb->m_ignore.find(*term2) != m_cb->m_ignore.cend())
|
||||
continue;
|
||||
comparisons++;
|
||||
if (*term == *term2)
|
||||
matching++;
|
||||
matching.insert(*term);
|
||||
}
|
||||
}
|
||||
|
||||
// If 7/8 terms match, assume related.
|
||||
if (matching*8 > std::min<std::list<std::wstring>::size_type>(chr.terms.size(), chr2.terms.size())*7)
|
||||
rel += chr2.chr;
|
||||
if (comparisons) {
|
||||
// If 1/2 terms match, assume related.
|
||||
set<wstring>::size_type hits = matching.size();
|
||||
if (hits*hits*2 >= comparisons)
|
||||
rel += chr2.chr;
|
||||
}
|
||||
}
|
||||
|
||||
chr.rel.assign(rel.c_str(), rel.length());
|
||||
@ -121,7 +157,7 @@ unsigned int __stdcall ZRCola::DBSource::character_bank::build_related_worker::p
|
||||
// ZRCola::DBSource::character_desc_idx
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
|
||||
void ZRCola::DBSource::character_desc_idx::parse_keywords(const wchar_t *str, list<wstring> &terms)
|
||||
void ZRCola::DBSource::character_desc_idx::parse_keywords(const wchar_t *str, set<wstring> &terms)
|
||||
{
|
||||
wxASSERT_MSG(str, wxT("string is NULL"));
|
||||
|
||||
@ -161,15 +197,15 @@ void ZRCola::DBSource::character_desc_idx::parse_keywords(const wchar_t *str, li
|
||||
|
||||
if (!term.empty()) {
|
||||
transform(term.begin(), term.end(), term.begin(), towlower);
|
||||
terms.push_back(term);
|
||||
terms.insert(term);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void ZRCola::DBSource::character_desc_idx::add_keywords(const std::list<std::wstring> &terms, wchar_t chr, size_t sub)
|
||||
void ZRCola::DBSource::character_desc_idx::add_keywords(const set<wstring> &terms, wchar_t chr, size_t sub)
|
||||
{
|
||||
for (list<wstring>::const_iterator term = terms.cbegin(), term_end = terms.cend(); term != term_end; ++term) {
|
||||
for (set<wstring>::const_iterator term = terms.cbegin(), term_end = terms.cend(); term != term_end; ++term) {
|
||||
if (sub) {
|
||||
wstring::size_type j_end = term->size();
|
||||
if (j_end >= sub) {
|
||||
|
@ -24,9 +24,9 @@
|
||||
|
||||
#include <atlbase.h>
|
||||
#include <adoint.h>
|
||||
#include <list>
|
||||
#include <map>
|
||||
#include <memory>
|
||||
#include <set>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
@ -155,7 +155,7 @@ namespace ZRCola {
|
||||
wchar_t chr; ///< Character
|
||||
ZRCola::chrcatid_t cat; ///< Category ID
|
||||
std::wstring desc; ///< Character description
|
||||
std::list<std::wstring> terms; ///< Search terms
|
||||
std::set<std::wstring> terms; ///< Search terms
|
||||
std::wstring rel; ///< Related characters
|
||||
};
|
||||
|
||||
@ -176,7 +176,8 @@ namespace ZRCola {
|
||||
typedef std::unique_ptr<void, stdex::CloseHandle_delete<void> > thread_type;
|
||||
|
||||
public:
|
||||
build_related_worker(_In_ character_bank *cb, _In_ size_type from, _In_ size_type to);
|
||||
build_related_worker(_In_ const character_bank *cb, _In_ size_type from, _In_ size_type to);
|
||||
virtual ~build_related_worker();
|
||||
|
||||
inline void join()
|
||||
{
|
||||
@ -197,9 +198,13 @@ namespace ZRCola {
|
||||
static unsigned int __stdcall process(_In_ void *param);
|
||||
|
||||
protected:
|
||||
character_bank *m_cb;
|
||||
const character_bank *m_cb;
|
||||
size_type m_from, m_to;
|
||||
HANDLE m_heap;
|
||||
};
|
||||
|
||||
protected:
|
||||
std::set<std::wstring> m_ignore;
|
||||
};
|
||||
|
||||
|
||||
@ -229,11 +234,11 @@ namespace ZRCola {
|
||||
class character_desc_idx : public std::map<std::wstring, std::vector<wchar_t>, character_desc_idx_less>
|
||||
{
|
||||
public:
|
||||
static void parse_keywords(const wchar_t *str, std::list<std::wstring> &terms);
|
||||
void add_keywords(const std::list<std::wstring> &terms, wchar_t chr, size_t sub = 0);
|
||||
static void parse_keywords(const wchar_t *str, std::set<std::wstring> &terms);
|
||||
void add_keywords(const std::set<std::wstring> &terms, wchar_t chr, size_t sub = 0);
|
||||
inline void add_keywords(const wchar_t *str, wchar_t chr, size_t sub = 0)
|
||||
{
|
||||
std::list<std::wstring> terms;
|
||||
std::set<std::wstring> terms;
|
||||
parse_keywords(str, terms);
|
||||
add_keywords(terms, chr, sub);
|
||||
}
|
||||
|
@ -1 +1 @@
|
||||
Subproject commit 0ed2d0a53dd3f8ee72fce0ba324aaeb86bf4f4d9
|
||||
Subproject commit b091bc90bc7445fc117d5818ffd9b5314721fc4c
|
Binary file not shown.
Loading…
x
Reference in New Issue
Block a user