Inactive related characters removed

This commit is contained in:
Simon Rozman 2016-05-26 08:53:38 +02:00
parent 7bbe92d359
commit d70b72b0c1
5 changed files with 129 additions and 56 deletions

View File

@ -22,7 +22,7 @@
using namespace std;
bool ZRCola::DBSource::character_desc_idx::add_keywords(const wchar_t *str, wchar_t chr, size_t sub)
void ZRCola::DBSource::character_desc_idx::parse_keywords(const wchar_t *str, list<wstring> &terms)
{
wxASSERT_MSG(str, wxT("string is NULL"));
@ -30,7 +30,7 @@ bool ZRCola::DBSource::character_desc_idx::add_keywords(const wchar_t *str, wcha
// Skip white space.
for (;;) {
if (*str == 0)
return true;
return;
else if (!iswspace(*str))
break;
else
@ -62,23 +62,29 @@ bool ZRCola::DBSource::character_desc_idx::add_keywords(const wchar_t *str, wcha
if (!term.empty()) {
transform(term.begin(), term.end(), term.begin(), towlower);
if (sub) {
wstring::size_type j_end = term.size();
if (j_end >= sub) {
// Insert all keyword substrings "sub" or more characters long.
for (wstring::size_type i = 0, i_end = j_end - sub; i <= i_end; ++i) {
for (wstring::size_type j = i + sub; j <= j_end; ++j)
add_keyword(term.substr(i, j - i), chr);
}
}
} else {
// Insert exact keyword only.
add_keyword(term, chr);
}
terms.push_back(term);
}
}
}
return true;
void ZRCola::DBSource::character_desc_idx::add_keywords(const std::list<std::wstring> &terms, wchar_t chr, size_t sub)
{
for (list<wstring>::const_iterator term = terms.cbegin(), term_end = terms.cend(); term != term_end; ++term) {
if (sub) {
wstring::size_type j_end = term->size();
if (j_end >= sub) {
// Insert all keyword substrings "sub" or more characters long.
for (wstring::size_type i = 0, i_end = j_end - sub; i <= i_end; ++i) {
for (wstring::size_type j = i + sub; j <= j_end; ++j)
add_keyword(term->substr(i, j - i), chr);
}
}
} else {
// Insert exact keyword only.
add_keyword(*term, chr);
}
}
}
@ -772,12 +778,15 @@ bool ZRCola::DBSource::GetCharacter(const ATL::CComPtr<ADORecordset>& rs, charac
wxVERIFY(SUCCEEDED(flds->get_Item(ATL::CComVariant(L"opis_en"), &f)));
wxCHECK(GetValue(f, chr.desc), false);
}
ZRCola::DBSource::character_desc_idx::parse_keywords(chr.desc.c_str(), chr.terms);
wstring keywords;
{
ATL::CComPtr<ADOField> f;
wxVERIFY(SUCCEEDED(flds->get_Item(ATL::CComVariant(L"klj_bes_en"), &f)));
wxCHECK(GetValue(f, chr.keywords), false);
wxCHECK(GetValue(f, keywords), false);
}
ZRCola::DBSource::character_desc_idx::parse_keywords(keywords.c_str(), chr.terms);
{
ATL::CComPtr<ADOField> f;

View File

@ -24,6 +24,7 @@
#include <atlbase.h>
#include <adoint.h>
#include <list>
#include <map>
#include <string>
#include <vector>
@ -119,11 +120,42 @@ namespace ZRCola {
///
class character {
public:
wchar_t chr; ///< Character
ZRCola::chrcatid_t cat; ///> Category ID
std::wstring desc; ///< Character description
std::wstring keywords; ///< Additional keywords
std::wstring rel; ///< Related characters
inline character()
{
chr = 0;
cat.data[0] = 0;
cat.data[1] = 0;
}
inline character(_In_ const character &othr) :
chr (othr.chr),
cat (othr.cat),
desc (othr.desc),
terms(othr.terms),
rel (othr.rel)
{
}
inline bool operator==(_In_ const character &othr) const
{
return
chr == othr.chr &&
cat == othr.cat &&
desc == othr.desc &&
terms == othr.terms &&
rel == othr.rel;
}
inline bool operator!=(_In_ const character &othr) const
{
return !operator==(othr);
}
wchar_t chr; ///< Character
ZRCola::chrcatid_t cat; ///< Category ID
std::wstring desc; ///< Character description
std::list<std::wstring> terms; ///< Search terms
std::wstring rel; ///< Related characters
};
@ -153,7 +185,14 @@ namespace ZRCola {
class character_desc_idx : public std::map<std::wstring, std::vector<wchar_t>, character_desc_idx_less>
{
public:
bool add_keywords(const wchar_t *str, wchar_t chr, size_t sub = 0);
static void parse_keywords(const wchar_t *str, std::list<std::wstring> &terms);
void add_keywords(const std::list<std::wstring> &terms, wchar_t chr, size_t sub = 0);
inline void add_keywords(const wchar_t *str, wchar_t chr, size_t sub = 0)
{
std::list<std::wstring> terms;
parse_keywords(str, terms);
add_keywords(terms, chr, sub);
}
void save(ZRCola::textindex<wchar_t, wchar_t, unsigned __int32> &idx) const;

View File

@ -393,47 +393,70 @@ int _tmain(int argc, _TCHAR *argv[])
if (src.SelectCharacters(rs)) {
size_t count = src.GetRecordsetCount(rs);
if (count < 0xffffffff) { // 4G check (-1 is reserved for error condition)
ZRCola::DBSource::character chr;
ZRCola::character_db db;
ZRCola::DBSource::character_desc_idx idxChrDsc, idxChrDscSub;
vector<unique_ptr<ZRCola::DBSource::character> > chrs;
chrs.resize(0x10000);
// Phase 1: Parse characters and build indexes.
while (!ZRCola::DBSource::IsEOF(rs)) {
// Read character from the database.
unique_ptr<ZRCola::DBSource::character> c(new ZRCola::DBSource::character);
if (src.GetCharacter(rs, *c)) {
const ZRCola::DBSource::character &chr = *c.get();
chrs[chr.chr].swap(c);
// Add description (and keywords) to index.
idxChrDsc .add_keywords(chr.terms, chr.chr, 0);
idxChrDscSub.add_keywords(chr.terms, chr.chr, 3);
} else
has_errors = true;
wxVERIFY(SUCCEEDED(rs->MoveNext()));
}
// Phase 2: Build related character lists.
for (size_t i = 0, i_end = chrs.size(); i < i_end; i++) {
ZRCola::DBSource::character &chr = *(chrs[i].get());
if (&chr == NULL) continue;
// Remove all unexisting or inactive related characters.
for (wstring::size_type i = chr.rel.length(); i--;) {
if (!chrs[chr.rel[i]])
chr.rel.erase(i, 1);
}
}
ZRCola::character_db db;
// Preallocate memory.
db.idxChr.reserve(count);
db.data .reserve(count*4);
// Parse characters and build index and data.
while (!ZRCola::DBSource::IsEOF(rs)) {
// Read character from the database.
if (src.GetCharacter(rs, chr)) {
// Add character to index and data.
unsigned __int32 idx = db.data.size();
db.data.push_back((unsigned __int16)chr.chr);
for (wstring::size_type i = 0; i < sizeof(ZRCola::chrcatid_t)/sizeof(unsigned __int16); i++)
db.data.push_back(((const unsigned __int16*)chr.cat.data)[i]);
wstring::size_type n_desc = chr.desc.length();
wxASSERT_MSG(n_desc <= 0xffff, wxT("character description too long"));
db.data.push_back((unsigned __int16)n_desc);
wstring::size_type n_rel = chr.rel.length();
wxASSERT_MSG(n_rel <= 0xffff, wxT("too many related characters"));
db.data.push_back((unsigned __int16)n_rel);
for (wstring::size_type i = 0; i < n_desc; i++)
db.data.push_back(chr.desc[i]);
for (wstring::size_type i = 0; i < n_rel; i++)
db.data.push_back(chr.rel[i]);
db.idxChr.push_back(idx);
// Phase 3: Parse characters and build index and data.
for (size_t i = 0, i_end = chrs.size(); i < i_end; i++) {
const ZRCola::DBSource::character &chr = *(chrs[i].get());
if (&chr == NULL) continue;
// Add description (and keywords) to index.
idxChrDsc .add_keywords(chr.desc .c_str(), chr.chr, 0);
idxChrDsc .add_keywords(chr.keywords.c_str(), chr.chr, 0);
idxChrDscSub.add_keywords(chr.desc .c_str(), chr.chr, 3);
idxChrDscSub.add_keywords(chr.keywords.c_str(), chr.chr, 3);
// Add character to index and data.
unsigned __int32 idx = db.data.size();
db.data.push_back((unsigned __int16)chr.chr);
for (wstring::size_type i = 0; i < sizeof(ZRCola::chrcatid_t)/sizeof(unsigned __int16); i++)
db.data.push_back(((const unsigned __int16*)chr.cat.data)[i]);
wstring::size_type n_desc = chr.desc.length();
wxASSERT_MSG(n_desc <= 0xffff, wxT("character description too long"));
db.data.push_back((unsigned __int16)n_desc);
wstring::size_type n_rel = chr.rel.length();
wxASSERT_MSG(n_rel <= 0xffff, wxT("too many related characters"));
db.data.push_back((unsigned __int16)n_rel);
for (wstring::size_type i = 0; i < n_desc; i++)
db.data.push_back(chr.desc[i]);
for (wstring::size_type i = 0; i < n_rel; i++)
db.data.push_back(chr.rel[i]);
db.idxChr.push_back(idx);
// Mark category used.
categories_used.insert(chr.cat);
} else
has_errors = true;
wxVERIFY(SUCCEEDED(rs->MoveNext()));
// Mark category used.
categories_used.insert(chr.cat);
}
// Sort indices.

View File

@ -52,4 +52,6 @@
#include <codecvt>
#include <cwctype>
#include <fstream>
#include <memory>
#include <set>
#include <vector>

Binary file not shown.