Inactive related characters removed
This commit is contained in:
parent
7bbe92d359
commit
d70b72b0c1
@ -22,7 +22,7 @@
|
||||
using namespace std;
|
||||
|
||||
|
||||
bool ZRCola::DBSource::character_desc_idx::add_keywords(const wchar_t *str, wchar_t chr, size_t sub)
|
||||
void ZRCola::DBSource::character_desc_idx::parse_keywords(const wchar_t *str, list<wstring> &terms)
|
||||
{
|
||||
wxASSERT_MSG(str, wxT("string is NULL"));
|
||||
|
||||
@ -30,7 +30,7 @@ bool ZRCola::DBSource::character_desc_idx::add_keywords(const wchar_t *str, wcha
|
||||
// Skip white space.
|
||||
for (;;) {
|
||||
if (*str == 0)
|
||||
return true;
|
||||
return;
|
||||
else if (!iswspace(*str))
|
||||
break;
|
||||
else
|
||||
@ -62,23 +62,29 @@ bool ZRCola::DBSource::character_desc_idx::add_keywords(const wchar_t *str, wcha
|
||||
|
||||
if (!term.empty()) {
|
||||
transform(term.begin(), term.end(), term.begin(), towlower);
|
||||
if (sub) {
|
||||
wstring::size_type j_end = term.size();
|
||||
if (j_end >= sub) {
|
||||
// Insert all keyword substrings "sub" or more characters long.
|
||||
for (wstring::size_type i = 0, i_end = j_end - sub; i <= i_end; ++i) {
|
||||
for (wstring::size_type j = i + sub; j <= j_end; ++j)
|
||||
add_keyword(term.substr(i, j - i), chr);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Insert exact keyword only.
|
||||
add_keyword(term, chr);
|
||||
}
|
||||
terms.push_back(term);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
|
||||
void ZRCola::DBSource::character_desc_idx::add_keywords(const std::list<std::wstring> &terms, wchar_t chr, size_t sub)
|
||||
{
|
||||
for (list<wstring>::const_iterator term = terms.cbegin(), term_end = terms.cend(); term != term_end; ++term) {
|
||||
if (sub) {
|
||||
wstring::size_type j_end = term->size();
|
||||
if (j_end >= sub) {
|
||||
// Insert all keyword substrings "sub" or more characters long.
|
||||
for (wstring::size_type i = 0, i_end = j_end - sub; i <= i_end; ++i) {
|
||||
for (wstring::size_type j = i + sub; j <= j_end; ++j)
|
||||
add_keyword(term->substr(i, j - i), chr);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Insert exact keyword only.
|
||||
add_keyword(*term, chr);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -772,12 +778,15 @@ bool ZRCola::DBSource::GetCharacter(const ATL::CComPtr<ADORecordset>& rs, charac
|
||||
wxVERIFY(SUCCEEDED(flds->get_Item(ATL::CComVariant(L"opis_en"), &f)));
|
||||
wxCHECK(GetValue(f, chr.desc), false);
|
||||
}
|
||||
ZRCola::DBSource::character_desc_idx::parse_keywords(chr.desc.c_str(), chr.terms);
|
||||
|
||||
wstring keywords;
|
||||
{
|
||||
ATL::CComPtr<ADOField> f;
|
||||
wxVERIFY(SUCCEEDED(flds->get_Item(ATL::CComVariant(L"klj_bes_en"), &f)));
|
||||
wxCHECK(GetValue(f, chr.keywords), false);
|
||||
wxCHECK(GetValue(f, keywords), false);
|
||||
}
|
||||
ZRCola::DBSource::character_desc_idx::parse_keywords(keywords.c_str(), chr.terms);
|
||||
|
||||
{
|
||||
ATL::CComPtr<ADOField> f;
|
||||
|
@ -24,6 +24,7 @@
|
||||
|
||||
#include <atlbase.h>
|
||||
#include <adoint.h>
|
||||
#include <list>
|
||||
#include <map>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
@ -119,11 +120,42 @@ namespace ZRCola {
|
||||
///
|
||||
class character {
|
||||
public:
|
||||
wchar_t chr; ///< Character
|
||||
ZRCola::chrcatid_t cat; ///> Category ID
|
||||
std::wstring desc; ///< Character description
|
||||
std::wstring keywords; ///< Additional keywords
|
||||
std::wstring rel; ///< Related characters
|
||||
inline character()
|
||||
{
|
||||
chr = 0;
|
||||
cat.data[0] = 0;
|
||||
cat.data[1] = 0;
|
||||
}
|
||||
|
||||
inline character(_In_ const character &othr) :
|
||||
chr (othr.chr),
|
||||
cat (othr.cat),
|
||||
desc (othr.desc),
|
||||
terms(othr.terms),
|
||||
rel (othr.rel)
|
||||
{
|
||||
}
|
||||
|
||||
inline bool operator==(_In_ const character &othr) const
|
||||
{
|
||||
return
|
||||
chr == othr.chr &&
|
||||
cat == othr.cat &&
|
||||
desc == othr.desc &&
|
||||
terms == othr.terms &&
|
||||
rel == othr.rel;
|
||||
}
|
||||
|
||||
inline bool operator!=(_In_ const character &othr) const
|
||||
{
|
||||
return !operator==(othr);
|
||||
}
|
||||
|
||||
wchar_t chr; ///< Character
|
||||
ZRCola::chrcatid_t cat; ///< Category ID
|
||||
std::wstring desc; ///< Character description
|
||||
std::list<std::wstring> terms; ///< Search terms
|
||||
std::wstring rel; ///< Related characters
|
||||
};
|
||||
|
||||
|
||||
@ -153,7 +185,14 @@ namespace ZRCola {
|
||||
class character_desc_idx : public std::map<std::wstring, std::vector<wchar_t>, character_desc_idx_less>
|
||||
{
|
||||
public:
|
||||
bool add_keywords(const wchar_t *str, wchar_t chr, size_t sub = 0);
|
||||
static void parse_keywords(const wchar_t *str, std::list<std::wstring> &terms);
|
||||
void add_keywords(const std::list<std::wstring> &terms, wchar_t chr, size_t sub = 0);
|
||||
inline void add_keywords(const wchar_t *str, wchar_t chr, size_t sub = 0)
|
||||
{
|
||||
std::list<std::wstring> terms;
|
||||
parse_keywords(str, terms);
|
||||
add_keywords(terms, chr, sub);
|
||||
}
|
||||
|
||||
void save(ZRCola::textindex<wchar_t, wchar_t, unsigned __int32> &idx) const;
|
||||
|
||||
|
@ -393,47 +393,70 @@ int _tmain(int argc, _TCHAR *argv[])
|
||||
if (src.SelectCharacters(rs)) {
|
||||
size_t count = src.GetRecordsetCount(rs);
|
||||
if (count < 0xffffffff) { // 4G check (-1 is reserved for error condition)
|
||||
ZRCola::DBSource::character chr;
|
||||
ZRCola::character_db db;
|
||||
ZRCola::DBSource::character_desc_idx idxChrDsc, idxChrDscSub;
|
||||
|
||||
vector<unique_ptr<ZRCola::DBSource::character> > chrs;
|
||||
chrs.resize(0x10000);
|
||||
|
||||
// Phase 1: Parse characters and build indexes.
|
||||
while (!ZRCola::DBSource::IsEOF(rs)) {
|
||||
// Read character from the database.
|
||||
unique_ptr<ZRCola::DBSource::character> c(new ZRCola::DBSource::character);
|
||||
if (src.GetCharacter(rs, *c)) {
|
||||
const ZRCola::DBSource::character &chr = *c.get();
|
||||
chrs[chr.chr].swap(c);
|
||||
|
||||
// Add description (and keywords) to index.
|
||||
idxChrDsc .add_keywords(chr.terms, chr.chr, 0);
|
||||
idxChrDscSub.add_keywords(chr.terms, chr.chr, 3);
|
||||
} else
|
||||
has_errors = true;
|
||||
|
||||
wxVERIFY(SUCCEEDED(rs->MoveNext()));
|
||||
}
|
||||
|
||||
// Phase 2: Build related character lists.
|
||||
for (size_t i = 0, i_end = chrs.size(); i < i_end; i++) {
|
||||
ZRCola::DBSource::character &chr = *(chrs[i].get());
|
||||
if (&chr == NULL) continue;
|
||||
|
||||
// Remove all unexisting or inactive related characters.
|
||||
for (wstring::size_type i = chr.rel.length(); i--;) {
|
||||
if (!chrs[chr.rel[i]])
|
||||
chr.rel.erase(i, 1);
|
||||
}
|
||||
}
|
||||
|
||||
ZRCola::character_db db;
|
||||
|
||||
// Preallocate memory.
|
||||
db.idxChr.reserve(count);
|
||||
db.data .reserve(count*4);
|
||||
|
||||
// Parse characters and build index and data.
|
||||
while (!ZRCola::DBSource::IsEOF(rs)) {
|
||||
// Read character from the database.
|
||||
if (src.GetCharacter(rs, chr)) {
|
||||
// Add character to index and data.
|
||||
unsigned __int32 idx = db.data.size();
|
||||
db.data.push_back((unsigned __int16)chr.chr);
|
||||
for (wstring::size_type i = 0; i < sizeof(ZRCola::chrcatid_t)/sizeof(unsigned __int16); i++)
|
||||
db.data.push_back(((const unsigned __int16*)chr.cat.data)[i]);
|
||||
wstring::size_type n_desc = chr.desc.length();
|
||||
wxASSERT_MSG(n_desc <= 0xffff, wxT("character description too long"));
|
||||
db.data.push_back((unsigned __int16)n_desc);
|
||||
wstring::size_type n_rel = chr.rel.length();
|
||||
wxASSERT_MSG(n_rel <= 0xffff, wxT("too many related characters"));
|
||||
db.data.push_back((unsigned __int16)n_rel);
|
||||
for (wstring::size_type i = 0; i < n_desc; i++)
|
||||
db.data.push_back(chr.desc[i]);
|
||||
for (wstring::size_type i = 0; i < n_rel; i++)
|
||||
db.data.push_back(chr.rel[i]);
|
||||
db.idxChr.push_back(idx);
|
||||
// Phase 3: Parse characters and build index and data.
|
||||
for (size_t i = 0, i_end = chrs.size(); i < i_end; i++) {
|
||||
const ZRCola::DBSource::character &chr = *(chrs[i].get());
|
||||
if (&chr == NULL) continue;
|
||||
|
||||
// Add description (and keywords) to index.
|
||||
idxChrDsc .add_keywords(chr.desc .c_str(), chr.chr, 0);
|
||||
idxChrDsc .add_keywords(chr.keywords.c_str(), chr.chr, 0);
|
||||
idxChrDscSub.add_keywords(chr.desc .c_str(), chr.chr, 3);
|
||||
idxChrDscSub.add_keywords(chr.keywords.c_str(), chr.chr, 3);
|
||||
// Add character to index and data.
|
||||
unsigned __int32 idx = db.data.size();
|
||||
db.data.push_back((unsigned __int16)chr.chr);
|
||||
for (wstring::size_type i = 0; i < sizeof(ZRCola::chrcatid_t)/sizeof(unsigned __int16); i++)
|
||||
db.data.push_back(((const unsigned __int16*)chr.cat.data)[i]);
|
||||
wstring::size_type n_desc = chr.desc.length();
|
||||
wxASSERT_MSG(n_desc <= 0xffff, wxT("character description too long"));
|
||||
db.data.push_back((unsigned __int16)n_desc);
|
||||
wstring::size_type n_rel = chr.rel.length();
|
||||
wxASSERT_MSG(n_rel <= 0xffff, wxT("too many related characters"));
|
||||
db.data.push_back((unsigned __int16)n_rel);
|
||||
for (wstring::size_type i = 0; i < n_desc; i++)
|
||||
db.data.push_back(chr.desc[i]);
|
||||
for (wstring::size_type i = 0; i < n_rel; i++)
|
||||
db.data.push_back(chr.rel[i]);
|
||||
db.idxChr.push_back(idx);
|
||||
|
||||
// Mark category used.
|
||||
categories_used.insert(chr.cat);
|
||||
} else
|
||||
has_errors = true;
|
||||
|
||||
wxVERIFY(SUCCEEDED(rs->MoveNext()));
|
||||
// Mark category used.
|
||||
categories_used.insert(chr.cat);
|
||||
}
|
||||
|
||||
// Sort indices.
|
||||
|
@ -52,4 +52,6 @@
|
||||
#include <codecvt>
|
||||
#include <cwctype>
|
||||
#include <fstream>
|
||||
#include <memory>
|
||||
#include <set>
|
||||
#include <vector>
|
||||
|
Binary file not shown.
Loading…
x
Reference in New Issue
Block a user