Inactive related characters removed
This commit is contained in:
parent
7bbe92d359
commit
d70b72b0c1
@ -22,7 +22,7 @@
|
|||||||
using namespace std;
|
using namespace std;
|
||||||
|
|
||||||
|
|
||||||
bool ZRCola::DBSource::character_desc_idx::add_keywords(const wchar_t *str, wchar_t chr, size_t sub)
|
void ZRCola::DBSource::character_desc_idx::parse_keywords(const wchar_t *str, list<wstring> &terms)
|
||||||
{
|
{
|
||||||
wxASSERT_MSG(str, wxT("string is NULL"));
|
wxASSERT_MSG(str, wxT("string is NULL"));
|
||||||
|
|
||||||
@ -30,7 +30,7 @@ bool ZRCola::DBSource::character_desc_idx::add_keywords(const wchar_t *str, wcha
|
|||||||
// Skip white space.
|
// Skip white space.
|
||||||
for (;;) {
|
for (;;) {
|
||||||
if (*str == 0)
|
if (*str == 0)
|
||||||
return true;
|
return;
|
||||||
else if (!iswspace(*str))
|
else if (!iswspace(*str))
|
||||||
break;
|
break;
|
||||||
else
|
else
|
||||||
@ -62,23 +62,29 @@ bool ZRCola::DBSource::character_desc_idx::add_keywords(const wchar_t *str, wcha
|
|||||||
|
|
||||||
if (!term.empty()) {
|
if (!term.empty()) {
|
||||||
transform(term.begin(), term.end(), term.begin(), towlower);
|
transform(term.begin(), term.end(), term.begin(), towlower);
|
||||||
if (sub) {
|
terms.push_back(term);
|
||||||
wstring::size_type j_end = term.size();
|
|
||||||
if (j_end >= sub) {
|
|
||||||
// Insert all keyword substrings "sub" or more characters long.
|
|
||||||
for (wstring::size_type i = 0, i_end = j_end - sub; i <= i_end; ++i) {
|
|
||||||
for (wstring::size_type j = i + sub; j <= j_end; ++j)
|
|
||||||
add_keyword(term.substr(i, j - i), chr);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
// Insert exact keyword only.
|
|
||||||
add_keyword(term, chr);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
return true;
|
|
||||||
|
void ZRCola::DBSource::character_desc_idx::add_keywords(const std::list<std::wstring> &terms, wchar_t chr, size_t sub)
|
||||||
|
{
|
||||||
|
for (list<wstring>::const_iterator term = terms.cbegin(), term_end = terms.cend(); term != term_end; ++term) {
|
||||||
|
if (sub) {
|
||||||
|
wstring::size_type j_end = term->size();
|
||||||
|
if (j_end >= sub) {
|
||||||
|
// Insert all keyword substrings "sub" or more characters long.
|
||||||
|
for (wstring::size_type i = 0, i_end = j_end - sub; i <= i_end; ++i) {
|
||||||
|
for (wstring::size_type j = i + sub; j <= j_end; ++j)
|
||||||
|
add_keyword(term->substr(i, j - i), chr);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// Insert exact keyword only.
|
||||||
|
add_keyword(*term, chr);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -772,12 +778,15 @@ bool ZRCola::DBSource::GetCharacter(const ATL::CComPtr<ADORecordset>& rs, charac
|
|||||||
wxVERIFY(SUCCEEDED(flds->get_Item(ATL::CComVariant(L"opis_en"), &f)));
|
wxVERIFY(SUCCEEDED(flds->get_Item(ATL::CComVariant(L"opis_en"), &f)));
|
||||||
wxCHECK(GetValue(f, chr.desc), false);
|
wxCHECK(GetValue(f, chr.desc), false);
|
||||||
}
|
}
|
||||||
|
ZRCola::DBSource::character_desc_idx::parse_keywords(chr.desc.c_str(), chr.terms);
|
||||||
|
|
||||||
|
wstring keywords;
|
||||||
{
|
{
|
||||||
ATL::CComPtr<ADOField> f;
|
ATL::CComPtr<ADOField> f;
|
||||||
wxVERIFY(SUCCEEDED(flds->get_Item(ATL::CComVariant(L"klj_bes_en"), &f)));
|
wxVERIFY(SUCCEEDED(flds->get_Item(ATL::CComVariant(L"klj_bes_en"), &f)));
|
||||||
wxCHECK(GetValue(f, chr.keywords), false);
|
wxCHECK(GetValue(f, keywords), false);
|
||||||
}
|
}
|
||||||
|
ZRCola::DBSource::character_desc_idx::parse_keywords(keywords.c_str(), chr.terms);
|
||||||
|
|
||||||
{
|
{
|
||||||
ATL::CComPtr<ADOField> f;
|
ATL::CComPtr<ADOField> f;
|
||||||
|
@ -24,6 +24,7 @@
|
|||||||
|
|
||||||
#include <atlbase.h>
|
#include <atlbase.h>
|
||||||
#include <adoint.h>
|
#include <adoint.h>
|
||||||
|
#include <list>
|
||||||
#include <map>
|
#include <map>
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
@ -119,11 +120,42 @@ namespace ZRCola {
|
|||||||
///
|
///
|
||||||
class character {
|
class character {
|
||||||
public:
|
public:
|
||||||
wchar_t chr; ///< Character
|
inline character()
|
||||||
ZRCola::chrcatid_t cat; ///> Category ID
|
{
|
||||||
std::wstring desc; ///< Character description
|
chr = 0;
|
||||||
std::wstring keywords; ///< Additional keywords
|
cat.data[0] = 0;
|
||||||
std::wstring rel; ///< Related characters
|
cat.data[1] = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
inline character(_In_ const character &othr) :
|
||||||
|
chr (othr.chr),
|
||||||
|
cat (othr.cat),
|
||||||
|
desc (othr.desc),
|
||||||
|
terms(othr.terms),
|
||||||
|
rel (othr.rel)
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
|
inline bool operator==(_In_ const character &othr) const
|
||||||
|
{
|
||||||
|
return
|
||||||
|
chr == othr.chr &&
|
||||||
|
cat == othr.cat &&
|
||||||
|
desc == othr.desc &&
|
||||||
|
terms == othr.terms &&
|
||||||
|
rel == othr.rel;
|
||||||
|
}
|
||||||
|
|
||||||
|
inline bool operator!=(_In_ const character &othr) const
|
||||||
|
{
|
||||||
|
return !operator==(othr);
|
||||||
|
}
|
||||||
|
|
||||||
|
wchar_t chr; ///< Character
|
||||||
|
ZRCola::chrcatid_t cat; ///< Category ID
|
||||||
|
std::wstring desc; ///< Character description
|
||||||
|
std::list<std::wstring> terms; ///< Search terms
|
||||||
|
std::wstring rel; ///< Related characters
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
@ -153,7 +185,14 @@ namespace ZRCola {
|
|||||||
class character_desc_idx : public std::map<std::wstring, std::vector<wchar_t>, character_desc_idx_less>
|
class character_desc_idx : public std::map<std::wstring, std::vector<wchar_t>, character_desc_idx_less>
|
||||||
{
|
{
|
||||||
public:
|
public:
|
||||||
bool add_keywords(const wchar_t *str, wchar_t chr, size_t sub = 0);
|
static void parse_keywords(const wchar_t *str, std::list<std::wstring> &terms);
|
||||||
|
void add_keywords(const std::list<std::wstring> &terms, wchar_t chr, size_t sub = 0);
|
||||||
|
inline void add_keywords(const wchar_t *str, wchar_t chr, size_t sub = 0)
|
||||||
|
{
|
||||||
|
std::list<std::wstring> terms;
|
||||||
|
parse_keywords(str, terms);
|
||||||
|
add_keywords(terms, chr, sub);
|
||||||
|
}
|
||||||
|
|
||||||
void save(ZRCola::textindex<wchar_t, wchar_t, unsigned __int32> &idx) const;
|
void save(ZRCola::textindex<wchar_t, wchar_t, unsigned __int32> &idx) const;
|
||||||
|
|
||||||
|
@ -393,47 +393,70 @@ int _tmain(int argc, _TCHAR *argv[])
|
|||||||
if (src.SelectCharacters(rs)) {
|
if (src.SelectCharacters(rs)) {
|
||||||
size_t count = src.GetRecordsetCount(rs);
|
size_t count = src.GetRecordsetCount(rs);
|
||||||
if (count < 0xffffffff) { // 4G check (-1 is reserved for error condition)
|
if (count < 0xffffffff) { // 4G check (-1 is reserved for error condition)
|
||||||
ZRCola::DBSource::character chr;
|
|
||||||
ZRCola::character_db db;
|
|
||||||
ZRCola::DBSource::character_desc_idx idxChrDsc, idxChrDscSub;
|
ZRCola::DBSource::character_desc_idx idxChrDsc, idxChrDscSub;
|
||||||
|
|
||||||
|
vector<unique_ptr<ZRCola::DBSource::character> > chrs;
|
||||||
|
chrs.resize(0x10000);
|
||||||
|
|
||||||
|
// Phase 1: Parse characters and build indexes.
|
||||||
|
while (!ZRCola::DBSource::IsEOF(rs)) {
|
||||||
|
// Read character from the database.
|
||||||
|
unique_ptr<ZRCola::DBSource::character> c(new ZRCola::DBSource::character);
|
||||||
|
if (src.GetCharacter(rs, *c)) {
|
||||||
|
const ZRCola::DBSource::character &chr = *c.get();
|
||||||
|
chrs[chr.chr].swap(c);
|
||||||
|
|
||||||
|
// Add description (and keywords) to index.
|
||||||
|
idxChrDsc .add_keywords(chr.terms, chr.chr, 0);
|
||||||
|
idxChrDscSub.add_keywords(chr.terms, chr.chr, 3);
|
||||||
|
} else
|
||||||
|
has_errors = true;
|
||||||
|
|
||||||
|
wxVERIFY(SUCCEEDED(rs->MoveNext()));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Phase 2: Build related character lists.
|
||||||
|
for (size_t i = 0, i_end = chrs.size(); i < i_end; i++) {
|
||||||
|
ZRCola::DBSource::character &chr = *(chrs[i].get());
|
||||||
|
if (&chr == NULL) continue;
|
||||||
|
|
||||||
|
// Remove all unexisting or inactive related characters.
|
||||||
|
for (wstring::size_type i = chr.rel.length(); i--;) {
|
||||||
|
if (!chrs[chr.rel[i]])
|
||||||
|
chr.rel.erase(i, 1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
ZRCola::character_db db;
|
||||||
|
|
||||||
// Preallocate memory.
|
// Preallocate memory.
|
||||||
db.idxChr.reserve(count);
|
db.idxChr.reserve(count);
|
||||||
db.data .reserve(count*4);
|
db.data .reserve(count*4);
|
||||||
|
|
||||||
// Parse characters and build index and data.
|
// Phase 3: Parse characters and build index and data.
|
||||||
while (!ZRCola::DBSource::IsEOF(rs)) {
|
for (size_t i = 0, i_end = chrs.size(); i < i_end; i++) {
|
||||||
// Read character from the database.
|
const ZRCola::DBSource::character &chr = *(chrs[i].get());
|
||||||
if (src.GetCharacter(rs, chr)) {
|
if (&chr == NULL) continue;
|
||||||
// Add character to index and data.
|
|
||||||
unsigned __int32 idx = db.data.size();
|
|
||||||
db.data.push_back((unsigned __int16)chr.chr);
|
|
||||||
for (wstring::size_type i = 0; i < sizeof(ZRCola::chrcatid_t)/sizeof(unsigned __int16); i++)
|
|
||||||
db.data.push_back(((const unsigned __int16*)chr.cat.data)[i]);
|
|
||||||
wstring::size_type n_desc = chr.desc.length();
|
|
||||||
wxASSERT_MSG(n_desc <= 0xffff, wxT("character description too long"));
|
|
||||||
db.data.push_back((unsigned __int16)n_desc);
|
|
||||||
wstring::size_type n_rel = chr.rel.length();
|
|
||||||
wxASSERT_MSG(n_rel <= 0xffff, wxT("too many related characters"));
|
|
||||||
db.data.push_back((unsigned __int16)n_rel);
|
|
||||||
for (wstring::size_type i = 0; i < n_desc; i++)
|
|
||||||
db.data.push_back(chr.desc[i]);
|
|
||||||
for (wstring::size_type i = 0; i < n_rel; i++)
|
|
||||||
db.data.push_back(chr.rel[i]);
|
|
||||||
db.idxChr.push_back(idx);
|
|
||||||
|
|
||||||
// Add description (and keywords) to index.
|
// Add character to index and data.
|
||||||
idxChrDsc .add_keywords(chr.desc .c_str(), chr.chr, 0);
|
unsigned __int32 idx = db.data.size();
|
||||||
idxChrDsc .add_keywords(chr.keywords.c_str(), chr.chr, 0);
|
db.data.push_back((unsigned __int16)chr.chr);
|
||||||
idxChrDscSub.add_keywords(chr.desc .c_str(), chr.chr, 3);
|
for (wstring::size_type i = 0; i < sizeof(ZRCola::chrcatid_t)/sizeof(unsigned __int16); i++)
|
||||||
idxChrDscSub.add_keywords(chr.keywords.c_str(), chr.chr, 3);
|
db.data.push_back(((const unsigned __int16*)chr.cat.data)[i]);
|
||||||
|
wstring::size_type n_desc = chr.desc.length();
|
||||||
|
wxASSERT_MSG(n_desc <= 0xffff, wxT("character description too long"));
|
||||||
|
db.data.push_back((unsigned __int16)n_desc);
|
||||||
|
wstring::size_type n_rel = chr.rel.length();
|
||||||
|
wxASSERT_MSG(n_rel <= 0xffff, wxT("too many related characters"));
|
||||||
|
db.data.push_back((unsigned __int16)n_rel);
|
||||||
|
for (wstring::size_type i = 0; i < n_desc; i++)
|
||||||
|
db.data.push_back(chr.desc[i]);
|
||||||
|
for (wstring::size_type i = 0; i < n_rel; i++)
|
||||||
|
db.data.push_back(chr.rel[i]);
|
||||||
|
db.idxChr.push_back(idx);
|
||||||
|
|
||||||
// Mark category used.
|
// Mark category used.
|
||||||
categories_used.insert(chr.cat);
|
categories_used.insert(chr.cat);
|
||||||
} else
|
|
||||||
has_errors = true;
|
|
||||||
|
|
||||||
wxVERIFY(SUCCEEDED(rs->MoveNext()));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Sort indices.
|
// Sort indices.
|
||||||
|
@ -52,4 +52,6 @@
|
|||||||
#include <codecvt>
|
#include <codecvt>
|
||||||
#include <cwctype>
|
#include <cwctype>
|
||||||
#include <fstream>
|
#include <fstream>
|
||||||
|
#include <memory>
|
||||||
#include <set>
|
#include <set>
|
||||||
|
#include <vector>
|
||||||
|
Binary file not shown.
Loading…
x
Reference in New Issue
Block a user