dual-table lookup
git-svn-id: https://cld2.googlecode.com/svn/trunk@98 b252ecd4-b096-bf77-eb8e-91563289f87e
This commit is contained in:
@@ -14,6 +14,7 @@
|
|||||||
|
|
||||||
//
|
//
|
||||||
// Author: dsites@google.com (Dick Sites)
|
// Author: dsites@google.com (Dick Sites)
|
||||||
|
// Updated 2014.01 for dual table lookup
|
||||||
//
|
//
|
||||||
|
|
||||||
#include "scoreonescriptspan.h"
|
#include "scoreonescriptspan.h"
|
||||||
@@ -548,6 +549,13 @@ void JustOneItemToVector(ScriptScanner* scanner, const char* text,
|
|||||||
// Debugging. Not thread safe. Defined in getonescriptspan
|
// Debugging. Not thread safe. Defined in getonescriptspan
|
||||||
char* DisplayPiece(const char* next_byte_, int byte_length_);
|
char* DisplayPiece(const char* next_byte_, int byte_length_);
|
||||||
|
|
||||||
|
// If high bit is on, take out high bit and add 2B to make table2 entries easy
|
||||||
|
inline int PrintableIndirect(int x) {
|
||||||
|
if ((x & 0x80000000u) != 0) {
|
||||||
|
return (x & ~0x80000000u) + 2000000000;
|
||||||
|
}
|
||||||
|
return x;
|
||||||
|
}
|
||||||
void DumpHitBuffer(FILE* df, const char* text,
|
void DumpHitBuffer(FILE* df, const char* text,
|
||||||
const ScoringHitBuffer* hitbuffer) {
|
const ScoringHitBuffer* hitbuffer) {
|
||||||
fprintf(df,
|
fprintf(df,
|
||||||
@@ -558,11 +566,12 @@ void DumpHitBuffer(FILE* df, const char* text,
|
|||||||
for (int i = 0; i < hitbuffer->maxscoringhits; ++i) {
|
for (int i = 0; i < hitbuffer->maxscoringhits; ++i) {
|
||||||
if (i < hitbuffer->next_base) {
|
if (i < hitbuffer->next_base) {
|
||||||
fprintf(df, "Q[%d]%d,%d,%s ",
|
fprintf(df, "Q[%d]%d,%d,%s ",
|
||||||
i, hitbuffer->base[i].offset, hitbuffer->base[i].indirect,
|
i, hitbuffer->base[i].offset,
|
||||||
|
PrintableIndirect(hitbuffer->base[i].indirect),
|
||||||
DisplayPiece(&text[hitbuffer->base[i].offset], 6));
|
DisplayPiece(&text[hitbuffer->base[i].offset], 6));
|
||||||
}
|
}
|
||||||
if (i < hitbuffer->next_delta) {
|
if (i < hitbuffer->next_delta) {
|
||||||
fprintf(df, "L[%d]%d,%d,%s ",
|
fprintf(df, "DL[%d]%d,%d,%s ",
|
||||||
i, hitbuffer->delta[i].offset, hitbuffer->delta[i].indirect,
|
i, hitbuffer->delta[i].offset, hitbuffer->delta[i].indirect,
|
||||||
DisplayPiece(&text[hitbuffer->delta[i].offset], 12));
|
DisplayPiece(&text[hitbuffer->delta[i].offset], 12));
|
||||||
}
|
}
|
||||||
@@ -579,12 +588,13 @@ void DumpHitBuffer(FILE* df, const char* text,
|
|||||||
if (hitbuffer->next_base > 50) {
|
if (hitbuffer->next_base > 50) {
|
||||||
int i = hitbuffer->next_base;
|
int i = hitbuffer->next_base;
|
||||||
fprintf(df, "Q[%d]%d,%d,%s ",
|
fprintf(df, "Q[%d]%d,%d,%s ",
|
||||||
i, hitbuffer->base[i].offset, hitbuffer->base[i].indirect,
|
i, hitbuffer->base[i].offset,
|
||||||
|
PrintableIndirect(hitbuffer->base[i].indirect),
|
||||||
DisplayPiece(&text[hitbuffer->base[i].offset], 6));
|
DisplayPiece(&text[hitbuffer->base[i].offset], 6));
|
||||||
}
|
}
|
||||||
if (hitbuffer->next_delta > 50) {
|
if (hitbuffer->next_delta > 50) {
|
||||||
int i = hitbuffer->next_delta;
|
int i = hitbuffer->next_delta;
|
||||||
fprintf(df, "L[%d]%d,%d,%s ",
|
fprintf(df, "DL[%d]%d,%d,%s ",
|
||||||
i, hitbuffer->delta[i].offset, hitbuffer->delta[i].indirect,
|
i, hitbuffer->delta[i].offset, hitbuffer->delta[i].indirect,
|
||||||
DisplayPiece(&text[hitbuffer->delta[i].offset], 12));
|
DisplayPiece(&text[hitbuffer->delta[i].offset], 12));
|
||||||
}
|
}
|
||||||
@@ -844,16 +854,19 @@ uint32 DefaultLangProb(ULScript ulscript) {
|
|||||||
void LinearizeAll(ScoringContext* scoringcontext, bool score_cjk,
|
void LinearizeAll(ScoringContext* scoringcontext, bool score_cjk,
|
||||||
ScoringHitBuffer* hitbuffer) {
|
ScoringHitBuffer* hitbuffer) {
|
||||||
const CLD2TableSummary* base_obj; // unigram or quadgram
|
const CLD2TableSummary* base_obj; // unigram or quadgram
|
||||||
|
const CLD2TableSummary* base_obj2; // quadgram dual table
|
||||||
const CLD2TableSummary* delta_obj; // bigram or octagram
|
const CLD2TableSummary* delta_obj; // bigram or octagram
|
||||||
const CLD2TableSummary* distinct_obj; // bigram or octagram
|
const CLD2TableSummary* distinct_obj; // bigram or octagram
|
||||||
uint16 base_hit;
|
uint16 base_hit;
|
||||||
if (score_cjk) {
|
if (score_cjk) {
|
||||||
base_obj = scoringcontext->scoringtables->unigram_compat_obj;
|
base_obj = scoringcontext->scoringtables->unigram_compat_obj;
|
||||||
|
base_obj2 = scoringcontext->scoringtables->unigram_compat_obj;
|
||||||
delta_obj = scoringcontext->scoringtables->deltabi_obj;
|
delta_obj = scoringcontext->scoringtables->deltabi_obj;
|
||||||
distinct_obj = scoringcontext->scoringtables->distinctbi_obj;
|
distinct_obj = scoringcontext->scoringtables->distinctbi_obj;
|
||||||
base_hit = UNIHIT;
|
base_hit = UNIHIT;
|
||||||
} else {
|
} else {
|
||||||
base_obj = scoringcontext->scoringtables->quadgram_obj;
|
base_obj = scoringcontext->scoringtables->quadgram_obj;
|
||||||
|
base_obj2 = scoringcontext->scoringtables->quadgram_obj2;
|
||||||
delta_obj = scoringcontext->scoringtables->deltaocta_obj;
|
delta_obj = scoringcontext->scoringtables->deltaocta_obj;
|
||||||
distinct_obj = scoringcontext->scoringtables->distinctocta_obj;
|
distinct_obj = scoringcontext->scoringtables->distinctocta_obj;
|
||||||
base_hit = QUADHIT;
|
base_hit = QUADHIT;
|
||||||
@@ -911,12 +924,18 @@ void LinearizeAll(ScoringContext* scoringcontext, bool score_cjk,
|
|||||||
else {
|
else {
|
||||||
// Add one or two base entries
|
// Add one or two base entries
|
||||||
int indirect = hitbuffer->base[base_i].indirect;
|
int indirect = hitbuffer->base[base_i].indirect;
|
||||||
|
// First, get right scoring table
|
||||||
|
const CLD2TableSummary* local_base_obj = base_obj;
|
||||||
|
if ((indirect & 0x80000000u) != 0) {
|
||||||
|
local_base_obj = base_obj2;
|
||||||
|
indirect &= ~0x80000000u;
|
||||||
|
}
|
||||||
++base_i;
|
++base_i;
|
||||||
// One langprob in kQuadInd[0..SingleSize),
|
// One langprob in kQuadInd[0..SingleSize),
|
||||||
// two in kQuadInd[SingleSize..Size)
|
// two in kQuadInd[SingleSize..Size)
|
||||||
if (indirect < static_cast<int>(base_obj->kCLDTableSizeOne)) {
|
if (indirect < static_cast<int>(local_base_obj->kCLDTableSizeOne)) {
|
||||||
// Up to three languages at indirect
|
// Up to three languages at indirect
|
||||||
uint32 langprob = base_obj->kCLDTableInd[indirect];
|
uint32 langprob = local_base_obj->kCLDTableInd[indirect];
|
||||||
if (langprob > 0) {
|
if (langprob > 0) {
|
||||||
hitbuffer->linear[linear_i].offset = base_off;
|
hitbuffer->linear[linear_i].offset = base_off;
|
||||||
hitbuffer->linear[linear_i].type = base_hit;
|
hitbuffer->linear[linear_i].type = base_hit;
|
||||||
@@ -925,9 +944,9 @@ void LinearizeAll(ScoringContext* scoringcontext, bool score_cjk,
|
|||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
// Up to six languages at start + 2 * (indirect - start)
|
// Up to six languages at start + 2 * (indirect - start)
|
||||||
indirect += (indirect - base_obj->kCLDTableSizeOne);
|
indirect += (indirect - local_base_obj->kCLDTableSizeOne);
|
||||||
uint32 langprob = base_obj->kCLDTableInd[indirect];
|
uint32 langprob = local_base_obj->kCLDTableInd[indirect];
|
||||||
uint32 langprob2 = base_obj->kCLDTableInd[indirect + 1];
|
uint32 langprob2 = local_base_obj->kCLDTableInd[indirect + 1];
|
||||||
if (langprob > 0) {
|
if (langprob > 0) {
|
||||||
hitbuffer->linear[linear_i].offset = base_off;
|
hitbuffer->linear[linear_i].offset = base_off;
|
||||||
hitbuffer->linear[linear_i].type = base_hit;
|
hitbuffer->linear[linear_i].type = base_hit;
|
||||||
@@ -966,6 +985,7 @@ void ChunkAll(int letter_offset, bool score_cjk, ScoringHitBuffer* hitbuffer) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
int linear_i = 0;
|
int linear_i = 0;
|
||||||
|
int linear_off_end = hitbuffer->next_linear;
|
||||||
int text_i = letter_offset; // Next unseen text offset
|
int text_i = letter_offset; // Next unseen text offset
|
||||||
int next_chunk_start = 0;
|
int next_chunk_start = 0;
|
||||||
int bases_left = hitbuffer->next_base;
|
int bases_left = hitbuffer->next_base;
|
||||||
@@ -985,7 +1005,7 @@ void ChunkAll(int letter_offset, bool score_cjk, ScoringHitBuffer* hitbuffer) {
|
|||||||
++next_chunk_start;
|
++next_chunk_start;
|
||||||
|
|
||||||
int base_count = 0;
|
int base_count = 0;
|
||||||
while (base_count < base_len) {
|
while ((base_count < base_len) && (linear_i < linear_off_end)) {
|
||||||
if (hitbuffer->linear[linear_i].type == base_hit) {++base_count;}
|
if (hitbuffer->linear[linear_i].type == base_hit) {++base_count;}
|
||||||
++linear_i;
|
++linear_i;
|
||||||
}
|
}
|
||||||
|
Reference in New Issue
Block a user