Enable Dynamic Mode for CLD2. See issue 6 for more information on dynamic mode:
https://code.google.com/p/cld2/issues/detail?id=6 git-svn-id: https://cld2.googlecode.com/svn/trunk@151 b252ecd4-b096-bf77-eb8e-91563289f87e
This commit is contained in:
376
internal/cld2_dynamic_data_extractor.cc
Normal file
376
internal/cld2_dynamic_data_extractor.cc
Normal file
@@ -0,0 +1,376 @@
|
||||
// Copyright 2014 Google Inc. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include <iostream>
|
||||
|
||||
#include "cld2_dynamic_data.h"
|
||||
#include "cld2_dynamic_data_extractor.h"
|
||||
#include "cld2_dynamic_data_loader.h" // for verifying the written data
|
||||
#include "integral_types.h"
|
||||
#include "cld2tablesummary.h"
|
||||
#include "utf8statetable.h"
|
||||
|
||||
using namespace std;
|
||||
namespace CLD2DynamicDataExtractor {
|
||||
static int DEBUG=0;
|
||||
void setDebug(int debug) {
|
||||
DEBUG=debug;
|
||||
}
|
||||
|
||||
int advance(FILE* f, CLD2::uint32 position) {
|
||||
const char ZERO = 0;
|
||||
int pad = position - ftell(f);
|
||||
if (DEBUG) cout << " Adding " << pad << " bytes of padding" << endl;
|
||||
while (pad-- > 0) {
|
||||
fwrite(&ZERO,1,1,f);
|
||||
}
|
||||
return pad;
|
||||
}
|
||||
|
||||
void writeChunk(FILE *f, const void* data, CLD2::uint32 startAt, CLD2::uint32 length) {
|
||||
if (DEBUG) cout << "Write chunk @" << startAt << ", len=" << length << endl;
|
||||
advance(f, startAt);
|
||||
if (DEBUG) cout << " Writing " << length << " bytes of data" << endl;
|
||||
fwrite(data, 1, length, f);
|
||||
}
|
||||
|
||||
void writeDataFile(const CLD2::ScoringTables* data, const char* fileName) {
|
||||
// The order here is hardcoded and MUST NOT BE CHANGED, else you will de-sync
|
||||
// with the reading code.
|
||||
const char ZERO = 0;
|
||||
const int NUM_TABLES = 7;
|
||||
const CLD2::CLD2TableSummary* tableSummaries[NUM_TABLES];
|
||||
tableSummaries[0] = data->unigram_compat_obj;
|
||||
tableSummaries[1] = data->deltabi_obj;
|
||||
tableSummaries[2] = data->distinctbi_obj;
|
||||
tableSummaries[3] = data->quadgram_obj;
|
||||
tableSummaries[4] = data->quadgram_obj2;
|
||||
tableSummaries[5] = data->deltaocta_obj;
|
||||
tableSummaries[6] = data->distinctocta_obj;
|
||||
|
||||
CLD2DynamicData::TableHeader tableHeaders[NUM_TABLES];
|
||||
CLD2DynamicData::FileHeader fileHeader;
|
||||
fileHeader.numTablesEncoded = NUM_TABLES;
|
||||
fileHeader.tableHeaders = tableHeaders;
|
||||
initUtf8Headers(&fileHeader, data->unigram_obj);
|
||||
initDeltaHeaders(&fileHeader, data->kExpectedScore);
|
||||
initTableHeaders(tableSummaries, NUM_TABLES, tableHeaders);
|
||||
alignAll(&fileHeader, 16); // Align all sections to 128-bit boundaries
|
||||
|
||||
// We are ready to rock.
|
||||
for (int x=0; x<CLD2DynamicData::DATA_FILE_MARKER_LENGTH; x++)
|
||||
fileHeader.sanityString[x] = CLD2DynamicData::DATA_FILE_MARKER[x];
|
||||
FILE* outFile = fopen(fileName, "w");
|
||||
fwrite(fileHeader.sanityString, 1, CLD2DynamicData::DATA_FILE_MARKER_LENGTH, outFile);
|
||||
fwrite(&(fileHeader.totalFileSizeBytes), 4, 1, outFile);
|
||||
fwrite(&(fileHeader.utf8PropObj_state0), 4, 1, outFile);
|
||||
fwrite(&(fileHeader.utf8PropObj_state0_size), 4, 1, outFile);
|
||||
fwrite(&(fileHeader.utf8PropObj_total_size), 4, 1, outFile);
|
||||
fwrite(&(fileHeader.utf8PropObj_max_expand), 4, 1, outFile);
|
||||
fwrite(&(fileHeader.utf8PropObj_entry_shift), 4, 1, outFile);
|
||||
fwrite(&(fileHeader.utf8PropObj_bytes_per_entry), 4, 1, outFile);
|
||||
fwrite(&(fileHeader.utf8PropObj_losub), 4, 1, outFile);
|
||||
fwrite(&(fileHeader.utf8PropObj_hiadd), 4, 1, outFile);
|
||||
fwrite(&(fileHeader.startOf_utf8PropObj_state_table), 4, 1, outFile);
|
||||
fwrite(&(fileHeader.lengthOf_utf8PropObj_state_table), 4, 1, outFile);
|
||||
fwrite(&(fileHeader.startOf_utf8PropObj_remap_base), 4, 1, outFile);
|
||||
fwrite(&(fileHeader.lengthOf_utf8PropObj_remap_base), 4, 1, outFile);
|
||||
fwrite(&(fileHeader.startOf_utf8PropObj_remap_string), 4, 1, outFile);
|
||||
fwrite(&(fileHeader.lengthOf_utf8PropObj_remap_string), 4, 1, outFile);
|
||||
fwrite(&(fileHeader.startOf_utf8PropObj_fast_state), 4, 1, outFile);
|
||||
fwrite(&(fileHeader.lengthOf_utf8PropObj_fast_state), 4, 1, outFile);
|
||||
fwrite(&(fileHeader.startOf_kAvgDeltaOctaScore), 4, 1, outFile);
|
||||
fwrite(&(fileHeader.lengthOf_kAvgDeltaOctaScore), 4, 1, outFile);
|
||||
fwrite(&(fileHeader.numTablesEncoded), 4, 1, outFile);
|
||||
for (int x=0; x<NUM_TABLES; x++) {
|
||||
CLD2DynamicData::TableHeader& tHeader = fileHeader.tableHeaders[x];
|
||||
fwrite(&(tHeader.kCLDTableSizeOne), 4, 1, outFile);
|
||||
fwrite(&(tHeader.kCLDTableSize), 4, 1, outFile);
|
||||
fwrite(&(tHeader.kCLDTableKeyMask), 4, 1, outFile);
|
||||
fwrite(&(tHeader.kCLDTableBuildDate), 4, 1, outFile);
|
||||
fwrite(&(tHeader.startOf_kCLDTable), 4, 1, outFile);
|
||||
fwrite(&(tHeader.lengthOf_kCLDTable), 4, 1, outFile);
|
||||
fwrite(&(tHeader.startOf_kCLDTableInd), 4, 1, outFile);
|
||||
fwrite(&(tHeader.lengthOf_kCLDTableInd), 4, 1, outFile);
|
||||
fwrite(&(tHeader.startOf_kRecognizedLangScripts), 4, 1, outFile);
|
||||
fwrite(&(tHeader.lengthOf_kRecognizedLangScripts), 4, 1, outFile);
|
||||
}
|
||||
|
||||
// Write data blob
|
||||
// 1. UTF8 Object
|
||||
writeChunk(outFile,
|
||||
data->unigram_obj->state_table,
|
||||
fileHeader.startOf_utf8PropObj_state_table,
|
||||
fileHeader.lengthOf_utf8PropObj_state_table);
|
||||
// FIXME: Unsafe to rely on this since RemapEntry is not a bit-packed structure
|
||||
writeChunk(outFile,
|
||||
data->unigram_obj->remap_base,
|
||||
fileHeader.startOf_utf8PropObj_remap_base,
|
||||
fileHeader.lengthOf_utf8PropObj_remap_base);
|
||||
writeChunk(outFile,
|
||||
data->unigram_obj->remap_string,
|
||||
fileHeader.startOf_utf8PropObj_remap_string,
|
||||
fileHeader.lengthOf_utf8PropObj_remap_string - 1);
|
||||
fwrite(&ZERO,1,1,outFile); // null terminator
|
||||
if (fileHeader.startOf_utf8PropObj_fast_state > 0) {
|
||||
writeChunk(outFile,
|
||||
data->unigram_obj->fast_state,
|
||||
fileHeader.startOf_utf8PropObj_fast_state,
|
||||
fileHeader.lengthOf_utf8PropObj_fast_state - 1);
|
||||
fwrite(&ZERO,1,1,outFile); // null terminator
|
||||
}
|
||||
|
||||
// 2. kAvgDeltaOctaScore array
|
||||
writeChunk(outFile,
|
||||
data->kExpectedScore,
|
||||
fileHeader.startOf_kAvgDeltaOctaScore,
|
||||
fileHeader.lengthOf_kAvgDeltaOctaScore);
|
||||
|
||||
// 3. Each table
|
||||
for (int x=0; x<NUM_TABLES; x++) {
|
||||
const CLD2::CLD2TableSummary* summary = tableSummaries[x];
|
||||
CLD2DynamicData::TableHeader& tHeader = fileHeader.tableHeaders[x];
|
||||
// NB: Safe to directly write IndirectProbBucket4 as it is just an alias for CLD2::uint32
|
||||
writeChunk(outFile,
|
||||
summary->kCLDTable,
|
||||
tHeader.startOf_kCLDTable,
|
||||
tHeader.lengthOf_kCLDTable);
|
||||
writeChunk(outFile,
|
||||
summary->kCLDTableInd,
|
||||
tHeader.startOf_kCLDTableInd,
|
||||
tHeader.lengthOf_kCLDTableInd);
|
||||
writeChunk(outFile,
|
||||
summary->kRecognizedLangScripts,
|
||||
tHeader.startOf_kRecognizedLangScripts,
|
||||
tHeader.lengthOf_kRecognizedLangScripts - 1);
|
||||
fwrite(&ZERO,1,1,outFile); // null terminator
|
||||
}
|
||||
fclose(outFile);
|
||||
}
|
||||
|
||||
void initTableHeaders(const CLD2::CLD2TableSummary** summaries,
|
||||
int numSummaries, CLD2DynamicData::TableHeader* tableHeaders) {
|
||||
for (int x=0; x<numSummaries; x++) {
|
||||
const CLD2::CLD2TableSummary* summary = summaries[x];
|
||||
CLD2DynamicData::TableHeader& tableHeader = tableHeaders[x];
|
||||
|
||||
// Copy the primitive bits
|
||||
tableHeader.kCLDTableSizeOne = summary->kCLDTableSizeOne;
|
||||
tableHeader.kCLDTableSize = summary->kCLDTableSize;
|
||||
tableHeader.kCLDTableKeyMask = summary->kCLDTableKeyMask;
|
||||
tableHeader.kCLDTableBuildDate = summary->kCLDTableBuildDate;
|
||||
|
||||
// Calculate size information
|
||||
CLD2::uint32 bytesPerBucket = sizeof(CLD2::IndirectProbBucket4);
|
||||
CLD2::uint32 numBuckets = summary->kCLDTableSize;
|
||||
CLD2::uint32 tableSizeBytes = bytesPerBucket * numBuckets;
|
||||
CLD2::uint32 indirectTableSizeBytes =
|
||||
summary->kCLDTableSizeOne * sizeof(CLD2::uint32);
|
||||
|
||||
// XXX XXX XXX HACK HACK HACK FIXME FIXME FIXME
|
||||
// XXX XXX XXX HACK HACK HACK FIXME FIXME FIXME
|
||||
// XXX XXX XXX HACK HACK HACK FIXME FIXME FIXME
|
||||
// cld2_generated_cjk_compatible.cc has a kCLDTableSizeOne of zero!
|
||||
if (x == 0) { // cld2_generated_cjk_compatible.cc
|
||||
indirectTableSizeBytes = 239*2*4;
|
||||
}
|
||||
// XXX XXX XXX HACK HACK HACK FIXME FIXME FIXME
|
||||
// XXX XXX XXX HACK HACK HACK FIXME FIXME FIXME
|
||||
// XXX XXX XXX HACK HACK HACK FIXME FIXME FIXME
|
||||
|
||||
CLD2::uint32 recognizedScriptsSizeBytes =
|
||||
strlen(summary->kRecognizedLangScripts) + 1; // note null terminator
|
||||
|
||||
// Place size information into header. We'll align on byte boundaries later.
|
||||
tableHeader.lengthOf_kCLDTable = tableSizeBytes;
|
||||
tableHeader.lengthOf_kCLDTableInd = indirectTableSizeBytes;
|
||||
tableHeader.lengthOf_kRecognizedLangScripts =
|
||||
recognizedScriptsSizeBytes; // null terminator counted above
|
||||
}
|
||||
}
|
||||
|
||||
// Assuming that all fields have been set in the specified header, re-align
|
||||
// the starting positions of all data chunks to be aligned along 64-bit
|
||||
// boundaries for maximum efficiency.
|
||||
void alignAll(CLD2DynamicData::FileHeader* header, int alignment) {
|
||||
CLD2::uint32 totalPadding = 0;
|
||||
if (DEBUG) { std::cout << "Align for " << (alignment*8) << " bits." << std::endl; }
|
||||
CLD2::uint32 headerSize = CLD2DynamicData::calculateHeaderSize(
|
||||
header->numTablesEncoded);
|
||||
CLD2::uint32 offset = headerSize;
|
||||
|
||||
{ // scoping block
|
||||
int stateTablePad = alignment - (offset % alignment);
|
||||
if (stateTablePad == alignment) stateTablePad = 0;
|
||||
totalPadding += stateTablePad;
|
||||
if (DEBUG) { std::cout << "Alignment for stateTable adjusted by " << stateTablePad << std::endl; }
|
||||
offset += stateTablePad;
|
||||
header->startOf_utf8PropObj_state_table = offset;
|
||||
offset += header->lengthOf_utf8PropObj_state_table;
|
||||
}
|
||||
|
||||
{ // scoping block
|
||||
int remapPad = alignment - (offset % alignment);
|
||||
if (remapPad == alignment) remapPad = 0;
|
||||
totalPadding += remapPad;
|
||||
if (DEBUG) { std::cout << "Alignment for remap adjusted by " << remapPad << std::endl; }
|
||||
offset += remapPad;
|
||||
header->startOf_utf8PropObj_remap_base = offset;
|
||||
offset += header->lengthOf_utf8PropObj_remap_base;
|
||||
}
|
||||
|
||||
{ // scoping block
|
||||
int remapStringPad = alignment - (offset % alignment);
|
||||
if (remapStringPad == alignment) remapStringPad = 0;
|
||||
totalPadding += remapStringPad;
|
||||
if (DEBUG) { std::cout << "Alignment for remapString adjusted by " << remapStringPad << std::endl; }
|
||||
offset += remapStringPad;
|
||||
header->startOf_utf8PropObj_remap_string = offset;
|
||||
offset += header->lengthOf_utf8PropObj_remap_string; // null terminator already counted in initUtf8Headers
|
||||
}
|
||||
|
||||
{ // scoping block
|
||||
int fastStatePad = alignment - (offset % alignment);
|
||||
if (fastStatePad == alignment) fastStatePad = 0;
|
||||
totalPadding += fastStatePad;
|
||||
if (DEBUG) { std::cout << "Alignment for fastState adjusted by " << fastStatePad << std::endl; }
|
||||
offset += fastStatePad;
|
||||
if (header->lengthOf_utf8PropObj_fast_state > 0) {
|
||||
header->startOf_utf8PropObj_fast_state = offset;
|
||||
offset += header->lengthOf_utf8PropObj_fast_state; // null terminator already counted in initUtf8Headers
|
||||
} else {
|
||||
header->startOf_utf8PropObj_fast_state = 0;
|
||||
}
|
||||
}
|
||||
|
||||
{ // scoping block
|
||||
int deltaOctaPad = alignment - (offset % alignment);
|
||||
if (deltaOctaPad == alignment) deltaOctaPad = 0;
|
||||
totalPadding += deltaOctaPad;
|
||||
if (DEBUG) { std::cout << "Alignment for deltaOctaScore adjusted by " << deltaOctaPad << std::endl; }
|
||||
offset += deltaOctaPad;
|
||||
header->startOf_kAvgDeltaOctaScore = offset;
|
||||
offset += header->lengthOf_kAvgDeltaOctaScore;
|
||||
}
|
||||
|
||||
// TODO: The rest of the fields
|
||||
for (int x=0; x<header->numTablesEncoded; x++) {
|
||||
CLD2DynamicData::TableHeader& tableHeader = header->tableHeaders[x];
|
||||
int tablePad = alignment - (offset % alignment);
|
||||
if (tablePad == alignment) tablePad = 0;
|
||||
totalPadding += tablePad;
|
||||
if (DEBUG) { std::cout << "Alignment for table " << x << " adjusted by " << tablePad << std::endl; }
|
||||
offset += tablePad;
|
||||
tableHeader.startOf_kCLDTable = offset;
|
||||
offset += tableHeader.lengthOf_kCLDTable;
|
||||
|
||||
int indirectPad = alignment - (offset % alignment);
|
||||
if (indirectPad == alignment) indirectPad = 0;
|
||||
totalPadding += indirectPad;
|
||||
if (DEBUG) { std::cout << "Alignment for tableInd " << x << " adjusted by " << indirectPad << std::endl; }
|
||||
offset += indirectPad;
|
||||
tableHeader.startOf_kCLDTableInd = offset;
|
||||
offset += tableHeader.lengthOf_kCLDTableInd;
|
||||
|
||||
int scriptsPad = alignment - (offset % alignment);
|
||||
if (scriptsPad == alignment) scriptsPad = 0;
|
||||
totalPadding += scriptsPad;
|
||||
if (DEBUG) { std::cout << "Alignment for scriptsPad " << x << " adjusted by " << scriptsPad << std::endl; }
|
||||
offset += scriptsPad;
|
||||
tableHeader.startOf_kRecognizedLangScripts = offset;
|
||||
offset += tableHeader.lengthOf_kRecognizedLangScripts; // null terminator already counted in initTableHeaders
|
||||
}
|
||||
|
||||
// Now that we know exactly how much data we have written, store it in the
|
||||
// header as a sanity check
|
||||
header->totalFileSizeBytes = offset;
|
||||
|
||||
if (DEBUG) {
|
||||
std::cout << "Data aligned." << std::endl;
|
||||
std::cout << "Header size: " << headerSize << " bytes " << std::endl;
|
||||
std::cout << "Data size: " << (offset - totalPadding) << " bytes" << std::endl;
|
||||
std::cout << "Padding size: " << totalPadding << " bytes" << std::endl;
|
||||
|
||||
std::cout << " cld_generated_CjkUni_obj: " << (
|
||||
header->lengthOf_utf8PropObj_state_table +
|
||||
header->lengthOf_utf8PropObj_remap_string +
|
||||
header->lengthOf_utf8PropObj_fast_state)
|
||||
<< " bytes " << std::endl;
|
||||
std::cout << " kAvgDeltaOctaScore: "
|
||||
<< header->lengthOf_kAvgDeltaOctaScore << " bytes " << std::endl;
|
||||
std::cout << " kCjkCompat_obj: " << (
|
||||
header->tableHeaders[0].lengthOf_kCLDTable +
|
||||
header->tableHeaders[0].lengthOf_kCLDTableInd +
|
||||
header->tableHeaders[0].lengthOf_kRecognizedLangScripts + 1)
|
||||
<< " bytes " << std::endl;
|
||||
std::cout << " kCjkDeltaBi_obj: " << (
|
||||
header->tableHeaders[1].lengthOf_kCLDTable +
|
||||
header->tableHeaders[1].lengthOf_kCLDTableInd +
|
||||
header->tableHeaders[1].lengthOf_kRecognizedLangScripts + 1)
|
||||
<< " bytes " << std::endl;
|
||||
std::cout << " kDistinctBiTable_obj: " << (
|
||||
header->tableHeaders[2].lengthOf_kCLDTable +
|
||||
header->tableHeaders[2].lengthOf_kCLDTableInd +
|
||||
header->tableHeaders[2].lengthOf_kRecognizedLangScripts + 1)
|
||||
<< " bytes " << std::endl;
|
||||
std::cout << " kQuad_obj: " << (
|
||||
header->tableHeaders[3].lengthOf_kCLDTable +
|
||||
header->tableHeaders[3].lengthOf_kCLDTableInd +
|
||||
header->tableHeaders[3].lengthOf_kRecognizedLangScripts + 1)
|
||||
<< " bytes " << std::endl;
|
||||
std::cout << " kQuad_obj2: " << (
|
||||
header->tableHeaders[4].lengthOf_kCLDTable +
|
||||
header->tableHeaders[4].lengthOf_kCLDTableInd +
|
||||
header->tableHeaders[4].lengthOf_kRecognizedLangScripts + 1)
|
||||
<< " bytes " << std::endl;
|
||||
std::cout << " kDeltaOcta_obj: " << (
|
||||
header->tableHeaders[5].lengthOf_kCLDTable +
|
||||
header->tableHeaders[5].lengthOf_kCLDTableInd +
|
||||
header->tableHeaders[5].lengthOf_kRecognizedLangScripts + 1)
|
||||
<< " bytes " << std::endl;
|
||||
std::cout << " kDistinctOcta_obj: " << (
|
||||
header->tableHeaders[6].lengthOf_kCLDTable +
|
||||
header->tableHeaders[6].lengthOf_kCLDTableInd +
|
||||
header->tableHeaders[6].lengthOf_kRecognizedLangScripts + 1)
|
||||
<< " bytes " << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
void initDeltaHeaders(CLD2DynamicData::FileHeader* header, const short* deltaArray) {
|
||||
// TODO: Don't hardcode 614*4. Get constant from generated_language.cc?
|
||||
header->startOf_kAvgDeltaOctaScore = 0;
|
||||
header->lengthOf_kAvgDeltaOctaScore = 614 * 4; // from cld_generated_score_quad_octa_1024_256.cc
|
||||
}
|
||||
|
||||
void initUtf8Headers(CLD2DynamicData::FileHeader* header, const CLD2::UTF8PropObj* utf8Object) {
|
||||
header->utf8PropObj_state0 = utf8Object->state0;
|
||||
header->utf8PropObj_state0_size = utf8Object->state0_size;
|
||||
header->utf8PropObj_total_size = utf8Object->total_size;
|
||||
header->utf8PropObj_max_expand = utf8Object->max_expand;
|
||||
header->utf8PropObj_entry_shift = utf8Object->entry_shift;
|
||||
header->utf8PropObj_bytes_per_entry = utf8Object->bytes_per_entry;
|
||||
header->utf8PropObj_losub = utf8Object->losub;
|
||||
header->utf8PropObj_hiadd = utf8Object->hiadd;
|
||||
header->lengthOf_utf8PropObj_state_table = utf8Object->total_size;
|
||||
header->lengthOf_utf8PropObj_remap_base = sizeof(CLD2::RemapEntry); // TODO: Can this ever have more than one entry?
|
||||
header->lengthOf_utf8PropObj_remap_string = strlen(
|
||||
reinterpret_cast<const char*>(utf8Object->remap_string)) + 1; // note null terminator
|
||||
if (utf8Object->fast_state == NULL) {
|
||||
header->lengthOf_utf8PropObj_fast_state = 0; // not applicable
|
||||
} else {
|
||||
header->lengthOf_utf8PropObj_fast_state = strlen(
|
||||
reinterpret_cast<const char*>(utf8Object->fast_state)) + 1; // note null terminator
|
||||
}
|
||||
}
|
||||
} // End namespace CLD2DynamicDataExtractor
|
Reference in New Issue
Block a user