Enable Dynamic Mode for CLD2. See issue 6 for more information on dynamic mode:

https://code.google.com/p/cld2/issues/detail?id=6


git-svn-id: https://cld2.googlecode.com/svn/trunk@151 b252ecd4-b096-bf77-eb8e-91563289f87e
This commit is contained in:
andrewhayden@google.com
2014-03-03 15:20:05 +00:00
parent 0fb71b3bda
commit cffbd73e13
12 changed files with 1541 additions and 23 deletions

View File

@@ -0,0 +1,236 @@
// Copyright 2014 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "cld2_dynamic_data.h"
#include "integral_types.h"
#include <assert.h>
#include <iostream>
#include <stdint.h>
namespace CLD2DynamicData {
static int DEBUG=0;
void setDebug(int debug) {
DEBUG=debug;
}
bool mem_compare(const void* data1, const void* data2, const int length) {
const unsigned char* raw1 = static_cast<const unsigned char*>(data1);
const unsigned char* raw2 = static_cast<const unsigned char*>(data2);
for (int x=0; x<length; x++) {
if (raw1[x] != raw2[x]) {
std::cerr << "mem difference at data[" << x << "]: decimal " << (unsigned int) raw1[x] << " != decimal " << (unsigned int) raw2[x] << std::endl;
for (int y=std::max(0,x-5); y<length && y<=x+5; y++) {
std::cerr << "[" << y << "]: " << (unsigned int) raw1[y]
<< " <-> " << (unsigned int) raw2[y]
<< ( x == y ? " [FIRST ERROR DETECTED HERE] " : "")
<< std::endl;
}
return false;
}
}
return true;
}
CLD2::uint32 calculateHeaderSize(CLD2::uint32 numTables) {
return DATA_FILE_MARKER_LENGTH // NB: no null terminator
+ (20 * sizeof(CLD2::uint32)) // 20 uint32 fields in the struct
+ (numTables * (10 * sizeof(CLD2::uint32))); // 10 uint32 per table
}
void dumpHeader(FileHeader* header) {
char safeString[DATA_FILE_MARKER_LENGTH + 1];
memcpy(safeString, header->sanityString, DATA_FILE_MARKER_LENGTH);
safeString[DATA_FILE_MARKER_LENGTH] = 0;
std::cout << "sanityString: " << safeString << std::endl;
std::cout << "totalFileSizeBytes: " << header->totalFileSizeBytes << std::endl;
std::cout << "utf8PropObj_state0: " << header->utf8PropObj_state0 << std::endl;
std::cout << "utf8PropObj_state0_size: " << header->utf8PropObj_state0_size << std::endl;
std::cout << "utf8PropObj_total_size: " << header->utf8PropObj_total_size << std::endl;
std::cout << "utf8PropObj_max_expand: " << header->utf8PropObj_max_expand << std::endl;
std::cout << "utf8PropObj_entry_shift: " << header->utf8PropObj_entry_shift << std::endl;
std::cout << "utf8PropObj_bytes_per_entry: " << header->utf8PropObj_bytes_per_entry << std::endl;
std::cout << "utf8PropObj_losub: " << header->utf8PropObj_losub << std::endl;
std::cout << "utf8PropObj_hiadd: " << header->utf8PropObj_hiadd << std::endl;
std::cout << "startOf_utf8PropObj_state_table: " << header->startOf_utf8PropObj_state_table << std::endl;
std::cout << "lengthOf_utf8PropObj_state_table: " << header->lengthOf_utf8PropObj_state_table << std::endl;
std::cout << "startOf_utf8PropObj_remap_base: " << header->startOf_utf8PropObj_remap_base << std::endl;
std::cout << "lengthOf_utf8PropObj_remap_base: " << header->lengthOf_utf8PropObj_remap_base << std::endl;
std::cout << "startOf_utf8PropObj_remap_string: " << header->startOf_utf8PropObj_remap_string << std::endl;
std::cout << "lengthOf_utf8PropObj_remap_string: " << header->lengthOf_utf8PropObj_remap_string << std::endl;
std::cout << "startOf_utf8PropObj_fast_state: " << header->startOf_utf8PropObj_fast_state << std::endl;
std::cout << "lengthOf_utf8PropObj_fast_state: " << header->lengthOf_utf8PropObj_fast_state << std::endl;
std::cout << "startOf_kAvgDeltaOctaScore: " << header->startOf_kAvgDeltaOctaScore << std::endl;
std::cout << "lengthOf_kAvgDeltaOctaScore: " << header->lengthOf_kAvgDeltaOctaScore << std::endl;
std::cout << "numTablesEncoded: " << header->numTablesEncoded << std::endl;
const char* tableNames[7];
tableNames[0]="unigram_compat_obj";
tableNames[1]="deltabi_obj";
tableNames[2]="distinctbi_obj";
tableNames[3]="quadgram_obj";
tableNames[4]="quadgram_obj2";
tableNames[5]="deltaocta_obj";
tableNames[6]="distinctocta_obj";
for (int x=0; x<header->numTablesEncoded; x++) {
TableHeader& tHeader = header->tableHeaders[x];
std::cout << "Table " << (x+1) << ": (" << tableNames[x] << ")" << std::endl;
std::cout << " kCLDTableSizeOne: " << tHeader.kCLDTableSizeOne << std::endl;
std::cout << " kCLDTableSize: " << tHeader.kCLDTableSize << std::endl;
std::cout << " kCLDTableKeyMask: " << tHeader.kCLDTableKeyMask << std::endl;
std::cout << " kCLDTableBuildDate: " << tHeader.kCLDTableBuildDate << std::endl;
std::cout << " startOf_kCLDTable: " << tHeader.startOf_kCLDTable << std::endl;
std::cout << " lengthOf_kCLDTable: " << tHeader.lengthOf_kCLDTable << std::endl;
std::cout << " startOf_kCLDTableInd: " << tHeader.startOf_kCLDTableInd << std::endl;
std::cout << " lengthOf_kCLDTableInd: " << tHeader.lengthOf_kCLDTableInd << std::endl;
std::cout << " startOf_kRecognizedLangScripts: " << tHeader.startOf_kRecognizedLangScripts << std::endl;
std::cout << " lengthOf_kRecognizedLangScripts: " << tHeader.lengthOf_kRecognizedLangScripts << std::endl;
}
}
#define CHECK_EQUALS(name) if (loadedData->name != realData->name) {\
std::cerr << #name << ": " << loadedData->name << " != " << realData->name << std::endl;\
return false;\
}
#define CHECK_MEM_EQUALS(name,size) if (!mem_compare(loadedData->name,realData->name,size)) {\
std::cerr << #name << ": data mismatch." << std::endl;\
return false;\
}
bool verify(const CLD2::ScoringTables* realData, const CLD2::ScoringTables* loadedData) {
const int NUM_TABLES = 7;
const CLD2::CLD2TableSummary* realTableSummaries[NUM_TABLES];
realTableSummaries[0] = realData->unigram_compat_obj;
realTableSummaries[1] = realData->deltabi_obj;
realTableSummaries[2] = realData->distinctbi_obj;
realTableSummaries[3] = realData->quadgram_obj;
realTableSummaries[4] = realData->quadgram_obj2;
realTableSummaries[5] = realData->deltaocta_obj;
realTableSummaries[6] = realData->distinctocta_obj;
const CLD2::CLD2TableSummary* loadedTableSummaries[NUM_TABLES];
loadedTableSummaries[0] = loadedData->unigram_compat_obj;
loadedTableSummaries[1] = loadedData->deltabi_obj;
loadedTableSummaries[2] = loadedData->distinctbi_obj;
loadedTableSummaries[3] = loadedData->quadgram_obj;
loadedTableSummaries[4] = loadedData->quadgram_obj2;
loadedTableSummaries[5] = loadedData->deltaocta_obj;
loadedTableSummaries[6] = loadedData->distinctocta_obj;
CHECK_EQUALS(unigram_obj->state0);
CHECK_EQUALS(unigram_obj->state0_size);
CHECK_EQUALS(unigram_obj->total_size);
CHECK_EQUALS(unigram_obj->max_expand);
CHECK_EQUALS(unigram_obj->entry_shift);
CHECK_EQUALS(unigram_obj->bytes_per_entry);
CHECK_EQUALS(unigram_obj->losub);
CHECK_EQUALS(unigram_obj->hiadd);
CHECK_MEM_EQUALS(unigram_obj->state_table, realData->unigram_obj->total_size);
CHECK_MEM_EQUALS(unigram_obj->remap_base, sizeof(CLD2::RemapEntry)); // TODO: can this have more than one entry?
CHECK_MEM_EQUALS(unigram_obj->remap_string, strlen(
reinterpret_cast<const char*>(realData->unigram_obj->remap_string)) + 1); // null terminator included
if (loadedData->unigram_obj->fast_state == NULL) {
if (realData->unigram_obj->fast_state != NULL) {
std::cerr << "unigram_obj->fast_state is missing." << std::endl;
return false;
}
} else {
if (realData->unigram_obj->fast_state == NULL) {
std::cerr << "unigram_obj->fast_state shouldn't be present." << std::endl;
return false;
}
CHECK_MEM_EQUALS(unigram_obj->fast_state, strlen(
reinterpret_cast<const char*>(realData->unigram_obj->fast_state)) + 1); // null terminator included
}
if (DEBUG) std::cout << "verified." << std::endl;
if (DEBUG) std::cout << "Verifying kExpectedScore... ";
CHECK_MEM_EQUALS(kExpectedScore, 614*4); // TODO: Don't hardcode 614*4.
if (DEBUG) std::cout << "verified." << std::endl;
// 3. Each table
for (int x=0; x<NUM_TABLES; x++) {
if (DEBUG) std::cout << "Verifying table " << (x+1) << "... ";
const CLD2::CLD2TableSummary* realData = realTableSummaries[x];
const CLD2::CLD2TableSummary* loadedData = loadedTableSummaries[x];
// We need to calculate the table lengths to do the memcmp
CLD2::uint32 bytesPerBucket = sizeof(CLD2::IndirectProbBucket4);
CLD2::uint32 numBuckets = realData->kCLDTableSize;
CLD2::uint32 tableSizeBytes = bytesPerBucket * numBuckets;
CLD2::uint32 indirectTableSizeBytes =
realData->kCLDTableSizeOne * sizeof(CLD2::uint32);
// XXX XXX XXX HACK HACK HACK FIXME FIXME FIXME
// XXX XXX XXX HACK HACK HACK FIXME FIXME FIXME
// XXX XXX XXX HACK HACK HACK FIXME FIXME FIXME
// cld2_generated_cjk_compatible.cc has a kCLDTableSizeOne of zero!
if (x == 0) { // cld2_generated_cjk_compatible.cc
indirectTableSizeBytes = 239*2*4;
}
// XXX XXX XXX HACK HACK HACK FIXME FIXME FIXME
// XXX XXX XXX HACK HACK HACK FIXME FIXME FIXME
// XXX XXX XXX HACK HACK HACK FIXME FIXME FIXME
CLD2::uint32 recognizedScriptsSizeBytes =
strlen(realData->kRecognizedLangScripts) + 1; // null terminator included
// Verify the table data
CHECK_EQUALS(kCLDTableSizeOne);
CHECK_EQUALS(kCLDTableSize);
CHECK_EQUALS(kCLDTableKeyMask);
CHECK_EQUALS(kCLDTableBuildDate);
CHECK_MEM_EQUALS(kCLDTable, tableSizeBytes);
CHECK_MEM_EQUALS(kCLDTableInd, indirectTableSizeBytes);
CHECK_MEM_EQUALS(kRecognizedLangScripts, recognizedScriptsSizeBytes);
if (DEBUG) std::cout << "verified." << std::endl;
}
if (DEBUG) std::cout << "All data verified successfully." << std::endl;
return true;
}
// As noted on http://stackoverflow.com/questions/1001307, gcc is highly likely
// to convert this function's return into a constant - meaning that any
// if-branches based upon it will be eliminated at compile time, allowing
// "free" detection throughout any dependent code.
bool isLittleEndian() {
union {
uint32_t integer;
char bytes[4];
} test = {0x01020304};
return test.bytes[0] == 4;
}
bool coreAssumptionsOk() {
if (sizeof(CLD2::uint8) != 1) {
std::cerr << "uint8 is " << (sizeof(CLD2::uint8) * 8)
<< " bits instead of 8!" << std::endl;
return false;
}
if (sizeof(CLD2::uint16) != 2) {
std::cerr << "uint16 is " << (sizeof(CLD2::uint16) * 8)
<< " bits instead of 16!" << std::endl;
return false;
}
if (sizeof(CLD2::uint32) != 4) {
std::cerr << "uint32 is " << (sizeof(CLD2::uint32) * 8)
<< " bits instead of 32!" << std::endl;
return false;
}
return true;
}
} // End namespace CLD2DynamicData

View File

@@ -0,0 +1,216 @@
// Copyright 2014 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef CLD2_INTERNAL_CLD2_DYNAMIC_DATA_H_
#define CLD2_INTERNAL_CLD2_DYNAMIC_DATA_H_
#include "integral_types.h"
#include "cld2tablesummary.h"
#include "utf8statetable.h"
#include "scoreonescriptspan.h"
/*
There are two primary parts to a CLD2 dynamic data file:
1. A header, wherein trivial data, block lengths and block offsets are kept
2. A data block, wherein the large binary blocks are kept
By reading the header, an application can determine the offsets and lengths of
all the data blocks for all tables. Offsets in the header are expressed
relative to the first byte of the file, inclusive of the header itself; thus,
any offset whose value is less than the length of the header is invalid.
Any offset whose value is zero indicates a field that is null in the
underlying CLD2 data; a real example of this is the fast_state field of the
UTF8PropObj, which may be null.
The size of the header can be precalculated by calling calculateHeaderSize(),
which will indicate the exact size of the header for a data file that contains
a given number of CLD2TableSummary objects.
Notes on endianness:
The data format is only suitable for little-endian machines. For big-endian
systems, a tedious transformation would need to be made first to reverse the
byte order of significant portions of the binary - not just the lengths, but
also some of the underlying table data.
Note on 32/64 bit:
The data format is agnostic to 32/64 bit pointers. All the offsets within the
data blob itself are 32-bit values relative to the start of the file, and the
file should certainly never be gigabytes in size!
When the file is ultimately read by the loading code and mmap()'d, new
pointers are generated at whatever size the system uses, initialized to the
start of the mmap, and incremented by the 32-bit offset. This should be safe
regardless of 32- or 64-bit architectures.
--------------------------------------------------------------------
FIELD
--------------------------------------------------------------------
DATA_FILE_MARKER (no null terminator)
total file size (sanity check, uint32)
--------------------------------------------------------------------
UTF8PropObj: const uint32 state0
UTF8PropObj: const uint32 state0_size
UTF8PropObj: const uint32 total_size
UTF8PropObj: const int max_expand
UTF8PropObj: const int entry_shift (coerced to 32 bits)
UTF8PropObj: const int bytes_per_entry (coerced to 32 bits)
UTF8PropObj: const uint32 losub
UTF8PropObj: const uint32 hiadd
offset of UTF8PropObj: const uint8* state_table
length of UTF8PropObj: const uint8* state_table
offset of UTF8PropObj: const RemapEntry* remap_base (4-byte struct)
length of UTF8PropObj: const RemapEntry* remap_base (4-byte struct)
offset of UTF8PropObj: const uint8* remap_string
length of UTF8PropObj: const uint8* remap_string
offset of UTF8PropObj: const uint8* fast_state
length of UTF8PropObj: const uint8* fast_state
--------------------------------------------------------------------
start of const short kAvgDeltaOctaScore[]
length of const short kAvgDeltaOctaScore[]
--------------------------------------------------------------------
number of CLD2TableSummary objects encoded (n)
[Table 1]: CLD2TableSummary: uint32 kCLDTableSizeOne
[Table 1]: CLD2TableSummary: uint32 kCLDTableSize
[Table 1]: CLD2TableSummary: uint32 kCLDTableKeyMask
[Table 1]: CLD2TableSummary: uint32 kCLDTableBuildDate
[Table 1]: offset of CLD2TableSummary: const IndirectProbBucket4* kCLDTable
[Table 1]: length of CLD2TableSummary: const IndirectProbBucket4* kCLDTable
[Table 1]: offset of CLD2TableSummary: const uint32* kCLDTableInd
[Table 1]: length of CLD2TableSummary: const uint32* kCLDTableInd
[Table 1]: offset of CLD2TableSummary: const char* kRecognizedLangScripts
[Table 1]: length of CLD2TableSummary: const char* kRecognizedLangScripts + 1
.
.
.
[Table n]: CLD2TableSummary: uint32 kCLDTableSizeOne
[Table n]: CLD2TableSummary: uint32 kCLDTableSize
[Table n]: CLD2TableSummary: uint32 kCLDTableKeyMask
[Table n]: CLD2TableSummary: uint32 kCLDTableBuildDate
[Table n]: offset of CLD2TableSummary: const IndirectProbBucket4* kCLDTable
[Table n]: length of CLD2TableSummary: const IndirectProbBucket4* kCLDTable
[Table n]: offset of CLD2TableSummary: const uint32* kCLDTableInd
[Table n]: length of CLD2TableSummary: const uint32* kCLDTableInd
[Table n]: offset of CLD2TableSummary: const char* kRecognizedLangScripts
[Table n]: length of CLD2TableSummary: const char* kRecognizedLangScripts + 1
--------------------------------------------------------------------
Immediately after the header fields comes the data block. The data block has
the following content, in this order (note that padding is applied in order to
keep lookups word-aligned):
UTF8PropObj: const uint8* state_table
UTF8PropObj: const RemapEntry* remap_base (4-byte struct)
UTF8PropObj: const uint8* remap_string
UTF8PropObj: const uint8* fast_state
const short kAvgDeltaOctaScore[]
[Table 1]: CLD2TableSummary: const IndirectProbBucket4* kCLDTable
[Table 1]: CLD2TableSummary: const uint32* kCLDTableInd
[Table 1]: CLD2TableSummary: const char* kRecognizedLangScripts (with null terminator)
.
.
.
[Table n]: CLD2TableSummary: const IndirectProbBucket4* kCLDTable
[Table n]: CLD2TableSummary: const uint32* kCLDTableInd
[Table n]: CLD2TableSummary: const char* kRecognizedLangScripts (with null terminator)
It is STRONGLY recommended that the chunks within the data block be kept
128-bit aligned for efficiency reasons, although the code will work without
such alignment: the main lookup tables have randomly-accessed groups of four
4-byte entries, and these must be 16-byte aligned to avoid the performance
cost of multiple cache misses per group.
*/
namespace CLD2DynamicData {
static const char* DATA_FILE_MARKER = "cld2_data_file00";
static const int DATA_FILE_MARKER_LENGTH = 16; // Keep aligned to 128 bits
// Nicer version of memcmp that shows the offset at which bytes differ
bool mem_compare(const void* data1, const void* data2, const int length);
// Enable or disable debugging; 0 to disable, 1 to enable
void setDebug(int debug);
// Lower-level structure for individual tables. There are n table headers in
// a given file header.
typedef struct {
CLD2::uint32 kCLDTableSizeOne;
CLD2::uint32 kCLDTableSize;
CLD2::uint32 kCLDTableKeyMask;
CLD2::uint32 kCLDTableBuildDate;
CLD2::uint32 startOf_kCLDTable;
CLD2::uint32 lengthOf_kCLDTable;
CLD2::uint32 startOf_kCLDTableInd;
CLD2::uint32 lengthOf_kCLDTableInd;
CLD2::uint32 startOf_kRecognizedLangScripts;
CLD2::uint32 lengthOf_kRecognizedLangScripts;
} TableHeader;
// Top-level structure for a CLD2 Data File Header.
// Contains all the primitive fields for the header as well as an array of
// headers for the individual tables.
typedef struct {
// Marker fields help recognize and verify the data file
char sanityString[DATA_FILE_MARKER_LENGTH];
CLD2::uint32 totalFileSizeBytes;
// UTF8 primitives
CLD2::uint32 utf8PropObj_state0;
CLD2::uint32 utf8PropObj_state0_size;
CLD2::uint32 utf8PropObj_total_size;
CLD2::uint32 utf8PropObj_max_expand;
CLD2::uint32 utf8PropObj_entry_shift;
CLD2::uint32 utf8PropObj_bytes_per_entry;
CLD2::uint32 utf8PropObj_losub;
CLD2::uint32 utf8PropObj_hiadd;
CLD2::uint32 startOf_utf8PropObj_state_table;
CLD2::uint32 lengthOf_utf8PropObj_state_table;
CLD2::uint32 startOf_utf8PropObj_remap_base;
CLD2::uint32 lengthOf_utf8PropObj_remap_base;
CLD2::uint32 startOf_utf8PropObj_remap_string;
CLD2::uint32 lengthOf_utf8PropObj_remap_string;
CLD2::uint32 startOf_utf8PropObj_fast_state;
CLD2::uint32 lengthOf_utf8PropObj_fast_state;
// Average delta-octa-score bits
CLD2::uint32 startOf_kAvgDeltaOctaScore;
CLD2::uint32 lengthOf_kAvgDeltaOctaScore;
// Table bits
CLD2::uint32 numTablesEncoded;
TableHeader* tableHeaders;
} FileHeader;
// Calculate the exact size of a header that encodes the specified number of
// tables. This can be used to reserve space within the data file,
// calculate offsets, and so on.
CLD2::uint32 calculateHeaderSize(CLD2::uint32 numTables);
// Dump a given header to stdout as a human-readable string.
void dumpHeader(FileHeader* header);
// Verify that a given pair of scoring tables match precisely
// If there is a problem, returns an error message; otherwise, the empty string.
bool verify(const CLD2::ScoringTables* realData, const CLD2::ScoringTables* loadedData);
// Return true iff the program is running in little-endian mode.
bool isLittleEndian();
// Return true iff the core size assumptions are ok on this platform.
bool coreAssumptionsOk();
} // End namespace CLD2DynamicData
#endif // CLD2_INTERNAL_CLD2_DYNAMIC_DATA_H_

View File

@@ -0,0 +1,376 @@
// Copyright 2014 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <iostream>
#include "cld2_dynamic_data.h"
#include "cld2_dynamic_data_extractor.h"
#include "cld2_dynamic_data_loader.h" // for verifying the written data
#include "integral_types.h"
#include "cld2tablesummary.h"
#include "utf8statetable.h"
using namespace std;
namespace CLD2DynamicDataExtractor {
static int DEBUG=0;
void setDebug(int debug) {
DEBUG=debug;
}
int advance(FILE* f, CLD2::uint32 position) {
const char ZERO = 0;
int pad = position - ftell(f);
if (DEBUG) cout << " Adding " << pad << " bytes of padding" << endl;
while (pad-- > 0) {
fwrite(&ZERO,1,1,f);
}
return pad;
}
void writeChunk(FILE *f, const void* data, CLD2::uint32 startAt, CLD2::uint32 length) {
if (DEBUG) cout << "Write chunk @" << startAt << ", len=" << length << endl;
advance(f, startAt);
if (DEBUG) cout << " Writing " << length << " bytes of data" << endl;
fwrite(data, 1, length, f);
}
void writeDataFile(const CLD2::ScoringTables* data, const char* fileName) {
// The order here is hardcoded and MUST NOT BE CHANGED, else you will de-sync
// with the reading code.
const char ZERO = 0;
const int NUM_TABLES = 7;
const CLD2::CLD2TableSummary* tableSummaries[NUM_TABLES];
tableSummaries[0] = data->unigram_compat_obj;
tableSummaries[1] = data->deltabi_obj;
tableSummaries[2] = data->distinctbi_obj;
tableSummaries[3] = data->quadgram_obj;
tableSummaries[4] = data->quadgram_obj2;
tableSummaries[5] = data->deltaocta_obj;
tableSummaries[6] = data->distinctocta_obj;
CLD2DynamicData::TableHeader tableHeaders[NUM_TABLES];
CLD2DynamicData::FileHeader fileHeader;
fileHeader.numTablesEncoded = NUM_TABLES;
fileHeader.tableHeaders = tableHeaders;
initUtf8Headers(&fileHeader, data->unigram_obj);
initDeltaHeaders(&fileHeader, data->kExpectedScore);
initTableHeaders(tableSummaries, NUM_TABLES, tableHeaders);
alignAll(&fileHeader, 16); // Align all sections to 128-bit boundaries
// We are ready to rock.
for (int x=0; x<CLD2DynamicData::DATA_FILE_MARKER_LENGTH; x++)
fileHeader.sanityString[x] = CLD2DynamicData::DATA_FILE_MARKER[x];
FILE* outFile = fopen(fileName, "w");
fwrite(fileHeader.sanityString, 1, CLD2DynamicData::DATA_FILE_MARKER_LENGTH, outFile);
fwrite(&(fileHeader.totalFileSizeBytes), 4, 1, outFile);
fwrite(&(fileHeader.utf8PropObj_state0), 4, 1, outFile);
fwrite(&(fileHeader.utf8PropObj_state0_size), 4, 1, outFile);
fwrite(&(fileHeader.utf8PropObj_total_size), 4, 1, outFile);
fwrite(&(fileHeader.utf8PropObj_max_expand), 4, 1, outFile);
fwrite(&(fileHeader.utf8PropObj_entry_shift), 4, 1, outFile);
fwrite(&(fileHeader.utf8PropObj_bytes_per_entry), 4, 1, outFile);
fwrite(&(fileHeader.utf8PropObj_losub), 4, 1, outFile);
fwrite(&(fileHeader.utf8PropObj_hiadd), 4, 1, outFile);
fwrite(&(fileHeader.startOf_utf8PropObj_state_table), 4, 1, outFile);
fwrite(&(fileHeader.lengthOf_utf8PropObj_state_table), 4, 1, outFile);
fwrite(&(fileHeader.startOf_utf8PropObj_remap_base), 4, 1, outFile);
fwrite(&(fileHeader.lengthOf_utf8PropObj_remap_base), 4, 1, outFile);
fwrite(&(fileHeader.startOf_utf8PropObj_remap_string), 4, 1, outFile);
fwrite(&(fileHeader.lengthOf_utf8PropObj_remap_string), 4, 1, outFile);
fwrite(&(fileHeader.startOf_utf8PropObj_fast_state), 4, 1, outFile);
fwrite(&(fileHeader.lengthOf_utf8PropObj_fast_state), 4, 1, outFile);
fwrite(&(fileHeader.startOf_kAvgDeltaOctaScore), 4, 1, outFile);
fwrite(&(fileHeader.lengthOf_kAvgDeltaOctaScore), 4, 1, outFile);
fwrite(&(fileHeader.numTablesEncoded), 4, 1, outFile);
for (int x=0; x<NUM_TABLES; x++) {
CLD2DynamicData::TableHeader& tHeader = fileHeader.tableHeaders[x];
fwrite(&(tHeader.kCLDTableSizeOne), 4, 1, outFile);
fwrite(&(tHeader.kCLDTableSize), 4, 1, outFile);
fwrite(&(tHeader.kCLDTableKeyMask), 4, 1, outFile);
fwrite(&(tHeader.kCLDTableBuildDate), 4, 1, outFile);
fwrite(&(tHeader.startOf_kCLDTable), 4, 1, outFile);
fwrite(&(tHeader.lengthOf_kCLDTable), 4, 1, outFile);
fwrite(&(tHeader.startOf_kCLDTableInd), 4, 1, outFile);
fwrite(&(tHeader.lengthOf_kCLDTableInd), 4, 1, outFile);
fwrite(&(tHeader.startOf_kRecognizedLangScripts), 4, 1, outFile);
fwrite(&(tHeader.lengthOf_kRecognizedLangScripts), 4, 1, outFile);
}
// Write data blob
// 1. UTF8 Object
writeChunk(outFile,
data->unigram_obj->state_table,
fileHeader.startOf_utf8PropObj_state_table,
fileHeader.lengthOf_utf8PropObj_state_table);
// FIXME: Unsafe to rely on this since RemapEntry is not a bit-packed structure
writeChunk(outFile,
data->unigram_obj->remap_base,
fileHeader.startOf_utf8PropObj_remap_base,
fileHeader.lengthOf_utf8PropObj_remap_base);
writeChunk(outFile,
data->unigram_obj->remap_string,
fileHeader.startOf_utf8PropObj_remap_string,
fileHeader.lengthOf_utf8PropObj_remap_string - 1);
fwrite(&ZERO,1,1,outFile); // null terminator
if (fileHeader.startOf_utf8PropObj_fast_state > 0) {
writeChunk(outFile,
data->unigram_obj->fast_state,
fileHeader.startOf_utf8PropObj_fast_state,
fileHeader.lengthOf_utf8PropObj_fast_state - 1);
fwrite(&ZERO,1,1,outFile); // null terminator
}
// 2. kAvgDeltaOctaScore array
writeChunk(outFile,
data->kExpectedScore,
fileHeader.startOf_kAvgDeltaOctaScore,
fileHeader.lengthOf_kAvgDeltaOctaScore);
// 3. Each table
for (int x=0; x<NUM_TABLES; x++) {
const CLD2::CLD2TableSummary* summary = tableSummaries[x];
CLD2DynamicData::TableHeader& tHeader = fileHeader.tableHeaders[x];
// NB: Safe to directly write IndirectProbBucket4 as it is just an alias for CLD2::uint32
writeChunk(outFile,
summary->kCLDTable,
tHeader.startOf_kCLDTable,
tHeader.lengthOf_kCLDTable);
writeChunk(outFile,
summary->kCLDTableInd,
tHeader.startOf_kCLDTableInd,
tHeader.lengthOf_kCLDTableInd);
writeChunk(outFile,
summary->kRecognizedLangScripts,
tHeader.startOf_kRecognizedLangScripts,
tHeader.lengthOf_kRecognizedLangScripts - 1);
fwrite(&ZERO,1,1,outFile); // null terminator
}
fclose(outFile);
}
void initTableHeaders(const CLD2::CLD2TableSummary** summaries,
int numSummaries, CLD2DynamicData::TableHeader* tableHeaders) {
for (int x=0; x<numSummaries; x++) {
const CLD2::CLD2TableSummary* summary = summaries[x];
CLD2DynamicData::TableHeader& tableHeader = tableHeaders[x];
// Copy the primitive bits
tableHeader.kCLDTableSizeOne = summary->kCLDTableSizeOne;
tableHeader.kCLDTableSize = summary->kCLDTableSize;
tableHeader.kCLDTableKeyMask = summary->kCLDTableKeyMask;
tableHeader.kCLDTableBuildDate = summary->kCLDTableBuildDate;
// Calculate size information
CLD2::uint32 bytesPerBucket = sizeof(CLD2::IndirectProbBucket4);
CLD2::uint32 numBuckets = summary->kCLDTableSize;
CLD2::uint32 tableSizeBytes = bytesPerBucket * numBuckets;
CLD2::uint32 indirectTableSizeBytes =
summary->kCLDTableSizeOne * sizeof(CLD2::uint32);
// XXX XXX XXX HACK HACK HACK FIXME FIXME FIXME
// XXX XXX XXX HACK HACK HACK FIXME FIXME FIXME
// XXX XXX XXX HACK HACK HACK FIXME FIXME FIXME
// cld2_generated_cjk_compatible.cc has a kCLDTableSizeOne of zero!
if (x == 0) { // cld2_generated_cjk_compatible.cc
indirectTableSizeBytes = 239*2*4;
}
// XXX XXX XXX HACK HACK HACK FIXME FIXME FIXME
// XXX XXX XXX HACK HACK HACK FIXME FIXME FIXME
// XXX XXX XXX HACK HACK HACK FIXME FIXME FIXME
CLD2::uint32 recognizedScriptsSizeBytes =
strlen(summary->kRecognizedLangScripts) + 1; // note null terminator
// Place size information into header. We'll align on byte boundaries later.
tableHeader.lengthOf_kCLDTable = tableSizeBytes;
tableHeader.lengthOf_kCLDTableInd = indirectTableSizeBytes;
tableHeader.lengthOf_kRecognizedLangScripts =
recognizedScriptsSizeBytes; // null terminator counted above
}
}
// Assuming that all fields have been set in the specified header, re-align
// the starting positions of all data chunks to be aligned along 64-bit
// boundaries for maximum efficiency.
void alignAll(CLD2DynamicData::FileHeader* header, int alignment) {
CLD2::uint32 totalPadding = 0;
if (DEBUG) { std::cout << "Align for " << (alignment*8) << " bits." << std::endl; }
CLD2::uint32 headerSize = CLD2DynamicData::calculateHeaderSize(
header->numTablesEncoded);
CLD2::uint32 offset = headerSize;
{ // scoping block
int stateTablePad = alignment - (offset % alignment);
if (stateTablePad == alignment) stateTablePad = 0;
totalPadding += stateTablePad;
if (DEBUG) { std::cout << "Alignment for stateTable adjusted by " << stateTablePad << std::endl; }
offset += stateTablePad;
header->startOf_utf8PropObj_state_table = offset;
offset += header->lengthOf_utf8PropObj_state_table;
}
{ // scoping block
int remapPad = alignment - (offset % alignment);
if (remapPad == alignment) remapPad = 0;
totalPadding += remapPad;
if (DEBUG) { std::cout << "Alignment for remap adjusted by " << remapPad << std::endl; }
offset += remapPad;
header->startOf_utf8PropObj_remap_base = offset;
offset += header->lengthOf_utf8PropObj_remap_base;
}
{ // scoping block
int remapStringPad = alignment - (offset % alignment);
if (remapStringPad == alignment) remapStringPad = 0;
totalPadding += remapStringPad;
if (DEBUG) { std::cout << "Alignment for remapString adjusted by " << remapStringPad << std::endl; }
offset += remapStringPad;
header->startOf_utf8PropObj_remap_string = offset;
offset += header->lengthOf_utf8PropObj_remap_string; // null terminator already counted in initUtf8Headers
}
{ // scoping block
int fastStatePad = alignment - (offset % alignment);
if (fastStatePad == alignment) fastStatePad = 0;
totalPadding += fastStatePad;
if (DEBUG) { std::cout << "Alignment for fastState adjusted by " << fastStatePad << std::endl; }
offset += fastStatePad;
if (header->lengthOf_utf8PropObj_fast_state > 0) {
header->startOf_utf8PropObj_fast_state = offset;
offset += header->lengthOf_utf8PropObj_fast_state; // null terminator already counted in initUtf8Headers
} else {
header->startOf_utf8PropObj_fast_state = 0;
}
}
{ // scoping block
int deltaOctaPad = alignment - (offset % alignment);
if (deltaOctaPad == alignment) deltaOctaPad = 0;
totalPadding += deltaOctaPad;
if (DEBUG) { std::cout << "Alignment for deltaOctaScore adjusted by " << deltaOctaPad << std::endl; }
offset += deltaOctaPad;
header->startOf_kAvgDeltaOctaScore = offset;
offset += header->lengthOf_kAvgDeltaOctaScore;
}
// TODO: The rest of the fields
for (int x=0; x<header->numTablesEncoded; x++) {
CLD2DynamicData::TableHeader& tableHeader = header->tableHeaders[x];
int tablePad = alignment - (offset % alignment);
if (tablePad == alignment) tablePad = 0;
totalPadding += tablePad;
if (DEBUG) { std::cout << "Alignment for table " << x << " adjusted by " << tablePad << std::endl; }
offset += tablePad;
tableHeader.startOf_kCLDTable = offset;
offset += tableHeader.lengthOf_kCLDTable;
int indirectPad = alignment - (offset % alignment);
if (indirectPad == alignment) indirectPad = 0;
totalPadding += indirectPad;
if (DEBUG) { std::cout << "Alignment for tableInd " << x << " adjusted by " << indirectPad << std::endl; }
offset += indirectPad;
tableHeader.startOf_kCLDTableInd = offset;
offset += tableHeader.lengthOf_kCLDTableInd;
int scriptsPad = alignment - (offset % alignment);
if (scriptsPad == alignment) scriptsPad = 0;
totalPadding += scriptsPad;
if (DEBUG) { std::cout << "Alignment for scriptsPad " << x << " adjusted by " << scriptsPad << std::endl; }
offset += scriptsPad;
tableHeader.startOf_kRecognizedLangScripts = offset;
offset += tableHeader.lengthOf_kRecognizedLangScripts; // null terminator already counted in initTableHeaders
}
// Now that we know exactly how much data we have written, store it in the
// header as a sanity check
header->totalFileSizeBytes = offset;
if (DEBUG) {
std::cout << "Data aligned." << std::endl;
std::cout << "Header size: " << headerSize << " bytes " << std::endl;
std::cout << "Data size: " << (offset - totalPadding) << " bytes" << std::endl;
std::cout << "Padding size: " << totalPadding << " bytes" << std::endl;
std::cout << " cld_generated_CjkUni_obj: " << (
header->lengthOf_utf8PropObj_state_table +
header->lengthOf_utf8PropObj_remap_string +
header->lengthOf_utf8PropObj_fast_state)
<< " bytes " << std::endl;
std::cout << " kAvgDeltaOctaScore: "
<< header->lengthOf_kAvgDeltaOctaScore << " bytes " << std::endl;
std::cout << " kCjkCompat_obj: " << (
header->tableHeaders[0].lengthOf_kCLDTable +
header->tableHeaders[0].lengthOf_kCLDTableInd +
header->tableHeaders[0].lengthOf_kRecognizedLangScripts + 1)
<< " bytes " << std::endl;
std::cout << " kCjkDeltaBi_obj: " << (
header->tableHeaders[1].lengthOf_kCLDTable +
header->tableHeaders[1].lengthOf_kCLDTableInd +
header->tableHeaders[1].lengthOf_kRecognizedLangScripts + 1)
<< " bytes " << std::endl;
std::cout << " kDistinctBiTable_obj: " << (
header->tableHeaders[2].lengthOf_kCLDTable +
header->tableHeaders[2].lengthOf_kCLDTableInd +
header->tableHeaders[2].lengthOf_kRecognizedLangScripts + 1)
<< " bytes " << std::endl;
std::cout << " kQuad_obj: " << (
header->tableHeaders[3].lengthOf_kCLDTable +
header->tableHeaders[3].lengthOf_kCLDTableInd +
header->tableHeaders[3].lengthOf_kRecognizedLangScripts + 1)
<< " bytes " << std::endl;
std::cout << " kQuad_obj2: " << (
header->tableHeaders[4].lengthOf_kCLDTable +
header->tableHeaders[4].lengthOf_kCLDTableInd +
header->tableHeaders[4].lengthOf_kRecognizedLangScripts + 1)
<< " bytes " << std::endl;
std::cout << " kDeltaOcta_obj: " << (
header->tableHeaders[5].lengthOf_kCLDTable +
header->tableHeaders[5].lengthOf_kCLDTableInd +
header->tableHeaders[5].lengthOf_kRecognizedLangScripts + 1)
<< " bytes " << std::endl;
std::cout << " kDistinctOcta_obj: " << (
header->tableHeaders[6].lengthOf_kCLDTable +
header->tableHeaders[6].lengthOf_kCLDTableInd +
header->tableHeaders[6].lengthOf_kRecognizedLangScripts + 1)
<< " bytes " << std::endl;
}
}
void initDeltaHeaders(CLD2DynamicData::FileHeader* header, const short* deltaArray) {
// TODO: Don't hardcode 614*4. Get constant from generated_language.cc?
header->startOf_kAvgDeltaOctaScore = 0;
header->lengthOf_kAvgDeltaOctaScore = 614 * 4; // from cld_generated_score_quad_octa_1024_256.cc
}
void initUtf8Headers(CLD2DynamicData::FileHeader* header, const CLD2::UTF8PropObj* utf8Object) {
header->utf8PropObj_state0 = utf8Object->state0;
header->utf8PropObj_state0_size = utf8Object->state0_size;
header->utf8PropObj_total_size = utf8Object->total_size;
header->utf8PropObj_max_expand = utf8Object->max_expand;
header->utf8PropObj_entry_shift = utf8Object->entry_shift;
header->utf8PropObj_bytes_per_entry = utf8Object->bytes_per_entry;
header->utf8PropObj_losub = utf8Object->losub;
header->utf8PropObj_hiadd = utf8Object->hiadd;
header->lengthOf_utf8PropObj_state_table = utf8Object->total_size;
header->lengthOf_utf8PropObj_remap_base = sizeof(CLD2::RemapEntry); // TODO: Can this ever have more than one entry?
header->lengthOf_utf8PropObj_remap_string = strlen(
reinterpret_cast<const char*>(utf8Object->remap_string)) + 1; // note null terminator
if (utf8Object->fast_state == NULL) {
header->lengthOf_utf8PropObj_fast_state = 0; // not applicable
} else {
header->lengthOf_utf8PropObj_fast_state = strlen(
reinterpret_cast<const char*>(utf8Object->fast_state)) + 1; // note null terminator
}
}
} // End namespace CLD2DynamicDataExtractor

View File

@@ -0,0 +1,54 @@
// Copyright 2014 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef CLD2_INTERNAL_CLD2_DYNAMIC_DATA_EXTRACTOR_H_
#define CLD2_INTERNAL_CLD2_DYNAMIC_DATA_EXTRACTOR_H_
#include "cld2_dynamic_data.h"
#include "integral_types.h"
#include "cld2tablesummary.h"
#include "utf8statetable.h"
#include "scoreonescriptspan.h"
namespace CLD2DynamicDataExtractor {
// Enable or disable debugging; 0 to disable, 1 to enable
void setDebug(int debug);
// Populates all the UTF8-related fields of the header, and returns the total
// space required within the binary blob to represent the non-primitive data.
void initUtf8Headers(CLD2DynamicData::FileHeader* header,
const CLD2::UTF8PropObj* utf8Object);
// Populates all the AvgDeltaOctaScore-related fields of the header.
void initDeltaHeaders(CLD2DynamicData::FileHeader* header,
const short* deltaArray);
// Populates all fields of all table headers for the specified table summaries.
// Tables are laid out back-to-back in the order that they are specified in the
// input array of summaries, and the headers are filled in in the same order.
void initTableHeaders(const CLD2::CLD2TableSummary** summaries,
int numSummaries, CLD2DynamicData::TableHeader* tableSummaryHeaders);
// Align all entries in the data block along boundaries that are multiples of
// the specified number of bytes. For example, to align everything along 64-bit
// boundaries, pass an alignment of 8 (bytes).
void alignAll(CLD2DynamicData::FileHeader* header, int alignment);
// Write the dynamic data file to disk.
void writeDataFile(const CLD2::ScoringTables* data, const char* fileName);
} // End namespace CLD2DynamicDataExtractor
#endif // CLD2_INTERNAL_CLD2_DYNAMIC_DATA_EXTRACTOR_H_

View File

@@ -0,0 +1,212 @@
// Copyright 2014 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <assert.h>
#include <stdio.h>
#include <iostream>
#include <fstream>
#include <fcntl.h>
#include <stdlib.h>
#include <string.h>
#include <sys/mman.h>
#include "cld2_dynamic_data.h"
#include "cld2_dynamic_data_loader.h"
#include "integral_types.h"
#include "cld2tablesummary.h"
#include "utf8statetable.h"
#include "scoreonescriptspan.h"
namespace CLD2DynamicDataLoader {
static int DEBUG=0;
CLD2DynamicData::FileHeader* loadHeader(const char* fileName) {
// TODO: force null-terminate char* strings for safety
FILE* inFile = fopen(fileName, "r");
if (inFile == NULL) {
return NULL;
}
int bytesRead = 0;
CLD2DynamicData::FileHeader* fileHeader = new CLD2DynamicData::FileHeader;
bytesRead += fread(fileHeader->sanityString, 1, CLD2DynamicData::DATA_FILE_MARKER_LENGTH, inFile);
if (!CLD2DynamicData::mem_compare(fileHeader->sanityString, CLD2DynamicData::DATA_FILE_MARKER, CLD2DynamicData::DATA_FILE_MARKER_LENGTH)) {
std::cerr << "Malformed header: bad file marker!" << std::endl;
delete fileHeader;
return NULL;
}
bytesRead += 4 * fread(&(fileHeader->totalFileSizeBytes), 4, 1, inFile);
bytesRead += 4 * fread(&(fileHeader->utf8PropObj_state0), 4, 1, inFile);
bytesRead += 4 * fread(&(fileHeader->utf8PropObj_state0_size), 4, 1, inFile);
bytesRead += 4 * fread(&(fileHeader->utf8PropObj_total_size), 4, 1, inFile);
bytesRead += 4 * fread(&(fileHeader->utf8PropObj_max_expand), 4, 1, inFile);
bytesRead += 4 * fread(&(fileHeader->utf8PropObj_entry_shift), 4, 1, inFile);
bytesRead += 4 * fread(&(fileHeader->utf8PropObj_bytes_per_entry), 4, 1, inFile);
bytesRead += 4 * fread(&(fileHeader->utf8PropObj_losub), 4, 1, inFile);
bytesRead += 4 * fread(&(fileHeader->utf8PropObj_hiadd), 4, 1, inFile);
bytesRead += 4 * fread(&(fileHeader->startOf_utf8PropObj_state_table), 4, 1, inFile);
bytesRead += 4 * fread(&(fileHeader->lengthOf_utf8PropObj_state_table), 4, 1, inFile);
bytesRead += 4 * fread(&(fileHeader->startOf_utf8PropObj_remap_base), 4, 1, inFile);
bytesRead += 4 * fread(&(fileHeader->lengthOf_utf8PropObj_remap_base), 4, 1, inFile);
bytesRead += 4 * fread(&(fileHeader->startOf_utf8PropObj_remap_string), 4, 1, inFile);
bytesRead += 4 * fread(&(fileHeader->lengthOf_utf8PropObj_remap_string), 4, 1, inFile);
bytesRead += 4 * fread(&(fileHeader->startOf_utf8PropObj_fast_state), 4, 1, inFile);
bytesRead += 4 * fread(&(fileHeader->lengthOf_utf8PropObj_fast_state), 4, 1, inFile);
bytesRead += 4 * fread(&(fileHeader->startOf_kAvgDeltaOctaScore), 4, 1, inFile);
bytesRead += 4 * fread(&(fileHeader->lengthOf_kAvgDeltaOctaScore), 4, 1, inFile);
bytesRead += 4 * fread(&(fileHeader->numTablesEncoded), 4, 1, inFile);
CLD2DynamicData::TableHeader* tableHeaders = new CLD2DynamicData::TableHeader[fileHeader->numTablesEncoded];
fileHeader->tableHeaders = tableHeaders;
for (int x=0; x<fileHeader->numTablesEncoded; x++) {
CLD2DynamicData::TableHeader &tHeader = fileHeader->tableHeaders[x];
bytesRead += 4 * fread(&(tHeader.kCLDTableSizeOne), 4, 1, inFile);
bytesRead += 4 * fread(&(tHeader.kCLDTableSize), 4, 1, inFile);
bytesRead += 4 * fread(&(tHeader.kCLDTableKeyMask), 4, 1, inFile);
bytesRead += 4 * fread(&(tHeader.kCLDTableBuildDate), 4, 1, inFile);
bytesRead += 4 * fread(&(tHeader.startOf_kCLDTable), 4, 1, inFile);
bytesRead += 4 * fread(&(tHeader.lengthOf_kCLDTable), 4, 1, inFile);
bytesRead += 4 * fread(&(tHeader.startOf_kCLDTableInd), 4, 1, inFile);
bytesRead += 4 * fread(&(tHeader.lengthOf_kCLDTableInd), 4, 1, inFile);
bytesRead += 4 * fread(&(tHeader.startOf_kRecognizedLangScripts), 4, 1, inFile);
bytesRead += 4 * fread(&(tHeader.lengthOf_kRecognizedLangScripts), 4, 1, inFile);
}
// Confirm header size is correct.
int expectedHeaderSize = CLD2DynamicData::calculateHeaderSize(fileHeader->numTablesEncoded);
if (expectedHeaderSize != bytesRead) {
std::cerr << "Header size mismatch! Expected " << expectedHeaderSize << ", but read " << bytesRead << std::endl;
delete fileHeader;
delete tableHeaders;
return NULL;
}
// Confirm file size is correct.
fseek(inFile, 0, SEEK_END);
int actualSize = ftell(inFile);
fclose(inFile);
if (actualSize != fileHeader->totalFileSizeBytes) {
std::cerr << "File size mismatch! Expected " << fileHeader->totalFileSizeBytes << ", but found " << actualSize << std::endl;
delete fileHeader;
delete tableHeaders;
return NULL;
}
return fileHeader;
}
void unloadData(CLD2::ScoringTables** scoringTables, void** mmapAddress, int* mmapLength) {
free(const_cast<CLD2::UTF8PropObj*>((*scoringTables)->unigram_obj));
(*scoringTables)->unigram_obj = NULL;
delete((*scoringTables)->unigram_compat_obj); // tableSummaries[0] from loadDataFile
(*scoringTables)->unigram_compat_obj = NULL;
delete(*scoringTables);
*scoringTables = NULL;
munmap(*mmapAddress, *mmapLength);
*mmapAddress = NULL;
*mmapLength = 0;
}
CLD2::ScoringTables* loadDataFile(const char* fileName, void** mmapAddressOut, int* mmapLengthOut) {
CLD2DynamicData::FileHeader* fileHeader = loadHeader(fileName);
if (fileHeader == NULL) {
return NULL;
}
// Initialize the memory map
int inFileHandle = open(fileName, O_RDONLY);
void* mapped = mmap(NULL, fileHeader->totalFileSizeBytes,
PROT_READ, MAP_PRIVATE, inFileHandle, 0);
// Record the map address. This allows callers to unmap
*mmapAddressOut=mapped;
*mmapLengthOut=fileHeader->totalFileSizeBytes;
close(inFileHandle);
// 1. UTF8 Object
const CLD2::uint8* state_table = static_cast<const CLD2::uint8*>(mapped) +
fileHeader->startOf_utf8PropObj_state_table;
// FIXME: Unsafe to rely on this since RemapEntry is not a bit-packed structure
const CLD2::RemapEntry* remap_base =
reinterpret_cast<const CLD2::RemapEntry*>(
static_cast<const CLD2::uint8*>(mapped) +
fileHeader->startOf_utf8PropObj_remap_base);
const CLD2::uint8* remap_string = static_cast<const CLD2::uint8*>(mapped) +
fileHeader->startOf_utf8PropObj_remap_string;
const CLD2::uint8* fast_state =
fileHeader->startOf_utf8PropObj_fast_state == 0 ? 0 :
static_cast<const CLD2::uint8*>(mapped) +
fileHeader->startOf_utf8PropObj_fast_state;
// Populate intermediate object. Horrible casting required because the struct
// is all read-only integers, and doesn't have a constructor. Yikes.
// TODO: It might actually be less horrible to memcpy the data in <shudder>
const CLD2::UTF8PropObj* unigram_obj = reinterpret_cast<CLD2::UTF8PropObj*>(malloc(sizeof(CLD2::UTF8PropObj)));
*const_cast<CLD2::uint32*>(&unigram_obj->state0) = fileHeader->utf8PropObj_state0;
*const_cast<CLD2::uint32*>(&unigram_obj->state0_size) = fileHeader->utf8PropObj_state0_size;
*const_cast<CLD2::uint32*>(&unigram_obj->total_size) = fileHeader->utf8PropObj_total_size;
*const_cast<int*>(&unigram_obj->max_expand) = fileHeader->utf8PropObj_max_expand;
*const_cast<int*>(&unigram_obj->entry_shift) = fileHeader->utf8PropObj_entry_shift;
*const_cast<int*>(&unigram_obj->bytes_per_entry) = fileHeader->utf8PropObj_bytes_per_entry;
*const_cast<CLD2::uint32*>(&unigram_obj->losub) = fileHeader->utf8PropObj_losub;
*const_cast<CLD2::uint32*>(&unigram_obj->hiadd) = fileHeader->utf8PropObj_hiadd;
*const_cast<const CLD2::uint8**>(&unigram_obj->state_table) = state_table;
*const_cast<const CLD2::RemapEntry**>(&unigram_obj->remap_base) = remap_base;
*const_cast<const CLD2::uint8**>(&unigram_obj->remap_string) = remap_string;
*const_cast<const CLD2::uint8**>(&unigram_obj->fast_state) = fast_state;
// 2. kAvgDeltaOctaScore array
const short* read_kAvgDeltaOctaScore = reinterpret_cast<const short*>(
static_cast<const CLD2::uint8*>(mapped) +
fileHeader->startOf_kAvgDeltaOctaScore);
// 3. Each table
CLD2::CLD2TableSummary* tableSummaries = new CLD2::CLD2TableSummary[fileHeader->numTablesEncoded];
for (int x=0; x<fileHeader->numTablesEncoded; x++) {
CLD2::CLD2TableSummary &summary = tableSummaries[x];
CLD2DynamicData::TableHeader& tHeader = fileHeader->tableHeaders[x];
const CLD2::IndirectProbBucket4* kCLDTable =
reinterpret_cast<const CLD2::IndirectProbBucket4*>(
static_cast<CLD2::uint8*>(mapped) + tHeader.startOf_kCLDTable);
const CLD2::uint32* kCLDTableInd =
reinterpret_cast<const CLD2::uint32*>(
static_cast<CLD2::uint8*>(mapped) + tHeader.startOf_kCLDTableInd);
const char* kRecognizedLangScripts =
static_cast<const char*>(mapped) + tHeader.startOf_kRecognizedLangScripts;
summary.kCLDTable = kCLDTable;
summary.kCLDTableInd = kCLDTableInd;
summary.kCLDTableSizeOne = tHeader.kCLDTableSizeOne;
summary.kCLDTableSize = tHeader.kCLDTableSize;
summary.kCLDTableKeyMask = tHeader.kCLDTableKeyMask;
summary.kCLDTableBuildDate = tHeader.kCLDTableBuildDate;
summary.kRecognizedLangScripts = kRecognizedLangScripts;
}
// Tie everything together
CLD2::ScoringTables* result = new CLD2::ScoringTables;
result->unigram_obj = unigram_obj;
result->unigram_compat_obj = &tableSummaries[0];
result->deltabi_obj = &tableSummaries[1];
result->distinctbi_obj = &tableSummaries[2];
result->quadgram_obj = &tableSummaries[3];
result->quadgram_obj2 = &tableSummaries[4];
result->deltaocta_obj = &tableSummaries[5];
result->distinctocta_obj = &tableSummaries[6];
result->kExpectedScore = read_kAvgDeltaOctaScore;
delete fileHeader->tableHeaders;
delete fileHeader;
return result;
}
}

View File

@@ -0,0 +1,52 @@
// Copyright 2014 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef CLD2_INTERNAL_CLD2_DYNAMIC_DATA_LOADER_H_
#define CLD2_INTERNAL_CLD2_DYNAMIC_DATA_LOADER_H_
#include "scoreonescriptspan.h"
#include "cld2_dynamic_data.h"
namespace CLD2DynamicDataLoader {
// Read a header from the specified file and return it.
// The header returned is dynamically allocated; you must 'delete' the array
// of TableHeaders as well as the returned FileHeader* when done.
CLD2DynamicData::FileHeader* loadHeader(const char* fileName);
// Load data directly into a ScoringTables structure using a private, read-only
// mmap and return the newly-allocated structure.
// The out-parameter "mmapAddressOut" is a pointer to a void*; the starting
// address of the mmap()'d block will be written here.
// The out-parameter "mmapLengthOut" is a pointer to an int; the length of the
// mmap()'d block will be written here.
// It is up to the caller to delete
CLD2::ScoringTables* loadDataFile(const char* fileName,
void** mmapAddressOut, int* mmapLengthOut);
// Given pointers to the data from a previous invocation of loadDataFile,
// unloads the data safely - freeing and deleting any malloc'd/new'd objects.
// When this method returns, the mmap has been deleted, as have all the scoring
// tables; the pointers passed in are all zeroed, such that:
// *scoringTables == NULL
// *mmapAddress == NULL
// mmapLength == NULL
// This is the only safe way to unload data that was previously loaded, as there
// is an unfortunate mixture of new and malloc involved in building the
// in-memory represtation of the data.
void unloadData(CLD2::ScoringTables** scoringTables,
void** mmapAddress, int* mmapLength);
} // End namespace CLD2DynamicDataExtractor
#endif // CLD2_INTERNAL_CLD2_DYNAMIC_DATA_EXTRACTOR_H_

View File

@@ -0,0 +1,162 @@
// Copyright 2014 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <assert.h>
#include <stdio.h>
#include <iostream>
#include <fstream>
#include <fcntl.h>
#include <stdlib.h>
#include <string.h>
#include <sys/mman.h>
#include "cld2_dynamic_data.h"
#include "cld2_dynamic_data_extractor.h"
#include "cld2_dynamic_data_loader.h"
#include "integral_types.h"
#include "cld2tablesummary.h"
#include "utf8statetable.h"
#include "scoreonescriptspan.h"
// We need these in order to set up a real data object to pass around.
namespace CLD2 {
extern const UTF8PropObj cld_generated_CjkUni_obj;
extern const CLD2TableSummary kCjkCompat_obj;
extern const CLD2TableSummary kCjkDeltaBi_obj;
extern const CLD2TableSummary kDistinctBiTable_obj;
extern const CLD2TableSummary kQuad_obj;
extern const CLD2TableSummary kQuad_obj2;
extern const CLD2TableSummary kDeltaOcta_obj;
extern const CLD2TableSummary kDistinctOcta_obj;
extern const short kAvgDeltaOctaScore[];
}
int main(int argc, char** argv) {
if (!CLD2DynamicData::isLittleEndian()) {
std::cerr << "System is big-endian: currently not supported." << std::endl;
return -1;
}
if (!CLD2DynamicData::coreAssumptionsOk()) {
std::cerr << "Core assumptions violated, unsafe to continue." << std::endl;
return -2;
}
// Get command-line flags
int flags = 0;
bool get_vector = false;
char* fileName = NULL;
const char* USAGE = "\
CLD2 Dynamic Data Tool:\n\
Dump, verify or print summaries of scoring tables for CLD2.\n\
\n\
The files output by this tool are suitable for all little-endian platforms,\n\
and should work on both 32- and 64-bit platforms.\n\
\n\
IMPORTANT: The files output by this tool WILL NOT work on big-endian platforms.\n\
\n\
Usage:\n\
--dump [FILE] Dump the scoring tables that this tool was linked against\n\
to the specified file. The tables are automatically verified\n\
after writing, just as if the tool was run again with\n\
'--verify'.\n\
--verify [FILE] Verify that a given file precisely matches the scoring\n\
tables that this tool was linked against. This can be used\n\
to verify that a file is compatible.\n\
--head [FILE] Print headers from the specified file to stdout.\n\
--verbose Be verbose.\n\
";
int mode = 0; //1=dump, 2=verify, 3=head
for (int i = 1; i < argc; ++i) {
if (strcmp(argv[i], "--verbose") == 0) {
CLD2DynamicDataExtractor::setDebug(1);
CLD2DynamicData::setDebug(1);
}
else if (strcmp(argv[i], "--dump") == 0
|| strcmp(argv[i], "--verify") == 0
|| strcmp(argv[i], "--head") == 0) {
// set mode flag properly
if (strcmp(argv[i], "--dump") == 0) mode=1;
else if (strcmp(argv[i], "--verify") == 0) mode=2;
else mode=3;
if (i < argc - 1) {
fileName = argv[++i];
} else {
std::cerr << "missing file name argument" << std::endl << std::endl;
std::cerr << USAGE;
return -1;
}
} else if (strcmp(argv[i], "--help") == 0) {
std::cout << USAGE;
return 0;
} else {
std::cerr << "Unsupported option: " << argv[i] << std::endl << std::endl;
std::cerr << USAGE;
return -1;
}
}
if (mode == 0) {
std::cerr << USAGE;
return -1;
}
CLD2::ScoringTables realData = {
&CLD2::cld_generated_CjkUni_obj,
&CLD2::kCjkCompat_obj,
&CLD2::kCjkDeltaBi_obj,
&CLD2::kDistinctBiTable_obj,
&CLD2::kQuad_obj,
&CLD2::kQuad_obj2,
&CLD2::kDeltaOcta_obj,
&CLD2::kDistinctOcta_obj,
CLD2::kAvgDeltaOctaScore,
};
if (mode == 1) { // dump
CLD2DynamicDataExtractor::writeDataFile(
static_cast<const CLD2::ScoringTables*>(&realData),
fileName);
} else if (mode == 3) { // head
CLD2DynamicData::FileHeader* header = CLD2DynamicDataLoader::loadHeader(fileName);
if (header == NULL) {
std::cerr << "Cannot read header from file: " << fileName << std::endl;
return -1;
}
CLD2DynamicData::dumpHeader(header);
delete header->tableHeaders;
delete header;
}
if (mode == 1 || mode == 2) { // dump || verify (so perform verification)
void* mmapAddress = NULL;
int mmapLength = 0;
CLD2::ScoringTables* loadedData = CLD2DynamicDataLoader::loadDataFile(fileName, &mmapAddress, &mmapLength);
if (loadedData == NULL) {
std::cerr << "Failed to read data file: " << fileName << std::endl;
return -1;
}
bool result = CLD2DynamicData::verify(
static_cast<const CLD2::ScoringTables*>(&realData),
static_cast<const CLD2::ScoringTables*>(loadedData));
CLD2DynamicDataLoader::unloadData(&loadedData, &mmapAddress, &mmapLength);
if (loadedData != NULL || mmapAddress != NULL || mmapLength != 0) {
std::cerr << "Warning: failed to clean up memory for ScoringTables." << std::endl;
}
if (!result) {
std::cerr << "Verification failed!" << std::endl;
return -1;
}
}
}

View File

@@ -252,10 +252,34 @@ void FinishHtmlOut(int flags) {
#endif
}
#ifdef CLD2_DYNAMIC_MODE
int RunTests (int flags, bool get_vector, const char* data_file) {
#else
int RunTests (int flags, bool get_vector) {
#endif
fprintf(stdout, "CLD2 version: %s\n", CLD2::DetectLanguageVersion());
InitHtmlOut(flags);
bool any_fail = false;
#ifdef CLD2_DYNAMIC_MODE
fprintf(stdout, "[DYNAMIC] Test running in dynamic data mode!\n");
bool dataLoaded = CLD2::isDataLoaded();
if (dataLoaded) {
fprintf(stderr, "[DYNAMIC] *** Error: CLD2::isDataLoaded() returned true prior to loading data!\n");
any_fail = true;
}
fprintf(stdout, "[DYNAMIC] Attempting translation prior to loading data\n");
any_fail |= !OneTest(flags, get_vector, UNKNOWN_LANGUAGE, kTeststr_en, strlen(kTeststr_en));
fprintf(stdout, "[DYNAMIC] Loading data from: %s\n", data_file);
CLD2::loadData(data_file);
dataLoaded = CLD2::isDataLoaded();
if (!dataLoaded) {
fprintf(stderr, "[DYNAMIC] *** Error: CLD2::isDataLoaded() returned false after loading data!\n");
any_fail = true;
}
fprintf(stdout, "[DYNAMIC] Data loaded, normal tests commencing\n");
#endif
int i = 0;
while (kTestPair[i].text != NULL) {
Language lang_expected = kTestPair[i].lang;
@@ -265,6 +289,19 @@ int RunTests (int flags, bool get_vector) {
any_fail |= (!ok);
++i;
}
#ifdef CLD2_DYNAMIC_MODE
fprintf(stdout, "[DYNAMIC] Normal tests complete, attempting to unload data\n");
CLD2::unloadData();
dataLoaded = CLD2::isDataLoaded();
if (dataLoaded) {
fprintf(stderr, "[DYNAMIC] *** Error: CLD2::isDataLoaded() returned true after unloading data!\n");
any_fail = true;
}
fprintf(stdout, "[DYNAMIC] Attempting translation after unloading data\n");
any_fail |= !OneTest(flags, get_vector, UNKNOWN_LANGUAGE, kTeststr_en, strlen(kTeststr_en));
#endif
if (any_fail) {
fprintf(stderr, "FAIL\n");
fprintf(stdout, "FAIL\n");
@@ -283,6 +320,7 @@ int main(int argc, char** argv) {
// Get command-line flags
int flags = 0;
bool get_vector = false;
const char* data_file = NULL;
for (int i = 1; i < argc; ++i) {
if (strcmp(argv[i], "--html") == 0) {flags |= CLD2::kCLDFlagHtml;}
if (strcmp(argv[i], "--cr") == 0) {flags |= CLD2::kCLDFlagCr;}
@@ -290,8 +328,17 @@ int main(int argc, char** argv) {
if (strcmp(argv[i], "--quiet") == 0) {flags |= CLD2::kCLDFlagQuiet;}
if (strcmp(argv[i], "--echo") == 0) {flags |= CLD2::kCLDFlagEcho;}
if (strcmp(argv[i], "--vector") == 0) {get_vector = true;}
if (strcmp(argv[i], "--data-file") == 0) { data_file = argv[++i];}
}
#ifdef CLD2_DYNAMIC_MODE
if (data_file == NULL) {
fprintf(stderr, "When running in dynamic mode, you must specify --data-file [FILE]\n");
return -1;
}
return CLD2::RunTests(flags, get_vector, data_file);
#else
return CLD2::RunTests(flags, get_vector);
#endif
}

View File

@@ -28,6 +28,10 @@
#include "lang_script.h"
#include "utf8statetable.h"
#ifdef CLD2_DYNAMIC_MODE
#include "cld2_dynamic_data.h"
#include "cld2_dynamic_data_loader.h"
#endif
#include "cld2tablesummary.h"
#include "compact_lang_det_impl.h"
#include "compact_lang_det_hint_code.h"
@@ -63,20 +67,58 @@ extern const CLD2TableSummary kDeltaOcta_obj;
extern const CLD2TableSummary kDistinctOcta_obj;
extern const short kAvgDeltaOctaScore[];
// This initializes kScoringtables.quadgram_obj etc.
static const ScoringTables kScoringtables = {
&cld_generated_CjkUni_obj,
&kCjkCompat_obj,
&kCjkDeltaBi_obj,
&kDistinctBiTable_obj,
#ifdef CLD2_DYNAMIC_MODE
// CLD2_DYNAMIC_MODE is defined:
// Data will be read from an mmap opened at runtime.
static ScoringTables kScoringtables = {
NULL, //&cld_generated_CjkUni_obj,
NULL, //&kCjkCompat_obj,
NULL, //&kCjkDeltaBi_obj,
NULL, //&kDistinctBiTable_obj,
NULL, //&kQuad_obj,
NULL, //&kQuad_obj2,
NULL, //&kDeltaOcta_obj,
NULL, //&kDistinctOcta_obj,
NULL, //kAvgDeltaOctaScore,
};
static bool dynamicDataLoaded = false;
static ScoringTables* dynamicTables = NULL;
static void* mmapAddress = NULL;
static int mmapLength = 0;
&kQuad_obj,
&kQuad_obj2, // Dual lookup tables
&kDeltaOcta_obj,
&kDistinctOcta_obj,
bool isDataLoaded() { return dynamicDataLoaded; }
kAvgDeltaOctaScore,
};
void loadData(const char* fileName) {
if (isDataLoaded()) {
unloadData();
}
dynamicTables = CLD2DynamicDataLoader::loadDataFile(fileName, &mmapAddress, &mmapLength);
kScoringtables = *dynamicTables;
dynamicDataLoaded = true;
};
void unloadData() {
if (!dynamicDataLoaded) return;
dynamicDataLoaded = false;
// unloading will null all the pointers out.
CLD2DynamicDataLoader::unloadData(&dynamicTables, &mmapAddress, &mmapLength);
}
#else
// This initializes kScoringtables.quadgram_obj etc.
static const ScoringTables kScoringtables = {
&cld_generated_CjkUni_obj,
&kCjkCompat_obj,
&kCjkDeltaBi_obj,
&kDistinctBiTable_obj,
&kQuad_obj,
&kQuad_obj2, // Dual lookup tables
&kDeltaOcta_obj,
&kDistinctOcta_obj,
kAvgDeltaOctaScore,
};
#endif // #ifdef CLD2_DYNAMIC_MODE
static const bool FLAGS_cld_no_minimum_bytes = false;
@@ -1622,6 +1664,19 @@ Language DetectLanguageSummaryV2(
}
}
#ifdef CLD2_DYNAMIC_MODE
// In dynamic mode, we immediately return UNKNOWN_LANGUAGE if the data file
// hasn't been loaded yet. This is the only sane thing we can do, as there
// are no scoring tables to consult.
bool dataLoaded = isDataLoaded();
if ((flags & kCLDFlagVerbose) != 0) {
fprintf(stderr, "Data loaded: %s\n", (dataLoaded ? "true" : "false"));
}
if (!dataLoaded) {
return UNKNOWN_LANGUAGE;
}
#endif
// Exit now if no text
if (buffer_length == 0) {return UNKNOWN_LANGUAGE;}
if (kScoringtables.quadgram_obj == NULL) {return UNKNOWN_LANGUAGE;}

View File

@@ -42,16 +42,18 @@ typedef int32 Encoding;
static const Encoding UNKNOWN_ENCODING = 0;
// Linker supplies the right tables; see ScoringTables compact_lang_det_impl.cc
// These are here JUST for printing versions
extern const UTF8PropObj cld_generated_CjkUni_obj;
extern const CLD2TableSummary kCjkDeltaBi_obj;
extern const CLD2TableSummary kDistinctBiTable_obj;
extern const CLD2TableSummary kQuad_obj;
extern const CLD2TableSummary kDeltaOcta_obj;
extern const CLD2TableSummary kDistinctOcta_obj;
extern const CLD2TableSummary kOcta2_obj;
extern const short kAvgDeltaOctaScore[];
#ifndef CLD2_DYNAMIC_MODE
// Linker supplies the right tables; see ScoringTables compact_lang_det_impl.cc
// These are here JUST for printing versions
extern const UTF8PropObj cld_generated_CjkUni_obj;
extern const CLD2TableSummary kCjkDeltaBi_obj;
extern const CLD2TableSummary kDistinctBiTable_obj;
extern const CLD2TableSummary kQuad_obj;
extern const CLD2TableSummary kDeltaOcta_obj;
extern const CLD2TableSummary kDistinctOcta_obj;
extern const CLD2TableSummary kOcta2_obj;
extern const short kAvgDeltaOctaScore[];
#endif
bool FLAGS_cld_version = false;
bool FLAGS_cld_html = true;
@@ -201,6 +203,7 @@ void DumpLanguages(Language summary_lang,
int main(int argc, char** argv) {
if (FLAGS_cld_version) {
#ifndef CLD2_DYNAMIC_MODE
printf("%s %4dKB uni build date, bytes\n",
"........",
cld_generated_CjkUni_obj.total_size >> 10);
@@ -216,11 +219,14 @@ int main(int argc, char** argv) {
kDeltaOcta_obj.kCLDTableBuildDate,
(kDeltaOcta_obj.kCLDTableSize *
sizeof(IndirectProbBucket4)) >> 10);
#else
printf("FLAGS_cld_version doesn't work with dynamic data mode\n");
#endif
exit(0);
} // End FLAGS_cld_version
int flags = 0;
bool get_vector = false;
const char* data_file = NULL;
const char* fname = NULL;
for (int i = 1; i < argc; ++i) {
if (argv[i][0] != '-') {fname = argv[i];}
@@ -230,8 +236,19 @@ int main(int argc, char** argv) {
if (strcmp(argv[i], "--verbose") == 0) {flags |= kCLDFlagVerbose;}
if (strcmp(argv[i], "--echo") == 0) {flags |= kCLDFlagEcho;}
if (strcmp(argv[i], "--vector") == 0) {get_vector = true;}
if (strcmp(argv[i], "--data-file") == 0) { data_file = argv[++i];}
}
#ifdef CLD2_DYNAMIC_MODE
if (data_file == NULL) {
fprintf(stderr, "When running in dynamic mode, you must specify --data-file [FILE]\n");
return -1;
}
fprintf(stdout, "Loading data from: %s\n", data_file);
CLD2::loadData(data_file);
fprintf(stdout, "Data loaded, test commencing\n");
#endif
FILE* fin;
if (fname == NULL) {
fin = stdin;

71
internal/compile_dynamic.sh Executable file
View File

@@ -0,0 +1,71 @@
#!/bin/sh
#
# Copyright 2013 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http:# www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# The data tool, which can be used to read and write CLD2 dynamic data files
g++ -O2 -m64 cld2_dynamic_data_tool.cc \
cld2_dynamic_data.h cld2_dynamic_data.cc \
cld2_dynamic_data_extractor.h cld2_dynamic_data_extractor.cc \
cld2_dynamic_data_loader.h cld2_dynamic_data_loader.cc \
cldutil.cc cldutil_shared.cc compact_lang_det.cc compact_lang_det_hint_code.cc \
compact_lang_det_impl.cc debug.cc fixunicodevalue.cc \
generated_entities.cc generated_language.cc generated_ulscript.cc \
getonescriptspan.cc lang_script.cc offsetmap.cc scoreonescriptspan.cc \
tote.cc utf8statetable.cc \
cld_generated_cjk_uni_prop_80.cc cld2_generated_cjk_compatible.cc \
cld_generated_cjk_delta_bi_4.cc generated_distinct_bi_0.cc \
cld2_generated_quadchrome0122_2.cc cld2_generated_deltaoctachrome0122.cc \
cld2_generated_distinctoctachrome0122.cc cld_generated_score_quad_octa_0122_2.cc \
-o cld2_dynamic_data_tool
echo " cld2_dynamic_data_tool compiled"
# Tests for Chromium flavored dynamic CLD2
g++ -O2 -m64 -D CLD2_DYNAMIC_MODE compact_lang_det_test.cc \
cld2_dynamic_data.h cld2_dynamic_data.cc \
cld2_dynamic_data_extractor.h cld2_dynamic_data_extractor.cc \
cld2_dynamic_data_loader.h cld2_dynamic_data_loader.cc \
cldutil.cc cldutil_shared.cc compact_lang_det.cc compact_lang_det_hint_code.cc \
compact_lang_det_impl.cc debug.cc fixunicodevalue.cc \
generated_entities.cc generated_language.cc generated_ulscript.cc \
getonescriptspan.cc lang_script.cc offsetmap.cc scoreonescriptspan.cc \
tote.cc utf8statetable.cc \
-o compact_lang_det_dynamic_test_chrome
echo " compact_lang_det_dynamic_test_chrome compiled"
# Unit tests, in dynamic mode
g++ -O2 -m64 -g3 -D CLD2_DYNAMIC_MODE cld2_unittest.cc \
cld2_dynamic_data.h cld2_dynamic_data.cc \
cld2_dynamic_data_loader.h cld2_dynamic_data_loader.cc \
cldutil.cc cldutil_shared.cc compact_lang_det.cc compact_lang_det_hint_code.cc \
compact_lang_det_impl.cc debug.cc fixunicodevalue.cc \
generated_entities.cc generated_language.cc generated_ulscript.cc \
getonescriptspan.cc lang_script.cc offsetmap.cc scoreonescriptspan.cc \
tote.cc utf8statetable.cc \
-o cld2_dynamic_unittest
echo " cld2_dynamic_unittest compiled"
# Shared library, in dynamic mode
g++ -shared -fPIC -O2 -m64 -D CLD2_DYNAMIC_MODE \
cld2_dynamic_data.h cld2_dynamic_data.cc \
cld2_dynamic_data_loader.h cld2_dynamic_data_loader.cc \
cldutil.cc cldutil_shared.cc compact_lang_det.cc compact_lang_det_hint_code.cc \
compact_lang_det_impl.cc debug.cc fixunicodevalue.cc \
generated_entities.cc generated_language.cc generated_ulscript.cc \
getonescriptspan.cc lang_script.cc offsetmap.cc scoreonescriptspan.cc \
tote.cc utf8statetable.cc \
-o libcld2_dynamic.so
echo " libcld2_dynamic.so compiled"

View File

@@ -295,6 +295,26 @@ Flag meanings:
void DumpResultChunkVector(FILE* f, const char* src,
ResultChunkVector* resultchunkvector);
#ifdef CLD2_DYNAMIC_MODE
// If compiled with dynamic mode, load data from the specified file location.
// If other data has already been loaded, it is discarded and the data is read
// in from the specified file location again (even if the file has not changed).
// WARNING: Before calling this method, language detection will always fail
// and will always return the unknown language.
void loadData(const char* fileName);
// If compiled with dynamic mode, unload the previously-loaded data.
// WARNING: After calling this method, language detection will no longer work
// and will always return the unknown language.
void unloadData();
// Returns true if and only if data has been loaded via a call to loadData(...)
// and has not been subsequently unladed via a call to unloadDate().
bool isDataLoaded();
#endif // #ifdef CLD2_DYNAMIC_MODE
}; // End namespace CLD2
#endif // I18N_ENCODINGS_CLD2_PUBLIC_COMPACT_LANG_DET_H_