Enable Dynamic Mode for CLD2. See issue 6 for more information on dynamic mode:
https://code.google.com/p/cld2/issues/detail?id=6 git-svn-id: https://cld2.googlecode.com/svn/trunk@151 b252ecd4-b096-bf77-eb8e-91563289f87e
This commit is contained in:
236
internal/cld2_dynamic_data.cc
Normal file
236
internal/cld2_dynamic_data.cc
Normal file
@@ -0,0 +1,236 @@
|
||||
// Copyright 2014 Google Inc. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "cld2_dynamic_data.h"
|
||||
#include "integral_types.h"
|
||||
#include <assert.h>
|
||||
#include <iostream>
|
||||
#include <stdint.h>
|
||||
|
||||
namespace CLD2DynamicData {
|
||||
static int DEBUG=0;
|
||||
void setDebug(int debug) {
|
||||
DEBUG=debug;
|
||||
}
|
||||
|
||||
bool mem_compare(const void* data1, const void* data2, const int length) {
|
||||
const unsigned char* raw1 = static_cast<const unsigned char*>(data1);
|
||||
const unsigned char* raw2 = static_cast<const unsigned char*>(data2);
|
||||
for (int x=0; x<length; x++) {
|
||||
if (raw1[x] != raw2[x]) {
|
||||
std::cerr << "mem difference at data[" << x << "]: decimal " << (unsigned int) raw1[x] << " != decimal " << (unsigned int) raw2[x] << std::endl;
|
||||
for (int y=std::max(0,x-5); y<length && y<=x+5; y++) {
|
||||
std::cerr << "[" << y << "]: " << (unsigned int) raw1[y]
|
||||
<< " <-> " << (unsigned int) raw2[y]
|
||||
<< ( x == y ? " [FIRST ERROR DETECTED HERE] " : "")
|
||||
<< std::endl;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
CLD2::uint32 calculateHeaderSize(CLD2::uint32 numTables) {
|
||||
return DATA_FILE_MARKER_LENGTH // NB: no null terminator
|
||||
+ (20 * sizeof(CLD2::uint32)) // 20 uint32 fields in the struct
|
||||
+ (numTables * (10 * sizeof(CLD2::uint32))); // 10 uint32 per table
|
||||
}
|
||||
|
||||
void dumpHeader(FileHeader* header) {
|
||||
char safeString[DATA_FILE_MARKER_LENGTH + 1];
|
||||
memcpy(safeString, header->sanityString, DATA_FILE_MARKER_LENGTH);
|
||||
safeString[DATA_FILE_MARKER_LENGTH] = 0;
|
||||
std::cout << "sanityString: " << safeString << std::endl;
|
||||
std::cout << "totalFileSizeBytes: " << header->totalFileSizeBytes << std::endl;
|
||||
std::cout << "utf8PropObj_state0: " << header->utf8PropObj_state0 << std::endl;
|
||||
std::cout << "utf8PropObj_state0_size: " << header->utf8PropObj_state0_size << std::endl;
|
||||
std::cout << "utf8PropObj_total_size: " << header->utf8PropObj_total_size << std::endl;
|
||||
std::cout << "utf8PropObj_max_expand: " << header->utf8PropObj_max_expand << std::endl;
|
||||
std::cout << "utf8PropObj_entry_shift: " << header->utf8PropObj_entry_shift << std::endl;
|
||||
std::cout << "utf8PropObj_bytes_per_entry: " << header->utf8PropObj_bytes_per_entry << std::endl;
|
||||
std::cout << "utf8PropObj_losub: " << header->utf8PropObj_losub << std::endl;
|
||||
std::cout << "utf8PropObj_hiadd: " << header->utf8PropObj_hiadd << std::endl;
|
||||
std::cout << "startOf_utf8PropObj_state_table: " << header->startOf_utf8PropObj_state_table << std::endl;
|
||||
std::cout << "lengthOf_utf8PropObj_state_table: " << header->lengthOf_utf8PropObj_state_table << std::endl;
|
||||
std::cout << "startOf_utf8PropObj_remap_base: " << header->startOf_utf8PropObj_remap_base << std::endl;
|
||||
std::cout << "lengthOf_utf8PropObj_remap_base: " << header->lengthOf_utf8PropObj_remap_base << std::endl;
|
||||
std::cout << "startOf_utf8PropObj_remap_string: " << header->startOf_utf8PropObj_remap_string << std::endl;
|
||||
std::cout << "lengthOf_utf8PropObj_remap_string: " << header->lengthOf_utf8PropObj_remap_string << std::endl;
|
||||
std::cout << "startOf_utf8PropObj_fast_state: " << header->startOf_utf8PropObj_fast_state << std::endl;
|
||||
std::cout << "lengthOf_utf8PropObj_fast_state: " << header->lengthOf_utf8PropObj_fast_state << std::endl;
|
||||
std::cout << "startOf_kAvgDeltaOctaScore: " << header->startOf_kAvgDeltaOctaScore << std::endl;
|
||||
std::cout << "lengthOf_kAvgDeltaOctaScore: " << header->lengthOf_kAvgDeltaOctaScore << std::endl;
|
||||
std::cout << "numTablesEncoded: " << header->numTablesEncoded << std::endl;
|
||||
|
||||
const char* tableNames[7];
|
||||
tableNames[0]="unigram_compat_obj";
|
||||
tableNames[1]="deltabi_obj";
|
||||
tableNames[2]="distinctbi_obj";
|
||||
tableNames[3]="quadgram_obj";
|
||||
tableNames[4]="quadgram_obj2";
|
||||
tableNames[5]="deltaocta_obj";
|
||||
tableNames[6]="distinctocta_obj";
|
||||
|
||||
for (int x=0; x<header->numTablesEncoded; x++) {
|
||||
TableHeader& tHeader = header->tableHeaders[x];
|
||||
|
||||
std::cout << "Table " << (x+1) << ": (" << tableNames[x] << ")" << std::endl;
|
||||
std::cout << " kCLDTableSizeOne: " << tHeader.kCLDTableSizeOne << std::endl;
|
||||
std::cout << " kCLDTableSize: " << tHeader.kCLDTableSize << std::endl;
|
||||
std::cout << " kCLDTableKeyMask: " << tHeader.kCLDTableKeyMask << std::endl;
|
||||
std::cout << " kCLDTableBuildDate: " << tHeader.kCLDTableBuildDate << std::endl;
|
||||
std::cout << " startOf_kCLDTable: " << tHeader.startOf_kCLDTable << std::endl;
|
||||
std::cout << " lengthOf_kCLDTable: " << tHeader.lengthOf_kCLDTable << std::endl;
|
||||
std::cout << " startOf_kCLDTableInd: " << tHeader.startOf_kCLDTableInd << std::endl;
|
||||
std::cout << " lengthOf_kCLDTableInd: " << tHeader.lengthOf_kCLDTableInd << std::endl;
|
||||
std::cout << " startOf_kRecognizedLangScripts: " << tHeader.startOf_kRecognizedLangScripts << std::endl;
|
||||
std::cout << " lengthOf_kRecognizedLangScripts: " << tHeader.lengthOf_kRecognizedLangScripts << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
#define CHECK_EQUALS(name) if (loadedData->name != realData->name) {\
|
||||
std::cerr << #name << ": " << loadedData->name << " != " << realData->name << std::endl;\
|
||||
return false;\
|
||||
}
|
||||
|
||||
#define CHECK_MEM_EQUALS(name,size) if (!mem_compare(loadedData->name,realData->name,size)) {\
|
||||
std::cerr << #name << ": data mismatch." << std::endl;\
|
||||
return false;\
|
||||
}
|
||||
|
||||
bool verify(const CLD2::ScoringTables* realData, const CLD2::ScoringTables* loadedData) {
|
||||
const int NUM_TABLES = 7;
|
||||
const CLD2::CLD2TableSummary* realTableSummaries[NUM_TABLES];
|
||||
realTableSummaries[0] = realData->unigram_compat_obj;
|
||||
realTableSummaries[1] = realData->deltabi_obj;
|
||||
realTableSummaries[2] = realData->distinctbi_obj;
|
||||
realTableSummaries[3] = realData->quadgram_obj;
|
||||
realTableSummaries[4] = realData->quadgram_obj2;
|
||||
realTableSummaries[5] = realData->deltaocta_obj;
|
||||
realTableSummaries[6] = realData->distinctocta_obj;
|
||||
|
||||
const CLD2::CLD2TableSummary* loadedTableSummaries[NUM_TABLES];
|
||||
loadedTableSummaries[0] = loadedData->unigram_compat_obj;
|
||||
loadedTableSummaries[1] = loadedData->deltabi_obj;
|
||||
loadedTableSummaries[2] = loadedData->distinctbi_obj;
|
||||
loadedTableSummaries[3] = loadedData->quadgram_obj;
|
||||
loadedTableSummaries[4] = loadedData->quadgram_obj2;
|
||||
loadedTableSummaries[5] = loadedData->deltaocta_obj;
|
||||
loadedTableSummaries[6] = loadedData->distinctocta_obj;
|
||||
|
||||
CHECK_EQUALS(unigram_obj->state0);
|
||||
CHECK_EQUALS(unigram_obj->state0_size);
|
||||
CHECK_EQUALS(unigram_obj->total_size);
|
||||
CHECK_EQUALS(unigram_obj->max_expand);
|
||||
CHECK_EQUALS(unigram_obj->entry_shift);
|
||||
CHECK_EQUALS(unigram_obj->bytes_per_entry);
|
||||
CHECK_EQUALS(unigram_obj->losub);
|
||||
CHECK_EQUALS(unigram_obj->hiadd);
|
||||
CHECK_MEM_EQUALS(unigram_obj->state_table, realData->unigram_obj->total_size);
|
||||
CHECK_MEM_EQUALS(unigram_obj->remap_base, sizeof(CLD2::RemapEntry)); // TODO: can this have more than one entry?
|
||||
CHECK_MEM_EQUALS(unigram_obj->remap_string, strlen(
|
||||
reinterpret_cast<const char*>(realData->unigram_obj->remap_string)) + 1); // null terminator included
|
||||
|
||||
if (loadedData->unigram_obj->fast_state == NULL) {
|
||||
if (realData->unigram_obj->fast_state != NULL) {
|
||||
std::cerr << "unigram_obj->fast_state is missing." << std::endl;
|
||||
return false;
|
||||
}
|
||||
} else {
|
||||
if (realData->unigram_obj->fast_state == NULL) {
|
||||
std::cerr << "unigram_obj->fast_state shouldn't be present." << std::endl;
|
||||
return false;
|
||||
}
|
||||
CHECK_MEM_EQUALS(unigram_obj->fast_state, strlen(
|
||||
reinterpret_cast<const char*>(realData->unigram_obj->fast_state)) + 1); // null terminator included
|
||||
}
|
||||
if (DEBUG) std::cout << "verified." << std::endl;
|
||||
|
||||
if (DEBUG) std::cout << "Verifying kExpectedScore... ";
|
||||
CHECK_MEM_EQUALS(kExpectedScore, 614*4); // TODO: Don't hardcode 614*4.
|
||||
if (DEBUG) std::cout << "verified." << std::endl;
|
||||
|
||||
// 3. Each table
|
||||
for (int x=0; x<NUM_TABLES; x++) {
|
||||
if (DEBUG) std::cout << "Verifying table " << (x+1) << "... ";
|
||||
const CLD2::CLD2TableSummary* realData = realTableSummaries[x];
|
||||
const CLD2::CLD2TableSummary* loadedData = loadedTableSummaries[x];
|
||||
// We need to calculate the table lengths to do the memcmp
|
||||
CLD2::uint32 bytesPerBucket = sizeof(CLD2::IndirectProbBucket4);
|
||||
CLD2::uint32 numBuckets = realData->kCLDTableSize;
|
||||
CLD2::uint32 tableSizeBytes = bytesPerBucket * numBuckets;
|
||||
CLD2::uint32 indirectTableSizeBytes =
|
||||
realData->kCLDTableSizeOne * sizeof(CLD2::uint32);
|
||||
|
||||
// XXX XXX XXX HACK HACK HACK FIXME FIXME FIXME
|
||||
// XXX XXX XXX HACK HACK HACK FIXME FIXME FIXME
|
||||
// XXX XXX XXX HACK HACK HACK FIXME FIXME FIXME
|
||||
// cld2_generated_cjk_compatible.cc has a kCLDTableSizeOne of zero!
|
||||
if (x == 0) { // cld2_generated_cjk_compatible.cc
|
||||
indirectTableSizeBytes = 239*2*4;
|
||||
}
|
||||
// XXX XXX XXX HACK HACK HACK FIXME FIXME FIXME
|
||||
// XXX XXX XXX HACK HACK HACK FIXME FIXME FIXME
|
||||
// XXX XXX XXX HACK HACK HACK FIXME FIXME FIXME
|
||||
|
||||
CLD2::uint32 recognizedScriptsSizeBytes =
|
||||
strlen(realData->kRecognizedLangScripts) + 1; // null terminator included
|
||||
|
||||
// Verify the table data
|
||||
CHECK_EQUALS(kCLDTableSizeOne);
|
||||
CHECK_EQUALS(kCLDTableSize);
|
||||
CHECK_EQUALS(kCLDTableKeyMask);
|
||||
CHECK_EQUALS(kCLDTableBuildDate);
|
||||
CHECK_MEM_EQUALS(kCLDTable, tableSizeBytes);
|
||||
CHECK_MEM_EQUALS(kCLDTableInd, indirectTableSizeBytes);
|
||||
CHECK_MEM_EQUALS(kRecognizedLangScripts, recognizedScriptsSizeBytes);
|
||||
if (DEBUG) std::cout << "verified." << std::endl;
|
||||
}
|
||||
if (DEBUG) std::cout << "All data verified successfully." << std::endl;
|
||||
return true;
|
||||
}
|
||||
|
||||
// As noted on http://stackoverflow.com/questions/1001307, gcc is highly likely
|
||||
// to convert this function's return into a constant - meaning that any
|
||||
// if-branches based upon it will be eliminated at compile time, allowing
|
||||
// "free" detection throughout any dependent code.
|
||||
bool isLittleEndian() {
|
||||
union {
|
||||
uint32_t integer;
|
||||
char bytes[4];
|
||||
} test = {0x01020304};
|
||||
return test.bytes[0] == 4;
|
||||
}
|
||||
|
||||
bool coreAssumptionsOk() {
|
||||
if (sizeof(CLD2::uint8) != 1) {
|
||||
std::cerr << "uint8 is " << (sizeof(CLD2::uint8) * 8)
|
||||
<< " bits instead of 8!" << std::endl;
|
||||
return false;
|
||||
}
|
||||
if (sizeof(CLD2::uint16) != 2) {
|
||||
std::cerr << "uint16 is " << (sizeof(CLD2::uint16) * 8)
|
||||
<< " bits instead of 16!" << std::endl;
|
||||
return false;
|
||||
}
|
||||
if (sizeof(CLD2::uint32) != 4) {
|
||||
std::cerr << "uint32 is " << (sizeof(CLD2::uint32) * 8)
|
||||
<< " bits instead of 32!" << std::endl;
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
} // End namespace CLD2DynamicData
|
216
internal/cld2_dynamic_data.h
Normal file
216
internal/cld2_dynamic_data.h
Normal file
@@ -0,0 +1,216 @@
|
||||
// Copyright 2014 Google Inc. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#ifndef CLD2_INTERNAL_CLD2_DYNAMIC_DATA_H_
|
||||
#define CLD2_INTERNAL_CLD2_DYNAMIC_DATA_H_
|
||||
|
||||
#include "integral_types.h"
|
||||
#include "cld2tablesummary.h"
|
||||
#include "utf8statetable.h"
|
||||
#include "scoreonescriptspan.h"
|
||||
|
||||
/*
|
||||
There are two primary parts to a CLD2 dynamic data file:
|
||||
1. A header, wherein trivial data, block lengths and block offsets are kept
|
||||
2. A data block, wherein the large binary blocks are kept
|
||||
|
||||
By reading the header, an application can determine the offsets and lengths of
|
||||
all the data blocks for all tables. Offsets in the header are expressed
|
||||
relative to the first byte of the file, inclusive of the header itself; thus,
|
||||
any offset whose value is less than the length of the header is invalid.
|
||||
|
||||
Any offset whose value is zero indicates a field that is null in the
|
||||
underlying CLD2 data; a real example of this is the fast_state field of the
|
||||
UTF8PropObj, which may be null.
|
||||
|
||||
The size of the header can be precalculated by calling calculateHeaderSize(),
|
||||
which will indicate the exact size of the header for a data file that contains
|
||||
a given number of CLD2TableSummary objects.
|
||||
|
||||
Notes on endianness:
|
||||
The data format is only suitable for little-endian machines. For big-endian
|
||||
systems, a tedious transformation would need to be made first to reverse the
|
||||
byte order of significant portions of the binary - not just the lengths, but
|
||||
also some of the underlying table data.
|
||||
|
||||
Note on 32/64 bit:
|
||||
The data format is agnostic to 32/64 bit pointers. All the offsets within the
|
||||
data blob itself are 32-bit values relative to the start of the file, and the
|
||||
file should certainly never be gigabytes in size!
|
||||
When the file is ultimately read by the loading code and mmap()'d, new
|
||||
pointers are generated at whatever size the system uses, initialized to the
|
||||
start of the mmap, and incremented by the 32-bit offset. This should be safe
|
||||
regardless of 32- or 64-bit architectures.
|
||||
|
||||
--------------------------------------------------------------------
|
||||
FIELD
|
||||
--------------------------------------------------------------------
|
||||
DATA_FILE_MARKER (no null terminator)
|
||||
total file size (sanity check, uint32)
|
||||
--------------------------------------------------------------------
|
||||
UTF8PropObj: const uint32 state0
|
||||
UTF8PropObj: const uint32 state0_size
|
||||
UTF8PropObj: const uint32 total_size
|
||||
UTF8PropObj: const int max_expand
|
||||
UTF8PropObj: const int entry_shift (coerced to 32 bits)
|
||||
UTF8PropObj: const int bytes_per_entry (coerced to 32 bits)
|
||||
UTF8PropObj: const uint32 losub
|
||||
UTF8PropObj: const uint32 hiadd
|
||||
offset of UTF8PropObj: const uint8* state_table
|
||||
length of UTF8PropObj: const uint8* state_table
|
||||
offset of UTF8PropObj: const RemapEntry* remap_base (4-byte struct)
|
||||
length of UTF8PropObj: const RemapEntry* remap_base (4-byte struct)
|
||||
offset of UTF8PropObj: const uint8* remap_string
|
||||
length of UTF8PropObj: const uint8* remap_string
|
||||
offset of UTF8PropObj: const uint8* fast_state
|
||||
length of UTF8PropObj: const uint8* fast_state
|
||||
--------------------------------------------------------------------
|
||||
start of const short kAvgDeltaOctaScore[]
|
||||
length of const short kAvgDeltaOctaScore[]
|
||||
--------------------------------------------------------------------
|
||||
number of CLD2TableSummary objects encoded (n)
|
||||
[Table 1]: CLD2TableSummary: uint32 kCLDTableSizeOne
|
||||
[Table 1]: CLD2TableSummary: uint32 kCLDTableSize
|
||||
[Table 1]: CLD2TableSummary: uint32 kCLDTableKeyMask
|
||||
[Table 1]: CLD2TableSummary: uint32 kCLDTableBuildDate
|
||||
[Table 1]: offset of CLD2TableSummary: const IndirectProbBucket4* kCLDTable
|
||||
[Table 1]: length of CLD2TableSummary: const IndirectProbBucket4* kCLDTable
|
||||
[Table 1]: offset of CLD2TableSummary: const uint32* kCLDTableInd
|
||||
[Table 1]: length of CLD2TableSummary: const uint32* kCLDTableInd
|
||||
[Table 1]: offset of CLD2TableSummary: const char* kRecognizedLangScripts
|
||||
[Table 1]: length of CLD2TableSummary: const char* kRecognizedLangScripts + 1
|
||||
.
|
||||
.
|
||||
.
|
||||
[Table n]: CLD2TableSummary: uint32 kCLDTableSizeOne
|
||||
[Table n]: CLD2TableSummary: uint32 kCLDTableSize
|
||||
[Table n]: CLD2TableSummary: uint32 kCLDTableKeyMask
|
||||
[Table n]: CLD2TableSummary: uint32 kCLDTableBuildDate
|
||||
[Table n]: offset of CLD2TableSummary: const IndirectProbBucket4* kCLDTable
|
||||
[Table n]: length of CLD2TableSummary: const IndirectProbBucket4* kCLDTable
|
||||
[Table n]: offset of CLD2TableSummary: const uint32* kCLDTableInd
|
||||
[Table n]: length of CLD2TableSummary: const uint32* kCLDTableInd
|
||||
[Table n]: offset of CLD2TableSummary: const char* kRecognizedLangScripts
|
||||
[Table n]: length of CLD2TableSummary: const char* kRecognizedLangScripts + 1
|
||||
--------------------------------------------------------------------
|
||||
|
||||
|
||||
Immediately after the header fields comes the data block. The data block has
|
||||
the following content, in this order (note that padding is applied in order to
|
||||
keep lookups word-aligned):
|
||||
|
||||
UTF8PropObj: const uint8* state_table
|
||||
UTF8PropObj: const RemapEntry* remap_base (4-byte struct)
|
||||
UTF8PropObj: const uint8* remap_string
|
||||
UTF8PropObj: const uint8* fast_state
|
||||
const short kAvgDeltaOctaScore[]
|
||||
[Table 1]: CLD2TableSummary: const IndirectProbBucket4* kCLDTable
|
||||
[Table 1]: CLD2TableSummary: const uint32* kCLDTableInd
|
||||
[Table 1]: CLD2TableSummary: const char* kRecognizedLangScripts (with null terminator)
|
||||
.
|
||||
.
|
||||
.
|
||||
[Table n]: CLD2TableSummary: const IndirectProbBucket4* kCLDTable
|
||||
[Table n]: CLD2TableSummary: const uint32* kCLDTableInd
|
||||
[Table n]: CLD2TableSummary: const char* kRecognizedLangScripts (with null terminator)
|
||||
|
||||
|
||||
It is STRONGLY recommended that the chunks within the data block be kept
|
||||
128-bit aligned for efficiency reasons, although the code will work without
|
||||
such alignment: the main lookup tables have randomly-accessed groups of four
|
||||
4-byte entries, and these must be 16-byte aligned to avoid the performance
|
||||
cost of multiple cache misses per group.
|
||||
*/
|
||||
namespace CLD2DynamicData {
|
||||
|
||||
static const char* DATA_FILE_MARKER = "cld2_data_file00";
|
||||
static const int DATA_FILE_MARKER_LENGTH = 16; // Keep aligned to 128 bits
|
||||
|
||||
// Nicer version of memcmp that shows the offset at which bytes differ
|
||||
bool mem_compare(const void* data1, const void* data2, const int length);
|
||||
|
||||
// Enable or disable debugging; 0 to disable, 1 to enable
|
||||
void setDebug(int debug);
|
||||
|
||||
// Lower-level structure for individual tables. There are n table headers in
|
||||
// a given file header.
|
||||
typedef struct {
|
||||
CLD2::uint32 kCLDTableSizeOne;
|
||||
CLD2::uint32 kCLDTableSize;
|
||||
CLD2::uint32 kCLDTableKeyMask;
|
||||
CLD2::uint32 kCLDTableBuildDate;
|
||||
CLD2::uint32 startOf_kCLDTable;
|
||||
CLD2::uint32 lengthOf_kCLDTable;
|
||||
CLD2::uint32 startOf_kCLDTableInd;
|
||||
CLD2::uint32 lengthOf_kCLDTableInd;
|
||||
CLD2::uint32 startOf_kRecognizedLangScripts;
|
||||
CLD2::uint32 lengthOf_kRecognizedLangScripts;
|
||||
} TableHeader;
|
||||
|
||||
|
||||
// Top-level structure for a CLD2 Data File Header.
|
||||
// Contains all the primitive fields for the header as well as an array of
|
||||
// headers for the individual tables.
|
||||
typedef struct {
|
||||
// Marker fields help recognize and verify the data file
|
||||
char sanityString[DATA_FILE_MARKER_LENGTH];
|
||||
CLD2::uint32 totalFileSizeBytes;
|
||||
|
||||
// UTF8 primitives
|
||||
CLD2::uint32 utf8PropObj_state0;
|
||||
CLD2::uint32 utf8PropObj_state0_size;
|
||||
CLD2::uint32 utf8PropObj_total_size;
|
||||
CLD2::uint32 utf8PropObj_max_expand;
|
||||
CLD2::uint32 utf8PropObj_entry_shift;
|
||||
CLD2::uint32 utf8PropObj_bytes_per_entry;
|
||||
CLD2::uint32 utf8PropObj_losub;
|
||||
CLD2::uint32 utf8PropObj_hiadd;
|
||||
CLD2::uint32 startOf_utf8PropObj_state_table;
|
||||
CLD2::uint32 lengthOf_utf8PropObj_state_table;
|
||||
CLD2::uint32 startOf_utf8PropObj_remap_base;
|
||||
CLD2::uint32 lengthOf_utf8PropObj_remap_base;
|
||||
CLD2::uint32 startOf_utf8PropObj_remap_string;
|
||||
CLD2::uint32 lengthOf_utf8PropObj_remap_string;
|
||||
CLD2::uint32 startOf_utf8PropObj_fast_state;
|
||||
CLD2::uint32 lengthOf_utf8PropObj_fast_state;
|
||||
|
||||
// Average delta-octa-score bits
|
||||
CLD2::uint32 startOf_kAvgDeltaOctaScore;
|
||||
CLD2::uint32 lengthOf_kAvgDeltaOctaScore;
|
||||
|
||||
// Table bits
|
||||
CLD2::uint32 numTablesEncoded;
|
||||
TableHeader* tableHeaders;
|
||||
} FileHeader;
|
||||
|
||||
// Calculate the exact size of a header that encodes the specified number of
|
||||
// tables. This can be used to reserve space within the data file,
|
||||
// calculate offsets, and so on.
|
||||
CLD2::uint32 calculateHeaderSize(CLD2::uint32 numTables);
|
||||
|
||||
// Dump a given header to stdout as a human-readable string.
|
||||
void dumpHeader(FileHeader* header);
|
||||
|
||||
// Verify that a given pair of scoring tables match precisely
|
||||
// If there is a problem, returns an error message; otherwise, the empty string.
|
||||
bool verify(const CLD2::ScoringTables* realData, const CLD2::ScoringTables* loadedData);
|
||||
|
||||
// Return true iff the program is running in little-endian mode.
|
||||
bool isLittleEndian();
|
||||
|
||||
// Return true iff the core size assumptions are ok on this platform.
|
||||
bool coreAssumptionsOk();
|
||||
|
||||
} // End namespace CLD2DynamicData
|
||||
#endif // CLD2_INTERNAL_CLD2_DYNAMIC_DATA_H_
|
376
internal/cld2_dynamic_data_extractor.cc
Normal file
376
internal/cld2_dynamic_data_extractor.cc
Normal file
@@ -0,0 +1,376 @@
|
||||
// Copyright 2014 Google Inc. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include <iostream>
|
||||
|
||||
#include "cld2_dynamic_data.h"
|
||||
#include "cld2_dynamic_data_extractor.h"
|
||||
#include "cld2_dynamic_data_loader.h" // for verifying the written data
|
||||
#include "integral_types.h"
|
||||
#include "cld2tablesummary.h"
|
||||
#include "utf8statetable.h"
|
||||
|
||||
using namespace std;
|
||||
namespace CLD2DynamicDataExtractor {
|
||||
static int DEBUG=0;
|
||||
void setDebug(int debug) {
|
||||
DEBUG=debug;
|
||||
}
|
||||
|
||||
int advance(FILE* f, CLD2::uint32 position) {
|
||||
const char ZERO = 0;
|
||||
int pad = position - ftell(f);
|
||||
if (DEBUG) cout << " Adding " << pad << " bytes of padding" << endl;
|
||||
while (pad-- > 0) {
|
||||
fwrite(&ZERO,1,1,f);
|
||||
}
|
||||
return pad;
|
||||
}
|
||||
|
||||
void writeChunk(FILE *f, const void* data, CLD2::uint32 startAt, CLD2::uint32 length) {
|
||||
if (DEBUG) cout << "Write chunk @" << startAt << ", len=" << length << endl;
|
||||
advance(f, startAt);
|
||||
if (DEBUG) cout << " Writing " << length << " bytes of data" << endl;
|
||||
fwrite(data, 1, length, f);
|
||||
}
|
||||
|
||||
void writeDataFile(const CLD2::ScoringTables* data, const char* fileName) {
|
||||
// The order here is hardcoded and MUST NOT BE CHANGED, else you will de-sync
|
||||
// with the reading code.
|
||||
const char ZERO = 0;
|
||||
const int NUM_TABLES = 7;
|
||||
const CLD2::CLD2TableSummary* tableSummaries[NUM_TABLES];
|
||||
tableSummaries[0] = data->unigram_compat_obj;
|
||||
tableSummaries[1] = data->deltabi_obj;
|
||||
tableSummaries[2] = data->distinctbi_obj;
|
||||
tableSummaries[3] = data->quadgram_obj;
|
||||
tableSummaries[4] = data->quadgram_obj2;
|
||||
tableSummaries[5] = data->deltaocta_obj;
|
||||
tableSummaries[6] = data->distinctocta_obj;
|
||||
|
||||
CLD2DynamicData::TableHeader tableHeaders[NUM_TABLES];
|
||||
CLD2DynamicData::FileHeader fileHeader;
|
||||
fileHeader.numTablesEncoded = NUM_TABLES;
|
||||
fileHeader.tableHeaders = tableHeaders;
|
||||
initUtf8Headers(&fileHeader, data->unigram_obj);
|
||||
initDeltaHeaders(&fileHeader, data->kExpectedScore);
|
||||
initTableHeaders(tableSummaries, NUM_TABLES, tableHeaders);
|
||||
alignAll(&fileHeader, 16); // Align all sections to 128-bit boundaries
|
||||
|
||||
// We are ready to rock.
|
||||
for (int x=0; x<CLD2DynamicData::DATA_FILE_MARKER_LENGTH; x++)
|
||||
fileHeader.sanityString[x] = CLD2DynamicData::DATA_FILE_MARKER[x];
|
||||
FILE* outFile = fopen(fileName, "w");
|
||||
fwrite(fileHeader.sanityString, 1, CLD2DynamicData::DATA_FILE_MARKER_LENGTH, outFile);
|
||||
fwrite(&(fileHeader.totalFileSizeBytes), 4, 1, outFile);
|
||||
fwrite(&(fileHeader.utf8PropObj_state0), 4, 1, outFile);
|
||||
fwrite(&(fileHeader.utf8PropObj_state0_size), 4, 1, outFile);
|
||||
fwrite(&(fileHeader.utf8PropObj_total_size), 4, 1, outFile);
|
||||
fwrite(&(fileHeader.utf8PropObj_max_expand), 4, 1, outFile);
|
||||
fwrite(&(fileHeader.utf8PropObj_entry_shift), 4, 1, outFile);
|
||||
fwrite(&(fileHeader.utf8PropObj_bytes_per_entry), 4, 1, outFile);
|
||||
fwrite(&(fileHeader.utf8PropObj_losub), 4, 1, outFile);
|
||||
fwrite(&(fileHeader.utf8PropObj_hiadd), 4, 1, outFile);
|
||||
fwrite(&(fileHeader.startOf_utf8PropObj_state_table), 4, 1, outFile);
|
||||
fwrite(&(fileHeader.lengthOf_utf8PropObj_state_table), 4, 1, outFile);
|
||||
fwrite(&(fileHeader.startOf_utf8PropObj_remap_base), 4, 1, outFile);
|
||||
fwrite(&(fileHeader.lengthOf_utf8PropObj_remap_base), 4, 1, outFile);
|
||||
fwrite(&(fileHeader.startOf_utf8PropObj_remap_string), 4, 1, outFile);
|
||||
fwrite(&(fileHeader.lengthOf_utf8PropObj_remap_string), 4, 1, outFile);
|
||||
fwrite(&(fileHeader.startOf_utf8PropObj_fast_state), 4, 1, outFile);
|
||||
fwrite(&(fileHeader.lengthOf_utf8PropObj_fast_state), 4, 1, outFile);
|
||||
fwrite(&(fileHeader.startOf_kAvgDeltaOctaScore), 4, 1, outFile);
|
||||
fwrite(&(fileHeader.lengthOf_kAvgDeltaOctaScore), 4, 1, outFile);
|
||||
fwrite(&(fileHeader.numTablesEncoded), 4, 1, outFile);
|
||||
for (int x=0; x<NUM_TABLES; x++) {
|
||||
CLD2DynamicData::TableHeader& tHeader = fileHeader.tableHeaders[x];
|
||||
fwrite(&(tHeader.kCLDTableSizeOne), 4, 1, outFile);
|
||||
fwrite(&(tHeader.kCLDTableSize), 4, 1, outFile);
|
||||
fwrite(&(tHeader.kCLDTableKeyMask), 4, 1, outFile);
|
||||
fwrite(&(tHeader.kCLDTableBuildDate), 4, 1, outFile);
|
||||
fwrite(&(tHeader.startOf_kCLDTable), 4, 1, outFile);
|
||||
fwrite(&(tHeader.lengthOf_kCLDTable), 4, 1, outFile);
|
||||
fwrite(&(tHeader.startOf_kCLDTableInd), 4, 1, outFile);
|
||||
fwrite(&(tHeader.lengthOf_kCLDTableInd), 4, 1, outFile);
|
||||
fwrite(&(tHeader.startOf_kRecognizedLangScripts), 4, 1, outFile);
|
||||
fwrite(&(tHeader.lengthOf_kRecognizedLangScripts), 4, 1, outFile);
|
||||
}
|
||||
|
||||
// Write data blob
|
||||
// 1. UTF8 Object
|
||||
writeChunk(outFile,
|
||||
data->unigram_obj->state_table,
|
||||
fileHeader.startOf_utf8PropObj_state_table,
|
||||
fileHeader.lengthOf_utf8PropObj_state_table);
|
||||
// FIXME: Unsafe to rely on this since RemapEntry is not a bit-packed structure
|
||||
writeChunk(outFile,
|
||||
data->unigram_obj->remap_base,
|
||||
fileHeader.startOf_utf8PropObj_remap_base,
|
||||
fileHeader.lengthOf_utf8PropObj_remap_base);
|
||||
writeChunk(outFile,
|
||||
data->unigram_obj->remap_string,
|
||||
fileHeader.startOf_utf8PropObj_remap_string,
|
||||
fileHeader.lengthOf_utf8PropObj_remap_string - 1);
|
||||
fwrite(&ZERO,1,1,outFile); // null terminator
|
||||
if (fileHeader.startOf_utf8PropObj_fast_state > 0) {
|
||||
writeChunk(outFile,
|
||||
data->unigram_obj->fast_state,
|
||||
fileHeader.startOf_utf8PropObj_fast_state,
|
||||
fileHeader.lengthOf_utf8PropObj_fast_state - 1);
|
||||
fwrite(&ZERO,1,1,outFile); // null terminator
|
||||
}
|
||||
|
||||
// 2. kAvgDeltaOctaScore array
|
||||
writeChunk(outFile,
|
||||
data->kExpectedScore,
|
||||
fileHeader.startOf_kAvgDeltaOctaScore,
|
||||
fileHeader.lengthOf_kAvgDeltaOctaScore);
|
||||
|
||||
// 3. Each table
|
||||
for (int x=0; x<NUM_TABLES; x++) {
|
||||
const CLD2::CLD2TableSummary* summary = tableSummaries[x];
|
||||
CLD2DynamicData::TableHeader& tHeader = fileHeader.tableHeaders[x];
|
||||
// NB: Safe to directly write IndirectProbBucket4 as it is just an alias for CLD2::uint32
|
||||
writeChunk(outFile,
|
||||
summary->kCLDTable,
|
||||
tHeader.startOf_kCLDTable,
|
||||
tHeader.lengthOf_kCLDTable);
|
||||
writeChunk(outFile,
|
||||
summary->kCLDTableInd,
|
||||
tHeader.startOf_kCLDTableInd,
|
||||
tHeader.lengthOf_kCLDTableInd);
|
||||
writeChunk(outFile,
|
||||
summary->kRecognizedLangScripts,
|
||||
tHeader.startOf_kRecognizedLangScripts,
|
||||
tHeader.lengthOf_kRecognizedLangScripts - 1);
|
||||
fwrite(&ZERO,1,1,outFile); // null terminator
|
||||
}
|
||||
fclose(outFile);
|
||||
}
|
||||
|
||||
void initTableHeaders(const CLD2::CLD2TableSummary** summaries,
|
||||
int numSummaries, CLD2DynamicData::TableHeader* tableHeaders) {
|
||||
for (int x=0; x<numSummaries; x++) {
|
||||
const CLD2::CLD2TableSummary* summary = summaries[x];
|
||||
CLD2DynamicData::TableHeader& tableHeader = tableHeaders[x];
|
||||
|
||||
// Copy the primitive bits
|
||||
tableHeader.kCLDTableSizeOne = summary->kCLDTableSizeOne;
|
||||
tableHeader.kCLDTableSize = summary->kCLDTableSize;
|
||||
tableHeader.kCLDTableKeyMask = summary->kCLDTableKeyMask;
|
||||
tableHeader.kCLDTableBuildDate = summary->kCLDTableBuildDate;
|
||||
|
||||
// Calculate size information
|
||||
CLD2::uint32 bytesPerBucket = sizeof(CLD2::IndirectProbBucket4);
|
||||
CLD2::uint32 numBuckets = summary->kCLDTableSize;
|
||||
CLD2::uint32 tableSizeBytes = bytesPerBucket * numBuckets;
|
||||
CLD2::uint32 indirectTableSizeBytes =
|
||||
summary->kCLDTableSizeOne * sizeof(CLD2::uint32);
|
||||
|
||||
// XXX XXX XXX HACK HACK HACK FIXME FIXME FIXME
|
||||
// XXX XXX XXX HACK HACK HACK FIXME FIXME FIXME
|
||||
// XXX XXX XXX HACK HACK HACK FIXME FIXME FIXME
|
||||
// cld2_generated_cjk_compatible.cc has a kCLDTableSizeOne of zero!
|
||||
if (x == 0) { // cld2_generated_cjk_compatible.cc
|
||||
indirectTableSizeBytes = 239*2*4;
|
||||
}
|
||||
// XXX XXX XXX HACK HACK HACK FIXME FIXME FIXME
|
||||
// XXX XXX XXX HACK HACK HACK FIXME FIXME FIXME
|
||||
// XXX XXX XXX HACK HACK HACK FIXME FIXME FIXME
|
||||
|
||||
CLD2::uint32 recognizedScriptsSizeBytes =
|
||||
strlen(summary->kRecognizedLangScripts) + 1; // note null terminator
|
||||
|
||||
// Place size information into header. We'll align on byte boundaries later.
|
||||
tableHeader.lengthOf_kCLDTable = tableSizeBytes;
|
||||
tableHeader.lengthOf_kCLDTableInd = indirectTableSizeBytes;
|
||||
tableHeader.lengthOf_kRecognizedLangScripts =
|
||||
recognizedScriptsSizeBytes; // null terminator counted above
|
||||
}
|
||||
}
|
||||
|
||||
// Assuming that all fields have been set in the specified header, re-align
|
||||
// the starting positions of all data chunks to be aligned along 64-bit
|
||||
// boundaries for maximum efficiency.
|
||||
void alignAll(CLD2DynamicData::FileHeader* header, int alignment) {
|
||||
CLD2::uint32 totalPadding = 0;
|
||||
if (DEBUG) { std::cout << "Align for " << (alignment*8) << " bits." << std::endl; }
|
||||
CLD2::uint32 headerSize = CLD2DynamicData::calculateHeaderSize(
|
||||
header->numTablesEncoded);
|
||||
CLD2::uint32 offset = headerSize;
|
||||
|
||||
{ // scoping block
|
||||
int stateTablePad = alignment - (offset % alignment);
|
||||
if (stateTablePad == alignment) stateTablePad = 0;
|
||||
totalPadding += stateTablePad;
|
||||
if (DEBUG) { std::cout << "Alignment for stateTable adjusted by " << stateTablePad << std::endl; }
|
||||
offset += stateTablePad;
|
||||
header->startOf_utf8PropObj_state_table = offset;
|
||||
offset += header->lengthOf_utf8PropObj_state_table;
|
||||
}
|
||||
|
||||
{ // scoping block
|
||||
int remapPad = alignment - (offset % alignment);
|
||||
if (remapPad == alignment) remapPad = 0;
|
||||
totalPadding += remapPad;
|
||||
if (DEBUG) { std::cout << "Alignment for remap adjusted by " << remapPad << std::endl; }
|
||||
offset += remapPad;
|
||||
header->startOf_utf8PropObj_remap_base = offset;
|
||||
offset += header->lengthOf_utf8PropObj_remap_base;
|
||||
}
|
||||
|
||||
{ // scoping block
|
||||
int remapStringPad = alignment - (offset % alignment);
|
||||
if (remapStringPad == alignment) remapStringPad = 0;
|
||||
totalPadding += remapStringPad;
|
||||
if (DEBUG) { std::cout << "Alignment for remapString adjusted by " << remapStringPad << std::endl; }
|
||||
offset += remapStringPad;
|
||||
header->startOf_utf8PropObj_remap_string = offset;
|
||||
offset += header->lengthOf_utf8PropObj_remap_string; // null terminator already counted in initUtf8Headers
|
||||
}
|
||||
|
||||
{ // scoping block
|
||||
int fastStatePad = alignment - (offset % alignment);
|
||||
if (fastStatePad == alignment) fastStatePad = 0;
|
||||
totalPadding += fastStatePad;
|
||||
if (DEBUG) { std::cout << "Alignment for fastState adjusted by " << fastStatePad << std::endl; }
|
||||
offset += fastStatePad;
|
||||
if (header->lengthOf_utf8PropObj_fast_state > 0) {
|
||||
header->startOf_utf8PropObj_fast_state = offset;
|
||||
offset += header->lengthOf_utf8PropObj_fast_state; // null terminator already counted in initUtf8Headers
|
||||
} else {
|
||||
header->startOf_utf8PropObj_fast_state = 0;
|
||||
}
|
||||
}
|
||||
|
||||
{ // scoping block
|
||||
int deltaOctaPad = alignment - (offset % alignment);
|
||||
if (deltaOctaPad == alignment) deltaOctaPad = 0;
|
||||
totalPadding += deltaOctaPad;
|
||||
if (DEBUG) { std::cout << "Alignment for deltaOctaScore adjusted by " << deltaOctaPad << std::endl; }
|
||||
offset += deltaOctaPad;
|
||||
header->startOf_kAvgDeltaOctaScore = offset;
|
||||
offset += header->lengthOf_kAvgDeltaOctaScore;
|
||||
}
|
||||
|
||||
// TODO: The rest of the fields
|
||||
for (int x=0; x<header->numTablesEncoded; x++) {
|
||||
CLD2DynamicData::TableHeader& tableHeader = header->tableHeaders[x];
|
||||
int tablePad = alignment - (offset % alignment);
|
||||
if (tablePad == alignment) tablePad = 0;
|
||||
totalPadding += tablePad;
|
||||
if (DEBUG) { std::cout << "Alignment for table " << x << " adjusted by " << tablePad << std::endl; }
|
||||
offset += tablePad;
|
||||
tableHeader.startOf_kCLDTable = offset;
|
||||
offset += tableHeader.lengthOf_kCLDTable;
|
||||
|
||||
int indirectPad = alignment - (offset % alignment);
|
||||
if (indirectPad == alignment) indirectPad = 0;
|
||||
totalPadding += indirectPad;
|
||||
if (DEBUG) { std::cout << "Alignment for tableInd " << x << " adjusted by " << indirectPad << std::endl; }
|
||||
offset += indirectPad;
|
||||
tableHeader.startOf_kCLDTableInd = offset;
|
||||
offset += tableHeader.lengthOf_kCLDTableInd;
|
||||
|
||||
int scriptsPad = alignment - (offset % alignment);
|
||||
if (scriptsPad == alignment) scriptsPad = 0;
|
||||
totalPadding += scriptsPad;
|
||||
if (DEBUG) { std::cout << "Alignment for scriptsPad " << x << " adjusted by " << scriptsPad << std::endl; }
|
||||
offset += scriptsPad;
|
||||
tableHeader.startOf_kRecognizedLangScripts = offset;
|
||||
offset += tableHeader.lengthOf_kRecognizedLangScripts; // null terminator already counted in initTableHeaders
|
||||
}
|
||||
|
||||
// Now that we know exactly how much data we have written, store it in the
|
||||
// header as a sanity check
|
||||
header->totalFileSizeBytes = offset;
|
||||
|
||||
if (DEBUG) {
|
||||
std::cout << "Data aligned." << std::endl;
|
||||
std::cout << "Header size: " << headerSize << " bytes " << std::endl;
|
||||
std::cout << "Data size: " << (offset - totalPadding) << " bytes" << std::endl;
|
||||
std::cout << "Padding size: " << totalPadding << " bytes" << std::endl;
|
||||
|
||||
std::cout << " cld_generated_CjkUni_obj: " << (
|
||||
header->lengthOf_utf8PropObj_state_table +
|
||||
header->lengthOf_utf8PropObj_remap_string +
|
||||
header->lengthOf_utf8PropObj_fast_state)
|
||||
<< " bytes " << std::endl;
|
||||
std::cout << " kAvgDeltaOctaScore: "
|
||||
<< header->lengthOf_kAvgDeltaOctaScore << " bytes " << std::endl;
|
||||
std::cout << " kCjkCompat_obj: " << (
|
||||
header->tableHeaders[0].lengthOf_kCLDTable +
|
||||
header->tableHeaders[0].lengthOf_kCLDTableInd +
|
||||
header->tableHeaders[0].lengthOf_kRecognizedLangScripts + 1)
|
||||
<< " bytes " << std::endl;
|
||||
std::cout << " kCjkDeltaBi_obj: " << (
|
||||
header->tableHeaders[1].lengthOf_kCLDTable +
|
||||
header->tableHeaders[1].lengthOf_kCLDTableInd +
|
||||
header->tableHeaders[1].lengthOf_kRecognizedLangScripts + 1)
|
||||
<< " bytes " << std::endl;
|
||||
std::cout << " kDistinctBiTable_obj: " << (
|
||||
header->tableHeaders[2].lengthOf_kCLDTable +
|
||||
header->tableHeaders[2].lengthOf_kCLDTableInd +
|
||||
header->tableHeaders[2].lengthOf_kRecognizedLangScripts + 1)
|
||||
<< " bytes " << std::endl;
|
||||
std::cout << " kQuad_obj: " << (
|
||||
header->tableHeaders[3].lengthOf_kCLDTable +
|
||||
header->tableHeaders[3].lengthOf_kCLDTableInd +
|
||||
header->tableHeaders[3].lengthOf_kRecognizedLangScripts + 1)
|
||||
<< " bytes " << std::endl;
|
||||
std::cout << " kQuad_obj2: " << (
|
||||
header->tableHeaders[4].lengthOf_kCLDTable +
|
||||
header->tableHeaders[4].lengthOf_kCLDTableInd +
|
||||
header->tableHeaders[4].lengthOf_kRecognizedLangScripts + 1)
|
||||
<< " bytes " << std::endl;
|
||||
std::cout << " kDeltaOcta_obj: " << (
|
||||
header->tableHeaders[5].lengthOf_kCLDTable +
|
||||
header->tableHeaders[5].lengthOf_kCLDTableInd +
|
||||
header->tableHeaders[5].lengthOf_kRecognizedLangScripts + 1)
|
||||
<< " bytes " << std::endl;
|
||||
std::cout << " kDistinctOcta_obj: " << (
|
||||
header->tableHeaders[6].lengthOf_kCLDTable +
|
||||
header->tableHeaders[6].lengthOf_kCLDTableInd +
|
||||
header->tableHeaders[6].lengthOf_kRecognizedLangScripts + 1)
|
||||
<< " bytes " << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
void initDeltaHeaders(CLD2DynamicData::FileHeader* header, const short* deltaArray) {
|
||||
// TODO: Don't hardcode 614*4. Get constant from generated_language.cc?
|
||||
header->startOf_kAvgDeltaOctaScore = 0;
|
||||
header->lengthOf_kAvgDeltaOctaScore = 614 * 4; // from cld_generated_score_quad_octa_1024_256.cc
|
||||
}
|
||||
|
||||
void initUtf8Headers(CLD2DynamicData::FileHeader* header, const CLD2::UTF8PropObj* utf8Object) {
|
||||
header->utf8PropObj_state0 = utf8Object->state0;
|
||||
header->utf8PropObj_state0_size = utf8Object->state0_size;
|
||||
header->utf8PropObj_total_size = utf8Object->total_size;
|
||||
header->utf8PropObj_max_expand = utf8Object->max_expand;
|
||||
header->utf8PropObj_entry_shift = utf8Object->entry_shift;
|
||||
header->utf8PropObj_bytes_per_entry = utf8Object->bytes_per_entry;
|
||||
header->utf8PropObj_losub = utf8Object->losub;
|
||||
header->utf8PropObj_hiadd = utf8Object->hiadd;
|
||||
header->lengthOf_utf8PropObj_state_table = utf8Object->total_size;
|
||||
header->lengthOf_utf8PropObj_remap_base = sizeof(CLD2::RemapEntry); // TODO: Can this ever have more than one entry?
|
||||
header->lengthOf_utf8PropObj_remap_string = strlen(
|
||||
reinterpret_cast<const char*>(utf8Object->remap_string)) + 1; // note null terminator
|
||||
if (utf8Object->fast_state == NULL) {
|
||||
header->lengthOf_utf8PropObj_fast_state = 0; // not applicable
|
||||
} else {
|
||||
header->lengthOf_utf8PropObj_fast_state = strlen(
|
||||
reinterpret_cast<const char*>(utf8Object->fast_state)) + 1; // note null terminator
|
||||
}
|
||||
}
|
||||
} // End namespace CLD2DynamicDataExtractor
|
54
internal/cld2_dynamic_data_extractor.h
Normal file
54
internal/cld2_dynamic_data_extractor.h
Normal file
@@ -0,0 +1,54 @@
|
||||
// Copyright 2014 Google Inc. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#ifndef CLD2_INTERNAL_CLD2_DYNAMIC_DATA_EXTRACTOR_H_
|
||||
#define CLD2_INTERNAL_CLD2_DYNAMIC_DATA_EXTRACTOR_H_
|
||||
|
||||
#include "cld2_dynamic_data.h"
|
||||
#include "integral_types.h"
|
||||
#include "cld2tablesummary.h"
|
||||
#include "utf8statetable.h"
|
||||
#include "scoreonescriptspan.h"
|
||||
|
||||
namespace CLD2DynamicDataExtractor {
|
||||
|
||||
// Enable or disable debugging; 0 to disable, 1 to enable
|
||||
void setDebug(int debug);
|
||||
|
||||
// Populates all the UTF8-related fields of the header, and returns the total
|
||||
// space required within the binary blob to represent the non-primitive data.
|
||||
void initUtf8Headers(CLD2DynamicData::FileHeader* header,
|
||||
const CLD2::UTF8PropObj* utf8Object);
|
||||
|
||||
// Populates all the AvgDeltaOctaScore-related fields of the header.
|
||||
void initDeltaHeaders(CLD2DynamicData::FileHeader* header,
|
||||
const short* deltaArray);
|
||||
|
||||
// Populates all fields of all table headers for the specified table summaries.
|
||||
// Tables are laid out back-to-back in the order that they are specified in the
|
||||
// input array of summaries, and the headers are filled in in the same order.
|
||||
void initTableHeaders(const CLD2::CLD2TableSummary** summaries,
|
||||
int numSummaries, CLD2DynamicData::TableHeader* tableSummaryHeaders);
|
||||
|
||||
// Align all entries in the data block along boundaries that are multiples of
|
||||
// the specified number of bytes. For example, to align everything along 64-bit
|
||||
// boundaries, pass an alignment of 8 (bytes).
|
||||
void alignAll(CLD2DynamicData::FileHeader* header, int alignment);
|
||||
|
||||
// Write the dynamic data file to disk.
|
||||
void writeDataFile(const CLD2::ScoringTables* data, const char* fileName);
|
||||
|
||||
|
||||
} // End namespace CLD2DynamicDataExtractor
|
||||
#endif // CLD2_INTERNAL_CLD2_DYNAMIC_DATA_EXTRACTOR_H_
|
212
internal/cld2_dynamic_data_loader.cc
Normal file
212
internal/cld2_dynamic_data_loader.cc
Normal file
@@ -0,0 +1,212 @@
|
||||
// Copyright 2014 Google Inc. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include <assert.h>
|
||||
#include <stdio.h>
|
||||
#include <iostream>
|
||||
#include <fstream>
|
||||
#include <fcntl.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <sys/mman.h>
|
||||
|
||||
#include "cld2_dynamic_data.h"
|
||||
#include "cld2_dynamic_data_loader.h"
|
||||
#include "integral_types.h"
|
||||
#include "cld2tablesummary.h"
|
||||
#include "utf8statetable.h"
|
||||
#include "scoreonescriptspan.h"
|
||||
|
||||
namespace CLD2DynamicDataLoader {
|
||||
static int DEBUG=0;
|
||||
|
||||
CLD2DynamicData::FileHeader* loadHeader(const char* fileName) {
|
||||
// TODO: force null-terminate char* strings for safety
|
||||
FILE* inFile = fopen(fileName, "r");
|
||||
if (inFile == NULL) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
int bytesRead = 0;
|
||||
CLD2DynamicData::FileHeader* fileHeader = new CLD2DynamicData::FileHeader;
|
||||
bytesRead += fread(fileHeader->sanityString, 1, CLD2DynamicData::DATA_FILE_MARKER_LENGTH, inFile);
|
||||
if (!CLD2DynamicData::mem_compare(fileHeader->sanityString, CLD2DynamicData::DATA_FILE_MARKER, CLD2DynamicData::DATA_FILE_MARKER_LENGTH)) {
|
||||
std::cerr << "Malformed header: bad file marker!" << std::endl;
|
||||
delete fileHeader;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
bytesRead += 4 * fread(&(fileHeader->totalFileSizeBytes), 4, 1, inFile);
|
||||
bytesRead += 4 * fread(&(fileHeader->utf8PropObj_state0), 4, 1, inFile);
|
||||
bytesRead += 4 * fread(&(fileHeader->utf8PropObj_state0_size), 4, 1, inFile);
|
||||
bytesRead += 4 * fread(&(fileHeader->utf8PropObj_total_size), 4, 1, inFile);
|
||||
bytesRead += 4 * fread(&(fileHeader->utf8PropObj_max_expand), 4, 1, inFile);
|
||||
bytesRead += 4 * fread(&(fileHeader->utf8PropObj_entry_shift), 4, 1, inFile);
|
||||
bytesRead += 4 * fread(&(fileHeader->utf8PropObj_bytes_per_entry), 4, 1, inFile);
|
||||
bytesRead += 4 * fread(&(fileHeader->utf8PropObj_losub), 4, 1, inFile);
|
||||
bytesRead += 4 * fread(&(fileHeader->utf8PropObj_hiadd), 4, 1, inFile);
|
||||
bytesRead += 4 * fread(&(fileHeader->startOf_utf8PropObj_state_table), 4, 1, inFile);
|
||||
bytesRead += 4 * fread(&(fileHeader->lengthOf_utf8PropObj_state_table), 4, 1, inFile);
|
||||
bytesRead += 4 * fread(&(fileHeader->startOf_utf8PropObj_remap_base), 4, 1, inFile);
|
||||
bytesRead += 4 * fread(&(fileHeader->lengthOf_utf8PropObj_remap_base), 4, 1, inFile);
|
||||
bytesRead += 4 * fread(&(fileHeader->startOf_utf8PropObj_remap_string), 4, 1, inFile);
|
||||
bytesRead += 4 * fread(&(fileHeader->lengthOf_utf8PropObj_remap_string), 4, 1, inFile);
|
||||
bytesRead += 4 * fread(&(fileHeader->startOf_utf8PropObj_fast_state), 4, 1, inFile);
|
||||
bytesRead += 4 * fread(&(fileHeader->lengthOf_utf8PropObj_fast_state), 4, 1, inFile);
|
||||
bytesRead += 4 * fread(&(fileHeader->startOf_kAvgDeltaOctaScore), 4, 1, inFile);
|
||||
bytesRead += 4 * fread(&(fileHeader->lengthOf_kAvgDeltaOctaScore), 4, 1, inFile);
|
||||
bytesRead += 4 * fread(&(fileHeader->numTablesEncoded), 4, 1, inFile);
|
||||
|
||||
CLD2DynamicData::TableHeader* tableHeaders = new CLD2DynamicData::TableHeader[fileHeader->numTablesEncoded];
|
||||
fileHeader->tableHeaders = tableHeaders;
|
||||
for (int x=0; x<fileHeader->numTablesEncoded; x++) {
|
||||
CLD2DynamicData::TableHeader &tHeader = fileHeader->tableHeaders[x];
|
||||
bytesRead += 4 * fread(&(tHeader.kCLDTableSizeOne), 4, 1, inFile);
|
||||
bytesRead += 4 * fread(&(tHeader.kCLDTableSize), 4, 1, inFile);
|
||||
bytesRead += 4 * fread(&(tHeader.kCLDTableKeyMask), 4, 1, inFile);
|
||||
bytesRead += 4 * fread(&(tHeader.kCLDTableBuildDate), 4, 1, inFile);
|
||||
bytesRead += 4 * fread(&(tHeader.startOf_kCLDTable), 4, 1, inFile);
|
||||
bytesRead += 4 * fread(&(tHeader.lengthOf_kCLDTable), 4, 1, inFile);
|
||||
bytesRead += 4 * fread(&(tHeader.startOf_kCLDTableInd), 4, 1, inFile);
|
||||
bytesRead += 4 * fread(&(tHeader.lengthOf_kCLDTableInd), 4, 1, inFile);
|
||||
bytesRead += 4 * fread(&(tHeader.startOf_kRecognizedLangScripts), 4, 1, inFile);
|
||||
bytesRead += 4 * fread(&(tHeader.lengthOf_kRecognizedLangScripts), 4, 1, inFile);
|
||||
}
|
||||
|
||||
// Confirm header size is correct.
|
||||
int expectedHeaderSize = CLD2DynamicData::calculateHeaderSize(fileHeader->numTablesEncoded);
|
||||
if (expectedHeaderSize != bytesRead) {
|
||||
std::cerr << "Header size mismatch! Expected " << expectedHeaderSize << ", but read " << bytesRead << std::endl;
|
||||
delete fileHeader;
|
||||
delete tableHeaders;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
// Confirm file size is correct.
|
||||
fseek(inFile, 0, SEEK_END);
|
||||
int actualSize = ftell(inFile);
|
||||
fclose(inFile);
|
||||
|
||||
if (actualSize != fileHeader->totalFileSizeBytes) {
|
||||
std::cerr << "File size mismatch! Expected " << fileHeader->totalFileSizeBytes << ", but found " << actualSize << std::endl;
|
||||
delete fileHeader;
|
||||
delete tableHeaders;
|
||||
return NULL;
|
||||
}
|
||||
return fileHeader;
|
||||
}
|
||||
|
||||
void unloadData(CLD2::ScoringTables** scoringTables, void** mmapAddress, int* mmapLength) {
|
||||
free(const_cast<CLD2::UTF8PropObj*>((*scoringTables)->unigram_obj));
|
||||
(*scoringTables)->unigram_obj = NULL;
|
||||
delete((*scoringTables)->unigram_compat_obj); // tableSummaries[0] from loadDataFile
|
||||
(*scoringTables)->unigram_compat_obj = NULL;
|
||||
delete(*scoringTables);
|
||||
*scoringTables = NULL;
|
||||
munmap(*mmapAddress, *mmapLength);
|
||||
*mmapAddress = NULL;
|
||||
*mmapLength = 0;
|
||||
}
|
||||
|
||||
CLD2::ScoringTables* loadDataFile(const char* fileName, void** mmapAddressOut, int* mmapLengthOut) {
|
||||
CLD2DynamicData::FileHeader* fileHeader = loadHeader(fileName);
|
||||
if (fileHeader == NULL) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
// Initialize the memory map
|
||||
int inFileHandle = open(fileName, O_RDONLY);
|
||||
void* mapped = mmap(NULL, fileHeader->totalFileSizeBytes,
|
||||
PROT_READ, MAP_PRIVATE, inFileHandle, 0);
|
||||
// Record the map address. This allows callers to unmap
|
||||
*mmapAddressOut=mapped;
|
||||
*mmapLengthOut=fileHeader->totalFileSizeBytes;
|
||||
close(inFileHandle);
|
||||
|
||||
// 1. UTF8 Object
|
||||
const CLD2::uint8* state_table = static_cast<const CLD2::uint8*>(mapped) +
|
||||
fileHeader->startOf_utf8PropObj_state_table;
|
||||
// FIXME: Unsafe to rely on this since RemapEntry is not a bit-packed structure
|
||||
const CLD2::RemapEntry* remap_base =
|
||||
reinterpret_cast<const CLD2::RemapEntry*>(
|
||||
static_cast<const CLD2::uint8*>(mapped) +
|
||||
fileHeader->startOf_utf8PropObj_remap_base);
|
||||
const CLD2::uint8* remap_string = static_cast<const CLD2::uint8*>(mapped) +
|
||||
fileHeader->startOf_utf8PropObj_remap_string;
|
||||
const CLD2::uint8* fast_state =
|
||||
fileHeader->startOf_utf8PropObj_fast_state == 0 ? 0 :
|
||||
static_cast<const CLD2::uint8*>(mapped) +
|
||||
fileHeader->startOf_utf8PropObj_fast_state;
|
||||
|
||||
// Populate intermediate object. Horrible casting required because the struct
|
||||
// is all read-only integers, and doesn't have a constructor. Yikes.
|
||||
// TODO: It might actually be less horrible to memcpy the data in <shudder>
|
||||
const CLD2::UTF8PropObj* unigram_obj = reinterpret_cast<CLD2::UTF8PropObj*>(malloc(sizeof(CLD2::UTF8PropObj)));
|
||||
*const_cast<CLD2::uint32*>(&unigram_obj->state0) = fileHeader->utf8PropObj_state0;
|
||||
*const_cast<CLD2::uint32*>(&unigram_obj->state0_size) = fileHeader->utf8PropObj_state0_size;
|
||||
*const_cast<CLD2::uint32*>(&unigram_obj->total_size) = fileHeader->utf8PropObj_total_size;
|
||||
*const_cast<int*>(&unigram_obj->max_expand) = fileHeader->utf8PropObj_max_expand;
|
||||
*const_cast<int*>(&unigram_obj->entry_shift) = fileHeader->utf8PropObj_entry_shift;
|
||||
*const_cast<int*>(&unigram_obj->bytes_per_entry) = fileHeader->utf8PropObj_bytes_per_entry;
|
||||
*const_cast<CLD2::uint32*>(&unigram_obj->losub) = fileHeader->utf8PropObj_losub;
|
||||
*const_cast<CLD2::uint32*>(&unigram_obj->hiadd) = fileHeader->utf8PropObj_hiadd;
|
||||
*const_cast<const CLD2::uint8**>(&unigram_obj->state_table) = state_table;
|
||||
*const_cast<const CLD2::RemapEntry**>(&unigram_obj->remap_base) = remap_base;
|
||||
*const_cast<const CLD2::uint8**>(&unigram_obj->remap_string) = remap_string;
|
||||
*const_cast<const CLD2::uint8**>(&unigram_obj->fast_state) = fast_state;
|
||||
|
||||
// 2. kAvgDeltaOctaScore array
|
||||
const short* read_kAvgDeltaOctaScore = reinterpret_cast<const short*>(
|
||||
static_cast<const CLD2::uint8*>(mapped) +
|
||||
fileHeader->startOf_kAvgDeltaOctaScore);
|
||||
|
||||
// 3. Each table
|
||||
CLD2::CLD2TableSummary* tableSummaries = new CLD2::CLD2TableSummary[fileHeader->numTablesEncoded];
|
||||
for (int x=0; x<fileHeader->numTablesEncoded; x++) {
|
||||
CLD2::CLD2TableSummary &summary = tableSummaries[x];
|
||||
CLD2DynamicData::TableHeader& tHeader = fileHeader->tableHeaders[x];
|
||||
const CLD2::IndirectProbBucket4* kCLDTable =
|
||||
reinterpret_cast<const CLD2::IndirectProbBucket4*>(
|
||||
static_cast<CLD2::uint8*>(mapped) + tHeader.startOf_kCLDTable);
|
||||
const CLD2::uint32* kCLDTableInd =
|
||||
reinterpret_cast<const CLD2::uint32*>(
|
||||
static_cast<CLD2::uint8*>(mapped) + tHeader.startOf_kCLDTableInd);
|
||||
const char* kRecognizedLangScripts =
|
||||
static_cast<const char*>(mapped) + tHeader.startOf_kRecognizedLangScripts;
|
||||
|
||||
summary.kCLDTable = kCLDTable;
|
||||
summary.kCLDTableInd = kCLDTableInd;
|
||||
summary.kCLDTableSizeOne = tHeader.kCLDTableSizeOne;
|
||||
summary.kCLDTableSize = tHeader.kCLDTableSize;
|
||||
summary.kCLDTableKeyMask = tHeader.kCLDTableKeyMask;
|
||||
summary.kCLDTableBuildDate = tHeader.kCLDTableBuildDate;
|
||||
summary.kRecognizedLangScripts = kRecognizedLangScripts;
|
||||
}
|
||||
|
||||
// Tie everything together
|
||||
CLD2::ScoringTables* result = new CLD2::ScoringTables;
|
||||
result->unigram_obj = unigram_obj;
|
||||
result->unigram_compat_obj = &tableSummaries[0];
|
||||
result->deltabi_obj = &tableSummaries[1];
|
||||
result->distinctbi_obj = &tableSummaries[2];
|
||||
result->quadgram_obj = &tableSummaries[3];
|
||||
result->quadgram_obj2 = &tableSummaries[4];
|
||||
result->deltaocta_obj = &tableSummaries[5];
|
||||
result->distinctocta_obj = &tableSummaries[6];
|
||||
result->kExpectedScore = read_kAvgDeltaOctaScore;
|
||||
delete fileHeader->tableHeaders;
|
||||
delete fileHeader;
|
||||
return result;
|
||||
}
|
||||
}
|
52
internal/cld2_dynamic_data_loader.h
Normal file
52
internal/cld2_dynamic_data_loader.h
Normal file
@@ -0,0 +1,52 @@
|
||||
// Copyright 2014 Google Inc. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#ifndef CLD2_INTERNAL_CLD2_DYNAMIC_DATA_LOADER_H_
|
||||
#define CLD2_INTERNAL_CLD2_DYNAMIC_DATA_LOADER_H_
|
||||
|
||||
#include "scoreonescriptspan.h"
|
||||
#include "cld2_dynamic_data.h"
|
||||
|
||||
namespace CLD2DynamicDataLoader {
|
||||
|
||||
// Read a header from the specified file and return it.
|
||||
// The header returned is dynamically allocated; you must 'delete' the array
|
||||
// of TableHeaders as well as the returned FileHeader* when done.
|
||||
CLD2DynamicData::FileHeader* loadHeader(const char* fileName);
|
||||
|
||||
// Load data directly into a ScoringTables structure using a private, read-only
|
||||
// mmap and return the newly-allocated structure.
|
||||
// The out-parameter "mmapAddressOut" is a pointer to a void*; the starting
|
||||
// address of the mmap()'d block will be written here.
|
||||
// The out-parameter "mmapLengthOut" is a pointer to an int; the length of the
|
||||
// mmap()'d block will be written here.
|
||||
// It is up to the caller to delete
|
||||
CLD2::ScoringTables* loadDataFile(const char* fileName,
|
||||
void** mmapAddressOut, int* mmapLengthOut);
|
||||
|
||||
// Given pointers to the data from a previous invocation of loadDataFile,
|
||||
// unloads the data safely - freeing and deleting any malloc'd/new'd objects.
|
||||
// When this method returns, the mmap has been deleted, as have all the scoring
|
||||
// tables; the pointers passed in are all zeroed, such that:
|
||||
// *scoringTables == NULL
|
||||
// *mmapAddress == NULL
|
||||
// mmapLength == NULL
|
||||
// This is the only safe way to unload data that was previously loaded, as there
|
||||
// is an unfortunate mixture of new and malloc involved in building the
|
||||
// in-memory represtation of the data.
|
||||
void unloadData(CLD2::ScoringTables** scoringTables,
|
||||
void** mmapAddress, int* mmapLength);
|
||||
|
||||
} // End namespace CLD2DynamicDataExtractor
|
||||
#endif // CLD2_INTERNAL_CLD2_DYNAMIC_DATA_EXTRACTOR_H_
|
162
internal/cld2_dynamic_data_tool.cc
Normal file
162
internal/cld2_dynamic_data_tool.cc
Normal file
@@ -0,0 +1,162 @@
|
||||
// Copyright 2014 Google Inc. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include <assert.h>
|
||||
#include <stdio.h>
|
||||
#include <iostream>
|
||||
#include <fstream>
|
||||
#include <fcntl.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <sys/mman.h>
|
||||
|
||||
#include "cld2_dynamic_data.h"
|
||||
#include "cld2_dynamic_data_extractor.h"
|
||||
#include "cld2_dynamic_data_loader.h"
|
||||
#include "integral_types.h"
|
||||
#include "cld2tablesummary.h"
|
||||
#include "utf8statetable.h"
|
||||
#include "scoreonescriptspan.h"
|
||||
|
||||
// We need these in order to set up a real data object to pass around.
|
||||
namespace CLD2 {
|
||||
extern const UTF8PropObj cld_generated_CjkUni_obj;
|
||||
extern const CLD2TableSummary kCjkCompat_obj;
|
||||
extern const CLD2TableSummary kCjkDeltaBi_obj;
|
||||
extern const CLD2TableSummary kDistinctBiTable_obj;
|
||||
extern const CLD2TableSummary kQuad_obj;
|
||||
extern const CLD2TableSummary kQuad_obj2;
|
||||
extern const CLD2TableSummary kDeltaOcta_obj;
|
||||
extern const CLD2TableSummary kDistinctOcta_obj;
|
||||
extern const short kAvgDeltaOctaScore[];
|
||||
}
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
if (!CLD2DynamicData::isLittleEndian()) {
|
||||
std::cerr << "System is big-endian: currently not supported." << std::endl;
|
||||
return -1;
|
||||
}
|
||||
if (!CLD2DynamicData::coreAssumptionsOk()) {
|
||||
std::cerr << "Core assumptions violated, unsafe to continue." << std::endl;
|
||||
return -2;
|
||||
}
|
||||
|
||||
// Get command-line flags
|
||||
int flags = 0;
|
||||
bool get_vector = false;
|
||||
char* fileName = NULL;
|
||||
const char* USAGE = "\
|
||||
CLD2 Dynamic Data Tool:\n\
|
||||
Dump, verify or print summaries of scoring tables for CLD2.\n\
|
||||
\n\
|
||||
The files output by this tool are suitable for all little-endian platforms,\n\
|
||||
and should work on both 32- and 64-bit platforms.\n\
|
||||
\n\
|
||||
IMPORTANT: The files output by this tool WILL NOT work on big-endian platforms.\n\
|
||||
\n\
|
||||
Usage:\n\
|
||||
--dump [FILE] Dump the scoring tables that this tool was linked against\n\
|
||||
to the specified file. The tables are automatically verified\n\
|
||||
after writing, just as if the tool was run again with\n\
|
||||
'--verify'.\n\
|
||||
--verify [FILE] Verify that a given file precisely matches the scoring\n\
|
||||
tables that this tool was linked against. This can be used\n\
|
||||
to verify that a file is compatible.\n\
|
||||
--head [FILE] Print headers from the specified file to stdout.\n\
|
||||
--verbose Be verbose.\n\
|
||||
";
|
||||
int mode = 0; //1=dump, 2=verify, 3=head
|
||||
for (int i = 1; i < argc; ++i) {
|
||||
if (strcmp(argv[i], "--verbose") == 0) {
|
||||
CLD2DynamicDataExtractor::setDebug(1);
|
||||
CLD2DynamicData::setDebug(1);
|
||||
}
|
||||
else if (strcmp(argv[i], "--dump") == 0
|
||||
|| strcmp(argv[i], "--verify") == 0
|
||||
|| strcmp(argv[i], "--head") == 0) {
|
||||
|
||||
// set mode flag properly
|
||||
if (strcmp(argv[i], "--dump") == 0) mode=1;
|
||||
else if (strcmp(argv[i], "--verify") == 0) mode=2;
|
||||
else mode=3;
|
||||
if (i < argc - 1) {
|
||||
fileName = argv[++i];
|
||||
} else {
|
||||
std::cerr << "missing file name argument" << std::endl << std::endl;
|
||||
std::cerr << USAGE;
|
||||
return -1;
|
||||
}
|
||||
} else if (strcmp(argv[i], "--help") == 0) {
|
||||
std::cout << USAGE;
|
||||
return 0;
|
||||
} else {
|
||||
std::cerr << "Unsupported option: " << argv[i] << std::endl << std::endl;
|
||||
std::cerr << USAGE;
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
if (mode == 0) {
|
||||
std::cerr << USAGE;
|
||||
return -1;
|
||||
}
|
||||
|
||||
CLD2::ScoringTables realData = {
|
||||
&CLD2::cld_generated_CjkUni_obj,
|
||||
&CLD2::kCjkCompat_obj,
|
||||
&CLD2::kCjkDeltaBi_obj,
|
||||
&CLD2::kDistinctBiTable_obj,
|
||||
&CLD2::kQuad_obj,
|
||||
&CLD2::kQuad_obj2,
|
||||
&CLD2::kDeltaOcta_obj,
|
||||
&CLD2::kDistinctOcta_obj,
|
||||
CLD2::kAvgDeltaOctaScore,
|
||||
};
|
||||
if (mode == 1) { // dump
|
||||
CLD2DynamicDataExtractor::writeDataFile(
|
||||
static_cast<const CLD2::ScoringTables*>(&realData),
|
||||
fileName);
|
||||
} else if (mode == 3) { // head
|
||||
CLD2DynamicData::FileHeader* header = CLD2DynamicDataLoader::loadHeader(fileName);
|
||||
if (header == NULL) {
|
||||
std::cerr << "Cannot read header from file: " << fileName << std::endl;
|
||||
return -1;
|
||||
}
|
||||
CLD2DynamicData::dumpHeader(header);
|
||||
delete header->tableHeaders;
|
||||
delete header;
|
||||
}
|
||||
|
||||
if (mode == 1 || mode == 2) { // dump || verify (so perform verification)
|
||||
void* mmapAddress = NULL;
|
||||
int mmapLength = 0;
|
||||
CLD2::ScoringTables* loadedData = CLD2DynamicDataLoader::loadDataFile(fileName, &mmapAddress, &mmapLength);
|
||||
|
||||
if (loadedData == NULL) {
|
||||
std::cerr << "Failed to read data file: " << fileName << std::endl;
|
||||
return -1;
|
||||
}
|
||||
bool result = CLD2DynamicData::verify(
|
||||
static_cast<const CLD2::ScoringTables*>(&realData),
|
||||
static_cast<const CLD2::ScoringTables*>(loadedData));
|
||||
CLD2DynamicDataLoader::unloadData(&loadedData, &mmapAddress, &mmapLength);
|
||||
if (loadedData != NULL || mmapAddress != NULL || mmapLength != 0) {
|
||||
std::cerr << "Warning: failed to clean up memory for ScoringTables." << std::endl;
|
||||
}
|
||||
if (!result) {
|
||||
std::cerr << "Verification failed!" << std::endl;
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
}
|
@@ -252,10 +252,34 @@ void FinishHtmlOut(int flags) {
|
||||
#endif
|
||||
}
|
||||
|
||||
#ifdef CLD2_DYNAMIC_MODE
|
||||
int RunTests (int flags, bool get_vector, const char* data_file) {
|
||||
#else
|
||||
int RunTests (int flags, bool get_vector) {
|
||||
#endif
|
||||
fprintf(stdout, "CLD2 version: %s\n", CLD2::DetectLanguageVersion());
|
||||
InitHtmlOut(flags);
|
||||
bool any_fail = false;
|
||||
|
||||
#ifdef CLD2_DYNAMIC_MODE
|
||||
fprintf(stdout, "[DYNAMIC] Test running in dynamic data mode!\n");
|
||||
bool dataLoaded = CLD2::isDataLoaded();
|
||||
if (dataLoaded) {
|
||||
fprintf(stderr, "[DYNAMIC] *** Error: CLD2::isDataLoaded() returned true prior to loading data!\n");
|
||||
any_fail = true;
|
||||
}
|
||||
fprintf(stdout, "[DYNAMIC] Attempting translation prior to loading data\n");
|
||||
any_fail |= !OneTest(flags, get_vector, UNKNOWN_LANGUAGE, kTeststr_en, strlen(kTeststr_en));
|
||||
fprintf(stdout, "[DYNAMIC] Loading data from: %s\n", data_file);
|
||||
CLD2::loadData(data_file);
|
||||
dataLoaded = CLD2::isDataLoaded();
|
||||
if (!dataLoaded) {
|
||||
fprintf(stderr, "[DYNAMIC] *** Error: CLD2::isDataLoaded() returned false after loading data!\n");
|
||||
any_fail = true;
|
||||
}
|
||||
fprintf(stdout, "[DYNAMIC] Data loaded, normal tests commencing\n");
|
||||
#endif
|
||||
|
||||
int i = 0;
|
||||
while (kTestPair[i].text != NULL) {
|
||||
Language lang_expected = kTestPair[i].lang;
|
||||
@@ -265,6 +289,19 @@ int RunTests (int flags, bool get_vector) {
|
||||
any_fail |= (!ok);
|
||||
++i;
|
||||
}
|
||||
|
||||
#ifdef CLD2_DYNAMIC_MODE
|
||||
fprintf(stdout, "[DYNAMIC] Normal tests complete, attempting to unload data\n");
|
||||
CLD2::unloadData();
|
||||
dataLoaded = CLD2::isDataLoaded();
|
||||
if (dataLoaded) {
|
||||
fprintf(stderr, "[DYNAMIC] *** Error: CLD2::isDataLoaded() returned true after unloading data!\n");
|
||||
any_fail = true;
|
||||
}
|
||||
fprintf(stdout, "[DYNAMIC] Attempting translation after unloading data\n");
|
||||
any_fail |= !OneTest(flags, get_vector, UNKNOWN_LANGUAGE, kTeststr_en, strlen(kTeststr_en));
|
||||
#endif
|
||||
|
||||
if (any_fail) {
|
||||
fprintf(stderr, "FAIL\n");
|
||||
fprintf(stdout, "FAIL\n");
|
||||
@@ -283,6 +320,7 @@ int main(int argc, char** argv) {
|
||||
// Get command-line flags
|
||||
int flags = 0;
|
||||
bool get_vector = false;
|
||||
const char* data_file = NULL;
|
||||
for (int i = 1; i < argc; ++i) {
|
||||
if (strcmp(argv[i], "--html") == 0) {flags |= CLD2::kCLDFlagHtml;}
|
||||
if (strcmp(argv[i], "--cr") == 0) {flags |= CLD2::kCLDFlagCr;}
|
||||
@@ -290,8 +328,17 @@ int main(int argc, char** argv) {
|
||||
if (strcmp(argv[i], "--quiet") == 0) {flags |= CLD2::kCLDFlagQuiet;}
|
||||
if (strcmp(argv[i], "--echo") == 0) {flags |= CLD2::kCLDFlagEcho;}
|
||||
if (strcmp(argv[i], "--vector") == 0) {get_vector = true;}
|
||||
if (strcmp(argv[i], "--data-file") == 0) { data_file = argv[++i];}
|
||||
}
|
||||
|
||||
#ifdef CLD2_DYNAMIC_MODE
|
||||
if (data_file == NULL) {
|
||||
fprintf(stderr, "When running in dynamic mode, you must specify --data-file [FILE]\n");
|
||||
return -1;
|
||||
}
|
||||
return CLD2::RunTests(flags, get_vector, data_file);
|
||||
#else
|
||||
return CLD2::RunTests(flags, get_vector);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
@@ -28,6 +28,10 @@
|
||||
#include "lang_script.h"
|
||||
#include "utf8statetable.h"
|
||||
|
||||
#ifdef CLD2_DYNAMIC_MODE
|
||||
#include "cld2_dynamic_data.h"
|
||||
#include "cld2_dynamic_data_loader.h"
|
||||
#endif
|
||||
#include "cld2tablesummary.h"
|
||||
#include "compact_lang_det_impl.h"
|
||||
#include "compact_lang_det_hint_code.h"
|
||||
@@ -63,20 +67,58 @@ extern const CLD2TableSummary kDeltaOcta_obj;
|
||||
extern const CLD2TableSummary kDistinctOcta_obj;
|
||||
extern const short kAvgDeltaOctaScore[];
|
||||
|
||||
// This initializes kScoringtables.quadgram_obj etc.
|
||||
static const ScoringTables kScoringtables = {
|
||||
&cld_generated_CjkUni_obj,
|
||||
&kCjkCompat_obj,
|
||||
&kCjkDeltaBi_obj,
|
||||
&kDistinctBiTable_obj,
|
||||
#ifdef CLD2_DYNAMIC_MODE
|
||||
// CLD2_DYNAMIC_MODE is defined:
|
||||
// Data will be read from an mmap opened at runtime.
|
||||
static ScoringTables kScoringtables = {
|
||||
NULL, //&cld_generated_CjkUni_obj,
|
||||
NULL, //&kCjkCompat_obj,
|
||||
NULL, //&kCjkDeltaBi_obj,
|
||||
NULL, //&kDistinctBiTable_obj,
|
||||
NULL, //&kQuad_obj,
|
||||
NULL, //&kQuad_obj2,
|
||||
NULL, //&kDeltaOcta_obj,
|
||||
NULL, //&kDistinctOcta_obj,
|
||||
NULL, //kAvgDeltaOctaScore,
|
||||
};
|
||||
static bool dynamicDataLoaded = false;
|
||||
static ScoringTables* dynamicTables = NULL;
|
||||
static void* mmapAddress = NULL;
|
||||
static int mmapLength = 0;
|
||||
|
||||
&kQuad_obj,
|
||||
&kQuad_obj2, // Dual lookup tables
|
||||
&kDeltaOcta_obj,
|
||||
&kDistinctOcta_obj,
|
||||
bool isDataLoaded() { return dynamicDataLoaded; }
|
||||
|
||||
kAvgDeltaOctaScore,
|
||||
};
|
||||
void loadData(const char* fileName) {
|
||||
if (isDataLoaded()) {
|
||||
unloadData();
|
||||
}
|
||||
dynamicTables = CLD2DynamicDataLoader::loadDataFile(fileName, &mmapAddress, &mmapLength);
|
||||
kScoringtables = *dynamicTables;
|
||||
dynamicDataLoaded = true;
|
||||
};
|
||||
|
||||
void unloadData() {
|
||||
if (!dynamicDataLoaded) return;
|
||||
dynamicDataLoaded = false;
|
||||
// unloading will null all the pointers out.
|
||||
CLD2DynamicDataLoader::unloadData(&dynamicTables, &mmapAddress, &mmapLength);
|
||||
}
|
||||
#else
|
||||
// This initializes kScoringtables.quadgram_obj etc.
|
||||
static const ScoringTables kScoringtables = {
|
||||
&cld_generated_CjkUni_obj,
|
||||
&kCjkCompat_obj,
|
||||
&kCjkDeltaBi_obj,
|
||||
&kDistinctBiTable_obj,
|
||||
|
||||
&kQuad_obj,
|
||||
&kQuad_obj2, // Dual lookup tables
|
||||
&kDeltaOcta_obj,
|
||||
&kDistinctOcta_obj,
|
||||
|
||||
kAvgDeltaOctaScore,
|
||||
};
|
||||
#endif // #ifdef CLD2_DYNAMIC_MODE
|
||||
|
||||
|
||||
static const bool FLAGS_cld_no_minimum_bytes = false;
|
||||
@@ -1622,6 +1664,19 @@ Language DetectLanguageSummaryV2(
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef CLD2_DYNAMIC_MODE
|
||||
// In dynamic mode, we immediately return UNKNOWN_LANGUAGE if the data file
|
||||
// hasn't been loaded yet. This is the only sane thing we can do, as there
|
||||
// are no scoring tables to consult.
|
||||
bool dataLoaded = isDataLoaded();
|
||||
if ((flags & kCLDFlagVerbose) != 0) {
|
||||
fprintf(stderr, "Data loaded: %s\n", (dataLoaded ? "true" : "false"));
|
||||
}
|
||||
if (!dataLoaded) {
|
||||
return UNKNOWN_LANGUAGE;
|
||||
}
|
||||
#endif
|
||||
|
||||
// Exit now if no text
|
||||
if (buffer_length == 0) {return UNKNOWN_LANGUAGE;}
|
||||
if (kScoringtables.quadgram_obj == NULL) {return UNKNOWN_LANGUAGE;}
|
||||
|
@@ -42,16 +42,18 @@ typedef int32 Encoding;
|
||||
static const Encoding UNKNOWN_ENCODING = 0;
|
||||
|
||||
|
||||
// Linker supplies the right tables; see ScoringTables compact_lang_det_impl.cc
|
||||
// These are here JUST for printing versions
|
||||
extern const UTF8PropObj cld_generated_CjkUni_obj;
|
||||
extern const CLD2TableSummary kCjkDeltaBi_obj;
|
||||
extern const CLD2TableSummary kDistinctBiTable_obj;
|
||||
extern const CLD2TableSummary kQuad_obj;
|
||||
extern const CLD2TableSummary kDeltaOcta_obj;
|
||||
extern const CLD2TableSummary kDistinctOcta_obj;
|
||||
extern const CLD2TableSummary kOcta2_obj;
|
||||
extern const short kAvgDeltaOctaScore[];
|
||||
#ifndef CLD2_DYNAMIC_MODE
|
||||
// Linker supplies the right tables; see ScoringTables compact_lang_det_impl.cc
|
||||
// These are here JUST for printing versions
|
||||
extern const UTF8PropObj cld_generated_CjkUni_obj;
|
||||
extern const CLD2TableSummary kCjkDeltaBi_obj;
|
||||
extern const CLD2TableSummary kDistinctBiTable_obj;
|
||||
extern const CLD2TableSummary kQuad_obj;
|
||||
extern const CLD2TableSummary kDeltaOcta_obj;
|
||||
extern const CLD2TableSummary kDistinctOcta_obj;
|
||||
extern const CLD2TableSummary kOcta2_obj;
|
||||
extern const short kAvgDeltaOctaScore[];
|
||||
#endif
|
||||
|
||||
bool FLAGS_cld_version = false;
|
||||
bool FLAGS_cld_html = true;
|
||||
@@ -201,6 +203,7 @@ void DumpLanguages(Language summary_lang,
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
if (FLAGS_cld_version) {
|
||||
#ifndef CLD2_DYNAMIC_MODE
|
||||
printf("%s %4dKB uni build date, bytes\n",
|
||||
"........",
|
||||
cld_generated_CjkUni_obj.total_size >> 10);
|
||||
@@ -216,11 +219,14 @@ int main(int argc, char** argv) {
|
||||
kDeltaOcta_obj.kCLDTableBuildDate,
|
||||
(kDeltaOcta_obj.kCLDTableSize *
|
||||
sizeof(IndirectProbBucket4)) >> 10);
|
||||
#else
|
||||
printf("FLAGS_cld_version doesn't work with dynamic data mode\n");
|
||||
#endif
|
||||
exit(0);
|
||||
} // End FLAGS_cld_version
|
||||
|
||||
int flags = 0;
|
||||
bool get_vector = false;
|
||||
const char* data_file = NULL;
|
||||
const char* fname = NULL;
|
||||
for (int i = 1; i < argc; ++i) {
|
||||
if (argv[i][0] != '-') {fname = argv[i];}
|
||||
@@ -230,8 +236,19 @@ int main(int argc, char** argv) {
|
||||
if (strcmp(argv[i], "--verbose") == 0) {flags |= kCLDFlagVerbose;}
|
||||
if (strcmp(argv[i], "--echo") == 0) {flags |= kCLDFlagEcho;}
|
||||
if (strcmp(argv[i], "--vector") == 0) {get_vector = true;}
|
||||
if (strcmp(argv[i], "--data-file") == 0) { data_file = argv[++i];}
|
||||
}
|
||||
|
||||
#ifdef CLD2_DYNAMIC_MODE
|
||||
if (data_file == NULL) {
|
||||
fprintf(stderr, "When running in dynamic mode, you must specify --data-file [FILE]\n");
|
||||
return -1;
|
||||
}
|
||||
fprintf(stdout, "Loading data from: %s\n", data_file);
|
||||
CLD2::loadData(data_file);
|
||||
fprintf(stdout, "Data loaded, test commencing\n");
|
||||
#endif
|
||||
|
||||
FILE* fin;
|
||||
if (fname == NULL) {
|
||||
fin = stdin;
|
||||
|
71
internal/compile_dynamic.sh
Executable file
71
internal/compile_dynamic.sh
Executable file
@@ -0,0 +1,71 @@
|
||||
#!/bin/sh
|
||||
#
|
||||
# Copyright 2013 Google Inc. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http:# www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
# The data tool, which can be used to read and write CLD2 dynamic data files
|
||||
g++ -O2 -m64 cld2_dynamic_data_tool.cc \
|
||||
cld2_dynamic_data.h cld2_dynamic_data.cc \
|
||||
cld2_dynamic_data_extractor.h cld2_dynamic_data_extractor.cc \
|
||||
cld2_dynamic_data_loader.h cld2_dynamic_data_loader.cc \
|
||||
cldutil.cc cldutil_shared.cc compact_lang_det.cc compact_lang_det_hint_code.cc \
|
||||
compact_lang_det_impl.cc debug.cc fixunicodevalue.cc \
|
||||
generated_entities.cc generated_language.cc generated_ulscript.cc \
|
||||
getonescriptspan.cc lang_script.cc offsetmap.cc scoreonescriptspan.cc \
|
||||
tote.cc utf8statetable.cc \
|
||||
cld_generated_cjk_uni_prop_80.cc cld2_generated_cjk_compatible.cc \
|
||||
cld_generated_cjk_delta_bi_4.cc generated_distinct_bi_0.cc \
|
||||
cld2_generated_quadchrome0122_2.cc cld2_generated_deltaoctachrome0122.cc \
|
||||
cld2_generated_distinctoctachrome0122.cc cld_generated_score_quad_octa_0122_2.cc \
|
||||
-o cld2_dynamic_data_tool
|
||||
echo " cld2_dynamic_data_tool compiled"
|
||||
|
||||
# Tests for Chromium flavored dynamic CLD2
|
||||
g++ -O2 -m64 -D CLD2_DYNAMIC_MODE compact_lang_det_test.cc \
|
||||
cld2_dynamic_data.h cld2_dynamic_data.cc \
|
||||
cld2_dynamic_data_extractor.h cld2_dynamic_data_extractor.cc \
|
||||
cld2_dynamic_data_loader.h cld2_dynamic_data_loader.cc \
|
||||
cldutil.cc cldutil_shared.cc compact_lang_det.cc compact_lang_det_hint_code.cc \
|
||||
compact_lang_det_impl.cc debug.cc fixunicodevalue.cc \
|
||||
generated_entities.cc generated_language.cc generated_ulscript.cc \
|
||||
getonescriptspan.cc lang_script.cc offsetmap.cc scoreonescriptspan.cc \
|
||||
tote.cc utf8statetable.cc \
|
||||
-o compact_lang_det_dynamic_test_chrome
|
||||
echo " compact_lang_det_dynamic_test_chrome compiled"
|
||||
|
||||
|
||||
# Unit tests, in dynamic mode
|
||||
g++ -O2 -m64 -g3 -D CLD2_DYNAMIC_MODE cld2_unittest.cc \
|
||||
cld2_dynamic_data.h cld2_dynamic_data.cc \
|
||||
cld2_dynamic_data_loader.h cld2_dynamic_data_loader.cc \
|
||||
cldutil.cc cldutil_shared.cc compact_lang_det.cc compact_lang_det_hint_code.cc \
|
||||
compact_lang_det_impl.cc debug.cc fixunicodevalue.cc \
|
||||
generated_entities.cc generated_language.cc generated_ulscript.cc \
|
||||
getonescriptspan.cc lang_script.cc offsetmap.cc scoreonescriptspan.cc \
|
||||
tote.cc utf8statetable.cc \
|
||||
-o cld2_dynamic_unittest
|
||||
echo " cld2_dynamic_unittest compiled"
|
||||
|
||||
# Shared library, in dynamic mode
|
||||
g++ -shared -fPIC -O2 -m64 -D CLD2_DYNAMIC_MODE \
|
||||
cld2_dynamic_data.h cld2_dynamic_data.cc \
|
||||
cld2_dynamic_data_loader.h cld2_dynamic_data_loader.cc \
|
||||
cldutil.cc cldutil_shared.cc compact_lang_det.cc compact_lang_det_hint_code.cc \
|
||||
compact_lang_det_impl.cc debug.cc fixunicodevalue.cc \
|
||||
generated_entities.cc generated_language.cc generated_ulscript.cc \
|
||||
getonescriptspan.cc lang_script.cc offsetmap.cc scoreonescriptspan.cc \
|
||||
tote.cc utf8statetable.cc \
|
||||
-o libcld2_dynamic.so
|
||||
echo " libcld2_dynamic.so compiled"
|
||||
|
@@ -295,6 +295,26 @@ Flag meanings:
|
||||
void DumpResultChunkVector(FILE* f, const char* src,
|
||||
ResultChunkVector* resultchunkvector);
|
||||
|
||||
#ifdef CLD2_DYNAMIC_MODE
|
||||
|
||||
// If compiled with dynamic mode, load data from the specified file location.
|
||||
// If other data has already been loaded, it is discarded and the data is read
|
||||
// in from the specified file location again (even if the file has not changed).
|
||||
// WARNING: Before calling this method, language detection will always fail
|
||||
// and will always return the unknown language.
|
||||
void loadData(const char* fileName);
|
||||
|
||||
// If compiled with dynamic mode, unload the previously-loaded data.
|
||||
// WARNING: After calling this method, language detection will no longer work
|
||||
// and will always return the unknown language.
|
||||
void unloadData();
|
||||
|
||||
// Returns true if and only if data has been loaded via a call to loadData(...)
|
||||
// and has not been subsequently unladed via a call to unloadDate().
|
||||
bool isDataLoaded();
|
||||
|
||||
#endif // #ifdef CLD2_DYNAMIC_MODE
|
||||
|
||||
}; // End namespace CLD2
|
||||
|
||||
#endif // I18N_ENCODINGS_CLD2_PUBLIC_COMPACT_LANG_DET_H_
|
||||
|
Reference in New Issue
Block a user