From cffbd73e131353ca23de449b3b1a5182028a85a1 Mon Sep 17 00:00:00 2001 From: "andrewhayden@google.com" Date: Mon, 3 Mar 2014 15:20:05 +0000 Subject: [PATCH] Enable Dynamic Mode for CLD2. See issue 6 for more information on dynamic mode: https://code.google.com/p/cld2/issues/detail?id=6 git-svn-id: https://cld2.googlecode.com/svn/trunk@151 b252ecd4-b096-bf77-eb8e-91563289f87e --- internal/cld2_dynamic_data.cc | 236 +++++++++++++++ internal/cld2_dynamic_data.h | 216 ++++++++++++++ internal/cld2_dynamic_data_extractor.cc | 376 ++++++++++++++++++++++++ internal/cld2_dynamic_data_extractor.h | 54 ++++ internal/cld2_dynamic_data_loader.cc | 212 +++++++++++++ internal/cld2_dynamic_data_loader.h | 52 ++++ internal/cld2_dynamic_data_tool.cc | 162 ++++++++++ internal/cld2_unittest.cc | 47 +++ internal/compact_lang_det_impl.cc | 79 ++++- internal/compact_lang_det_test.cc | 39 ++- internal/compile_dynamic.sh | 71 +++++ public/compact_lang_det.h | 20 ++ 12 files changed, 1541 insertions(+), 23 deletions(-) create mode 100644 internal/cld2_dynamic_data.cc create mode 100644 internal/cld2_dynamic_data.h create mode 100644 internal/cld2_dynamic_data_extractor.cc create mode 100644 internal/cld2_dynamic_data_extractor.h create mode 100644 internal/cld2_dynamic_data_loader.cc create mode 100644 internal/cld2_dynamic_data_loader.h create mode 100644 internal/cld2_dynamic_data_tool.cc create mode 100755 internal/compile_dynamic.sh diff --git a/internal/cld2_dynamic_data.cc b/internal/cld2_dynamic_data.cc new file mode 100644 index 0000000..b867344 --- /dev/null +++ b/internal/cld2_dynamic_data.cc @@ -0,0 +1,236 @@ +// Copyright 2014 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "cld2_dynamic_data.h" +#include "integral_types.h" +#include +#include +#include + +namespace CLD2DynamicData { +static int DEBUG=0; +void setDebug(int debug) { + DEBUG=debug; +} + +bool mem_compare(const void* data1, const void* data2, const int length) { + const unsigned char* raw1 = static_cast(data1); + const unsigned char* raw2 = static_cast(data2); + for (int x=0; x " << (unsigned int) raw2[y] + << ( x == y ? " [FIRST ERROR DETECTED HERE] " : "") + << std::endl; + } + return false; + } + } + return true; +} + +CLD2::uint32 calculateHeaderSize(CLD2::uint32 numTables) { + return DATA_FILE_MARKER_LENGTH // NB: no null terminator + + (20 * sizeof(CLD2::uint32)) // 20 uint32 fields in the struct + + (numTables * (10 * sizeof(CLD2::uint32))); // 10 uint32 per table +} + +void dumpHeader(FileHeader* header) { + char safeString[DATA_FILE_MARKER_LENGTH + 1]; + memcpy(safeString, header->sanityString, DATA_FILE_MARKER_LENGTH); + safeString[DATA_FILE_MARKER_LENGTH] = 0; + std::cout << "sanityString: " << safeString << std::endl; + std::cout << "totalFileSizeBytes: " << header->totalFileSizeBytes << std::endl; + std::cout << "utf8PropObj_state0: " << header->utf8PropObj_state0 << std::endl; + std::cout << "utf8PropObj_state0_size: " << header->utf8PropObj_state0_size << std::endl; + std::cout << "utf8PropObj_total_size: " << header->utf8PropObj_total_size << std::endl; + std::cout << "utf8PropObj_max_expand: " << header->utf8PropObj_max_expand << std::endl; + std::cout << "utf8PropObj_entry_shift: " << header->utf8PropObj_entry_shift << std::endl; + std::cout << "utf8PropObj_bytes_per_entry: " << header->utf8PropObj_bytes_per_entry << std::endl; + std::cout << "utf8PropObj_losub: " << header->utf8PropObj_losub << std::endl; + std::cout << "utf8PropObj_hiadd: " << header->utf8PropObj_hiadd << std::endl; + std::cout << "startOf_utf8PropObj_state_table: " << header->startOf_utf8PropObj_state_table << std::endl; + std::cout << "lengthOf_utf8PropObj_state_table: " << header->lengthOf_utf8PropObj_state_table << std::endl; + std::cout << "startOf_utf8PropObj_remap_base: " << header->startOf_utf8PropObj_remap_base << std::endl; + std::cout << "lengthOf_utf8PropObj_remap_base: " << header->lengthOf_utf8PropObj_remap_base << std::endl; + std::cout << "startOf_utf8PropObj_remap_string: " << header->startOf_utf8PropObj_remap_string << std::endl; + std::cout << "lengthOf_utf8PropObj_remap_string: " << header->lengthOf_utf8PropObj_remap_string << std::endl; + std::cout << "startOf_utf8PropObj_fast_state: " << header->startOf_utf8PropObj_fast_state << std::endl; + std::cout << "lengthOf_utf8PropObj_fast_state: " << header->lengthOf_utf8PropObj_fast_state << std::endl; + std::cout << "startOf_kAvgDeltaOctaScore: " << header->startOf_kAvgDeltaOctaScore << std::endl; + std::cout << "lengthOf_kAvgDeltaOctaScore: " << header->lengthOf_kAvgDeltaOctaScore << std::endl; + std::cout << "numTablesEncoded: " << header->numTablesEncoded << std::endl; + + const char* tableNames[7]; + tableNames[0]="unigram_compat_obj"; + tableNames[1]="deltabi_obj"; + tableNames[2]="distinctbi_obj"; + tableNames[3]="quadgram_obj"; + tableNames[4]="quadgram_obj2"; + tableNames[5]="deltaocta_obj"; + tableNames[6]="distinctocta_obj"; + + for (int x=0; xnumTablesEncoded; x++) { + TableHeader& tHeader = header->tableHeaders[x]; + + std::cout << "Table " << (x+1) << ": (" << tableNames[x] << ")" << std::endl; + std::cout << " kCLDTableSizeOne: " << tHeader.kCLDTableSizeOne << std::endl; + std::cout << " kCLDTableSize: " << tHeader.kCLDTableSize << std::endl; + std::cout << " kCLDTableKeyMask: " << tHeader.kCLDTableKeyMask << std::endl; + std::cout << " kCLDTableBuildDate: " << tHeader.kCLDTableBuildDate << std::endl; + std::cout << " startOf_kCLDTable: " << tHeader.startOf_kCLDTable << std::endl; + std::cout << " lengthOf_kCLDTable: " << tHeader.lengthOf_kCLDTable << std::endl; + std::cout << " startOf_kCLDTableInd: " << tHeader.startOf_kCLDTableInd << std::endl; + std::cout << " lengthOf_kCLDTableInd: " << tHeader.lengthOf_kCLDTableInd << std::endl; + std::cout << " startOf_kRecognizedLangScripts: " << tHeader.startOf_kRecognizedLangScripts << std::endl; + std::cout << " lengthOf_kRecognizedLangScripts: " << tHeader.lengthOf_kRecognizedLangScripts << std::endl; + } +} + +#define CHECK_EQUALS(name) if (loadedData->name != realData->name) {\ + std::cerr << #name << ": " << loadedData->name << " != " << realData->name << std::endl;\ + return false;\ +} + +#define CHECK_MEM_EQUALS(name,size) if (!mem_compare(loadedData->name,realData->name,size)) {\ + std::cerr << #name << ": data mismatch." << std::endl;\ + return false;\ +} + +bool verify(const CLD2::ScoringTables* realData, const CLD2::ScoringTables* loadedData) { + const int NUM_TABLES = 7; + const CLD2::CLD2TableSummary* realTableSummaries[NUM_TABLES]; + realTableSummaries[0] = realData->unigram_compat_obj; + realTableSummaries[1] = realData->deltabi_obj; + realTableSummaries[2] = realData->distinctbi_obj; + realTableSummaries[3] = realData->quadgram_obj; + realTableSummaries[4] = realData->quadgram_obj2; + realTableSummaries[5] = realData->deltaocta_obj; + realTableSummaries[6] = realData->distinctocta_obj; + + const CLD2::CLD2TableSummary* loadedTableSummaries[NUM_TABLES]; + loadedTableSummaries[0] = loadedData->unigram_compat_obj; + loadedTableSummaries[1] = loadedData->deltabi_obj; + loadedTableSummaries[2] = loadedData->distinctbi_obj; + loadedTableSummaries[3] = loadedData->quadgram_obj; + loadedTableSummaries[4] = loadedData->quadgram_obj2; + loadedTableSummaries[5] = loadedData->deltaocta_obj; + loadedTableSummaries[6] = loadedData->distinctocta_obj; + + CHECK_EQUALS(unigram_obj->state0); + CHECK_EQUALS(unigram_obj->state0_size); + CHECK_EQUALS(unigram_obj->total_size); + CHECK_EQUALS(unigram_obj->max_expand); + CHECK_EQUALS(unigram_obj->entry_shift); + CHECK_EQUALS(unigram_obj->bytes_per_entry); + CHECK_EQUALS(unigram_obj->losub); + CHECK_EQUALS(unigram_obj->hiadd); + CHECK_MEM_EQUALS(unigram_obj->state_table, realData->unigram_obj->total_size); + CHECK_MEM_EQUALS(unigram_obj->remap_base, sizeof(CLD2::RemapEntry)); // TODO: can this have more than one entry? + CHECK_MEM_EQUALS(unigram_obj->remap_string, strlen( + reinterpret_cast(realData->unigram_obj->remap_string)) + 1); // null terminator included + + if (loadedData->unigram_obj->fast_state == NULL) { + if (realData->unigram_obj->fast_state != NULL) { + std::cerr << "unigram_obj->fast_state is missing." << std::endl; + return false; + } + } else { + if (realData->unigram_obj->fast_state == NULL) { + std::cerr << "unigram_obj->fast_state shouldn't be present." << std::endl; + return false; + } + CHECK_MEM_EQUALS(unigram_obj->fast_state, strlen( + reinterpret_cast(realData->unigram_obj->fast_state)) + 1); // null terminator included + } + if (DEBUG) std::cout << "verified." << std::endl; + + if (DEBUG) std::cout << "Verifying kExpectedScore... "; + CHECK_MEM_EQUALS(kExpectedScore, 614*4); // TODO: Don't hardcode 614*4. + if (DEBUG) std::cout << "verified." << std::endl; + + // 3. Each table + for (int x=0; xkCLDTableSize; + CLD2::uint32 tableSizeBytes = bytesPerBucket * numBuckets; + CLD2::uint32 indirectTableSizeBytes = + realData->kCLDTableSizeOne * sizeof(CLD2::uint32); + + // XXX XXX XXX HACK HACK HACK FIXME FIXME FIXME + // XXX XXX XXX HACK HACK HACK FIXME FIXME FIXME + // XXX XXX XXX HACK HACK HACK FIXME FIXME FIXME + // cld2_generated_cjk_compatible.cc has a kCLDTableSizeOne of zero! + if (x == 0) { // cld2_generated_cjk_compatible.cc + indirectTableSizeBytes = 239*2*4; + } + // XXX XXX XXX HACK HACK HACK FIXME FIXME FIXME + // XXX XXX XXX HACK HACK HACK FIXME FIXME FIXME + // XXX XXX XXX HACK HACK HACK FIXME FIXME FIXME + + CLD2::uint32 recognizedScriptsSizeBytes = + strlen(realData->kRecognizedLangScripts) + 1; // null terminator included + + // Verify the table data + CHECK_EQUALS(kCLDTableSizeOne); + CHECK_EQUALS(kCLDTableSize); + CHECK_EQUALS(kCLDTableKeyMask); + CHECK_EQUALS(kCLDTableBuildDate); + CHECK_MEM_EQUALS(kCLDTable, tableSizeBytes); + CHECK_MEM_EQUALS(kCLDTableInd, indirectTableSizeBytes); + CHECK_MEM_EQUALS(kRecognizedLangScripts, recognizedScriptsSizeBytes); + if (DEBUG) std::cout << "verified." << std::endl; + } + if (DEBUG) std::cout << "All data verified successfully." << std::endl; + return true; +} + +// As noted on http://stackoverflow.com/questions/1001307, gcc is highly likely +// to convert this function's return into a constant - meaning that any +// if-branches based upon it will be eliminated at compile time, allowing +// "free" detection throughout any dependent code. +bool isLittleEndian() { + union { + uint32_t integer; + char bytes[4]; + } test = {0x01020304}; + return test.bytes[0] == 4; +} + +bool coreAssumptionsOk() { + if (sizeof(CLD2::uint8) != 1) { + std::cerr << "uint8 is " << (sizeof(CLD2::uint8) * 8) + << " bits instead of 8!" << std::endl; + return false; + } + if (sizeof(CLD2::uint16) != 2) { + std::cerr << "uint16 is " << (sizeof(CLD2::uint16) * 8) + << " bits instead of 16!" << std::endl; + return false; + } + if (sizeof(CLD2::uint32) != 4) { + std::cerr << "uint32 is " << (sizeof(CLD2::uint32) * 8) + << " bits instead of 32!" << std::endl; + return false; + } + return true; +} + +} // End namespace CLD2DynamicData diff --git a/internal/cld2_dynamic_data.h b/internal/cld2_dynamic_data.h new file mode 100644 index 0000000..693d35b --- /dev/null +++ b/internal/cld2_dynamic_data.h @@ -0,0 +1,216 @@ +// Copyright 2014 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef CLD2_INTERNAL_CLD2_DYNAMIC_DATA_H_ +#define CLD2_INTERNAL_CLD2_DYNAMIC_DATA_H_ + +#include "integral_types.h" +#include "cld2tablesummary.h" +#include "utf8statetable.h" +#include "scoreonescriptspan.h" + +/* + There are two primary parts to a CLD2 dynamic data file: + 1. A header, wherein trivial data, block lengths and block offsets are kept + 2. A data block, wherein the large binary blocks are kept + + By reading the header, an application can determine the offsets and lengths of + all the data blocks for all tables. Offsets in the header are expressed + relative to the first byte of the file, inclusive of the header itself; thus, + any offset whose value is less than the length of the header is invalid. + + Any offset whose value is zero indicates a field that is null in the + underlying CLD2 data; a real example of this is the fast_state field of the + UTF8PropObj, which may be null. + + The size of the header can be precalculated by calling calculateHeaderSize(), + which will indicate the exact size of the header for a data file that contains + a given number of CLD2TableSummary objects. + + Notes on endianness: + The data format is only suitable for little-endian machines. For big-endian + systems, a tedious transformation would need to be made first to reverse the + byte order of significant portions of the binary - not just the lengths, but + also some of the underlying table data. + + Note on 32/64 bit: + The data format is agnostic to 32/64 bit pointers. All the offsets within the + data blob itself are 32-bit values relative to the start of the file, and the + file should certainly never be gigabytes in size! + When the file is ultimately read by the loading code and mmap()'d, new + pointers are generated at whatever size the system uses, initialized to the + start of the mmap, and incremented by the 32-bit offset. This should be safe + regardless of 32- or 64-bit architectures. + + -------------------------------------------------------------------- + FIELD + -------------------------------------------------------------------- + DATA_FILE_MARKER (no null terminator) + total file size (sanity check, uint32) + -------------------------------------------------------------------- + UTF8PropObj: const uint32 state0 + UTF8PropObj: const uint32 state0_size + UTF8PropObj: const uint32 total_size + UTF8PropObj: const int max_expand + UTF8PropObj: const int entry_shift (coerced to 32 bits) + UTF8PropObj: const int bytes_per_entry (coerced to 32 bits) + UTF8PropObj: const uint32 losub + UTF8PropObj: const uint32 hiadd + offset of UTF8PropObj: const uint8* state_table + length of UTF8PropObj: const uint8* state_table + offset of UTF8PropObj: const RemapEntry* remap_base (4-byte struct) + length of UTF8PropObj: const RemapEntry* remap_base (4-byte struct) + offset of UTF8PropObj: const uint8* remap_string + length of UTF8PropObj: const uint8* remap_string + offset of UTF8PropObj: const uint8* fast_state + length of UTF8PropObj: const uint8* fast_state + -------------------------------------------------------------------- + start of const short kAvgDeltaOctaScore[] + length of const short kAvgDeltaOctaScore[] + -------------------------------------------------------------------- + number of CLD2TableSummary objects encoded (n) + [Table 1]: CLD2TableSummary: uint32 kCLDTableSizeOne + [Table 1]: CLD2TableSummary: uint32 kCLDTableSize + [Table 1]: CLD2TableSummary: uint32 kCLDTableKeyMask + [Table 1]: CLD2TableSummary: uint32 kCLDTableBuildDate + [Table 1]: offset of CLD2TableSummary: const IndirectProbBucket4* kCLDTable + [Table 1]: length of CLD2TableSummary: const IndirectProbBucket4* kCLDTable + [Table 1]: offset of CLD2TableSummary: const uint32* kCLDTableInd + [Table 1]: length of CLD2TableSummary: const uint32* kCLDTableInd + [Table 1]: offset of CLD2TableSummary: const char* kRecognizedLangScripts + [Table 1]: length of CLD2TableSummary: const char* kRecognizedLangScripts + 1 + . + . + . + [Table n]: CLD2TableSummary: uint32 kCLDTableSizeOne + [Table n]: CLD2TableSummary: uint32 kCLDTableSize + [Table n]: CLD2TableSummary: uint32 kCLDTableKeyMask + [Table n]: CLD2TableSummary: uint32 kCLDTableBuildDate + [Table n]: offset of CLD2TableSummary: const IndirectProbBucket4* kCLDTable + [Table n]: length of CLD2TableSummary: const IndirectProbBucket4* kCLDTable + [Table n]: offset of CLD2TableSummary: const uint32* kCLDTableInd + [Table n]: length of CLD2TableSummary: const uint32* kCLDTableInd + [Table n]: offset of CLD2TableSummary: const char* kRecognizedLangScripts + [Table n]: length of CLD2TableSummary: const char* kRecognizedLangScripts + 1 + -------------------------------------------------------------------- + + + Immediately after the header fields comes the data block. The data block has + the following content, in this order (note that padding is applied in order to + keep lookups word-aligned): + + UTF8PropObj: const uint8* state_table + UTF8PropObj: const RemapEntry* remap_base (4-byte struct) + UTF8PropObj: const uint8* remap_string + UTF8PropObj: const uint8* fast_state + const short kAvgDeltaOctaScore[] + [Table 1]: CLD2TableSummary: const IndirectProbBucket4* kCLDTable + [Table 1]: CLD2TableSummary: const uint32* kCLDTableInd + [Table 1]: CLD2TableSummary: const char* kRecognizedLangScripts (with null terminator) + . + . + . + [Table n]: CLD2TableSummary: const IndirectProbBucket4* kCLDTable + [Table n]: CLD2TableSummary: const uint32* kCLDTableInd + [Table n]: CLD2TableSummary: const char* kRecognizedLangScripts (with null terminator) + + + It is STRONGLY recommended that the chunks within the data block be kept + 128-bit aligned for efficiency reasons, although the code will work without + such alignment: the main lookup tables have randomly-accessed groups of four + 4-byte entries, and these must be 16-byte aligned to avoid the performance + cost of multiple cache misses per group. +*/ +namespace CLD2DynamicData { + +static const char* DATA_FILE_MARKER = "cld2_data_file00"; +static const int DATA_FILE_MARKER_LENGTH = 16; // Keep aligned to 128 bits + +// Nicer version of memcmp that shows the offset at which bytes differ +bool mem_compare(const void* data1, const void* data2, const int length); + +// Enable or disable debugging; 0 to disable, 1 to enable +void setDebug(int debug); + +// Lower-level structure for individual tables. There are n table headers in +// a given file header. +typedef struct { + CLD2::uint32 kCLDTableSizeOne; + CLD2::uint32 kCLDTableSize; + CLD2::uint32 kCLDTableKeyMask; + CLD2::uint32 kCLDTableBuildDate; + CLD2::uint32 startOf_kCLDTable; + CLD2::uint32 lengthOf_kCLDTable; + CLD2::uint32 startOf_kCLDTableInd; + CLD2::uint32 lengthOf_kCLDTableInd; + CLD2::uint32 startOf_kRecognizedLangScripts; + CLD2::uint32 lengthOf_kRecognizedLangScripts; +} TableHeader; + + +// Top-level structure for a CLD2 Data File Header. +// Contains all the primitive fields for the header as well as an array of +// headers for the individual tables. +typedef struct { + // Marker fields help recognize and verify the data file + char sanityString[DATA_FILE_MARKER_LENGTH]; + CLD2::uint32 totalFileSizeBytes; + + // UTF8 primitives + CLD2::uint32 utf8PropObj_state0; + CLD2::uint32 utf8PropObj_state0_size; + CLD2::uint32 utf8PropObj_total_size; + CLD2::uint32 utf8PropObj_max_expand; + CLD2::uint32 utf8PropObj_entry_shift; + CLD2::uint32 utf8PropObj_bytes_per_entry; + CLD2::uint32 utf8PropObj_losub; + CLD2::uint32 utf8PropObj_hiadd; + CLD2::uint32 startOf_utf8PropObj_state_table; + CLD2::uint32 lengthOf_utf8PropObj_state_table; + CLD2::uint32 startOf_utf8PropObj_remap_base; + CLD2::uint32 lengthOf_utf8PropObj_remap_base; + CLD2::uint32 startOf_utf8PropObj_remap_string; + CLD2::uint32 lengthOf_utf8PropObj_remap_string; + CLD2::uint32 startOf_utf8PropObj_fast_state; + CLD2::uint32 lengthOf_utf8PropObj_fast_state; + + // Average delta-octa-score bits + CLD2::uint32 startOf_kAvgDeltaOctaScore; + CLD2::uint32 lengthOf_kAvgDeltaOctaScore; + + // Table bits + CLD2::uint32 numTablesEncoded; + TableHeader* tableHeaders; +} FileHeader; + +// Calculate the exact size of a header that encodes the specified number of +// tables. This can be used to reserve space within the data file, +// calculate offsets, and so on. +CLD2::uint32 calculateHeaderSize(CLD2::uint32 numTables); + +// Dump a given header to stdout as a human-readable string. +void dumpHeader(FileHeader* header); + +// Verify that a given pair of scoring tables match precisely +// If there is a problem, returns an error message; otherwise, the empty string. +bool verify(const CLD2::ScoringTables* realData, const CLD2::ScoringTables* loadedData); + +// Return true iff the program is running in little-endian mode. +bool isLittleEndian(); + +// Return true iff the core size assumptions are ok on this platform. +bool coreAssumptionsOk(); + +} // End namespace CLD2DynamicData +#endif // CLD2_INTERNAL_CLD2_DYNAMIC_DATA_H_ diff --git a/internal/cld2_dynamic_data_extractor.cc b/internal/cld2_dynamic_data_extractor.cc new file mode 100644 index 0000000..36ce41e --- /dev/null +++ b/internal/cld2_dynamic_data_extractor.cc @@ -0,0 +1,376 @@ +// Copyright 2014 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include "cld2_dynamic_data.h" +#include "cld2_dynamic_data_extractor.h" +#include "cld2_dynamic_data_loader.h" // for verifying the written data +#include "integral_types.h" +#include "cld2tablesummary.h" +#include "utf8statetable.h" + +using namespace std; +namespace CLD2DynamicDataExtractor { +static int DEBUG=0; +void setDebug(int debug) { + DEBUG=debug; +} + +int advance(FILE* f, CLD2::uint32 position) { + const char ZERO = 0; + int pad = position - ftell(f); + if (DEBUG) cout << " Adding " << pad << " bytes of padding" << endl; + while (pad-- > 0) { + fwrite(&ZERO,1,1,f); + } + return pad; +} + +void writeChunk(FILE *f, const void* data, CLD2::uint32 startAt, CLD2::uint32 length) { + if (DEBUG) cout << "Write chunk @" << startAt << ", len=" << length << endl; + advance(f, startAt); + if (DEBUG) cout << " Writing " << length << " bytes of data" << endl; + fwrite(data, 1, length, f); +} + +void writeDataFile(const CLD2::ScoringTables* data, const char* fileName) { + // The order here is hardcoded and MUST NOT BE CHANGED, else you will de-sync + // with the reading code. + const char ZERO = 0; + const int NUM_TABLES = 7; + const CLD2::CLD2TableSummary* tableSummaries[NUM_TABLES]; + tableSummaries[0] = data->unigram_compat_obj; + tableSummaries[1] = data->deltabi_obj; + tableSummaries[2] = data->distinctbi_obj; + tableSummaries[3] = data->quadgram_obj; + tableSummaries[4] = data->quadgram_obj2; + tableSummaries[5] = data->deltaocta_obj; + tableSummaries[6] = data->distinctocta_obj; + + CLD2DynamicData::TableHeader tableHeaders[NUM_TABLES]; + CLD2DynamicData::FileHeader fileHeader; + fileHeader.numTablesEncoded = NUM_TABLES; + fileHeader.tableHeaders = tableHeaders; + initUtf8Headers(&fileHeader, data->unigram_obj); + initDeltaHeaders(&fileHeader, data->kExpectedScore); + initTableHeaders(tableSummaries, NUM_TABLES, tableHeaders); + alignAll(&fileHeader, 16); // Align all sections to 128-bit boundaries + + // We are ready to rock. + for (int x=0; xunigram_obj->state_table, + fileHeader.startOf_utf8PropObj_state_table, + fileHeader.lengthOf_utf8PropObj_state_table); + // FIXME: Unsafe to rely on this since RemapEntry is not a bit-packed structure + writeChunk(outFile, + data->unigram_obj->remap_base, + fileHeader.startOf_utf8PropObj_remap_base, + fileHeader.lengthOf_utf8PropObj_remap_base); + writeChunk(outFile, + data->unigram_obj->remap_string, + fileHeader.startOf_utf8PropObj_remap_string, + fileHeader.lengthOf_utf8PropObj_remap_string - 1); + fwrite(&ZERO,1,1,outFile); // null terminator + if (fileHeader.startOf_utf8PropObj_fast_state > 0) { + writeChunk(outFile, + data->unigram_obj->fast_state, + fileHeader.startOf_utf8PropObj_fast_state, + fileHeader.lengthOf_utf8PropObj_fast_state - 1); + fwrite(&ZERO,1,1,outFile); // null terminator + } + + // 2. kAvgDeltaOctaScore array + writeChunk(outFile, + data->kExpectedScore, + fileHeader.startOf_kAvgDeltaOctaScore, + fileHeader.lengthOf_kAvgDeltaOctaScore); + + // 3. Each table + for (int x=0; xkCLDTable, + tHeader.startOf_kCLDTable, + tHeader.lengthOf_kCLDTable); + writeChunk(outFile, + summary->kCLDTableInd, + tHeader.startOf_kCLDTableInd, + tHeader.lengthOf_kCLDTableInd); + writeChunk(outFile, + summary->kRecognizedLangScripts, + tHeader.startOf_kRecognizedLangScripts, + tHeader.lengthOf_kRecognizedLangScripts - 1); + fwrite(&ZERO,1,1,outFile); // null terminator + } + fclose(outFile); +} + +void initTableHeaders(const CLD2::CLD2TableSummary** summaries, + int numSummaries, CLD2DynamicData::TableHeader* tableHeaders) { + for (int x=0; xkCLDTableSizeOne; + tableHeader.kCLDTableSize = summary->kCLDTableSize; + tableHeader.kCLDTableKeyMask = summary->kCLDTableKeyMask; + tableHeader.kCLDTableBuildDate = summary->kCLDTableBuildDate; + + // Calculate size information + CLD2::uint32 bytesPerBucket = sizeof(CLD2::IndirectProbBucket4); + CLD2::uint32 numBuckets = summary->kCLDTableSize; + CLD2::uint32 tableSizeBytes = bytesPerBucket * numBuckets; + CLD2::uint32 indirectTableSizeBytes = + summary->kCLDTableSizeOne * sizeof(CLD2::uint32); + + // XXX XXX XXX HACK HACK HACK FIXME FIXME FIXME + // XXX XXX XXX HACK HACK HACK FIXME FIXME FIXME + // XXX XXX XXX HACK HACK HACK FIXME FIXME FIXME + // cld2_generated_cjk_compatible.cc has a kCLDTableSizeOne of zero! + if (x == 0) { // cld2_generated_cjk_compatible.cc + indirectTableSizeBytes = 239*2*4; + } + // XXX XXX XXX HACK HACK HACK FIXME FIXME FIXME + // XXX XXX XXX HACK HACK HACK FIXME FIXME FIXME + // XXX XXX XXX HACK HACK HACK FIXME FIXME FIXME + + CLD2::uint32 recognizedScriptsSizeBytes = + strlen(summary->kRecognizedLangScripts) + 1; // note null terminator + + // Place size information into header. We'll align on byte boundaries later. + tableHeader.lengthOf_kCLDTable = tableSizeBytes; + tableHeader.lengthOf_kCLDTableInd = indirectTableSizeBytes; + tableHeader.lengthOf_kRecognizedLangScripts = + recognizedScriptsSizeBytes; // null terminator counted above + } +} + +// Assuming that all fields have been set in the specified header, re-align +// the starting positions of all data chunks to be aligned along 64-bit +// boundaries for maximum efficiency. +void alignAll(CLD2DynamicData::FileHeader* header, int alignment) { + CLD2::uint32 totalPadding = 0; + if (DEBUG) { std::cout << "Align for " << (alignment*8) << " bits." << std::endl; } + CLD2::uint32 headerSize = CLD2DynamicData::calculateHeaderSize( + header->numTablesEncoded); + CLD2::uint32 offset = headerSize; + + { // scoping block + int stateTablePad = alignment - (offset % alignment); + if (stateTablePad == alignment) stateTablePad = 0; + totalPadding += stateTablePad; + if (DEBUG) { std::cout << "Alignment for stateTable adjusted by " << stateTablePad << std::endl; } + offset += stateTablePad; + header->startOf_utf8PropObj_state_table = offset; + offset += header->lengthOf_utf8PropObj_state_table; + } + + { // scoping block + int remapPad = alignment - (offset % alignment); + if (remapPad == alignment) remapPad = 0; + totalPadding += remapPad; + if (DEBUG) { std::cout << "Alignment for remap adjusted by " << remapPad << std::endl; } + offset += remapPad; + header->startOf_utf8PropObj_remap_base = offset; + offset += header->lengthOf_utf8PropObj_remap_base; + } + + { // scoping block + int remapStringPad = alignment - (offset % alignment); + if (remapStringPad == alignment) remapStringPad = 0; + totalPadding += remapStringPad; + if (DEBUG) { std::cout << "Alignment for remapString adjusted by " << remapStringPad << std::endl; } + offset += remapStringPad; + header->startOf_utf8PropObj_remap_string = offset; + offset += header->lengthOf_utf8PropObj_remap_string; // null terminator already counted in initUtf8Headers + } + + { // scoping block + int fastStatePad = alignment - (offset % alignment); + if (fastStatePad == alignment) fastStatePad = 0; + totalPadding += fastStatePad; + if (DEBUG) { std::cout << "Alignment for fastState adjusted by " << fastStatePad << std::endl; } + offset += fastStatePad; + if (header->lengthOf_utf8PropObj_fast_state > 0) { + header->startOf_utf8PropObj_fast_state = offset; + offset += header->lengthOf_utf8PropObj_fast_state; // null terminator already counted in initUtf8Headers + } else { + header->startOf_utf8PropObj_fast_state = 0; + } + } + + { // scoping block + int deltaOctaPad = alignment - (offset % alignment); + if (deltaOctaPad == alignment) deltaOctaPad = 0; + totalPadding += deltaOctaPad; + if (DEBUG) { std::cout << "Alignment for deltaOctaScore adjusted by " << deltaOctaPad << std::endl; } + offset += deltaOctaPad; + header->startOf_kAvgDeltaOctaScore = offset; + offset += header->lengthOf_kAvgDeltaOctaScore; + } + + // TODO: The rest of the fields + for (int x=0; xnumTablesEncoded; x++) { + CLD2DynamicData::TableHeader& tableHeader = header->tableHeaders[x]; + int tablePad = alignment - (offset % alignment); + if (tablePad == alignment) tablePad = 0; + totalPadding += tablePad; + if (DEBUG) { std::cout << "Alignment for table " << x << " adjusted by " << tablePad << std::endl; } + offset += tablePad; + tableHeader.startOf_kCLDTable = offset; + offset += tableHeader.lengthOf_kCLDTable; + + int indirectPad = alignment - (offset % alignment); + if (indirectPad == alignment) indirectPad = 0; + totalPadding += indirectPad; + if (DEBUG) { std::cout << "Alignment for tableInd " << x << " adjusted by " << indirectPad << std::endl; } + offset += indirectPad; + tableHeader.startOf_kCLDTableInd = offset; + offset += tableHeader.lengthOf_kCLDTableInd; + + int scriptsPad = alignment - (offset % alignment); + if (scriptsPad == alignment) scriptsPad = 0; + totalPadding += scriptsPad; + if (DEBUG) { std::cout << "Alignment for scriptsPad " << x << " adjusted by " << scriptsPad << std::endl; } + offset += scriptsPad; + tableHeader.startOf_kRecognizedLangScripts = offset; + offset += tableHeader.lengthOf_kRecognizedLangScripts; // null terminator already counted in initTableHeaders + } + + // Now that we know exactly how much data we have written, store it in the + // header as a sanity check + header->totalFileSizeBytes = offset; + + if (DEBUG) { + std::cout << "Data aligned." << std::endl; + std::cout << "Header size: " << headerSize << " bytes " << std::endl; + std::cout << "Data size: " << (offset - totalPadding) << " bytes" << std::endl; + std::cout << "Padding size: " << totalPadding << " bytes" << std::endl; + + std::cout << " cld_generated_CjkUni_obj: " << ( + header->lengthOf_utf8PropObj_state_table + + header->lengthOf_utf8PropObj_remap_string + + header->lengthOf_utf8PropObj_fast_state) + << " bytes " << std::endl; + std::cout << " kAvgDeltaOctaScore: " + << header->lengthOf_kAvgDeltaOctaScore << " bytes " << std::endl; + std::cout << " kCjkCompat_obj: " << ( + header->tableHeaders[0].lengthOf_kCLDTable + + header->tableHeaders[0].lengthOf_kCLDTableInd + + header->tableHeaders[0].lengthOf_kRecognizedLangScripts + 1) + << " bytes " << std::endl; + std::cout << " kCjkDeltaBi_obj: " << ( + header->tableHeaders[1].lengthOf_kCLDTable + + header->tableHeaders[1].lengthOf_kCLDTableInd + + header->tableHeaders[1].lengthOf_kRecognizedLangScripts + 1) + << " bytes " << std::endl; + std::cout << " kDistinctBiTable_obj: " << ( + header->tableHeaders[2].lengthOf_kCLDTable + + header->tableHeaders[2].lengthOf_kCLDTableInd + + header->tableHeaders[2].lengthOf_kRecognizedLangScripts + 1) + << " bytes " << std::endl; + std::cout << " kQuad_obj: " << ( + header->tableHeaders[3].lengthOf_kCLDTable + + header->tableHeaders[3].lengthOf_kCLDTableInd + + header->tableHeaders[3].lengthOf_kRecognizedLangScripts + 1) + << " bytes " << std::endl; + std::cout << " kQuad_obj2: " << ( + header->tableHeaders[4].lengthOf_kCLDTable + + header->tableHeaders[4].lengthOf_kCLDTableInd + + header->tableHeaders[4].lengthOf_kRecognizedLangScripts + 1) + << " bytes " << std::endl; + std::cout << " kDeltaOcta_obj: " << ( + header->tableHeaders[5].lengthOf_kCLDTable + + header->tableHeaders[5].lengthOf_kCLDTableInd + + header->tableHeaders[5].lengthOf_kRecognizedLangScripts + 1) + << " bytes " << std::endl; + std::cout << " kDistinctOcta_obj: " << ( + header->tableHeaders[6].lengthOf_kCLDTable + + header->tableHeaders[6].lengthOf_kCLDTableInd + + header->tableHeaders[6].lengthOf_kRecognizedLangScripts + 1) + << " bytes " << std::endl; + } +} + +void initDeltaHeaders(CLD2DynamicData::FileHeader* header, const short* deltaArray) { + // TODO: Don't hardcode 614*4. Get constant from generated_language.cc? + header->startOf_kAvgDeltaOctaScore = 0; + header->lengthOf_kAvgDeltaOctaScore = 614 * 4; // from cld_generated_score_quad_octa_1024_256.cc +} + +void initUtf8Headers(CLD2DynamicData::FileHeader* header, const CLD2::UTF8PropObj* utf8Object) { + header->utf8PropObj_state0 = utf8Object->state0; + header->utf8PropObj_state0_size = utf8Object->state0_size; + header->utf8PropObj_total_size = utf8Object->total_size; + header->utf8PropObj_max_expand = utf8Object->max_expand; + header->utf8PropObj_entry_shift = utf8Object->entry_shift; + header->utf8PropObj_bytes_per_entry = utf8Object->bytes_per_entry; + header->utf8PropObj_losub = utf8Object->losub; + header->utf8PropObj_hiadd = utf8Object->hiadd; + header->lengthOf_utf8PropObj_state_table = utf8Object->total_size; + header->lengthOf_utf8PropObj_remap_base = sizeof(CLD2::RemapEntry); // TODO: Can this ever have more than one entry? + header->lengthOf_utf8PropObj_remap_string = strlen( + reinterpret_cast(utf8Object->remap_string)) + 1; // note null terminator + if (utf8Object->fast_state == NULL) { + header->lengthOf_utf8PropObj_fast_state = 0; // not applicable + } else { + header->lengthOf_utf8PropObj_fast_state = strlen( + reinterpret_cast(utf8Object->fast_state)) + 1; // note null terminator + } +} +} // End namespace CLD2DynamicDataExtractor diff --git a/internal/cld2_dynamic_data_extractor.h b/internal/cld2_dynamic_data_extractor.h new file mode 100644 index 0000000..8850dd7 --- /dev/null +++ b/internal/cld2_dynamic_data_extractor.h @@ -0,0 +1,54 @@ +// Copyright 2014 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef CLD2_INTERNAL_CLD2_DYNAMIC_DATA_EXTRACTOR_H_ +#define CLD2_INTERNAL_CLD2_DYNAMIC_DATA_EXTRACTOR_H_ + +#include "cld2_dynamic_data.h" +#include "integral_types.h" +#include "cld2tablesummary.h" +#include "utf8statetable.h" +#include "scoreonescriptspan.h" + +namespace CLD2DynamicDataExtractor { + +// Enable or disable debugging; 0 to disable, 1 to enable +void setDebug(int debug); + +// Populates all the UTF8-related fields of the header, and returns the total +// space required within the binary blob to represent the non-primitive data. +void initUtf8Headers(CLD2DynamicData::FileHeader* header, + const CLD2::UTF8PropObj* utf8Object); + +// Populates all the AvgDeltaOctaScore-related fields of the header. +void initDeltaHeaders(CLD2DynamicData::FileHeader* header, + const short* deltaArray); + +// Populates all fields of all table headers for the specified table summaries. +// Tables are laid out back-to-back in the order that they are specified in the +// input array of summaries, and the headers are filled in in the same order. +void initTableHeaders(const CLD2::CLD2TableSummary** summaries, + int numSummaries, CLD2DynamicData::TableHeader* tableSummaryHeaders); + +// Align all entries in the data block along boundaries that are multiples of +// the specified number of bytes. For example, to align everything along 64-bit +// boundaries, pass an alignment of 8 (bytes). +void alignAll(CLD2DynamicData::FileHeader* header, int alignment); + +// Write the dynamic data file to disk. +void writeDataFile(const CLD2::ScoringTables* data, const char* fileName); + + +} // End namespace CLD2DynamicDataExtractor +#endif // CLD2_INTERNAL_CLD2_DYNAMIC_DATA_EXTRACTOR_H_ diff --git a/internal/cld2_dynamic_data_loader.cc b/internal/cld2_dynamic_data_loader.cc new file mode 100644 index 0000000..a6087c5 --- /dev/null +++ b/internal/cld2_dynamic_data_loader.cc @@ -0,0 +1,212 @@ +// Copyright 2014 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cld2_dynamic_data.h" +#include "cld2_dynamic_data_loader.h" +#include "integral_types.h" +#include "cld2tablesummary.h" +#include "utf8statetable.h" +#include "scoreonescriptspan.h" + +namespace CLD2DynamicDataLoader { +static int DEBUG=0; + +CLD2DynamicData::FileHeader* loadHeader(const char* fileName) { + // TODO: force null-terminate char* strings for safety + FILE* inFile = fopen(fileName, "r"); + if (inFile == NULL) { + return NULL; + } + + int bytesRead = 0; + CLD2DynamicData::FileHeader* fileHeader = new CLD2DynamicData::FileHeader; + bytesRead += fread(fileHeader->sanityString, 1, CLD2DynamicData::DATA_FILE_MARKER_LENGTH, inFile); + if (!CLD2DynamicData::mem_compare(fileHeader->sanityString, CLD2DynamicData::DATA_FILE_MARKER, CLD2DynamicData::DATA_FILE_MARKER_LENGTH)) { + std::cerr << "Malformed header: bad file marker!" << std::endl; + delete fileHeader; + return NULL; + } + + bytesRead += 4 * fread(&(fileHeader->totalFileSizeBytes), 4, 1, inFile); + bytesRead += 4 * fread(&(fileHeader->utf8PropObj_state0), 4, 1, inFile); + bytesRead += 4 * fread(&(fileHeader->utf8PropObj_state0_size), 4, 1, inFile); + bytesRead += 4 * fread(&(fileHeader->utf8PropObj_total_size), 4, 1, inFile); + bytesRead += 4 * fread(&(fileHeader->utf8PropObj_max_expand), 4, 1, inFile); + bytesRead += 4 * fread(&(fileHeader->utf8PropObj_entry_shift), 4, 1, inFile); + bytesRead += 4 * fread(&(fileHeader->utf8PropObj_bytes_per_entry), 4, 1, inFile); + bytesRead += 4 * fread(&(fileHeader->utf8PropObj_losub), 4, 1, inFile); + bytesRead += 4 * fread(&(fileHeader->utf8PropObj_hiadd), 4, 1, inFile); + bytesRead += 4 * fread(&(fileHeader->startOf_utf8PropObj_state_table), 4, 1, inFile); + bytesRead += 4 * fread(&(fileHeader->lengthOf_utf8PropObj_state_table), 4, 1, inFile); + bytesRead += 4 * fread(&(fileHeader->startOf_utf8PropObj_remap_base), 4, 1, inFile); + bytesRead += 4 * fread(&(fileHeader->lengthOf_utf8PropObj_remap_base), 4, 1, inFile); + bytesRead += 4 * fread(&(fileHeader->startOf_utf8PropObj_remap_string), 4, 1, inFile); + bytesRead += 4 * fread(&(fileHeader->lengthOf_utf8PropObj_remap_string), 4, 1, inFile); + bytesRead += 4 * fread(&(fileHeader->startOf_utf8PropObj_fast_state), 4, 1, inFile); + bytesRead += 4 * fread(&(fileHeader->lengthOf_utf8PropObj_fast_state), 4, 1, inFile); + bytesRead += 4 * fread(&(fileHeader->startOf_kAvgDeltaOctaScore), 4, 1, inFile); + bytesRead += 4 * fread(&(fileHeader->lengthOf_kAvgDeltaOctaScore), 4, 1, inFile); + bytesRead += 4 * fread(&(fileHeader->numTablesEncoded), 4, 1, inFile); + + CLD2DynamicData::TableHeader* tableHeaders = new CLD2DynamicData::TableHeader[fileHeader->numTablesEncoded]; + fileHeader->tableHeaders = tableHeaders; + for (int x=0; xnumTablesEncoded; x++) { + CLD2DynamicData::TableHeader &tHeader = fileHeader->tableHeaders[x]; + bytesRead += 4 * fread(&(tHeader.kCLDTableSizeOne), 4, 1, inFile); + bytesRead += 4 * fread(&(tHeader.kCLDTableSize), 4, 1, inFile); + bytesRead += 4 * fread(&(tHeader.kCLDTableKeyMask), 4, 1, inFile); + bytesRead += 4 * fread(&(tHeader.kCLDTableBuildDate), 4, 1, inFile); + bytesRead += 4 * fread(&(tHeader.startOf_kCLDTable), 4, 1, inFile); + bytesRead += 4 * fread(&(tHeader.lengthOf_kCLDTable), 4, 1, inFile); + bytesRead += 4 * fread(&(tHeader.startOf_kCLDTableInd), 4, 1, inFile); + bytesRead += 4 * fread(&(tHeader.lengthOf_kCLDTableInd), 4, 1, inFile); + bytesRead += 4 * fread(&(tHeader.startOf_kRecognizedLangScripts), 4, 1, inFile); + bytesRead += 4 * fread(&(tHeader.lengthOf_kRecognizedLangScripts), 4, 1, inFile); + } + + // Confirm header size is correct. + int expectedHeaderSize = CLD2DynamicData::calculateHeaderSize(fileHeader->numTablesEncoded); + if (expectedHeaderSize != bytesRead) { + std::cerr << "Header size mismatch! Expected " << expectedHeaderSize << ", but read " << bytesRead << std::endl; + delete fileHeader; + delete tableHeaders; + return NULL; + } + + // Confirm file size is correct. + fseek(inFile, 0, SEEK_END); + int actualSize = ftell(inFile); + fclose(inFile); + + if (actualSize != fileHeader->totalFileSizeBytes) { + std::cerr << "File size mismatch! Expected " << fileHeader->totalFileSizeBytes << ", but found " << actualSize << std::endl; + delete fileHeader; + delete tableHeaders; + return NULL; + } + return fileHeader; +} + +void unloadData(CLD2::ScoringTables** scoringTables, void** mmapAddress, int* mmapLength) { + free(const_cast((*scoringTables)->unigram_obj)); + (*scoringTables)->unigram_obj = NULL; + delete((*scoringTables)->unigram_compat_obj); // tableSummaries[0] from loadDataFile + (*scoringTables)->unigram_compat_obj = NULL; + delete(*scoringTables); + *scoringTables = NULL; + munmap(*mmapAddress, *mmapLength); + *mmapAddress = NULL; + *mmapLength = 0; +} + +CLD2::ScoringTables* loadDataFile(const char* fileName, void** mmapAddressOut, int* mmapLengthOut) { + CLD2DynamicData::FileHeader* fileHeader = loadHeader(fileName); + if (fileHeader == NULL) { + return NULL; + } + + // Initialize the memory map + int inFileHandle = open(fileName, O_RDONLY); + void* mapped = mmap(NULL, fileHeader->totalFileSizeBytes, + PROT_READ, MAP_PRIVATE, inFileHandle, 0); + // Record the map address. This allows callers to unmap + *mmapAddressOut=mapped; + *mmapLengthOut=fileHeader->totalFileSizeBytes; + close(inFileHandle); + + // 1. UTF8 Object + const CLD2::uint8* state_table = static_cast(mapped) + + fileHeader->startOf_utf8PropObj_state_table; + // FIXME: Unsafe to rely on this since RemapEntry is not a bit-packed structure + const CLD2::RemapEntry* remap_base = + reinterpret_cast( + static_cast(mapped) + + fileHeader->startOf_utf8PropObj_remap_base); + const CLD2::uint8* remap_string = static_cast(mapped) + + fileHeader->startOf_utf8PropObj_remap_string; + const CLD2::uint8* fast_state = + fileHeader->startOf_utf8PropObj_fast_state == 0 ? 0 : + static_cast(mapped) + + fileHeader->startOf_utf8PropObj_fast_state; + + // Populate intermediate object. Horrible casting required because the struct + // is all read-only integers, and doesn't have a constructor. Yikes. + // TODO: It might actually be less horrible to memcpy the data in + const CLD2::UTF8PropObj* unigram_obj = reinterpret_cast(malloc(sizeof(CLD2::UTF8PropObj))); + *const_cast(&unigram_obj->state0) = fileHeader->utf8PropObj_state0; + *const_cast(&unigram_obj->state0_size) = fileHeader->utf8PropObj_state0_size; + *const_cast(&unigram_obj->total_size) = fileHeader->utf8PropObj_total_size; + *const_cast(&unigram_obj->max_expand) = fileHeader->utf8PropObj_max_expand; + *const_cast(&unigram_obj->entry_shift) = fileHeader->utf8PropObj_entry_shift; + *const_cast(&unigram_obj->bytes_per_entry) = fileHeader->utf8PropObj_bytes_per_entry; + *const_cast(&unigram_obj->losub) = fileHeader->utf8PropObj_losub; + *const_cast(&unigram_obj->hiadd) = fileHeader->utf8PropObj_hiadd; + *const_cast(&unigram_obj->state_table) = state_table; + *const_cast(&unigram_obj->remap_base) = remap_base; + *const_cast(&unigram_obj->remap_string) = remap_string; + *const_cast(&unigram_obj->fast_state) = fast_state; + + // 2. kAvgDeltaOctaScore array + const short* read_kAvgDeltaOctaScore = reinterpret_cast( + static_cast(mapped) + + fileHeader->startOf_kAvgDeltaOctaScore); + + // 3. Each table + CLD2::CLD2TableSummary* tableSummaries = new CLD2::CLD2TableSummary[fileHeader->numTablesEncoded]; + for (int x=0; xnumTablesEncoded; x++) { + CLD2::CLD2TableSummary &summary = tableSummaries[x]; + CLD2DynamicData::TableHeader& tHeader = fileHeader->tableHeaders[x]; + const CLD2::IndirectProbBucket4* kCLDTable = + reinterpret_cast( + static_cast(mapped) + tHeader.startOf_kCLDTable); + const CLD2::uint32* kCLDTableInd = + reinterpret_cast( + static_cast(mapped) + tHeader.startOf_kCLDTableInd); + const char* kRecognizedLangScripts = + static_cast(mapped) + tHeader.startOf_kRecognizedLangScripts; + + summary.kCLDTable = kCLDTable; + summary.kCLDTableInd = kCLDTableInd; + summary.kCLDTableSizeOne = tHeader.kCLDTableSizeOne; + summary.kCLDTableSize = tHeader.kCLDTableSize; + summary.kCLDTableKeyMask = tHeader.kCLDTableKeyMask; + summary.kCLDTableBuildDate = tHeader.kCLDTableBuildDate; + summary.kRecognizedLangScripts = kRecognizedLangScripts; + } + + // Tie everything together + CLD2::ScoringTables* result = new CLD2::ScoringTables; + result->unigram_obj = unigram_obj; + result->unigram_compat_obj = &tableSummaries[0]; + result->deltabi_obj = &tableSummaries[1]; + result->distinctbi_obj = &tableSummaries[2]; + result->quadgram_obj = &tableSummaries[3]; + result->quadgram_obj2 = &tableSummaries[4]; + result->deltaocta_obj = &tableSummaries[5]; + result->distinctocta_obj = &tableSummaries[6]; + result->kExpectedScore = read_kAvgDeltaOctaScore; + delete fileHeader->tableHeaders; + delete fileHeader; + return result; +} +} diff --git a/internal/cld2_dynamic_data_loader.h b/internal/cld2_dynamic_data_loader.h new file mode 100644 index 0000000..9beba0f --- /dev/null +++ b/internal/cld2_dynamic_data_loader.h @@ -0,0 +1,52 @@ +// Copyright 2014 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef CLD2_INTERNAL_CLD2_DYNAMIC_DATA_LOADER_H_ +#define CLD2_INTERNAL_CLD2_DYNAMIC_DATA_LOADER_H_ + +#include "scoreonescriptspan.h" +#include "cld2_dynamic_data.h" + +namespace CLD2DynamicDataLoader { + +// Read a header from the specified file and return it. +// The header returned is dynamically allocated; you must 'delete' the array +// of TableHeaders as well as the returned FileHeader* when done. +CLD2DynamicData::FileHeader* loadHeader(const char* fileName); + +// Load data directly into a ScoringTables structure using a private, read-only +// mmap and return the newly-allocated structure. +// The out-parameter "mmapAddressOut" is a pointer to a void*; the starting +// address of the mmap()'d block will be written here. +// The out-parameter "mmapLengthOut" is a pointer to an int; the length of the +// mmap()'d block will be written here. +// It is up to the caller to delete +CLD2::ScoringTables* loadDataFile(const char* fileName, + void** mmapAddressOut, int* mmapLengthOut); + +// Given pointers to the data from a previous invocation of loadDataFile, +// unloads the data safely - freeing and deleting any malloc'd/new'd objects. +// When this method returns, the mmap has been deleted, as have all the scoring +// tables; the pointers passed in are all zeroed, such that: +// *scoringTables == NULL +// *mmapAddress == NULL +// mmapLength == NULL +// This is the only safe way to unload data that was previously loaded, as there +// is an unfortunate mixture of new and malloc involved in building the +// in-memory represtation of the data. +void unloadData(CLD2::ScoringTables** scoringTables, + void** mmapAddress, int* mmapLength); + +} // End namespace CLD2DynamicDataExtractor +#endif // CLD2_INTERNAL_CLD2_DYNAMIC_DATA_EXTRACTOR_H_ diff --git a/internal/cld2_dynamic_data_tool.cc b/internal/cld2_dynamic_data_tool.cc new file mode 100644 index 0000000..18f8cd8 --- /dev/null +++ b/internal/cld2_dynamic_data_tool.cc @@ -0,0 +1,162 @@ +// Copyright 2014 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cld2_dynamic_data.h" +#include "cld2_dynamic_data_extractor.h" +#include "cld2_dynamic_data_loader.h" +#include "integral_types.h" +#include "cld2tablesummary.h" +#include "utf8statetable.h" +#include "scoreonescriptspan.h" + +// We need these in order to set up a real data object to pass around. +namespace CLD2 { + extern const UTF8PropObj cld_generated_CjkUni_obj; + extern const CLD2TableSummary kCjkCompat_obj; + extern const CLD2TableSummary kCjkDeltaBi_obj; + extern const CLD2TableSummary kDistinctBiTable_obj; + extern const CLD2TableSummary kQuad_obj; + extern const CLD2TableSummary kQuad_obj2; + extern const CLD2TableSummary kDeltaOcta_obj; + extern const CLD2TableSummary kDistinctOcta_obj; + extern const short kAvgDeltaOctaScore[]; +} + +int main(int argc, char** argv) { + if (!CLD2DynamicData::isLittleEndian()) { + std::cerr << "System is big-endian: currently not supported." << std::endl; + return -1; + } + if (!CLD2DynamicData::coreAssumptionsOk()) { + std::cerr << "Core assumptions violated, unsafe to continue." << std::endl; + return -2; + } + + // Get command-line flags + int flags = 0; + bool get_vector = false; + char* fileName = NULL; + const char* USAGE = "\ +CLD2 Dynamic Data Tool:\n\ +Dump, verify or print summaries of scoring tables for CLD2.\n\ +\n\ +The files output by this tool are suitable for all little-endian platforms,\n\ +and should work on both 32- and 64-bit platforms.\n\ +\n\ +IMPORTANT: The files output by this tool WILL NOT work on big-endian platforms.\n\ +\n\ +Usage:\n\ + --dump [FILE] Dump the scoring tables that this tool was linked against\n\ + to the specified file. The tables are automatically verified\n\ + after writing, just as if the tool was run again with\n\ + '--verify'.\n\ + --verify [FILE] Verify that a given file precisely matches the scoring\n\ + tables that this tool was linked against. This can be used\n\ + to verify that a file is compatible.\n\ + --head [FILE] Print headers from the specified file to stdout.\n\ + --verbose Be verbose.\n\ +"; + int mode = 0; //1=dump, 2=verify, 3=head + for (int i = 1; i < argc; ++i) { + if (strcmp(argv[i], "--verbose") == 0) { + CLD2DynamicDataExtractor::setDebug(1); + CLD2DynamicData::setDebug(1); + } + else if (strcmp(argv[i], "--dump") == 0 + || strcmp(argv[i], "--verify") == 0 + || strcmp(argv[i], "--head") == 0) { + + // set mode flag properly + if (strcmp(argv[i], "--dump") == 0) mode=1; + else if (strcmp(argv[i], "--verify") == 0) mode=2; + else mode=3; + if (i < argc - 1) { + fileName = argv[++i]; + } else { + std::cerr << "missing file name argument" << std::endl << std::endl; + std::cerr << USAGE; + return -1; + } + } else if (strcmp(argv[i], "--help") == 0) { + std::cout << USAGE; + return 0; + } else { + std::cerr << "Unsupported option: " << argv[i] << std::endl << std::endl; + std::cerr << USAGE; + return -1; + } + } + + if (mode == 0) { + std::cerr << USAGE; + return -1; + } + + CLD2::ScoringTables realData = { + &CLD2::cld_generated_CjkUni_obj, + &CLD2::kCjkCompat_obj, + &CLD2::kCjkDeltaBi_obj, + &CLD2::kDistinctBiTable_obj, + &CLD2::kQuad_obj, + &CLD2::kQuad_obj2, + &CLD2::kDeltaOcta_obj, + &CLD2::kDistinctOcta_obj, + CLD2::kAvgDeltaOctaScore, + }; + if (mode == 1) { // dump + CLD2DynamicDataExtractor::writeDataFile( + static_cast(&realData), + fileName); + } else if (mode == 3) { // head + CLD2DynamicData::FileHeader* header = CLD2DynamicDataLoader::loadHeader(fileName); + if (header == NULL) { + std::cerr << "Cannot read header from file: " << fileName << std::endl; + return -1; + } + CLD2DynamicData::dumpHeader(header); + delete header->tableHeaders; + delete header; + } + + if (mode == 1 || mode == 2) { // dump || verify (so perform verification) + void* mmapAddress = NULL; + int mmapLength = 0; + CLD2::ScoringTables* loadedData = CLD2DynamicDataLoader::loadDataFile(fileName, &mmapAddress, &mmapLength); + + if (loadedData == NULL) { + std::cerr << "Failed to read data file: " << fileName << std::endl; + return -1; + } + bool result = CLD2DynamicData::verify( + static_cast(&realData), + static_cast(loadedData)); + CLD2DynamicDataLoader::unloadData(&loadedData, &mmapAddress, &mmapLength); + if (loadedData != NULL || mmapAddress != NULL || mmapLength != 0) { + std::cerr << "Warning: failed to clean up memory for ScoringTables." << std::endl; + } + if (!result) { + std::cerr << "Verification failed!" << std::endl; + return -1; + } + } +} diff --git a/internal/cld2_unittest.cc b/internal/cld2_unittest.cc index f7fe899..570c44d 100644 --- a/internal/cld2_unittest.cc +++ b/internal/cld2_unittest.cc @@ -252,10 +252,34 @@ void FinishHtmlOut(int flags) { #endif } +#ifdef CLD2_DYNAMIC_MODE +int RunTests (int flags, bool get_vector, const char* data_file) { +#else int RunTests (int flags, bool get_vector) { +#endif fprintf(stdout, "CLD2 version: %s\n", CLD2::DetectLanguageVersion()); InitHtmlOut(flags); bool any_fail = false; + +#ifdef CLD2_DYNAMIC_MODE + fprintf(stdout, "[DYNAMIC] Test running in dynamic data mode!\n"); + bool dataLoaded = CLD2::isDataLoaded(); + if (dataLoaded) { + fprintf(stderr, "[DYNAMIC] *** Error: CLD2::isDataLoaded() returned true prior to loading data!\n"); + any_fail = true; + } + fprintf(stdout, "[DYNAMIC] Attempting translation prior to loading data\n"); + any_fail |= !OneTest(flags, get_vector, UNKNOWN_LANGUAGE, kTeststr_en, strlen(kTeststr_en)); + fprintf(stdout, "[DYNAMIC] Loading data from: %s\n", data_file); + CLD2::loadData(data_file); + dataLoaded = CLD2::isDataLoaded(); + if (!dataLoaded) { + fprintf(stderr, "[DYNAMIC] *** Error: CLD2::isDataLoaded() returned false after loading data!\n"); + any_fail = true; + } + fprintf(stdout, "[DYNAMIC] Data loaded, normal tests commencing\n"); +#endif + int i = 0; while (kTestPair[i].text != NULL) { Language lang_expected = kTestPair[i].lang; @@ -265,6 +289,19 @@ int RunTests (int flags, bool get_vector) { any_fail |= (!ok); ++i; } + +#ifdef CLD2_DYNAMIC_MODE + fprintf(stdout, "[DYNAMIC] Normal tests complete, attempting to unload data\n"); + CLD2::unloadData(); + dataLoaded = CLD2::isDataLoaded(); + if (dataLoaded) { + fprintf(stderr, "[DYNAMIC] *** Error: CLD2::isDataLoaded() returned true after unloading data!\n"); + any_fail = true; + } + fprintf(stdout, "[DYNAMIC] Attempting translation after unloading data\n"); + any_fail |= !OneTest(flags, get_vector, UNKNOWN_LANGUAGE, kTeststr_en, strlen(kTeststr_en)); +#endif + if (any_fail) { fprintf(stderr, "FAIL\n"); fprintf(stdout, "FAIL\n"); @@ -283,6 +320,7 @@ int main(int argc, char** argv) { // Get command-line flags int flags = 0; bool get_vector = false; + const char* data_file = NULL; for (int i = 1; i < argc; ++i) { if (strcmp(argv[i], "--html") == 0) {flags |= CLD2::kCLDFlagHtml;} if (strcmp(argv[i], "--cr") == 0) {flags |= CLD2::kCLDFlagCr;} @@ -290,8 +328,17 @@ int main(int argc, char** argv) { if (strcmp(argv[i], "--quiet") == 0) {flags |= CLD2::kCLDFlagQuiet;} if (strcmp(argv[i], "--echo") == 0) {flags |= CLD2::kCLDFlagEcho;} if (strcmp(argv[i], "--vector") == 0) {get_vector = true;} + if (strcmp(argv[i], "--data-file") == 0) { data_file = argv[++i];} } +#ifdef CLD2_DYNAMIC_MODE + if (data_file == NULL) { + fprintf(stderr, "When running in dynamic mode, you must specify --data-file [FILE]\n"); + return -1; + } + return CLD2::RunTests(flags, get_vector, data_file); +#else return CLD2::RunTests(flags, get_vector); +#endif } diff --git a/internal/compact_lang_det_impl.cc b/internal/compact_lang_det_impl.cc index e5e9b1b..e01fdce 100644 --- a/internal/compact_lang_det_impl.cc +++ b/internal/compact_lang_det_impl.cc @@ -28,6 +28,10 @@ #include "lang_script.h" #include "utf8statetable.h" +#ifdef CLD2_DYNAMIC_MODE +#include "cld2_dynamic_data.h" +#include "cld2_dynamic_data_loader.h" +#endif #include "cld2tablesummary.h" #include "compact_lang_det_impl.h" #include "compact_lang_det_hint_code.h" @@ -63,20 +67,58 @@ extern const CLD2TableSummary kDeltaOcta_obj; extern const CLD2TableSummary kDistinctOcta_obj; extern const short kAvgDeltaOctaScore[]; -// This initializes kScoringtables.quadgram_obj etc. -static const ScoringTables kScoringtables = { - &cld_generated_CjkUni_obj, - &kCjkCompat_obj, - &kCjkDeltaBi_obj, - &kDistinctBiTable_obj, +#ifdef CLD2_DYNAMIC_MODE + // CLD2_DYNAMIC_MODE is defined: + // Data will be read from an mmap opened at runtime. + static ScoringTables kScoringtables = { + NULL, //&cld_generated_CjkUni_obj, + NULL, //&kCjkCompat_obj, + NULL, //&kCjkDeltaBi_obj, + NULL, //&kDistinctBiTable_obj, + NULL, //&kQuad_obj, + NULL, //&kQuad_obj2, + NULL, //&kDeltaOcta_obj, + NULL, //&kDistinctOcta_obj, + NULL, //kAvgDeltaOctaScore, + }; + static bool dynamicDataLoaded = false; + static ScoringTables* dynamicTables = NULL; + static void* mmapAddress = NULL; + static int mmapLength = 0; - &kQuad_obj, - &kQuad_obj2, // Dual lookup tables - &kDeltaOcta_obj, - &kDistinctOcta_obj, + bool isDataLoaded() { return dynamicDataLoaded; } - kAvgDeltaOctaScore, -}; + void loadData(const char* fileName) { + if (isDataLoaded()) { + unloadData(); + } + dynamicTables = CLD2DynamicDataLoader::loadDataFile(fileName, &mmapAddress, &mmapLength); + kScoringtables = *dynamicTables; + dynamicDataLoaded = true; + }; + + void unloadData() { + if (!dynamicDataLoaded) return; + dynamicDataLoaded = false; + // unloading will null all the pointers out. + CLD2DynamicDataLoader::unloadData(&dynamicTables, &mmapAddress, &mmapLength); + } +#else + // This initializes kScoringtables.quadgram_obj etc. + static const ScoringTables kScoringtables = { + &cld_generated_CjkUni_obj, + &kCjkCompat_obj, + &kCjkDeltaBi_obj, + &kDistinctBiTable_obj, + + &kQuad_obj, + &kQuad_obj2, // Dual lookup tables + &kDeltaOcta_obj, + &kDistinctOcta_obj, + + kAvgDeltaOctaScore, + }; +#endif // #ifdef CLD2_DYNAMIC_MODE static const bool FLAGS_cld_no_minimum_bytes = false; @@ -1622,6 +1664,19 @@ Language DetectLanguageSummaryV2( } } +#ifdef CLD2_DYNAMIC_MODE + // In dynamic mode, we immediately return UNKNOWN_LANGUAGE if the data file + // hasn't been loaded yet. This is the only sane thing we can do, as there + // are no scoring tables to consult. + bool dataLoaded = isDataLoaded(); + if ((flags & kCLDFlagVerbose) != 0) { + fprintf(stderr, "Data loaded: %s\n", (dataLoaded ? "true" : "false")); + } + if (!dataLoaded) { + return UNKNOWN_LANGUAGE; + } +#endif + // Exit now if no text if (buffer_length == 0) {return UNKNOWN_LANGUAGE;} if (kScoringtables.quadgram_obj == NULL) {return UNKNOWN_LANGUAGE;} diff --git a/internal/compact_lang_det_test.cc b/internal/compact_lang_det_test.cc index 4d0e47e..f2fb1a7 100644 --- a/internal/compact_lang_det_test.cc +++ b/internal/compact_lang_det_test.cc @@ -42,16 +42,18 @@ typedef int32 Encoding; static const Encoding UNKNOWN_ENCODING = 0; -// Linker supplies the right tables; see ScoringTables compact_lang_det_impl.cc -// These are here JUST for printing versions -extern const UTF8PropObj cld_generated_CjkUni_obj; -extern const CLD2TableSummary kCjkDeltaBi_obj; -extern const CLD2TableSummary kDistinctBiTable_obj; -extern const CLD2TableSummary kQuad_obj; -extern const CLD2TableSummary kDeltaOcta_obj; -extern const CLD2TableSummary kDistinctOcta_obj; -extern const CLD2TableSummary kOcta2_obj; -extern const short kAvgDeltaOctaScore[]; +#ifndef CLD2_DYNAMIC_MODE + // Linker supplies the right tables; see ScoringTables compact_lang_det_impl.cc + // These are here JUST for printing versions + extern const UTF8PropObj cld_generated_CjkUni_obj; + extern const CLD2TableSummary kCjkDeltaBi_obj; + extern const CLD2TableSummary kDistinctBiTable_obj; + extern const CLD2TableSummary kQuad_obj; + extern const CLD2TableSummary kDeltaOcta_obj; + extern const CLD2TableSummary kDistinctOcta_obj; + extern const CLD2TableSummary kOcta2_obj; + extern const short kAvgDeltaOctaScore[]; +#endif bool FLAGS_cld_version = false; bool FLAGS_cld_html = true; @@ -201,6 +203,7 @@ void DumpLanguages(Language summary_lang, int main(int argc, char** argv) { if (FLAGS_cld_version) { +#ifndef CLD2_DYNAMIC_MODE printf("%s %4dKB uni build date, bytes\n", "........", cld_generated_CjkUni_obj.total_size >> 10); @@ -216,11 +219,14 @@ int main(int argc, char** argv) { kDeltaOcta_obj.kCLDTableBuildDate, (kDeltaOcta_obj.kCLDTableSize * sizeof(IndirectProbBucket4)) >> 10); +#else + printf("FLAGS_cld_version doesn't work with dynamic data mode\n"); +#endif exit(0); } // End FLAGS_cld_version - int flags = 0; bool get_vector = false; + const char* data_file = NULL; const char* fname = NULL; for (int i = 1; i < argc; ++i) { if (argv[i][0] != '-') {fname = argv[i];} @@ -230,8 +236,19 @@ int main(int argc, char** argv) { if (strcmp(argv[i], "--verbose") == 0) {flags |= kCLDFlagVerbose;} if (strcmp(argv[i], "--echo") == 0) {flags |= kCLDFlagEcho;} if (strcmp(argv[i], "--vector") == 0) {get_vector = true;} + if (strcmp(argv[i], "--data-file") == 0) { data_file = argv[++i];} } +#ifdef CLD2_DYNAMIC_MODE + if (data_file == NULL) { + fprintf(stderr, "When running in dynamic mode, you must specify --data-file [FILE]\n"); + return -1; + } + fprintf(stdout, "Loading data from: %s\n", data_file); + CLD2::loadData(data_file); + fprintf(stdout, "Data loaded, test commencing\n"); +#endif + FILE* fin; if (fname == NULL) { fin = stdin; diff --git a/internal/compile_dynamic.sh b/internal/compile_dynamic.sh new file mode 100755 index 0000000..ecb3047 --- /dev/null +++ b/internal/compile_dynamic.sh @@ -0,0 +1,71 @@ +#!/bin/sh +# +# Copyright 2013 Google Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http:# www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# The data tool, which can be used to read and write CLD2 dynamic data files +g++ -O2 -m64 cld2_dynamic_data_tool.cc \ + cld2_dynamic_data.h cld2_dynamic_data.cc \ + cld2_dynamic_data_extractor.h cld2_dynamic_data_extractor.cc \ + cld2_dynamic_data_loader.h cld2_dynamic_data_loader.cc \ + cldutil.cc cldutil_shared.cc compact_lang_det.cc compact_lang_det_hint_code.cc \ + compact_lang_det_impl.cc debug.cc fixunicodevalue.cc \ + generated_entities.cc generated_language.cc generated_ulscript.cc \ + getonescriptspan.cc lang_script.cc offsetmap.cc scoreonescriptspan.cc \ + tote.cc utf8statetable.cc \ + cld_generated_cjk_uni_prop_80.cc cld2_generated_cjk_compatible.cc \ + cld_generated_cjk_delta_bi_4.cc generated_distinct_bi_0.cc \ + cld2_generated_quadchrome0122_2.cc cld2_generated_deltaoctachrome0122.cc \ + cld2_generated_distinctoctachrome0122.cc cld_generated_score_quad_octa_0122_2.cc \ + -o cld2_dynamic_data_tool +echo " cld2_dynamic_data_tool compiled" + +# Tests for Chromium flavored dynamic CLD2 +g++ -O2 -m64 -D CLD2_DYNAMIC_MODE compact_lang_det_test.cc \ + cld2_dynamic_data.h cld2_dynamic_data.cc \ + cld2_dynamic_data_extractor.h cld2_dynamic_data_extractor.cc \ + cld2_dynamic_data_loader.h cld2_dynamic_data_loader.cc \ + cldutil.cc cldutil_shared.cc compact_lang_det.cc compact_lang_det_hint_code.cc \ + compact_lang_det_impl.cc debug.cc fixunicodevalue.cc \ + generated_entities.cc generated_language.cc generated_ulscript.cc \ + getonescriptspan.cc lang_script.cc offsetmap.cc scoreonescriptspan.cc \ + tote.cc utf8statetable.cc \ + -o compact_lang_det_dynamic_test_chrome +echo " compact_lang_det_dynamic_test_chrome compiled" + + +# Unit tests, in dynamic mode +g++ -O2 -m64 -g3 -D CLD2_DYNAMIC_MODE cld2_unittest.cc \ + cld2_dynamic_data.h cld2_dynamic_data.cc \ + cld2_dynamic_data_loader.h cld2_dynamic_data_loader.cc \ + cldutil.cc cldutil_shared.cc compact_lang_det.cc compact_lang_det_hint_code.cc \ + compact_lang_det_impl.cc debug.cc fixunicodevalue.cc \ + generated_entities.cc generated_language.cc generated_ulscript.cc \ + getonescriptspan.cc lang_script.cc offsetmap.cc scoreonescriptspan.cc \ + tote.cc utf8statetable.cc \ + -o cld2_dynamic_unittest +echo " cld2_dynamic_unittest compiled" + +# Shared library, in dynamic mode +g++ -shared -fPIC -O2 -m64 -D CLD2_DYNAMIC_MODE \ + cld2_dynamic_data.h cld2_dynamic_data.cc \ + cld2_dynamic_data_loader.h cld2_dynamic_data_loader.cc \ + cldutil.cc cldutil_shared.cc compact_lang_det.cc compact_lang_det_hint_code.cc \ + compact_lang_det_impl.cc debug.cc fixunicodevalue.cc \ + generated_entities.cc generated_language.cc generated_ulscript.cc \ + getonescriptspan.cc lang_script.cc offsetmap.cc scoreonescriptspan.cc \ + tote.cc utf8statetable.cc \ + -o libcld2_dynamic.so +echo " libcld2_dynamic.so compiled" + diff --git a/public/compact_lang_det.h b/public/compact_lang_det.h index 71bb5df..da59abd 100644 --- a/public/compact_lang_det.h +++ b/public/compact_lang_det.h @@ -295,6 +295,26 @@ Flag meanings: void DumpResultChunkVector(FILE* f, const char* src, ResultChunkVector* resultchunkvector); +#ifdef CLD2_DYNAMIC_MODE + +// If compiled with dynamic mode, load data from the specified file location. +// If other data has already been loaded, it is discarded and the data is read +// in from the specified file location again (even if the file has not changed). +// WARNING: Before calling this method, language detection will always fail +// and will always return the unknown language. +void loadData(const char* fileName); + +// If compiled with dynamic mode, unload the previously-loaded data. +// WARNING: After calling this method, language detection will no longer work +// and will always return the unknown language. +void unloadData(); + +// Returns true if and only if data has been loaded via a call to loadData(...) +// and has not been subsequently unladed via a call to unloadDate(). +bool isDataLoaded(); + +#endif // #ifdef CLD2_DYNAMIC_MODE + }; // End namespace CLD2 #endif // I18N_ENCODINGS_CLD2_PUBLIC_COMPACT_LANG_DET_H_