https://code.google.com/p/cld2/issues/detail?id=24 git-svn-id: https://cld2.googlecode.com/svn/trunk@169 b252ecd4-b096-bf77-eb8e-91563289f87e
273 lines
11 KiB
C++
273 lines
11 KiB
C++
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
#include <assert.h>
|
|
#include <stdint.h>
|
|
#include <stdio.h>
|
|
#include <fstream>
|
|
#include <fcntl.h>
|
|
#include <stdlib.h>
|
|
#include <string.h>
|
|
|
|
#include "cld2_dynamic_compat.h" // for win32/posix compatibility
|
|
#include "cld2_dynamic_data.h"
|
|
#include "cld2_dynamic_data_loader.h"
|
|
#include "integral_types.h"
|
|
#include "cld2tablesummary.h"
|
|
#include "utf8statetable.h"
|
|
#include "scoreonescriptspan.h"
|
|
|
|
namespace CLD2DynamicDataLoader {
|
|
static int DEBUG=0;
|
|
|
|
CLD2DynamicData::FileHeader* loadHeaderFromFile(const char* fileName) {
|
|
FILE* inFile = fopen(fileName, "r");
|
|
if (inFile == NULL) {
|
|
return NULL;
|
|
}
|
|
return loadInternal(inFile, NULL, -1);
|
|
}
|
|
|
|
CLD2DynamicData::FileHeader* loadHeaderFromRaw(const void* basePointer,
|
|
const uint32_t length) {
|
|
return loadInternal(NULL, basePointer, length);
|
|
}
|
|
|
|
|
|
#define CLD2_READINT(field) \
|
|
if (sourceIsFile) {\
|
|
bytesRead += 4 * fread(&(header->field), 4, 1, inFile);\
|
|
} else {\
|
|
memcpy(&(header->field), (((char*)(basePointer)) + bytesRead), 4);\
|
|
bytesRead += 4;\
|
|
}
|
|
CLD2DynamicData::FileHeader* loadInternal(FILE* inFile, const void* basePointer, const uint32_t length) {
|
|
const bool sourceIsFile = (inFile != NULL);
|
|
int bytesRead = 0;
|
|
CLD2DynamicData::FileHeader* header = new CLD2DynamicData::FileHeader;
|
|
|
|
// TODO: force null-terminate char* strings for safety
|
|
if (sourceIsFile) {
|
|
bytesRead += fread(header->sanityString, 1, CLD2DynamicData::DATA_FILE_MARKER_LENGTH, inFile);
|
|
} else {
|
|
memcpy(header->sanityString, basePointer, CLD2DynamicData::DATA_FILE_MARKER_LENGTH);
|
|
bytesRead += CLD2DynamicData::DATA_FILE_MARKER_LENGTH;
|
|
}
|
|
|
|
if (!CLD2DynamicData::mem_compare(
|
|
header->sanityString,
|
|
CLD2DynamicData::DATA_FILE_MARKER,
|
|
CLD2DynamicData::DATA_FILE_MARKER_LENGTH)) {
|
|
fprintf(stderr, "Malformed header: bad file marker!\n");
|
|
delete header;
|
|
return NULL;
|
|
}
|
|
|
|
CLD2_READINT(totalFileSizeBytes);
|
|
CLD2_READINT(utf8PropObj_state0);
|
|
CLD2_READINT(utf8PropObj_state0_size);
|
|
CLD2_READINT(utf8PropObj_total_size);
|
|
CLD2_READINT(utf8PropObj_max_expand);
|
|
CLD2_READINT(utf8PropObj_entry_shift);
|
|
CLD2_READINT(utf8PropObj_bytes_per_entry);
|
|
CLD2_READINT(utf8PropObj_losub);
|
|
CLD2_READINT(utf8PropObj_hiadd);
|
|
CLD2_READINT(startOf_utf8PropObj_state_table);
|
|
CLD2_READINT(lengthOf_utf8PropObj_state_table);
|
|
CLD2_READINT(startOf_utf8PropObj_remap_base);
|
|
CLD2_READINT(lengthOf_utf8PropObj_remap_base);
|
|
CLD2_READINT(startOf_utf8PropObj_remap_string);
|
|
CLD2_READINT(lengthOf_utf8PropObj_remap_string);
|
|
CLD2_READINT(startOf_utf8PropObj_fast_state);
|
|
CLD2_READINT(lengthOf_utf8PropObj_fast_state);
|
|
CLD2_READINT(startOf_kAvgDeltaOctaScore);
|
|
CLD2_READINT(lengthOf_kAvgDeltaOctaScore);
|
|
CLD2_READINT(numTablesEncoded);
|
|
|
|
CLD2DynamicData::TableHeader* tableHeaders = new CLD2DynamicData::TableHeader[header->numTablesEncoded];
|
|
header->tableHeaders = tableHeaders;
|
|
for (int x=0; x < (int) header->numTablesEncoded; x++) {
|
|
CLD2DynamicData::TableHeader *header = &(tableHeaders[x]);
|
|
CLD2_READINT(kCLDTableSizeOne);
|
|
CLD2_READINT(kCLDTableSize);
|
|
CLD2_READINT(kCLDTableKeyMask);
|
|
CLD2_READINT(kCLDTableBuildDate);
|
|
CLD2_READINT(startOf_kCLDTable);
|
|
CLD2_READINT(lengthOf_kCLDTable);
|
|
CLD2_READINT(startOf_kCLDTableInd);
|
|
CLD2_READINT(lengthOf_kCLDTableInd);
|
|
CLD2_READINT(startOf_kRecognizedLangScripts);
|
|
CLD2_READINT(lengthOf_kRecognizedLangScripts);
|
|
}
|
|
|
|
// Confirm header size is correct.
|
|
int expectedHeaderSize = CLD2DynamicData::calculateHeaderSize(header->numTablesEncoded);
|
|
if (expectedHeaderSize != bytesRead) {
|
|
fprintf(stderr, "Header size mismatch! Expected %d, but read %d\n", expectedHeaderSize, bytesRead);
|
|
delete header;
|
|
delete[] tableHeaders;
|
|
return NULL;
|
|
}
|
|
|
|
int actualSize = 0;
|
|
if (sourceIsFile) {
|
|
// Confirm file size is correct.
|
|
fseek(inFile, 0, SEEK_END);
|
|
actualSize = ftell(inFile);
|
|
fclose(inFile);
|
|
} else {
|
|
actualSize = length;
|
|
}
|
|
|
|
if (actualSize != header->totalFileSizeBytes) {
|
|
fprintf(stderr, "File size mismatch! Expected %d, but found %d\n", header->totalFileSizeBytes, actualSize);
|
|
delete header;
|
|
delete[] tableHeaders;
|
|
return NULL;
|
|
}
|
|
return header;
|
|
}
|
|
|
|
void unloadDataFile(CLD2::ScoringTables** scoringTables,
|
|
void** mmapAddress, uint32_t* mmapLength) {
|
|
#ifdef _WIN32
|
|
// See https://code.google.com/p/cld2/issues/detail?id=20
|
|
fprintf(stderr, "dynamic data unloading from file is not currently supported on win32, use raw mode instead.");
|
|
return;
|
|
#else // i.e., is POSIX (no support for Mac prior to OSX)
|
|
CLD2DynamicDataLoader::unloadDataRaw(scoringTables);
|
|
munmap(*mmapAddress, *mmapLength);
|
|
*mmapAddress = NULL;
|
|
*mmapLength = 0;
|
|
#endif // ifdef _WIN32
|
|
}
|
|
|
|
void unloadDataRaw(CLD2::ScoringTables** scoringTables) {
|
|
free(const_cast<CLD2::UTF8PropObj*>((*scoringTables)->unigram_obj));
|
|
(*scoringTables)->unigram_obj = NULL;
|
|
delete((*scoringTables)->unigram_compat_obj); // tableSummaries[0] from loadDataFile
|
|
(*scoringTables)->unigram_compat_obj = NULL;
|
|
delete(*scoringTables);
|
|
*scoringTables = NULL;
|
|
}
|
|
|
|
CLD2::ScoringTables* loadDataFile(const char* fileName,
|
|
void** mmapAddressOut, uint32_t* mmapLengthOut) {
|
|
|
|
#ifdef _WIN32
|
|
// See https://code.google.com/p/cld2/issues/detail?id=20
|
|
fprintf(stderr, "dynamic data loading from file is not currently supported on win32, use raw mode instead.");
|
|
return NULL;
|
|
#else // i.e., is POSIX (no support for Mac prior to OSX)
|
|
CLD2DynamicData::FileHeader* header = loadHeaderFromFile(fileName);
|
|
if (header == NULL) {
|
|
return NULL;
|
|
}
|
|
|
|
// Initialize the memory map
|
|
int inFileHandle = OPEN(fileName, O_RDONLY);
|
|
void* mapped = mmap(NULL, header->totalFileSizeBytes,
|
|
PROT_READ, MAP_PRIVATE, inFileHandle, 0);
|
|
// Record the map address. This allows callers to unmap
|
|
*mmapAddressOut=mapped;
|
|
*mmapLengthOut=header->totalFileSizeBytes;
|
|
CLOSE(inFileHandle);
|
|
|
|
return loadDataInternal(header, mapped, header->totalFileSizeBytes);
|
|
#endif // ifdef _WIN32
|
|
}
|
|
|
|
CLD2::ScoringTables* loadDataRaw(const void* basePointer, const uint32_t length) {
|
|
CLD2DynamicData::FileHeader* header = loadHeaderFromRaw(basePointer, length);
|
|
return loadDataInternal(header, basePointer, length);
|
|
}
|
|
|
|
CLD2::ScoringTables* loadDataInternal(CLD2DynamicData::FileHeader* header, const void* basePointer, const uint32_t length) {
|
|
// 1. UTF8 Object
|
|
const CLD2::uint8* state_table = static_cast<const CLD2::uint8*>(basePointer) +
|
|
header->startOf_utf8PropObj_state_table;
|
|
// FIXME: Unsafe to rely on this since RemapEntry is not a bit-packed structure
|
|
const CLD2::RemapEntry* remap_base =
|
|
reinterpret_cast<const CLD2::RemapEntry*>(
|
|
static_cast<const CLD2::uint8*>(basePointer) +
|
|
header->startOf_utf8PropObj_remap_base);
|
|
const CLD2::uint8* remap_string = static_cast<const CLD2::uint8*>(basePointer) +
|
|
header->startOf_utf8PropObj_remap_string;
|
|
const CLD2::uint8* fast_state =
|
|
header->startOf_utf8PropObj_fast_state == 0 ? 0 :
|
|
static_cast<const CLD2::uint8*>(basePointer) +
|
|
header->startOf_utf8PropObj_fast_state;
|
|
|
|
// Populate intermediate object. Horrible casting required because the struct
|
|
// is all read-only integers, and doesn't have a constructor. Yikes.
|
|
// TODO: It might actually be less horrible to memcpy the data in <shudder>
|
|
const CLD2::UTF8PropObj* unigram_obj = reinterpret_cast<CLD2::UTF8PropObj*>(malloc(sizeof(CLD2::UTF8PropObj)));
|
|
*const_cast<CLD2::uint32*>(&unigram_obj->state0) = header->utf8PropObj_state0;
|
|
*const_cast<CLD2::uint32*>(&unigram_obj->state0_size) = header->utf8PropObj_state0_size;
|
|
*const_cast<CLD2::uint32*>(&unigram_obj->total_size) = header->utf8PropObj_total_size;
|
|
*const_cast<int*>(&unigram_obj->max_expand) = header->utf8PropObj_max_expand;
|
|
*const_cast<int*>(&unigram_obj->entry_shift) = header->utf8PropObj_entry_shift;
|
|
*const_cast<int*>(&unigram_obj->bytes_per_entry) = header->utf8PropObj_bytes_per_entry;
|
|
*const_cast<CLD2::uint32*>(&unigram_obj->losub) = header->utf8PropObj_losub;
|
|
*const_cast<CLD2::uint32*>(&unigram_obj->hiadd) = header->utf8PropObj_hiadd;
|
|
*const_cast<const CLD2::uint8**>(&unigram_obj->state_table) = state_table;
|
|
*const_cast<const CLD2::RemapEntry**>(&unigram_obj->remap_base) = remap_base;
|
|
*const_cast<const CLD2::uint8**>(&unigram_obj->remap_string) = remap_string;
|
|
*const_cast<const CLD2::uint8**>(&unigram_obj->fast_state) = fast_state;
|
|
|
|
// 2. kAvgDeltaOctaScore array
|
|
const short* read_kAvgDeltaOctaScore = reinterpret_cast<const short*>(
|
|
static_cast<const CLD2::uint8*>(basePointer) +
|
|
header->startOf_kAvgDeltaOctaScore);
|
|
|
|
// 3. Each table
|
|
CLD2::CLD2TableSummary* tableSummaries = new CLD2::CLD2TableSummary[header->numTablesEncoded];
|
|
for (int x=0; x < (int) header->numTablesEncoded; x++) {
|
|
CLD2::CLD2TableSummary &summary = tableSummaries[x];
|
|
CLD2DynamicData::TableHeader& tHeader = header->tableHeaders[x];
|
|
const CLD2::IndirectProbBucket4* kCLDTable =
|
|
reinterpret_cast<const CLD2::IndirectProbBucket4*>(
|
|
static_cast<const CLD2::uint8*>(basePointer) + tHeader.startOf_kCLDTable);
|
|
const CLD2::uint32* kCLDTableInd =
|
|
reinterpret_cast<const CLD2::uint32*>(
|
|
static_cast<const CLD2::uint8*>(basePointer) + tHeader.startOf_kCLDTableInd);
|
|
const char* kRecognizedLangScripts =
|
|
static_cast<const char*>(basePointer) + tHeader.startOf_kRecognizedLangScripts;
|
|
|
|
summary.kCLDTable = kCLDTable;
|
|
summary.kCLDTableInd = kCLDTableInd;
|
|
summary.kCLDTableSizeOne = tHeader.kCLDTableSizeOne;
|
|
summary.kCLDTableSize = tHeader.kCLDTableSize;
|
|
summary.kCLDTableKeyMask = tHeader.kCLDTableKeyMask;
|
|
summary.kCLDTableBuildDate = tHeader.kCLDTableBuildDate;
|
|
summary.kRecognizedLangScripts = kRecognizedLangScripts;
|
|
}
|
|
|
|
// Tie everything together
|
|
CLD2::ScoringTables* result = new CLD2::ScoringTables;
|
|
result->unigram_obj = unigram_obj;
|
|
result->unigram_compat_obj = &tableSummaries[0];
|
|
result->deltabi_obj = &tableSummaries[1];
|
|
result->distinctbi_obj = &tableSummaries[2];
|
|
result->quadgram_obj = &tableSummaries[3];
|
|
result->quadgram_obj2 = &tableSummaries[4];
|
|
result->deltaocta_obj = &tableSummaries[5];
|
|
result->distinctocta_obj = &tableSummaries[6];
|
|
result->kExpectedScore = read_kAvgDeltaOctaScore;
|
|
delete[] header->tableHeaders;
|
|
delete header;
|
|
return result;
|
|
}
|
|
|
|
} // namespace CLD2DynamicDataLoader
|