Allow externally-managed mmaps to be used for loading data. For more info:
https://code.google.com/p/cld2/issues/detail?id=7 git-svn-id: https://cld2.googlecode.com/svn/trunk@153 b252ecd4-b096-bf77-eb8e-91563289f87e
This commit is contained in:
@@ -31,136 +31,183 @@
|
||||
namespace CLD2DynamicDataLoader {
|
||||
static int DEBUG=0;
|
||||
|
||||
CLD2DynamicData::FileHeader* loadHeader(const char* fileName) {
|
||||
// TODO: force null-terminate char* strings for safety
|
||||
CLD2DynamicData::FileHeader* loadHeaderFromFile(const char* fileName) {
|
||||
FILE* inFile = fopen(fileName, "r");
|
||||
if (inFile == NULL) {
|
||||
return NULL;
|
||||
}
|
||||
return loadInternal(inFile, NULL, -1);
|
||||
}
|
||||
|
||||
CLD2DynamicData::FileHeader* loadHeaderFromRaw(const void* basePointer,
|
||||
const int length) {
|
||||
return loadInternal(NULL, basePointer, length);
|
||||
}
|
||||
|
||||
|
||||
#define CLD2_READINT(field) \
|
||||
if (sourceIsFile) {\
|
||||
bytesRead += 4 * fread(&(header->field), 4, 1, inFile);\
|
||||
} else {\
|
||||
memcpy(&(header->field), (((char*)(basePointer)) + bytesRead), 4);\
|
||||
bytesRead += 4;\
|
||||
}
|
||||
CLD2DynamicData::FileHeader* loadInternal(FILE* inFile, const void* basePointer, const int length) {
|
||||
const bool sourceIsFile = (inFile != NULL);
|
||||
int bytesRead = 0;
|
||||
CLD2DynamicData::FileHeader* fileHeader = new CLD2DynamicData::FileHeader;
|
||||
bytesRead += fread(fileHeader->sanityString, 1, CLD2DynamicData::DATA_FILE_MARKER_LENGTH, inFile);
|
||||
if (!CLD2DynamicData::mem_compare(fileHeader->sanityString, CLD2DynamicData::DATA_FILE_MARKER, CLD2DynamicData::DATA_FILE_MARKER_LENGTH)) {
|
||||
CLD2DynamicData::FileHeader* header = new CLD2DynamicData::FileHeader;
|
||||
|
||||
// TODO: force null-terminate char* strings for safety
|
||||
if (sourceIsFile) {
|
||||
bytesRead += fread(header->sanityString, 1, CLD2DynamicData::DATA_FILE_MARKER_LENGTH, inFile);
|
||||
} else {
|
||||
memcpy(header->sanityString, basePointer, CLD2DynamicData::DATA_FILE_MARKER_LENGTH);
|
||||
bytesRead += CLD2DynamicData::DATA_FILE_MARKER_LENGTH;
|
||||
}
|
||||
|
||||
if (!CLD2DynamicData::mem_compare(
|
||||
header->sanityString,
|
||||
CLD2DynamicData::DATA_FILE_MARKER,
|
||||
CLD2DynamicData::DATA_FILE_MARKER_LENGTH)) {
|
||||
std::cerr << "Malformed header: bad file marker!" << std::endl;
|
||||
delete fileHeader;
|
||||
delete header;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
bytesRead += 4 * fread(&(fileHeader->totalFileSizeBytes), 4, 1, inFile);
|
||||
bytesRead += 4 * fread(&(fileHeader->utf8PropObj_state0), 4, 1, inFile);
|
||||
bytesRead += 4 * fread(&(fileHeader->utf8PropObj_state0_size), 4, 1, inFile);
|
||||
bytesRead += 4 * fread(&(fileHeader->utf8PropObj_total_size), 4, 1, inFile);
|
||||
bytesRead += 4 * fread(&(fileHeader->utf8PropObj_max_expand), 4, 1, inFile);
|
||||
bytesRead += 4 * fread(&(fileHeader->utf8PropObj_entry_shift), 4, 1, inFile);
|
||||
bytesRead += 4 * fread(&(fileHeader->utf8PropObj_bytes_per_entry), 4, 1, inFile);
|
||||
bytesRead += 4 * fread(&(fileHeader->utf8PropObj_losub), 4, 1, inFile);
|
||||
bytesRead += 4 * fread(&(fileHeader->utf8PropObj_hiadd), 4, 1, inFile);
|
||||
bytesRead += 4 * fread(&(fileHeader->startOf_utf8PropObj_state_table), 4, 1, inFile);
|
||||
bytesRead += 4 * fread(&(fileHeader->lengthOf_utf8PropObj_state_table), 4, 1, inFile);
|
||||
bytesRead += 4 * fread(&(fileHeader->startOf_utf8PropObj_remap_base), 4, 1, inFile);
|
||||
bytesRead += 4 * fread(&(fileHeader->lengthOf_utf8PropObj_remap_base), 4, 1, inFile);
|
||||
bytesRead += 4 * fread(&(fileHeader->startOf_utf8PropObj_remap_string), 4, 1, inFile);
|
||||
bytesRead += 4 * fread(&(fileHeader->lengthOf_utf8PropObj_remap_string), 4, 1, inFile);
|
||||
bytesRead += 4 * fread(&(fileHeader->startOf_utf8PropObj_fast_state), 4, 1, inFile);
|
||||
bytesRead += 4 * fread(&(fileHeader->lengthOf_utf8PropObj_fast_state), 4, 1, inFile);
|
||||
bytesRead += 4 * fread(&(fileHeader->startOf_kAvgDeltaOctaScore), 4, 1, inFile);
|
||||
bytesRead += 4 * fread(&(fileHeader->lengthOf_kAvgDeltaOctaScore), 4, 1, inFile);
|
||||
bytesRead += 4 * fread(&(fileHeader->numTablesEncoded), 4, 1, inFile);
|
||||
CLD2_READINT(totalFileSizeBytes);
|
||||
CLD2_READINT(utf8PropObj_state0);
|
||||
CLD2_READINT(utf8PropObj_state0_size);
|
||||
CLD2_READINT(utf8PropObj_total_size);
|
||||
CLD2_READINT(utf8PropObj_max_expand);
|
||||
CLD2_READINT(utf8PropObj_entry_shift);
|
||||
CLD2_READINT(utf8PropObj_bytes_per_entry);
|
||||
CLD2_READINT(utf8PropObj_losub);
|
||||
CLD2_READINT(utf8PropObj_hiadd);
|
||||
CLD2_READINT(startOf_utf8PropObj_state_table);
|
||||
CLD2_READINT(lengthOf_utf8PropObj_state_table);
|
||||
CLD2_READINT(startOf_utf8PropObj_remap_base);
|
||||
CLD2_READINT(lengthOf_utf8PropObj_remap_base);
|
||||
CLD2_READINT(startOf_utf8PropObj_remap_string);
|
||||
CLD2_READINT(lengthOf_utf8PropObj_remap_string);
|
||||
CLD2_READINT(startOf_utf8PropObj_fast_state);
|
||||
CLD2_READINT(lengthOf_utf8PropObj_fast_state);
|
||||
CLD2_READINT(startOf_kAvgDeltaOctaScore);
|
||||
CLD2_READINT(lengthOf_kAvgDeltaOctaScore);
|
||||
CLD2_READINT(numTablesEncoded);
|
||||
|
||||
CLD2DynamicData::TableHeader* tableHeaders = new CLD2DynamicData::TableHeader[fileHeader->numTablesEncoded];
|
||||
fileHeader->tableHeaders = tableHeaders;
|
||||
for (int x=0; x<fileHeader->numTablesEncoded; x++) {
|
||||
CLD2DynamicData::TableHeader &tHeader = fileHeader->tableHeaders[x];
|
||||
bytesRead += 4 * fread(&(tHeader.kCLDTableSizeOne), 4, 1, inFile);
|
||||
bytesRead += 4 * fread(&(tHeader.kCLDTableSize), 4, 1, inFile);
|
||||
bytesRead += 4 * fread(&(tHeader.kCLDTableKeyMask), 4, 1, inFile);
|
||||
bytesRead += 4 * fread(&(tHeader.kCLDTableBuildDate), 4, 1, inFile);
|
||||
bytesRead += 4 * fread(&(tHeader.startOf_kCLDTable), 4, 1, inFile);
|
||||
bytesRead += 4 * fread(&(tHeader.lengthOf_kCLDTable), 4, 1, inFile);
|
||||
bytesRead += 4 * fread(&(tHeader.startOf_kCLDTableInd), 4, 1, inFile);
|
||||
bytesRead += 4 * fread(&(tHeader.lengthOf_kCLDTableInd), 4, 1, inFile);
|
||||
bytesRead += 4 * fread(&(tHeader.startOf_kRecognizedLangScripts), 4, 1, inFile);
|
||||
bytesRead += 4 * fread(&(tHeader.lengthOf_kRecognizedLangScripts), 4, 1, inFile);
|
||||
CLD2DynamicData::TableHeader* tableHeaders = new CLD2DynamicData::TableHeader[header->numTablesEncoded];
|
||||
header->tableHeaders = tableHeaders;
|
||||
for (int x=0; x<header->numTablesEncoded; x++) {
|
||||
CLD2DynamicData::TableHeader *header = &(tableHeaders[x]);
|
||||
CLD2_READINT(kCLDTableSizeOne);
|
||||
CLD2_READINT(kCLDTableSize);
|
||||
CLD2_READINT(kCLDTableKeyMask);
|
||||
CLD2_READINT(kCLDTableBuildDate);
|
||||
CLD2_READINT(startOf_kCLDTable);
|
||||
CLD2_READINT(lengthOf_kCLDTable);
|
||||
CLD2_READINT(startOf_kCLDTableInd);
|
||||
CLD2_READINT(lengthOf_kCLDTableInd);
|
||||
CLD2_READINT(startOf_kRecognizedLangScripts);
|
||||
CLD2_READINT(lengthOf_kRecognizedLangScripts);
|
||||
}
|
||||
|
||||
// Confirm header size is correct.
|
||||
int expectedHeaderSize = CLD2DynamicData::calculateHeaderSize(fileHeader->numTablesEncoded);
|
||||
int expectedHeaderSize = CLD2DynamicData::calculateHeaderSize(header->numTablesEncoded);
|
||||
if (expectedHeaderSize != bytesRead) {
|
||||
std::cerr << "Header size mismatch! Expected " << expectedHeaderSize << ", but read " << bytesRead << std::endl;
|
||||
delete fileHeader;
|
||||
delete header;
|
||||
delete tableHeaders;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
// Confirm file size is correct.
|
||||
fseek(inFile, 0, SEEK_END);
|
||||
int actualSize = ftell(inFile);
|
||||
fclose(inFile);
|
||||
int actualSize = 0;
|
||||
if (sourceIsFile) {
|
||||
// Confirm file size is correct.
|
||||
fseek(inFile, 0, SEEK_END);
|
||||
actualSize = ftell(inFile);
|
||||
fclose(inFile);
|
||||
} else {
|
||||
actualSize = length;
|
||||
}
|
||||
|
||||
if (actualSize != fileHeader->totalFileSizeBytes) {
|
||||
std::cerr << "File size mismatch! Expected " << fileHeader->totalFileSizeBytes << ", but found " << actualSize << std::endl;
|
||||
delete fileHeader;
|
||||
if (actualSize != header->totalFileSizeBytes) {
|
||||
std::cerr << "File size mismatch! Expected " << header->totalFileSizeBytes << ", but found " << actualSize << std::endl;
|
||||
delete header;
|
||||
delete tableHeaders;
|
||||
return NULL;
|
||||
}
|
||||
return fileHeader;
|
||||
return header;
|
||||
}
|
||||
|
||||
void unloadData(CLD2::ScoringTables** scoringTables, void** mmapAddress, int* mmapLength) {
|
||||
void unloadDataFile(CLD2::ScoringTables** scoringTables,
|
||||
void** mmapAddress, int* mmapLength) {
|
||||
CLD2DynamicDataLoader::unloadDataRaw(scoringTables);
|
||||
munmap(*mmapAddress, *mmapLength);
|
||||
*mmapAddress = NULL;
|
||||
*mmapLength = 0;
|
||||
}
|
||||
|
||||
void unloadDataRaw(CLD2::ScoringTables** scoringTables) {
|
||||
free(const_cast<CLD2::UTF8PropObj*>((*scoringTables)->unigram_obj));
|
||||
(*scoringTables)->unigram_obj = NULL;
|
||||
delete((*scoringTables)->unigram_compat_obj); // tableSummaries[0] from loadDataFile
|
||||
(*scoringTables)->unigram_compat_obj = NULL;
|
||||
delete(*scoringTables);
|
||||
*scoringTables = NULL;
|
||||
munmap(*mmapAddress, *mmapLength);
|
||||
*mmapAddress = NULL;
|
||||
*mmapLength = 0;
|
||||
}
|
||||
|
||||
CLD2::ScoringTables* loadDataFile(const char* fileName, void** mmapAddressOut, int* mmapLengthOut) {
|
||||
CLD2DynamicData::FileHeader* fileHeader = loadHeader(fileName);
|
||||
if (fileHeader == NULL) {
|
||||
CLD2::ScoringTables* loadDataFile(const char* fileName,
|
||||
void** mmapAddressOut, int* mmapLengthOut) {
|
||||
CLD2DynamicData::FileHeader* header = loadHeaderFromFile(fileName);
|
||||
if (header == NULL) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
// Initialize the memory map
|
||||
int inFileHandle = open(fileName, O_RDONLY);
|
||||
void* mapped = mmap(NULL, fileHeader->totalFileSizeBytes,
|
||||
void* mapped = mmap(NULL, header->totalFileSizeBytes,
|
||||
PROT_READ, MAP_PRIVATE, inFileHandle, 0);
|
||||
// Record the map address. This allows callers to unmap
|
||||
*mmapAddressOut=mapped;
|
||||
*mmapLengthOut=fileHeader->totalFileSizeBytes;
|
||||
*mmapLengthOut=header->totalFileSizeBytes;
|
||||
close(inFileHandle);
|
||||
|
||||
return loadDataInternal(header, mapped, header->totalFileSizeBytes);
|
||||
}
|
||||
|
||||
CLD2::ScoringTables* loadDataRaw(const void* basePointer, const int length) {
|
||||
CLD2DynamicData::FileHeader* header = loadHeaderFromRaw(basePointer, length);
|
||||
return loadDataInternal(header, basePointer, length);
|
||||
}
|
||||
|
||||
CLD2::ScoringTables* loadDataInternal(CLD2DynamicData::FileHeader* header, const void* basePointer, const int length) {
|
||||
// 1. UTF8 Object
|
||||
const CLD2::uint8* state_table = static_cast<const CLD2::uint8*>(mapped) +
|
||||
fileHeader->startOf_utf8PropObj_state_table;
|
||||
const CLD2::uint8* state_table = static_cast<const CLD2::uint8*>(basePointer) +
|
||||
header->startOf_utf8PropObj_state_table;
|
||||
// FIXME: Unsafe to rely on this since RemapEntry is not a bit-packed structure
|
||||
const CLD2::RemapEntry* remap_base =
|
||||
reinterpret_cast<const CLD2::RemapEntry*>(
|
||||
static_cast<const CLD2::uint8*>(mapped) +
|
||||
fileHeader->startOf_utf8PropObj_remap_base);
|
||||
const CLD2::uint8* remap_string = static_cast<const CLD2::uint8*>(mapped) +
|
||||
fileHeader->startOf_utf8PropObj_remap_string;
|
||||
static_cast<const CLD2::uint8*>(basePointer) +
|
||||
header->startOf_utf8PropObj_remap_base);
|
||||
const CLD2::uint8* remap_string = static_cast<const CLD2::uint8*>(basePointer) +
|
||||
header->startOf_utf8PropObj_remap_string;
|
||||
const CLD2::uint8* fast_state =
|
||||
fileHeader->startOf_utf8PropObj_fast_state == 0 ? 0 :
|
||||
static_cast<const CLD2::uint8*>(mapped) +
|
||||
fileHeader->startOf_utf8PropObj_fast_state;
|
||||
header->startOf_utf8PropObj_fast_state == 0 ? 0 :
|
||||
static_cast<const CLD2::uint8*>(basePointer) +
|
||||
header->startOf_utf8PropObj_fast_state;
|
||||
|
||||
// Populate intermediate object. Horrible casting required because the struct
|
||||
// is all read-only integers, and doesn't have a constructor. Yikes.
|
||||
// TODO: It might actually be less horrible to memcpy the data in <shudder>
|
||||
const CLD2::UTF8PropObj* unigram_obj = reinterpret_cast<CLD2::UTF8PropObj*>(malloc(sizeof(CLD2::UTF8PropObj)));
|
||||
*const_cast<CLD2::uint32*>(&unigram_obj->state0) = fileHeader->utf8PropObj_state0;
|
||||
*const_cast<CLD2::uint32*>(&unigram_obj->state0_size) = fileHeader->utf8PropObj_state0_size;
|
||||
*const_cast<CLD2::uint32*>(&unigram_obj->total_size) = fileHeader->utf8PropObj_total_size;
|
||||
*const_cast<int*>(&unigram_obj->max_expand) = fileHeader->utf8PropObj_max_expand;
|
||||
*const_cast<int*>(&unigram_obj->entry_shift) = fileHeader->utf8PropObj_entry_shift;
|
||||
*const_cast<int*>(&unigram_obj->bytes_per_entry) = fileHeader->utf8PropObj_bytes_per_entry;
|
||||
*const_cast<CLD2::uint32*>(&unigram_obj->losub) = fileHeader->utf8PropObj_losub;
|
||||
*const_cast<CLD2::uint32*>(&unigram_obj->hiadd) = fileHeader->utf8PropObj_hiadd;
|
||||
*const_cast<CLD2::uint32*>(&unigram_obj->state0) = header->utf8PropObj_state0;
|
||||
*const_cast<CLD2::uint32*>(&unigram_obj->state0_size) = header->utf8PropObj_state0_size;
|
||||
*const_cast<CLD2::uint32*>(&unigram_obj->total_size) = header->utf8PropObj_total_size;
|
||||
*const_cast<int*>(&unigram_obj->max_expand) = header->utf8PropObj_max_expand;
|
||||
*const_cast<int*>(&unigram_obj->entry_shift) = header->utf8PropObj_entry_shift;
|
||||
*const_cast<int*>(&unigram_obj->bytes_per_entry) = header->utf8PropObj_bytes_per_entry;
|
||||
*const_cast<CLD2::uint32*>(&unigram_obj->losub) = header->utf8PropObj_losub;
|
||||
*const_cast<CLD2::uint32*>(&unigram_obj->hiadd) = header->utf8PropObj_hiadd;
|
||||
*const_cast<const CLD2::uint8**>(&unigram_obj->state_table) = state_table;
|
||||
*const_cast<const CLD2::RemapEntry**>(&unigram_obj->remap_base) = remap_base;
|
||||
*const_cast<const CLD2::uint8**>(&unigram_obj->remap_string) = remap_string;
|
||||
@@ -168,22 +215,22 @@ CLD2::ScoringTables* loadDataFile(const char* fileName, void** mmapAddressOut, i
|
||||
|
||||
// 2. kAvgDeltaOctaScore array
|
||||
const short* read_kAvgDeltaOctaScore = reinterpret_cast<const short*>(
|
||||
static_cast<const CLD2::uint8*>(mapped) +
|
||||
fileHeader->startOf_kAvgDeltaOctaScore);
|
||||
static_cast<const CLD2::uint8*>(basePointer) +
|
||||
header->startOf_kAvgDeltaOctaScore);
|
||||
|
||||
// 3. Each table
|
||||
CLD2::CLD2TableSummary* tableSummaries = new CLD2::CLD2TableSummary[fileHeader->numTablesEncoded];
|
||||
for (int x=0; x<fileHeader->numTablesEncoded; x++) {
|
||||
CLD2::CLD2TableSummary* tableSummaries = new CLD2::CLD2TableSummary[header->numTablesEncoded];
|
||||
for (int x=0; x<header->numTablesEncoded; x++) {
|
||||
CLD2::CLD2TableSummary &summary = tableSummaries[x];
|
||||
CLD2DynamicData::TableHeader& tHeader = fileHeader->tableHeaders[x];
|
||||
CLD2DynamicData::TableHeader& tHeader = header->tableHeaders[x];
|
||||
const CLD2::IndirectProbBucket4* kCLDTable =
|
||||
reinterpret_cast<const CLD2::IndirectProbBucket4*>(
|
||||
static_cast<CLD2::uint8*>(mapped) + tHeader.startOf_kCLDTable);
|
||||
static_cast<const CLD2::uint8*>(basePointer) + tHeader.startOf_kCLDTable);
|
||||
const CLD2::uint32* kCLDTableInd =
|
||||
reinterpret_cast<const CLD2::uint32*>(
|
||||
static_cast<CLD2::uint8*>(mapped) + tHeader.startOf_kCLDTableInd);
|
||||
static_cast<const CLD2::uint8*>(basePointer) + tHeader.startOf_kCLDTableInd);
|
||||
const char* kRecognizedLangScripts =
|
||||
static_cast<const char*>(mapped) + tHeader.startOf_kRecognizedLangScripts;
|
||||
static_cast<const char*>(basePointer) + tHeader.startOf_kRecognizedLangScripts;
|
||||
|
||||
summary.kCLDTable = kCLDTable;
|
||||
summary.kCLDTableInd = kCLDTableInd;
|
||||
@@ -205,8 +252,9 @@ CLD2::ScoringTables* loadDataFile(const char* fileName, void** mmapAddressOut, i
|
||||
result->deltaocta_obj = &tableSummaries[5];
|
||||
result->distinctocta_obj = &tableSummaries[6];
|
||||
result->kExpectedScore = read_kAvgDeltaOctaScore;
|
||||
delete fileHeader->tableHeaders;
|
||||
delete fileHeader;
|
||||
delete header->tableHeaders;
|
||||
delete header;
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace CLD2DynamicDataLoader
|
||||
|
@@ -23,7 +23,17 @@ namespace CLD2DynamicDataLoader {
|
||||
// Read a header from the specified file and return it.
|
||||
// The header returned is dynamically allocated; you must 'delete' the array
|
||||
// of TableHeaders as well as the returned FileHeader* when done.
|
||||
CLD2DynamicData::FileHeader* loadHeader(const char* fileName);
|
||||
CLD2DynamicData::FileHeader* loadHeaderFromFile(const char* fileName);
|
||||
|
||||
// Read a header from the specified area of raw memory and return it.
|
||||
// The header returned is dynamically allocated; you must 'delete' the array
|
||||
// of TableHeaders as well as the returned FileHeader* when done.
|
||||
CLD2DynamicData::FileHeader* loadHeaderFromRaw(
|
||||
const void* basePointer, const int length);
|
||||
|
||||
// Not for public consumption.
|
||||
CLD2DynamicData::FileHeader* loadInternal(
|
||||
FILE* inFile, const void* basePointer, const int length);
|
||||
|
||||
// Load data directly into a ScoringTables structure using a private, read-only
|
||||
// mmap and return the newly-allocated structure.
|
||||
@@ -31,10 +41,20 @@ CLD2DynamicData::FileHeader* loadHeader(const char* fileName);
|
||||
// address of the mmap()'d block will be written here.
|
||||
// The out-parameter "mmapLengthOut" is a pointer to an int; the length of the
|
||||
// mmap()'d block will be written here.
|
||||
// It is up to the caller to delete
|
||||
// It is up to the caller to delete the data at a later time using
|
||||
// unloadData(...).
|
||||
CLD2::ScoringTables* loadDataFile(const char* fileName,
|
||||
void** mmapAddressOut, int* mmapLengthOut);
|
||||
|
||||
// Load data directly into a ScoringTables structure from an arbitrary region
|
||||
// of memory, which is assumed to be a pointer to an mmap-ed region of memory
|
||||
// backed by a valid data file that could alternatively be read (if access
|
||||
// were allowed or desired) using loadDataFile(...).
|
||||
CLD2::ScoringTables* loadDataRaw(const void* basePointer, const int length);
|
||||
|
||||
// Not for public consumption.
|
||||
CLD2::ScoringTables* loadDataInternal(CLD2DynamicData::FileHeader* header, const void* basePointer, const int length);
|
||||
|
||||
// Given pointers to the data from a previous invocation of loadDataFile,
|
||||
// unloads the data safely - freeing and deleting any malloc'd/new'd objects.
|
||||
// When this method returns, the mmap has been deleted, as have all the scoring
|
||||
@@ -45,8 +65,14 @@ CLD2::ScoringTables* loadDataFile(const char* fileName,
|
||||
// This is the only safe way to unload data that was previously loaded, as there
|
||||
// is an unfortunate mixture of new and malloc involved in building the
|
||||
// in-memory represtation of the data.
|
||||
void unloadData(CLD2::ScoringTables** scoringTables,
|
||||
void unloadDataFile(CLD2::ScoringTables** scoringTables,
|
||||
void** mmapAddress, int* mmapLength);
|
||||
|
||||
// Given a pointer to the data from a previous invocation of loadDataRaw,
|
||||
// unloads the data safely just like unloadDataFile does. This method doesn't
|
||||
// deal with mmaps, since it is assumed that the memory for the data is managed
|
||||
// external to this library.
|
||||
void unloadDataRaw(CLD2::ScoringTables** scoringTables);
|
||||
|
||||
} // End namespace CLD2DynamicDataExtractor
|
||||
#endif // CLD2_INTERNAL_CLD2_DYNAMIC_DATA_EXTRACTOR_H_
|
||||
|
@@ -128,7 +128,7 @@ Usage:\n\
|
||||
static_cast<const CLD2::ScoringTables*>(&realData),
|
||||
fileName);
|
||||
} else if (mode == 3) { // head
|
||||
CLD2DynamicData::FileHeader* header = CLD2DynamicDataLoader::loadHeader(fileName);
|
||||
CLD2DynamicData::FileHeader* header = CLD2DynamicDataLoader::loadHeaderFromFile(fileName);
|
||||
if (header == NULL) {
|
||||
std::cerr << "Cannot read header from file: " << fileName << std::endl;
|
||||
return -1;
|
||||
@@ -150,7 +150,7 @@ Usage:\n\
|
||||
bool result = CLD2DynamicData::verify(
|
||||
static_cast<const CLD2::ScoringTables*>(&realData),
|
||||
static_cast<const CLD2::ScoringTables*>(loadedData));
|
||||
CLD2DynamicDataLoader::unloadData(&loadedData, &mmapAddress, &mmapLength);
|
||||
CLD2DynamicDataLoader::unloadDataFile(&loadedData, &mmapAddress, &mmapLength);
|
||||
if (loadedData != NULL || mmapAddress != NULL || mmapLength != 0) {
|
||||
std::cerr << "Warning: failed to clean up memory for ScoringTables." << std::endl;
|
||||
}
|
||||
|
@@ -22,6 +22,9 @@
|
||||
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <fcntl.h>
|
||||
#include <fstream>
|
||||
#include <sys/mman.h>
|
||||
|
||||
#include "../public/compact_lang_det.h"
|
||||
#include "../public/encodings.h"
|
||||
@@ -265,19 +268,19 @@ int RunTests (int flags, bool get_vector) {
|
||||
fprintf(stdout, "[DYNAMIC] Test running in dynamic data mode!\n");
|
||||
bool dataLoaded = CLD2::isDataLoaded();
|
||||
if (dataLoaded) {
|
||||
fprintf(stderr, "[DYNAMIC] *** Error: CLD2::isDataLoaded() returned true prior to loading data!\n");
|
||||
fprintf(stderr, "[DYNAMIC] *** Error: CLD2::isDataLoaded() returned true prior to loading data from file!\n");
|
||||
any_fail = true;
|
||||
}
|
||||
fprintf(stdout, "[DYNAMIC] Attempting translation prior to loading data\n");
|
||||
any_fail |= !OneTest(flags, get_vector, UNKNOWN_LANGUAGE, kTeststr_en, strlen(kTeststr_en));
|
||||
fprintf(stdout, "[DYNAMIC] Loading data from: %s\n", data_file);
|
||||
CLD2::loadData(data_file);
|
||||
CLD2::loadDataFromFile(data_file);
|
||||
dataLoaded = CLD2::isDataLoaded();
|
||||
if (!dataLoaded) {
|
||||
fprintf(stderr, "[DYNAMIC] *** Error: CLD2::isDataLoaded() returned false after loading data!\n");
|
||||
fprintf(stderr, "[DYNAMIC] *** Error: CLD2::isDataLoaded() returned false after loading data from file!\n");
|
||||
any_fail = true;
|
||||
}
|
||||
fprintf(stdout, "[DYNAMIC] Data loaded, normal tests commencing\n");
|
||||
fprintf(stdout, "[DYNAMIC] Data loaded, file-based tests commencing\n");
|
||||
#endif
|
||||
|
||||
int i = 0;
|
||||
@@ -291,15 +294,60 @@ int RunTests (int flags, bool get_vector) {
|
||||
}
|
||||
|
||||
#ifdef CLD2_DYNAMIC_MODE
|
||||
fprintf(stdout, "[DYNAMIC] Normal tests complete, attempting to unload data\n");
|
||||
fprintf(stdout, "[DYNAMIC] File-based tests complete, attempting to unload file data\n");
|
||||
CLD2::unloadData();
|
||||
dataLoaded = CLD2::isDataLoaded();
|
||||
if (dataLoaded) {
|
||||
fprintf(stderr, "[DYNAMIC] *** Error: CLD2::isDataLoaded() returned true after unloading data!\n");
|
||||
fprintf(stderr, "[DYNAMIC] *** Error: CLD2::isDataLoaded() returned true after unloading file data!\n");
|
||||
any_fail = true;
|
||||
}
|
||||
fprintf(stdout, "[DYNAMIC] Attempting translation after unloading data\n");
|
||||
any_fail |= !OneTest(flags, get_vector, UNKNOWN_LANGUAGE, kTeststr_en, strlen(kTeststr_en));
|
||||
|
||||
// Now, run the whole thing again, but this time mmap the file's contents
|
||||
// and hit the external-mmap code.
|
||||
fprintf(stdout, "[DYNAMIC] mmaping data for external-mmap test.\n");
|
||||
FILE* inFile = fopen(data_file, "r");
|
||||
fseek(inFile, 0, SEEK_END);
|
||||
const int actualSize = ftell(inFile);
|
||||
fclose(inFile);
|
||||
|
||||
int inFileHandle = open(data_file, O_RDONLY);
|
||||
void* mapped = mmap(NULL, actualSize,
|
||||
PROT_READ, MAP_PRIVATE, inFileHandle, 0);
|
||||
close(inFileHandle);
|
||||
|
||||
fprintf(stdout, "[DYNAMIC] mmap'ed successfully, attempting data load.\n");
|
||||
CLD2::loadDataFromRawAddress(mapped, actualSize);
|
||||
dataLoaded = CLD2::isDataLoaded();
|
||||
if (!dataLoaded) {
|
||||
fprintf(stderr, "[DYNAMIC] *** Error: CLD2::isDataLoaded() returned false after loading data from mmap!\n");
|
||||
any_fail = true;
|
||||
}
|
||||
|
||||
// Reset and run the tests again
|
||||
fprintf(stdout, "[DYNAMIC] Data loaded, mmap-based tests commencing\n");
|
||||
i = 0;
|
||||
while (kTestPair[i].text != NULL) {
|
||||
Language lang_expected = kTestPair[i].lang;
|
||||
const char* buffer = kTestPair[i].text;
|
||||
int buffer_length = strlen(buffer);
|
||||
bool ok = OneTest(flags, get_vector, lang_expected, buffer, buffer_length);
|
||||
any_fail |= (!ok);
|
||||
++i;
|
||||
}
|
||||
|
||||
fprintf(stdout, "[DYNAMIC] Mmap-based tests complete, attempting to unload data\n");
|
||||
CLD2::unloadData();
|
||||
dataLoaded = CLD2::isDataLoaded();
|
||||
if (dataLoaded) {
|
||||
fprintf(stderr, "[DYNAMIC] *** Error: CLD2::isDataLoaded() returned true after unloading mmap data!\n");
|
||||
any_fail = true;
|
||||
}
|
||||
fprintf(stdout, "[DYNAMIC] Attempting translation after unloading map data\n");
|
||||
any_fail |= !OneTest(flags, get_vector, UNKNOWN_LANGUAGE, kTeststr_en, strlen(kTeststr_en));
|
||||
|
||||
fprintf(stdout, "[DYNAMIC] All dynamic-mode tests complete\n");
|
||||
#endif
|
||||
|
||||
if (any_fail) {
|
||||
|
@@ -70,7 +70,9 @@ extern const short kAvgDeltaOctaScore[];
|
||||
#ifdef CLD2_DYNAMIC_MODE
|
||||
// CLD2_DYNAMIC_MODE is defined:
|
||||
// Data will be read from an mmap opened at runtime.
|
||||
static ScoringTables kScoringtables = {
|
||||
|
||||
// Convenience for nulling things out completely at any point.
|
||||
static ScoringTables NULL_TABLES = {
|
||||
NULL, //&cld_generated_CjkUni_obj,
|
||||
NULL, //&kCjkCompat_obj,
|
||||
NULL, //&kCjkDeltaBi_obj,
|
||||
@@ -81,27 +83,45 @@ extern const short kAvgDeltaOctaScore[];
|
||||
NULL, //&kDistinctOcta_obj,
|
||||
NULL, //kAvgDeltaOctaScore,
|
||||
};
|
||||
static ScoringTables kScoringtables = NULL_TABLES; // copy constructed
|
||||
static bool dynamicDataLoaded = false;
|
||||
static bool dataSourceIsFile = false;
|
||||
static ScoringTables* dynamicTables = NULL;
|
||||
static void* mmapAddress = NULL;
|
||||
static int mmapLength = 0;
|
||||
|
||||
bool isDataLoaded() { return dynamicDataLoaded; }
|
||||
|
||||
void loadData(const char* fileName) {
|
||||
void loadDataFromFile(const char* fileName) {
|
||||
if (isDataLoaded()) {
|
||||
unloadData();
|
||||
}
|
||||
dynamicTables = CLD2DynamicDataLoader::loadDataFile(fileName, &mmapAddress, &mmapLength);
|
||||
kScoringtables = *dynamicTables;
|
||||
dataSourceIsFile = true;
|
||||
dynamicDataLoaded = true;
|
||||
};
|
||||
|
||||
void loadDataFromRawAddress(const void* rawAddress, const int length) {
|
||||
if (isDataLoaded()) {
|
||||
unloadData();
|
||||
}
|
||||
dynamicTables = CLD2DynamicDataLoader::loadDataRaw(rawAddress, length);
|
||||
kScoringtables = *dynamicTables;
|
||||
dataSourceIsFile = false;
|
||||
dynamicDataLoaded = true;
|
||||
}
|
||||
|
||||
void unloadData() {
|
||||
if (!dynamicDataLoaded) return;
|
||||
if (dataSourceIsFile) {
|
||||
CLD2DynamicDataLoader::unloadDataFile(&dynamicTables, &mmapAddress, &mmapLength);
|
||||
} else {
|
||||
CLD2DynamicDataLoader::unloadDataRaw(&dynamicTables);
|
||||
}
|
||||
dynamicDataLoaded = false;
|
||||
// unloading will null all the pointers out.
|
||||
CLD2DynamicDataLoader::unloadData(&dynamicTables, &mmapAddress, &mmapLength);
|
||||
dataSourceIsFile = false; // vacuous
|
||||
kScoringtables = NULL_TABLES; // Housekeeping: null all pointers
|
||||
}
|
||||
#else
|
||||
// This initializes kScoringtables.quadgram_obj etc.
|
||||
|
@@ -245,7 +245,7 @@ int main(int argc, char** argv) {
|
||||
return -1;
|
||||
}
|
||||
fprintf(stdout, "Loading data from: %s\n", data_file);
|
||||
CLD2::loadData(data_file);
|
||||
CLD2::loadDataFromFile(data_file);
|
||||
fprintf(stdout, "Data loaded, test commencing\n");
|
||||
#endif
|
||||
|
||||
|
@@ -300,17 +300,35 @@ void DumpResultChunkVector(FILE* f, const char* src,
|
||||
// If compiled with dynamic mode, load data from the specified file location.
|
||||
// If other data has already been loaded, it is discarded and the data is read
|
||||
// in from the specified file location again (even if the file has not changed).
|
||||
// WARNING: Before calling this method, language detection will always fail
|
||||
// and will always return the unknown language.
|
||||
void loadData(const char* fileName);
|
||||
// If data needs to be loaded in a context where direct access to the file
|
||||
// system is either undesireable or impossible, use loadDataFromRawAddress
|
||||
// instead to read the data from an arbitrary region in memory (such as a
|
||||
// mmap-ed file).
|
||||
// WARNING: Before calling one of the provided "loadData" methods, language
|
||||
// detection will always fail and will always return the unknown language.
|
||||
void loadDataFromFile(const char* fileName);
|
||||
|
||||
// If compiled with dynamic mode, unload the previously-loaded data.
|
||||
// If compiled with dynamic mode, load data from the specified location in
|
||||
// memory.
|
||||
// This method is provided as an alternative to loadDataFromFile() for use cases
|
||||
// where the loading process may not have direct access to the file system,
|
||||
// e.g., where the direct process knows the pointer to an mmap region in system
|
||||
// memory where the data file's contents have been loaded.
|
||||
// If other data has already been loaded, it is discarded and the data is read
|
||||
// in from the specified location again (even if it has not changed).
|
||||
// WARNING: Before calling one of the provided "loadData" methods, language
|
||||
// detection will always fail and will always return the unknown language.
|
||||
void loadDataFromRawAddress(const void* rawAddress, const int length);
|
||||
|
||||
// If compiled with dynamic mode, unload the data that was previously loaded
|
||||
// via loadDataFromFile() or loadDataFromRawAddress().
|
||||
// WARNING: After calling this method, language detection will no longer work
|
||||
// and will always return the unknown language.
|
||||
void unloadData();
|
||||
|
||||
// Returns true if and only if data has been loaded via a call to loadData(...)
|
||||
// and has not been subsequently unladed via a call to unloadDate().
|
||||
// Returns true if and only if data has been loaded via a call to
|
||||
// loadDataFromFile(...) or loadDataFromRawAddress(...) and has not been
|
||||
// subsequently unladed via a call to unloadData().
|
||||
bool isDataLoaded();
|
||||
|
||||
#endif // #ifdef CLD2_DYNAMIC_MODE
|
||||
|
Reference in New Issue
Block a user