From a57e2d13c31bfffa3c6918997c856532d9c8f41a Mon Sep 17 00:00:00 2001 From: "andrewhayden@google.com" Date: Tue, 4 Mar 2014 10:21:09 +0000 Subject: [PATCH] Allow externally-managed mmaps to be used for loading data. For more info: https://code.google.com/p/cld2/issues/detail?id=7 git-svn-id: https://cld2.googlecode.com/svn/trunk@153 b252ecd4-b096-bf77-eb8e-91563289f87e --- internal/cld2_dynamic_data_loader.cc | 222 ++++++++++++++++----------- internal/cld2_dynamic_data_loader.h | 32 +++- internal/cld2_dynamic_data_tool.cc | 4 +- internal/cld2_unittest.cc | 60 +++++++- internal/compact_lang_det_impl.cc | 28 +++- internal/compact_lang_det_test.cc | 2 +- public/compact_lang_det.h | 30 +++- 7 files changed, 269 insertions(+), 109 deletions(-) diff --git a/internal/cld2_dynamic_data_loader.cc b/internal/cld2_dynamic_data_loader.cc index a6087c5..beb9395 100644 --- a/internal/cld2_dynamic_data_loader.cc +++ b/internal/cld2_dynamic_data_loader.cc @@ -31,136 +31,183 @@ namespace CLD2DynamicDataLoader { static int DEBUG=0; -CLD2DynamicData::FileHeader* loadHeader(const char* fileName) { - // TODO: force null-terminate char* strings for safety +CLD2DynamicData::FileHeader* loadHeaderFromFile(const char* fileName) { FILE* inFile = fopen(fileName, "r"); if (inFile == NULL) { return NULL; } + return loadInternal(inFile, NULL, -1); +} +CLD2DynamicData::FileHeader* loadHeaderFromRaw(const void* basePointer, + const int length) { + return loadInternal(NULL, basePointer, length); +} + + +#define CLD2_READINT(field) \ + if (sourceIsFile) {\ + bytesRead += 4 * fread(&(header->field), 4, 1, inFile);\ + } else {\ + memcpy(&(header->field), (((char*)(basePointer)) + bytesRead), 4);\ + bytesRead += 4;\ + } +CLD2DynamicData::FileHeader* loadInternal(FILE* inFile, const void* basePointer, const int length) { + const bool sourceIsFile = (inFile != NULL); int bytesRead = 0; - CLD2DynamicData::FileHeader* fileHeader = new CLD2DynamicData::FileHeader; - bytesRead += fread(fileHeader->sanityString, 1, CLD2DynamicData::DATA_FILE_MARKER_LENGTH, inFile); - if (!CLD2DynamicData::mem_compare(fileHeader->sanityString, CLD2DynamicData::DATA_FILE_MARKER, CLD2DynamicData::DATA_FILE_MARKER_LENGTH)) { + CLD2DynamicData::FileHeader* header = new CLD2DynamicData::FileHeader; + + // TODO: force null-terminate char* strings for safety + if (sourceIsFile) { + bytesRead += fread(header->sanityString, 1, CLD2DynamicData::DATA_FILE_MARKER_LENGTH, inFile); + } else { + memcpy(header->sanityString, basePointer, CLD2DynamicData::DATA_FILE_MARKER_LENGTH); + bytesRead += CLD2DynamicData::DATA_FILE_MARKER_LENGTH; + } + + if (!CLD2DynamicData::mem_compare( + header->sanityString, + CLD2DynamicData::DATA_FILE_MARKER, + CLD2DynamicData::DATA_FILE_MARKER_LENGTH)) { std::cerr << "Malformed header: bad file marker!" << std::endl; - delete fileHeader; + delete header; return NULL; } - bytesRead += 4 * fread(&(fileHeader->totalFileSizeBytes), 4, 1, inFile); - bytesRead += 4 * fread(&(fileHeader->utf8PropObj_state0), 4, 1, inFile); - bytesRead += 4 * fread(&(fileHeader->utf8PropObj_state0_size), 4, 1, inFile); - bytesRead += 4 * fread(&(fileHeader->utf8PropObj_total_size), 4, 1, inFile); - bytesRead += 4 * fread(&(fileHeader->utf8PropObj_max_expand), 4, 1, inFile); - bytesRead += 4 * fread(&(fileHeader->utf8PropObj_entry_shift), 4, 1, inFile); - bytesRead += 4 * fread(&(fileHeader->utf8PropObj_bytes_per_entry), 4, 1, inFile); - bytesRead += 4 * fread(&(fileHeader->utf8PropObj_losub), 4, 1, inFile); - bytesRead += 4 * fread(&(fileHeader->utf8PropObj_hiadd), 4, 1, inFile); - bytesRead += 4 * fread(&(fileHeader->startOf_utf8PropObj_state_table), 4, 1, inFile); - bytesRead += 4 * fread(&(fileHeader->lengthOf_utf8PropObj_state_table), 4, 1, inFile); - bytesRead += 4 * fread(&(fileHeader->startOf_utf8PropObj_remap_base), 4, 1, inFile); - bytesRead += 4 * fread(&(fileHeader->lengthOf_utf8PropObj_remap_base), 4, 1, inFile); - bytesRead += 4 * fread(&(fileHeader->startOf_utf8PropObj_remap_string), 4, 1, inFile); - bytesRead += 4 * fread(&(fileHeader->lengthOf_utf8PropObj_remap_string), 4, 1, inFile); - bytesRead += 4 * fread(&(fileHeader->startOf_utf8PropObj_fast_state), 4, 1, inFile); - bytesRead += 4 * fread(&(fileHeader->lengthOf_utf8PropObj_fast_state), 4, 1, inFile); - bytesRead += 4 * fread(&(fileHeader->startOf_kAvgDeltaOctaScore), 4, 1, inFile); - bytesRead += 4 * fread(&(fileHeader->lengthOf_kAvgDeltaOctaScore), 4, 1, inFile); - bytesRead += 4 * fread(&(fileHeader->numTablesEncoded), 4, 1, inFile); + CLD2_READINT(totalFileSizeBytes); + CLD2_READINT(utf8PropObj_state0); + CLD2_READINT(utf8PropObj_state0_size); + CLD2_READINT(utf8PropObj_total_size); + CLD2_READINT(utf8PropObj_max_expand); + CLD2_READINT(utf8PropObj_entry_shift); + CLD2_READINT(utf8PropObj_bytes_per_entry); + CLD2_READINT(utf8PropObj_losub); + CLD2_READINT(utf8PropObj_hiadd); + CLD2_READINT(startOf_utf8PropObj_state_table); + CLD2_READINT(lengthOf_utf8PropObj_state_table); + CLD2_READINT(startOf_utf8PropObj_remap_base); + CLD2_READINT(lengthOf_utf8PropObj_remap_base); + CLD2_READINT(startOf_utf8PropObj_remap_string); + CLD2_READINT(lengthOf_utf8PropObj_remap_string); + CLD2_READINT(startOf_utf8PropObj_fast_state); + CLD2_READINT(lengthOf_utf8PropObj_fast_state); + CLD2_READINT(startOf_kAvgDeltaOctaScore); + CLD2_READINT(lengthOf_kAvgDeltaOctaScore); + CLD2_READINT(numTablesEncoded); - CLD2DynamicData::TableHeader* tableHeaders = new CLD2DynamicData::TableHeader[fileHeader->numTablesEncoded]; - fileHeader->tableHeaders = tableHeaders; - for (int x=0; xnumTablesEncoded; x++) { - CLD2DynamicData::TableHeader &tHeader = fileHeader->tableHeaders[x]; - bytesRead += 4 * fread(&(tHeader.kCLDTableSizeOne), 4, 1, inFile); - bytesRead += 4 * fread(&(tHeader.kCLDTableSize), 4, 1, inFile); - bytesRead += 4 * fread(&(tHeader.kCLDTableKeyMask), 4, 1, inFile); - bytesRead += 4 * fread(&(tHeader.kCLDTableBuildDate), 4, 1, inFile); - bytesRead += 4 * fread(&(tHeader.startOf_kCLDTable), 4, 1, inFile); - bytesRead += 4 * fread(&(tHeader.lengthOf_kCLDTable), 4, 1, inFile); - bytesRead += 4 * fread(&(tHeader.startOf_kCLDTableInd), 4, 1, inFile); - bytesRead += 4 * fread(&(tHeader.lengthOf_kCLDTableInd), 4, 1, inFile); - bytesRead += 4 * fread(&(tHeader.startOf_kRecognizedLangScripts), 4, 1, inFile); - bytesRead += 4 * fread(&(tHeader.lengthOf_kRecognizedLangScripts), 4, 1, inFile); + CLD2DynamicData::TableHeader* tableHeaders = new CLD2DynamicData::TableHeader[header->numTablesEncoded]; + header->tableHeaders = tableHeaders; + for (int x=0; xnumTablesEncoded; x++) { + CLD2DynamicData::TableHeader *header = &(tableHeaders[x]); + CLD2_READINT(kCLDTableSizeOne); + CLD2_READINT(kCLDTableSize); + CLD2_READINT(kCLDTableKeyMask); + CLD2_READINT(kCLDTableBuildDate); + CLD2_READINT(startOf_kCLDTable); + CLD2_READINT(lengthOf_kCLDTable); + CLD2_READINT(startOf_kCLDTableInd); + CLD2_READINT(lengthOf_kCLDTableInd); + CLD2_READINT(startOf_kRecognizedLangScripts); + CLD2_READINT(lengthOf_kRecognizedLangScripts); } // Confirm header size is correct. - int expectedHeaderSize = CLD2DynamicData::calculateHeaderSize(fileHeader->numTablesEncoded); + int expectedHeaderSize = CLD2DynamicData::calculateHeaderSize(header->numTablesEncoded); if (expectedHeaderSize != bytesRead) { std::cerr << "Header size mismatch! Expected " << expectedHeaderSize << ", but read " << bytesRead << std::endl; - delete fileHeader; + delete header; delete tableHeaders; return NULL; } - // Confirm file size is correct. - fseek(inFile, 0, SEEK_END); - int actualSize = ftell(inFile); - fclose(inFile); + int actualSize = 0; + if (sourceIsFile) { + // Confirm file size is correct. + fseek(inFile, 0, SEEK_END); + actualSize = ftell(inFile); + fclose(inFile); + } else { + actualSize = length; + } - if (actualSize != fileHeader->totalFileSizeBytes) { - std::cerr << "File size mismatch! Expected " << fileHeader->totalFileSizeBytes << ", but found " << actualSize << std::endl; - delete fileHeader; + if (actualSize != header->totalFileSizeBytes) { + std::cerr << "File size mismatch! Expected " << header->totalFileSizeBytes << ", but found " << actualSize << std::endl; + delete header; delete tableHeaders; return NULL; } - return fileHeader; + return header; } -void unloadData(CLD2::ScoringTables** scoringTables, void** mmapAddress, int* mmapLength) { +void unloadDataFile(CLD2::ScoringTables** scoringTables, + void** mmapAddress, int* mmapLength) { + CLD2DynamicDataLoader::unloadDataRaw(scoringTables); + munmap(*mmapAddress, *mmapLength); + *mmapAddress = NULL; + *mmapLength = 0; +} + +void unloadDataRaw(CLD2::ScoringTables** scoringTables) { free(const_cast((*scoringTables)->unigram_obj)); (*scoringTables)->unigram_obj = NULL; delete((*scoringTables)->unigram_compat_obj); // tableSummaries[0] from loadDataFile (*scoringTables)->unigram_compat_obj = NULL; delete(*scoringTables); *scoringTables = NULL; - munmap(*mmapAddress, *mmapLength); - *mmapAddress = NULL; - *mmapLength = 0; } -CLD2::ScoringTables* loadDataFile(const char* fileName, void** mmapAddressOut, int* mmapLengthOut) { - CLD2DynamicData::FileHeader* fileHeader = loadHeader(fileName); - if (fileHeader == NULL) { +CLD2::ScoringTables* loadDataFile(const char* fileName, + void** mmapAddressOut, int* mmapLengthOut) { + CLD2DynamicData::FileHeader* header = loadHeaderFromFile(fileName); + if (header == NULL) { return NULL; } // Initialize the memory map int inFileHandle = open(fileName, O_RDONLY); - void* mapped = mmap(NULL, fileHeader->totalFileSizeBytes, + void* mapped = mmap(NULL, header->totalFileSizeBytes, PROT_READ, MAP_PRIVATE, inFileHandle, 0); // Record the map address. This allows callers to unmap *mmapAddressOut=mapped; - *mmapLengthOut=fileHeader->totalFileSizeBytes; + *mmapLengthOut=header->totalFileSizeBytes; close(inFileHandle); + return loadDataInternal(header, mapped, header->totalFileSizeBytes); +} + +CLD2::ScoringTables* loadDataRaw(const void* basePointer, const int length) { + CLD2DynamicData::FileHeader* header = loadHeaderFromRaw(basePointer, length); + return loadDataInternal(header, basePointer, length); +} + +CLD2::ScoringTables* loadDataInternal(CLD2DynamicData::FileHeader* header, const void* basePointer, const int length) { // 1. UTF8 Object - const CLD2::uint8* state_table = static_cast(mapped) + - fileHeader->startOf_utf8PropObj_state_table; + const CLD2::uint8* state_table = static_cast(basePointer) + + header->startOf_utf8PropObj_state_table; // FIXME: Unsafe to rely on this since RemapEntry is not a bit-packed structure const CLD2::RemapEntry* remap_base = reinterpret_cast( - static_cast(mapped) + - fileHeader->startOf_utf8PropObj_remap_base); - const CLD2::uint8* remap_string = static_cast(mapped) + - fileHeader->startOf_utf8PropObj_remap_string; + static_cast(basePointer) + + header->startOf_utf8PropObj_remap_base); + const CLD2::uint8* remap_string = static_cast(basePointer) + + header->startOf_utf8PropObj_remap_string; const CLD2::uint8* fast_state = - fileHeader->startOf_utf8PropObj_fast_state == 0 ? 0 : - static_cast(mapped) + - fileHeader->startOf_utf8PropObj_fast_state; + header->startOf_utf8PropObj_fast_state == 0 ? 0 : + static_cast(basePointer) + + header->startOf_utf8PropObj_fast_state; // Populate intermediate object. Horrible casting required because the struct // is all read-only integers, and doesn't have a constructor. Yikes. // TODO: It might actually be less horrible to memcpy the data in const CLD2::UTF8PropObj* unigram_obj = reinterpret_cast(malloc(sizeof(CLD2::UTF8PropObj))); - *const_cast(&unigram_obj->state0) = fileHeader->utf8PropObj_state0; - *const_cast(&unigram_obj->state0_size) = fileHeader->utf8PropObj_state0_size; - *const_cast(&unigram_obj->total_size) = fileHeader->utf8PropObj_total_size; - *const_cast(&unigram_obj->max_expand) = fileHeader->utf8PropObj_max_expand; - *const_cast(&unigram_obj->entry_shift) = fileHeader->utf8PropObj_entry_shift; - *const_cast(&unigram_obj->bytes_per_entry) = fileHeader->utf8PropObj_bytes_per_entry; - *const_cast(&unigram_obj->losub) = fileHeader->utf8PropObj_losub; - *const_cast(&unigram_obj->hiadd) = fileHeader->utf8PropObj_hiadd; + *const_cast(&unigram_obj->state0) = header->utf8PropObj_state0; + *const_cast(&unigram_obj->state0_size) = header->utf8PropObj_state0_size; + *const_cast(&unigram_obj->total_size) = header->utf8PropObj_total_size; + *const_cast(&unigram_obj->max_expand) = header->utf8PropObj_max_expand; + *const_cast(&unigram_obj->entry_shift) = header->utf8PropObj_entry_shift; + *const_cast(&unigram_obj->bytes_per_entry) = header->utf8PropObj_bytes_per_entry; + *const_cast(&unigram_obj->losub) = header->utf8PropObj_losub; + *const_cast(&unigram_obj->hiadd) = header->utf8PropObj_hiadd; *const_cast(&unigram_obj->state_table) = state_table; *const_cast(&unigram_obj->remap_base) = remap_base; *const_cast(&unigram_obj->remap_string) = remap_string; @@ -168,22 +215,22 @@ CLD2::ScoringTables* loadDataFile(const char* fileName, void** mmapAddressOut, i // 2. kAvgDeltaOctaScore array const short* read_kAvgDeltaOctaScore = reinterpret_cast( - static_cast(mapped) + - fileHeader->startOf_kAvgDeltaOctaScore); + static_cast(basePointer) + + header->startOf_kAvgDeltaOctaScore); // 3. Each table - CLD2::CLD2TableSummary* tableSummaries = new CLD2::CLD2TableSummary[fileHeader->numTablesEncoded]; - for (int x=0; xnumTablesEncoded; x++) { + CLD2::CLD2TableSummary* tableSummaries = new CLD2::CLD2TableSummary[header->numTablesEncoded]; + for (int x=0; xnumTablesEncoded; x++) { CLD2::CLD2TableSummary &summary = tableSummaries[x]; - CLD2DynamicData::TableHeader& tHeader = fileHeader->tableHeaders[x]; + CLD2DynamicData::TableHeader& tHeader = header->tableHeaders[x]; const CLD2::IndirectProbBucket4* kCLDTable = reinterpret_cast( - static_cast(mapped) + tHeader.startOf_kCLDTable); + static_cast(basePointer) + tHeader.startOf_kCLDTable); const CLD2::uint32* kCLDTableInd = reinterpret_cast( - static_cast(mapped) + tHeader.startOf_kCLDTableInd); + static_cast(basePointer) + tHeader.startOf_kCLDTableInd); const char* kRecognizedLangScripts = - static_cast(mapped) + tHeader.startOf_kRecognizedLangScripts; + static_cast(basePointer) + tHeader.startOf_kRecognizedLangScripts; summary.kCLDTable = kCLDTable; summary.kCLDTableInd = kCLDTableInd; @@ -205,8 +252,9 @@ CLD2::ScoringTables* loadDataFile(const char* fileName, void** mmapAddressOut, i result->deltaocta_obj = &tableSummaries[5]; result->distinctocta_obj = &tableSummaries[6]; result->kExpectedScore = read_kAvgDeltaOctaScore; - delete fileHeader->tableHeaders; - delete fileHeader; + delete header->tableHeaders; + delete header; return result; } -} + +} // namespace CLD2DynamicDataLoader diff --git a/internal/cld2_dynamic_data_loader.h b/internal/cld2_dynamic_data_loader.h index 9beba0f..7724e14 100644 --- a/internal/cld2_dynamic_data_loader.h +++ b/internal/cld2_dynamic_data_loader.h @@ -23,7 +23,17 @@ namespace CLD2DynamicDataLoader { // Read a header from the specified file and return it. // The header returned is dynamically allocated; you must 'delete' the array // of TableHeaders as well as the returned FileHeader* when done. -CLD2DynamicData::FileHeader* loadHeader(const char* fileName); +CLD2DynamicData::FileHeader* loadHeaderFromFile(const char* fileName); + +// Read a header from the specified area of raw memory and return it. +// The header returned is dynamically allocated; you must 'delete' the array +// of TableHeaders as well as the returned FileHeader* when done. +CLD2DynamicData::FileHeader* loadHeaderFromRaw( + const void* basePointer, const int length); + +// Not for public consumption. +CLD2DynamicData::FileHeader* loadInternal( + FILE* inFile, const void* basePointer, const int length); // Load data directly into a ScoringTables structure using a private, read-only // mmap and return the newly-allocated structure. @@ -31,10 +41,20 @@ CLD2DynamicData::FileHeader* loadHeader(const char* fileName); // address of the mmap()'d block will be written here. // The out-parameter "mmapLengthOut" is a pointer to an int; the length of the // mmap()'d block will be written here. -// It is up to the caller to delete +// It is up to the caller to delete the data at a later time using +// unloadData(...). CLD2::ScoringTables* loadDataFile(const char* fileName, void** mmapAddressOut, int* mmapLengthOut); +// Load data directly into a ScoringTables structure from an arbitrary region +// of memory, which is assumed to be a pointer to an mmap-ed region of memory +// backed by a valid data file that could alternatively be read (if access +// were allowed or desired) using loadDataFile(...). +CLD2::ScoringTables* loadDataRaw(const void* basePointer, const int length); + +// Not for public consumption. +CLD2::ScoringTables* loadDataInternal(CLD2DynamicData::FileHeader* header, const void* basePointer, const int length); + // Given pointers to the data from a previous invocation of loadDataFile, // unloads the data safely - freeing and deleting any malloc'd/new'd objects. // When this method returns, the mmap has been deleted, as have all the scoring @@ -45,8 +65,14 @@ CLD2::ScoringTables* loadDataFile(const char* fileName, // This is the only safe way to unload data that was previously loaded, as there // is an unfortunate mixture of new and malloc involved in building the // in-memory represtation of the data. -void unloadData(CLD2::ScoringTables** scoringTables, +void unloadDataFile(CLD2::ScoringTables** scoringTables, void** mmapAddress, int* mmapLength); +// Given a pointer to the data from a previous invocation of loadDataRaw, +// unloads the data safely just like unloadDataFile does. This method doesn't +// deal with mmaps, since it is assumed that the memory for the data is managed +// external to this library. +void unloadDataRaw(CLD2::ScoringTables** scoringTables); + } // End namespace CLD2DynamicDataExtractor #endif // CLD2_INTERNAL_CLD2_DYNAMIC_DATA_EXTRACTOR_H_ diff --git a/internal/cld2_dynamic_data_tool.cc b/internal/cld2_dynamic_data_tool.cc index 18f8cd8..de97f88 100644 --- a/internal/cld2_dynamic_data_tool.cc +++ b/internal/cld2_dynamic_data_tool.cc @@ -128,7 +128,7 @@ Usage:\n\ static_cast(&realData), fileName); } else if (mode == 3) { // head - CLD2DynamicData::FileHeader* header = CLD2DynamicDataLoader::loadHeader(fileName); + CLD2DynamicData::FileHeader* header = CLD2DynamicDataLoader::loadHeaderFromFile(fileName); if (header == NULL) { std::cerr << "Cannot read header from file: " << fileName << std::endl; return -1; @@ -150,7 +150,7 @@ Usage:\n\ bool result = CLD2DynamicData::verify( static_cast(&realData), static_cast(loadedData)); - CLD2DynamicDataLoader::unloadData(&loadedData, &mmapAddress, &mmapLength); + CLD2DynamicDataLoader::unloadDataFile(&loadedData, &mmapAddress, &mmapLength); if (loadedData != NULL || mmapAddress != NULL || mmapLength != 0) { std::cerr << "Warning: failed to clean up memory for ScoringTables." << std::endl; } diff --git a/internal/cld2_unittest.cc b/internal/cld2_unittest.cc index 570c44d..57766c1 100644 --- a/internal/cld2_unittest.cc +++ b/internal/cld2_unittest.cc @@ -22,6 +22,9 @@ #include #include +#include +#include +#include #include "../public/compact_lang_det.h" #include "../public/encodings.h" @@ -265,19 +268,19 @@ int RunTests (int flags, bool get_vector) { fprintf(stdout, "[DYNAMIC] Test running in dynamic data mode!\n"); bool dataLoaded = CLD2::isDataLoaded(); if (dataLoaded) { - fprintf(stderr, "[DYNAMIC] *** Error: CLD2::isDataLoaded() returned true prior to loading data!\n"); + fprintf(stderr, "[DYNAMIC] *** Error: CLD2::isDataLoaded() returned true prior to loading data from file!\n"); any_fail = true; } fprintf(stdout, "[DYNAMIC] Attempting translation prior to loading data\n"); any_fail |= !OneTest(flags, get_vector, UNKNOWN_LANGUAGE, kTeststr_en, strlen(kTeststr_en)); fprintf(stdout, "[DYNAMIC] Loading data from: %s\n", data_file); - CLD2::loadData(data_file); + CLD2::loadDataFromFile(data_file); dataLoaded = CLD2::isDataLoaded(); if (!dataLoaded) { - fprintf(stderr, "[DYNAMIC] *** Error: CLD2::isDataLoaded() returned false after loading data!\n"); + fprintf(stderr, "[DYNAMIC] *** Error: CLD2::isDataLoaded() returned false after loading data from file!\n"); any_fail = true; } - fprintf(stdout, "[DYNAMIC] Data loaded, normal tests commencing\n"); + fprintf(stdout, "[DYNAMIC] Data loaded, file-based tests commencing\n"); #endif int i = 0; @@ -291,15 +294,60 @@ int RunTests (int flags, bool get_vector) { } #ifdef CLD2_DYNAMIC_MODE - fprintf(stdout, "[DYNAMIC] Normal tests complete, attempting to unload data\n"); + fprintf(stdout, "[DYNAMIC] File-based tests complete, attempting to unload file data\n"); CLD2::unloadData(); dataLoaded = CLD2::isDataLoaded(); if (dataLoaded) { - fprintf(stderr, "[DYNAMIC] *** Error: CLD2::isDataLoaded() returned true after unloading data!\n"); + fprintf(stderr, "[DYNAMIC] *** Error: CLD2::isDataLoaded() returned true after unloading file data!\n"); any_fail = true; } fprintf(stdout, "[DYNAMIC] Attempting translation after unloading data\n"); any_fail |= !OneTest(flags, get_vector, UNKNOWN_LANGUAGE, kTeststr_en, strlen(kTeststr_en)); + + // Now, run the whole thing again, but this time mmap the file's contents + // and hit the external-mmap code. + fprintf(stdout, "[DYNAMIC] mmaping data for external-mmap test.\n"); + FILE* inFile = fopen(data_file, "r"); + fseek(inFile, 0, SEEK_END); + const int actualSize = ftell(inFile); + fclose(inFile); + + int inFileHandle = open(data_file, O_RDONLY); + void* mapped = mmap(NULL, actualSize, + PROT_READ, MAP_PRIVATE, inFileHandle, 0); + close(inFileHandle); + + fprintf(stdout, "[DYNAMIC] mmap'ed successfully, attempting data load.\n"); + CLD2::loadDataFromRawAddress(mapped, actualSize); + dataLoaded = CLD2::isDataLoaded(); + if (!dataLoaded) { + fprintf(stderr, "[DYNAMIC] *** Error: CLD2::isDataLoaded() returned false after loading data from mmap!\n"); + any_fail = true; + } + + // Reset and run the tests again + fprintf(stdout, "[DYNAMIC] Data loaded, mmap-based tests commencing\n"); + i = 0; + while (kTestPair[i].text != NULL) { + Language lang_expected = kTestPair[i].lang; + const char* buffer = kTestPair[i].text; + int buffer_length = strlen(buffer); + bool ok = OneTest(flags, get_vector, lang_expected, buffer, buffer_length); + any_fail |= (!ok); + ++i; + } + + fprintf(stdout, "[DYNAMIC] Mmap-based tests complete, attempting to unload data\n"); + CLD2::unloadData(); + dataLoaded = CLD2::isDataLoaded(); + if (dataLoaded) { + fprintf(stderr, "[DYNAMIC] *** Error: CLD2::isDataLoaded() returned true after unloading mmap data!\n"); + any_fail = true; + } + fprintf(stdout, "[DYNAMIC] Attempting translation after unloading map data\n"); + any_fail |= !OneTest(flags, get_vector, UNKNOWN_LANGUAGE, kTeststr_en, strlen(kTeststr_en)); + + fprintf(stdout, "[DYNAMIC] All dynamic-mode tests complete\n"); #endif if (any_fail) { diff --git a/internal/compact_lang_det_impl.cc b/internal/compact_lang_det_impl.cc index e01fdce..bb22b89 100644 --- a/internal/compact_lang_det_impl.cc +++ b/internal/compact_lang_det_impl.cc @@ -70,7 +70,9 @@ extern const short kAvgDeltaOctaScore[]; #ifdef CLD2_DYNAMIC_MODE // CLD2_DYNAMIC_MODE is defined: // Data will be read from an mmap opened at runtime. - static ScoringTables kScoringtables = { + + // Convenience for nulling things out completely at any point. + static ScoringTables NULL_TABLES = { NULL, //&cld_generated_CjkUni_obj, NULL, //&kCjkCompat_obj, NULL, //&kCjkDeltaBi_obj, @@ -81,27 +83,45 @@ extern const short kAvgDeltaOctaScore[]; NULL, //&kDistinctOcta_obj, NULL, //kAvgDeltaOctaScore, }; + static ScoringTables kScoringtables = NULL_TABLES; // copy constructed static bool dynamicDataLoaded = false; + static bool dataSourceIsFile = false; static ScoringTables* dynamicTables = NULL; static void* mmapAddress = NULL; static int mmapLength = 0; bool isDataLoaded() { return dynamicDataLoaded; } - void loadData(const char* fileName) { + void loadDataFromFile(const char* fileName) { if (isDataLoaded()) { unloadData(); } dynamicTables = CLD2DynamicDataLoader::loadDataFile(fileName, &mmapAddress, &mmapLength); kScoringtables = *dynamicTables; + dataSourceIsFile = true; dynamicDataLoaded = true; }; + void loadDataFromRawAddress(const void* rawAddress, const int length) { + if (isDataLoaded()) { + unloadData(); + } + dynamicTables = CLD2DynamicDataLoader::loadDataRaw(rawAddress, length); + kScoringtables = *dynamicTables; + dataSourceIsFile = false; + dynamicDataLoaded = true; + } + void unloadData() { if (!dynamicDataLoaded) return; + if (dataSourceIsFile) { + CLD2DynamicDataLoader::unloadDataFile(&dynamicTables, &mmapAddress, &mmapLength); + } else { + CLD2DynamicDataLoader::unloadDataRaw(&dynamicTables); + } dynamicDataLoaded = false; - // unloading will null all the pointers out. - CLD2DynamicDataLoader::unloadData(&dynamicTables, &mmapAddress, &mmapLength); + dataSourceIsFile = false; // vacuous + kScoringtables = NULL_TABLES; // Housekeeping: null all pointers } #else // This initializes kScoringtables.quadgram_obj etc. diff --git a/internal/compact_lang_det_test.cc b/internal/compact_lang_det_test.cc index f2fb1a7..6793b05 100644 --- a/internal/compact_lang_det_test.cc +++ b/internal/compact_lang_det_test.cc @@ -245,7 +245,7 @@ int main(int argc, char** argv) { return -1; } fprintf(stdout, "Loading data from: %s\n", data_file); - CLD2::loadData(data_file); + CLD2::loadDataFromFile(data_file); fprintf(stdout, "Data loaded, test commencing\n"); #endif diff --git a/public/compact_lang_det.h b/public/compact_lang_det.h index da59abd..b0b1a84 100644 --- a/public/compact_lang_det.h +++ b/public/compact_lang_det.h @@ -300,17 +300,35 @@ void DumpResultChunkVector(FILE* f, const char* src, // If compiled with dynamic mode, load data from the specified file location. // If other data has already been loaded, it is discarded and the data is read // in from the specified file location again (even if the file has not changed). -// WARNING: Before calling this method, language detection will always fail -// and will always return the unknown language. -void loadData(const char* fileName); +// If data needs to be loaded in a context where direct access to the file +// system is either undesireable or impossible, use loadDataFromRawAddress +// instead to read the data from an arbitrary region in memory (such as a +// mmap-ed file). +// WARNING: Before calling one of the provided "loadData" methods, language +// detection will always fail and will always return the unknown language. +void loadDataFromFile(const char* fileName); -// If compiled with dynamic mode, unload the previously-loaded data. +// If compiled with dynamic mode, load data from the specified location in +// memory. +// This method is provided as an alternative to loadDataFromFile() for use cases +// where the loading process may not have direct access to the file system, +// e.g., where the direct process knows the pointer to an mmap region in system +// memory where the data file's contents have been loaded. +// If other data has already been loaded, it is discarded and the data is read +// in from the specified location again (even if it has not changed). +// WARNING: Before calling one of the provided "loadData" methods, language +// detection will always fail and will always return the unknown language. +void loadDataFromRawAddress(const void* rawAddress, const int length); + +// If compiled with dynamic mode, unload the data that was previously loaded +// via loadDataFromFile() or loadDataFromRawAddress(). // WARNING: After calling this method, language detection will no longer work // and will always return the unknown language. void unloadData(); -// Returns true if and only if data has been loaded via a call to loadData(...) -// and has not been subsequently unladed via a call to unloadDate(). +// Returns true if and only if data has been loaded via a call to +// loadDataFromFile(...) or loadDataFromRawAddress(...) and has not been +// subsequently unladed via a call to unloadData(). bool isDataLoaded(); #endif // #ifdef CLD2_DYNAMIC_MODE