cld2/public/compact_lang_det.h

// Copyright 2013 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

//
// Author: dsites@google.com (Dick Sites)
//

// NOTE:
// Baybayin (ancient script of the Philippines) is detected as TAGALOG.
// Chu Nom (Vietnamese ancient Han characters) is detected as VIETNAMESE.
// HAITIAN_CREOLE is detected as such.
// NORWEGIAN and NORWEGIAN_N are detected separately (but not robustly)
// PORTUGUESE, PORTUGUESE_P, and PORTUGUESE_B are all detected as PORTUGUESE.
// ROMANIAN-Latin is detected as ROMANIAN; ROMANIAN-Cyrillic as ROMANIAN.
// BOSNIAN is not detected as such, but likely scores as Croatian or Serbian.
// MONTENEGRIN is not detected as such, but likely scores as Serbian.
// CROATIAN is detected in the Latin script
// SERBIAN is detected in the Cyrililc and Latin scripts
// Zhuang is detected in the Latin script only.
//
// The languages X_PIG_LATIN and X_KLINGON are detected in the
//  extended calls ExtDetectLanguageSummary().
//
// UNKNOWN_LANGUAGE is returned if no language's internal reliablity measure
//  is high enough. This happens with non-text input such as the bytes of a
//  JPEG, and also with text in languages outside training set.
//
// The following languages are to be detected in multiple scripts:
//  AZERBAIJANI (Latin, Cyrillic*, Arabic*)
//  BURMESE (Latin, Myanmar)
//  HAUSA (Latin, Arabic)
//  KASHMIRI (Arabic, Devanagari)
//  KAZAKH (Latin, Cyrillic, Arabic)
//  KURDISH (Latin, Arabic)
//  KYRGYZ (Cyrillic, Arabic)
//  LIMBU (Devanagari, Limbu)
//  MONGOLIAN (Cyrillic, Mongolian)
//  SANSKRIT (Latin, Devanagari)
//  SINDHI (Arabic, Devanagari)
//  TAGALOG (Latin, Tagalog)
//  TAJIK (Cyrillic, Arabic*)
//  TATAR (Latin, Cyrillic, Arabic)
//  TURKMEN (Latin, Cyrillic, Arabic)
//  UIGHUR (Latin, Cyrillic, Arabic)
//  UZBEK (Latin, Cyrillic, Arabic)
//
// * Due to a shortage of training text, AZERBAIJANI is not currently detected
//   in Arabic or Cyrillic scripts, nor TAJIK in Arabic script.
//

#ifndef I18N_ENCODINGS_CLD2_PUBLIC_COMPACT_LANG_DET_H_
#define I18N_ENCODINGS_CLD2_PUBLIC_COMPACT_LANG_DET_H_

#include <stdint.h>
#include <vector>
#include "../internal/integral_types.h"   // For uint8 etc.
#include "../internal/lang_script.h"      // For Language

namespace CLD2 {

// NOTE: If you cannot prove the the input text is valid UTF-8 by design because
// it went through a known-good conversion program, call one of the *CheckUTF8
// routines. For example, never trust raw user-supplied bytes. It is especially
// important to do a UTF8-to-UTF8 conversion on raw bytes that claim to be
// UTF-8, using a converter that guarantees to produce valid UTF-8, turning
// other byte sequences into the Unicode replacement character U+FFFD (deleting
// or turning into space or question-mark can create security holes).

  // Scan interchange-valid UTF-8 bytes and detect most likely language,
  // or set of languages.
  //
  // Design goals:
  //   Skip over big stretches of HTML tags
  //   Able to return ranges of different languages
  //   Relatively small tables and relatively fast processing
  //   Thread safe
  //
  // For HTML documents, tags are skipped, along with <script> ... </script>
  // and <style> ... </style> sequences, and entities are expanded.
  //
  // We distinguish between bytes of the raw input buffer and bytes of non-tag
  // text letters. Since tags can be over 50% of the bytes of an HTML Page,
  // and are nearly all seven-bit ASCII English, we prefer to distinguish
  // language mixture fractions based on just the non-tag text.
  //
  // Inputs: text and text_length
  //  Code skips HTML tags and expands HTML entities, unless
  //  is_plain_text is true
  // Outputs:
  //  language3 is an array of the top 3 languages or UNKNOWN_LANGUAGE
  //  percent3 is an array of the text percentages 0..100 of the top 3 languages
  //  text_bytes is the amount of non-tag/letters-only text found
  //  is_reliable set true if the returned Language is some amount more
  //   probable then the second-best Language. Calculation is a complex function
  //   of the length of the text and the different-script runs of text.
  // Return value: the most likely Language for the majority of the input text
  //  Length 0 input returns UNKNOWN_LANGUAGE. Very short indeterminate text
  //  defaults to ENGLISH.
  //
  // The first two versions return ENGLISH instead of UNKNOWN_LANGUAGE, for
  // backwards compatibility with a different detector.
  //
  // The third version may return UNKNOWN_LANGUAGE, and also returns extended
  // language codes from lang_script.h
  //


  // Instead of individual arguments, pass in hints as an initialized struct
  // Init to {NULL, NULL, UNKNOWN_ENCODING, UNKNOWN_LANGUAGE} if not known.
  //
  // Pass in hints whenever possible; doing so improves detection accuracy. The
  // set of passed-in hints are all information that is external to the text
  // itself.
  //
  // The content_language_hint is intended to come from an HTTP header
  // Content-Language: field, the tld_hint from the hostname of a URL, the
  // encoding-hint from an encoding detector applied to the input
  // document, and the language hint from any other context you might have.
  // The lang= tags inside an HTML document will be picked up as hints
  // by code within the compact language detector.

  typedef struct {
    const char* content_language_hint;      // "mi,en" boosts Maori and English
    const char* tld_hint;                   // "id" boosts Indonesian
    int encoding_hint;                      // SJS boosts Japanese
    Language language_hint;                 // ITALIAN boosts it
  } CLDHints;

  static const int32 kMaxResultChunkBytes = 0x7fffffff;

  // Note: this was initially over-optimized to fit into 8 bytes,
  // causing too much work to deal with with greater than 16-bit byte lengths.
  // For returning a vector of per-language pieces of the input buffer
  // Unreliable and too-short are mapped to UNKNOWN_LANGUAGE
  typedef struct {
    int offset;                 // Starting byte offset in original buffer
    int32 bytes;                // Number of bytes in chunk
    uint16 lang1;               // Top lang, as full Language. Apply
                                //  static_cast<Language>() to this short value.
    uint16 pad;                 // Make multiple of 4 bytes
  } ResultChunk;
  typedef std::vector<ResultChunk> ResultChunkVector;


  // These initial simple versions all cascade through the full-blown last
  // version which it would be better for you to use directly because you will
  // get better results passing in any available hints.

  // Scan interchange-valid UTF-8 bytes and detect most likely language
  // If the input is in fact not valid UTF-8, this returns immediately with
  // the result value UNKNOWN_LANGUAGE and is_reliable set to false.
  //
  // In all cases, valid_prefix_bytes will be set to the number of leading
  // bytes that are valid UTF-8. If this is < buffer_length, there is invalid
  // input starting at the following byte.
  Language DetectLanguageCheckUTF8(
                          const char* buffer,
                          int buffer_length,
                          bool is_plain_text,
                          bool* is_reliable,
                          int* valid_prefix_bytes);

  // Use this one ONLY if you can prove the the input text is valid UTF-8 by
  // design because it went through a known-good conversion program.
  // Scan interchange-valid UTF-8 bytes and detect most likely language
  Language DetectLanguage(
                          const char* buffer,
                          int buffer_length,
                          bool is_plain_text,
                          bool* is_reliable);

  // Use this one ONLY if you can prove the the input text is valid UTF-8 by
  // design because it went through a known-good conversion program.
  // Scan interchange-valid UTF-8 bytes and detect list of top 3 languages.
  // language3[0] is usually also the return value
  Language DetectLanguageSummary(
                          const char* buffer,
                          int buffer_length,
                          bool is_plain_text,
                          Language* language3,
                          int* percent3,
                          int* text_bytes,
                          bool* is_reliable);

  // Use this one ONLY if you can prove the the input text is valid UTF-8 by
  // design because it went through a known-good conversion program.
  // Same as above, with hints supplied
  // Scan interchange-valid UTF-8 bytes and detect list of top 3 languages.
  // language3[0] is usually also the return value
  Language DetectLanguageSummary(
                          const char* buffer,
                          int buffer_length,
                          bool is_plain_text,
                          const char* tld_hint,       // "id" boosts Indonesian
                          int encoding_hint,          // SJS boosts Japanese
                          Language language_hint,     // ITALIAN boosts it
                          Language* language3,
                          int* percent3,
                          int* text_bytes,
                          bool* is_reliable);

  // Use this one ONLY if you can prove the the input text is valid UTF-8 by
  // design because it went through a known-good conversion program.
  // Scan interchange-valid UTF-8 bytes and detect list of top 3 extended
  // languages.
  //
  // Extended languages are additional interface languages and Unicode
  // single-language scripts, from lang_script.h
  //
  // language3[0] is usually also the return value
  Language ExtDetectLanguageSummary(
                          const char* buffer,
                          int buffer_length,
                          bool is_plain_text,
                          Language* language3,
                          int* percent3,
                          int* text_bytes,
                          bool* is_reliable);

  // Use this one ONLY if you can prove the the input text is valid UTF-8 by
  // design because it went through a known-good conversion program.
  // Same as above, with hints supplied
  // Scan interchange-valid UTF-8 bytes and detect list of top 3 extended
  // languages.
  //
  // Extended languages are additional Google interface languages and Unicode
  // single-language scripts, from lang_script.h
  //
  // language3[0] is usually also the return value
  Language ExtDetectLanguageSummary(
                          const char* buffer,
                          int buffer_length,
                          bool is_plain_text,
                          const char* tld_hint,       // "id" boosts Indonesian
                          int encoding_hint,          // SJS boosts Japanese
                          Language language_hint,     // ITALIAN boosts it
                          Language* language3,
                          int* percent3,
                          int* text_bytes,
                          bool* is_reliable);

  // Use this one ONLY if you can prove the the input text is valid UTF-8 by
  // design because it went through a known-good conversion program.
  // Same as above, and also returns 3 internal language scores as a ratio to
  // normal score for real text in that language. Scores close to 1.0 indicate
  // normal text, while scores far away from 1.0 indicate badly-skewed text or
  // gibberish
  //
  Language ExtDetectLanguageSummary(
                          const char* buffer,
                          int buffer_length,
                          bool is_plain_text,
                          const char* tld_hint,       // "id" boosts Indonesian
                          int encoding_hint,          // SJS boosts Japanese
                          Language language_hint,     // ITALIAN boosts it
                          Language* language3,
                          int* percent3,
                          double* normalized_score3,
                          int* text_bytes,
                          bool* is_reliable);


  // Use this one.
  //
  // Hints are collected into a struct.
  // Flags are passed in (normally zero).
  //
  // Also returns 3 internal language scores as a ratio to
  // normal score for real text in that language. Scores close to 1.0 indicate
  // normal text, while scores far away from 1.0 indicate badly-skewed text or
  // gibberish
  //
  // Returns a vector of chunks in different languages, so that caller may
  // spell-check, translate, or otherwise process different parts of the input
  // buffer in language-dependant ways.
  //
  // If the input is in fact not valid UTF-8, this returns immediately with
  // the result value UNKNOWN_LANGUAGE and is_reliable set to false.
  //
  // In all cases, valid_prefix_bytes will be set to the number of leading
  // bytes that are valid UTF-8. If this is < buffer_length, there is invalid
  // input starting at the following byte.
  Language ExtDetectLanguageSummaryCheckUTF8(
                          const char* buffer,
                          int buffer_length,
                          bool is_plain_text,
                          const CLDHints* cld_hints,
                          int flags,
                          Language* language3,
                          int* percent3,
                          double* normalized_score3,
                          ResultChunkVector* resultchunkvector,
                          int* text_bytes,
                          bool* is_reliable,
                          int* valid_prefix_bytes);

  // Use this one ONLY if you can prove the the input text is valid UTF-8 by
  // design because it went through a known-good conversion program.
  //
  // Hints are collected into a struct.
  // Flags are passed in (normally zero).
  //
  // Also returns 3 internal language scores as a ratio to
  // normal score for real text in that language. Scores close to 1.0 indicate
  // normal text, while scores far away from 1.0 indicate badly-skewed text or
  // gibberish
  //
  // Returns a vector of chunks in different languages, so that caller may
  // spell-check, translate, or otherwaise process different parts of the input
  // buffer in language-dependant ways.
  //
  Language ExtDetectLanguageSummary(
                          const char* buffer,
                          int buffer_length,
                          bool is_plain_text,
                          const CLDHints* cld_hints,
                          int flags,
                          Language* language3,
                          int* percent3,
                          double* normalized_score3,
                          ResultChunkVector* resultchunkvector,
                          int* text_bytes,
                          bool* is_reliable);

  // Return version text string
  // String is "code_version - data_build_date"
  const char* DetectLanguageVersion();


  // Public use flags, debug output controls
  static const int kCLDFlagScoreAsQuads = 0x0100;  // Force Greek, etc. => quads
  static const int kCLDFlagHtml =         0x0200;  // Debug HTML => stderr
  static const int kCLDFlagCr =           0x0400;  // <cr> per chunk if HTML
  static const int kCLDFlagVerbose =      0x0800;  // More debug HTML => stderr
  static const int kCLDFlagQuiet =        0x1000;  // Less debug HTML => stderr
  static const int kCLDFlagEcho =         0x2000;  // Echo input => stderr
  static const int kCLDFlagBestEffort =   0x4000;  // Give best-effort answer,
                                                   // even on short text


/***

Flag meanings:
 kCLDFlagScoreAsQuads
   Normally, several languages are detected solely by their Unicode script.
   Combined with appropritate lookup tables, this flag forces them instead
   to be detected via quadgrams. This can be a useful refinement when looking
   for meaningful text in these languages, instead of just character sets.
   The default tables do not support this use.
 kCLDFlagHtml
   For each detection call, write an HTML file to stderr, showing the text
   chunks and their detected languages.
 kCLDFlagCr
   In that HTML file, force a new line for each chunk.
 kCLDFlagVerbose
   In that HTML file, show every lookup entry.
 kCLDFlagQuiet
   In that HTML file, suppress most of the output detail.
 kCLDFlagEcho
  Echo every input buffer to stderr.
 kCLDFlagBestEffort
  Give best-effort answer, instead of UNKNOWN_LANGUAGE. May be useful for
  short text if the caller prefers an approximate answer over none.

***/

// Debug output: Print the resultchunkvector to file f
void DumpResultChunkVector(FILE* f, const char* src,
                           ResultChunkVector* resultchunkvector);

// If compiled with dynamic mode, load data from the specified file location.
// If other data has already been loaded, it is discarded and the data is read
// in from the specified file location again (even if the file has not changed).
// If data needs to be loaded in a context where direct access to the file
// system is either undesireable or impossible, use loadDataFromRawAddress
// instead to read the data from an arbitrary region in memory (such as a
// mmap-ed file).
// WARNING: Before calling one of the provided "loadData" methods, language
// detection will always fail and will always return the unknown language.
// If not compiled with dynamic mode, this method does nothing.
void loadDataFromFile(const char* fileName);

// If compiled with dynamic mode, load data from the specified location in
// memory.
// This method is provided as an alternative to loadDataFromFile() for use cases
// where the loading process may not have direct access to the file system,
// e.g., where the direct process knows the pointer to an mmap region in system
// memory where the data file's contents have been loaded.
// If other data has already been loaded, it is discarded and the data is read
// in from the specified location again (even if it has not changed).
// WARNING: Before calling one of the provided "loadData" methods, language
// detection will always fail and will always return the unknown language.
// If not compiled with dynamic mode, this method does nothing.
void loadDataFromRawAddress(const void* rawAddress, const uint32_t length);

// If compiled with dynamic mode, unload the data that was previously loaded
// via loadDataFromFile() or loadDataFromRawAddress().
// WARNING: After calling this method, language detection will no longer work
// and will always return the unknown language.
// If not compiled with dynamic mode, this method does nothing.
void unloadData();

// Returns true if and only if data has been loaded via a call to
// loadDataFromFile(...) or loadDataFromRawAddress(...) and has not been
// subsequently unladed via a call to unloadData().
// If not compiled with dynamic mode, this method always returns true (because
// data has been statically linked).
bool isDataLoaded();

// Returns true if and only if compiled with dynamic mode, otherwise returns
// false. Callers can use this to make runtime checks for whether or not CLD2
// data needs to be dynamically initialized or not, instead of relying on the
// CLD2_DYNAMIC_MODE define.
bool isDataDynamic();

};      // End namespace CLD2

#endif  // I18N_ENCODINGS_CLD2_PUBLIC_COMPACT_LANG_DET_H_