git-svn-id: https://cld2.googlecode.com/svn/trunk@11 b252ecd4-b096-bf77-eb8e-91563289f87e
96 lines
3.0 KiB
C++
96 lines
3.0 KiB
C++
// Copyright 2013 Google Inc. All Rights Reserved.
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
//
|
|
// Author: dsites@google.com (Dick Sites)
|
|
//
|
|
|
|
#ifndef I18N_ENCODINGS_COMPACT_LANG_DET_COMPACT_LANG_DET_HINT_CODE_H__
|
|
#define I18N_ENCODINGS_COMPACT_LANG_DET_COMPACT_LANG_DET_HINT_CODE_H__
|
|
|
|
|
|
#include <string>
|
|
#include "integral_types.h"
|
|
#include "lang_script.h"
|
|
#include "../public/encodings.h"
|
|
|
|
namespace CLD2 {
|
|
|
|
// Packed <Language, weight>, weight in [-32..31] (powers of 2**1.6 ~=3.03)
|
|
// Full language in bottom 10 bits, weight in top 6 bits
|
|
typedef int16 OneCLDLangPrior;
|
|
|
|
const int kMaxOneCLDLangPrior = 14;
|
|
typedef struct {
|
|
int32 n;
|
|
OneCLDLangPrior prior[kMaxOneCLDLangPrior];
|
|
} CLDLangPriors;
|
|
|
|
// Reading exposed here; setting hidden in .cc
|
|
inline int GetCLDPriorWeight(OneCLDLangPrior olp) {
|
|
return olp >> 10;
|
|
}
|
|
inline Language GetCLDPriorLang(OneCLDLangPrior olp) {
|
|
return static_cast<Language>(olp & 0x3ff);
|
|
}
|
|
|
|
inline int32 GetCLDLangPriorCount(CLDLangPriors* lps) {
|
|
return lps->n;
|
|
}
|
|
|
|
inline void InitCLDLangPriors(CLDLangPriors* lps) {
|
|
lps->n = 0;
|
|
}
|
|
|
|
// Trim language priors to no more than max_entries, keeping largest abs weights
|
|
void TrimCLDLangPriors(int max_entries, CLDLangPriors* lps);
|
|
|
|
// Trim language tag string to canonical form for each language
|
|
// Input is from GetLangTagsFromHtml(), already lowercased
|
|
std::string TrimCLDLangTagsHint(const std::string& langtags);
|
|
|
|
// Add hints to vector of langpriors
|
|
// Input is from GetLangTagsFromHtml(), already lowercased
|
|
void SetCLDLangTagsHint(const std::string& langtags, CLDLangPriors* langpriors);
|
|
|
|
// Add hints to vector of langpriors
|
|
// Input is from HTTP content-language
|
|
void SetCLDContentLangHint(const char* contentlang, CLDLangPriors* langpriors);
|
|
|
|
// Add hints to vector of langpriors
|
|
// Input is from GetTLD(), already lowercased
|
|
void SetCLDTLDHint(const char* tld, CLDLangPriors* langpriors);
|
|
|
|
// Add hints to vector of langpriors
|
|
// Input is from DetectEncoding()
|
|
void SetCLDEncodingHint(Encoding enc, CLDLangPriors* langpriors);
|
|
|
|
// Add hints to vector of langpriors
|
|
// Input is from random source
|
|
void SetCLDLanguageHint(Language lang, CLDLangPriors* langpriors);
|
|
|
|
// Make printable string of priors
|
|
std::string DumpCLDLangPriors(const CLDLangPriors* langpriors);
|
|
|
|
|
|
// Get language tag hints from HTML body
|
|
// Normalize: remove spaces and make lowercase comma list
|
|
std::string GetLangTagsFromHtml(const char* utf8_body, int32 utf8_body_len,
|
|
int32 max_scan_bytes);
|
|
|
|
} // End namespace CLD2
|
|
|
|
#endif // I18N_ENCODINGS_COMPACT_LANG_DET_COMPACT_LANG_DET_HINT_CODE_H__
|
|
|