Initial CLD2 source upload.
git-svn-id: https://cld2.googlecode.com/svn/trunk@3 b252ecd4-b096-bf77-eb8e-91563289f87e
This commit is contained in:
110
internal/getonescriptspan.h
Normal file
110
internal/getonescriptspan.h
Normal file
@@ -0,0 +1,110 @@
|
||||
// Copyright 2013 Google Inc. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
//
|
||||
// Author: dsites@google.com (Dick Sites)
|
||||
//
|
||||
|
||||
|
||||
#ifndef I18N_ENCODINGS_CLD2_INTERNAL_GETONESCRIPTSPAN_H_
|
||||
#define I18N_ENCODINGS_CLD2_INTERNAL_GETONESCRIPTSPAN_H_
|
||||
|
||||
#include "integral_types.h"
|
||||
#include "langspan.h"
|
||||
#include "offsetmap.h"
|
||||
|
||||
namespace CLD2 {
|
||||
|
||||
static const int kMaxScriptBuffer = 40960;
|
||||
static const int kMaxScriptLowerBuffer = (kMaxScriptBuffer * 3) / 2;
|
||||
static const int kMaxScriptBytes = kMaxScriptBuffer - 32; // Leave some room
|
||||
static const int kWithinScriptTail = 32; // Stop at word space in last
|
||||
// N bytes of script buffer
|
||||
|
||||
|
||||
static inline bool IsContinuationByte(char c) {
|
||||
return static_cast<signed char>(c) < -64;
|
||||
}
|
||||
|
||||
// Gets lscript number for letters; always returns
|
||||
// 0 (common script) for non-letters
|
||||
int GetUTF8LetterScriptNum(const char* src);
|
||||
|
||||
// Update src pointer to point to next quadgram, +2..+5
|
||||
// Looks at src[0..4]
|
||||
const char* AdvanceQuad(const char* src);
|
||||
|
||||
|
||||
class ScriptScanner {
|
||||
public:
|
||||
ScriptScanner(const char* buffer, int buffer_length, bool is_plain_text);
|
||||
ScriptScanner(const char* buffer, int buffer_length, bool is_plain_text,
|
||||
bool any_text, bool any_script);
|
||||
~ScriptScanner();
|
||||
|
||||
// Copy next run of same-script non-tag letters to buffer [NUL terminated]
|
||||
bool GetOneScriptSpan(LangSpan* span);
|
||||
|
||||
// Force Latin and Cyrillic scripts to be lowercase
|
||||
void LowerScriptSpan(LangSpan* span);
|
||||
|
||||
// Copy next run of same-script non-tag letters to buffer [NUL terminated]
|
||||
// Force Latin and Cyrillic scripts to be lowercase
|
||||
bool GetOneScriptSpanLower(LangSpan* span);
|
||||
|
||||
// Copy next run of non-tag characters to buffer [NUL terminated]
|
||||
// This just removes tags and removes entities
|
||||
// Buffer has leading space
|
||||
bool GetOneTextSpan(LangSpan* span);
|
||||
|
||||
// Maps byte offset in most recent GetOneScriptSpan/Lower
|
||||
// span->text [0..text_bytes] into an additional byte offset from
|
||||
// span->offset, to get back to corresponding text in the original
|
||||
// input buffer.
|
||||
// text_offset must be the first byte
|
||||
// of a UTF-8 character, or just beyond the last character. Normally this
|
||||
// routine is called with the first byte of an interesting range and
|
||||
// again with the first byte of the following range.
|
||||
int MapBack(int text_offset);
|
||||
|
||||
const char* GetBufferStart() {return start_byte_;};
|
||||
|
||||
private:
|
||||
// Skip over tags and non-letters
|
||||
int SkipToFrontOfSpan(const char* src, int len, int* script);
|
||||
|
||||
const char* start_byte_; // Starting byte of buffer to scan
|
||||
const char* next_byte_; // First unscanned byte
|
||||
const char* next_byte_limit_; // Last byte + 1
|
||||
int byte_length_; // Bytes left: next_byte_limit_ - next_byte_
|
||||
|
||||
bool is_plain_text_; // true fo text, false for HTML
|
||||
char* script_buffer_; // Holds text with expanded entities
|
||||
char* script_buffer_lower_; // Holds lowercased text
|
||||
bool letters_marks_only_; // To distinguish scriptspan of one
|
||||
// letters/marks vs. any mixture of text
|
||||
bool one_script_only_; // To distinguish scriptspan of one
|
||||
// script vs. any mixture of scripts
|
||||
int exit_state_; // For tag parser kTagParseTbl_0, based
|
||||
// on letters_marks_only_
|
||||
public :
|
||||
// Expose for debugging
|
||||
OffsetMap map2original_; // map from script_buffer_ to buffer
|
||||
OffsetMap map2uplow_; // map from script_buffer_lower_ to script_buffer_
|
||||
};
|
||||
|
||||
} // namespace CLD2
|
||||
|
||||
#endif // I18N_ENCODINGS_CLD2_INTERNAL_GETONESCRIPTSPAN_H_
|
||||
|
Reference in New Issue
Block a user