Initial CLD2 source upload.
git-svn-id: https://cld2.googlecode.com/svn/trunk@3 b252ecd4-b096-bf77-eb8e-91563289f87e
This commit is contained in:
437
internal/cldutil_shared.cc
Normal file
437
internal/cldutil_shared.cc
Normal file
@@ -0,0 +1,437 @@
|
||||
// Copyright 2013 Google Inc. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
//
|
||||
// Author: dsites@google.com (Dick Sites)
|
||||
//
|
||||
|
||||
#include "cldutil_shared.h"
|
||||
#include <string>
|
||||
|
||||
#include "cld2tablesummary.h"
|
||||
#include "integral_types.h"
|
||||
#include "port.h"
|
||||
#include "utf8statetable.h"
|
||||
|
||||
namespace CLD2 {
|
||||
|
||||
// Runtime routines for hashing, looking up, and scoring
|
||||
// unigrams (CJK), bigrams (CJK), quadgrams, and octagrams.
|
||||
// Unigrams and bigrams are for CJK languages only, including simplified/
|
||||
// traditional Chinese, Japanese, Korean, Vietnamese Han characters, and
|
||||
// Zhuang Han characters. Surrounding spaces are not considered.
|
||||
// Quadgrams and octagrams for for non-CJK and include two bits indicating
|
||||
// preceding and trailing spaces (word boundaries).
|
||||
|
||||
|
||||
// Indicator bits for leading/trailing space around quad/octagram
|
||||
// NOTE: 4444 bits are chosen to flip constant bits in hash of four chars of
|
||||
// 1-, 2-, or 3-bytes each.
|
||||
static const uint32 kPreSpaceIndicator = 0x00004444;
|
||||
static const uint32 kPostSpaceIndicator = 0x44440000;
|
||||
|
||||
// Little-endian masks for 0..24 bytes picked up as uint32's
|
||||
static const uint32 kWordMask0[4] = {
|
||||
0xFFFFFFFF, 0x000000FF, 0x0000FFFF, 0x00FFFFFF
|
||||
};
|
||||
|
||||
static const int kMinCJKUTF8CharBytes = 3;
|
||||
|
||||
static const int kMinGramCount = 3;
|
||||
static const int kMaxGramCount = 16;
|
||||
|
||||
static const int UTFmax = 4; // Max number of bytes in a UTF-8 character
|
||||
|
||||
|
||||
// Routines to access a hash table of <key:wordhash, value:probs> pairs
|
||||
// Buckets have 4-byte wordhash for sizes < 32K buckets, but only
|
||||
// 2-byte wordhash for sizes >= 32K buckets, with other wordhash bits used as
|
||||
// bucket subscript.
|
||||
// Probs is a packed: three languages plus a subscript for probability table
|
||||
// Buckets have all the keys together, then all the values.Key array never
|
||||
// crosses a cache-line boundary, so no-match case takes exactly one cache miss.
|
||||
// Match case may sometimes take an additional cache miss on value access.
|
||||
//
|
||||
// Other possibilites include 5 or 10 6-byte entries plus pad to make 32 or 64
|
||||
// byte buckets with single cache miss.
|
||||
// Or 2-byte key and 6-byte value, allowing 5 languages instead of three.
|
||||
|
||||
|
||||
//----------------------------------------------------------------------------//
|
||||
// Hashing groups of 1/2/4/8 letters, perhaps with spaces or underscores //
|
||||
//----------------------------------------------------------------------------//
|
||||
|
||||
// Design principles for these hash functions
|
||||
// - Few operations
|
||||
// - Handle 1-, 2-, and 3-byte UTF-8 scripts, ignoring intermixing except in
|
||||
// Latin script expect 1- and 2-byte mixtures.
|
||||
// - Last byte of each character has about 5 bits of information
|
||||
// - Spread good bits around so they can interact in at least two ways
|
||||
// with other characters
|
||||
// - Use add for additional mixing thorugh carries
|
||||
|
||||
// CJK Three-byte bigram
|
||||
// ....dddd..cccccc..bbbbbb....aaaa
|
||||
// ..................ffffff..eeeeee
|
||||
// make
|
||||
// ....dddd..cccccc..bbbbbb....aaaa
|
||||
// 000....dddd..cccccc..bbbbbb....a
|
||||
// ..................ffffff..eeeeee
|
||||
// ffffff..eeeeee000000000000000000
|
||||
//
|
||||
// CJK Four-byte bigram
|
||||
// ..dddddd..cccccc....bbbb....aaaa
|
||||
// ..hhhhhh..gggggg....ffff....eeee
|
||||
// make
|
||||
// ..dddddd..cccccc....bbbb....aaaa
|
||||
// 000..dddddd..cccccc....bbbb....a
|
||||
// ..hhhhhh..gggggg....ffff....eeee
|
||||
// ..ffff....eeee000000000000000000
|
||||
|
||||
// BIGRAM
|
||||
// Pick up 1..8 bytes and hash them via mask/shift/add. NO pre/post
|
||||
// OVERSHOOTS up to 3 bytes
|
||||
// For runtime use of tables
|
||||
// Does X86 unaligned loads
|
||||
uint32 BiHashV2(const char* word_ptr, int bytecount) {
|
||||
if (bytecount == 0) {return 0;}
|
||||
const uint32* word_ptr32 = reinterpret_cast<const uint32*>(word_ptr);
|
||||
uint32 word0, word1;
|
||||
if (bytecount <= 4) {
|
||||
word0 = UNALIGNED_LOAD32(word_ptr32) & kWordMask0[bytecount & 3];
|
||||
word0 = word0 ^ (word0 >> 3);
|
||||
return word0;
|
||||
}
|
||||
// Else do 8 bytes
|
||||
word0 = UNALIGNED_LOAD32(word_ptr32);
|
||||
word0 = word0 ^ (word0 >> 3);
|
||||
word1 = UNALIGNED_LOAD32(word_ptr32 + 1) & kWordMask0[bytecount & 3];
|
||||
word1 = word1 ^ (word1 << 18);
|
||||
return word0 + word1;
|
||||
}
|
||||
|
||||
//
|
||||
// Ascii-7 One-byte chars
|
||||
// ...ddddd...ccccc...bbbbb...aaaaa
|
||||
// make
|
||||
// ...ddddd...ccccc...bbbbb...aaaaa
|
||||
// 000...ddddd...ccccc...bbbbb...aa
|
||||
//
|
||||
// Latin 1- and 2-byte chars
|
||||
// ...ddddd...ccccc...bbbbb...aaaaa
|
||||
// ...................fffff...eeeee
|
||||
// make
|
||||
// ...ddddd...ccccc...bbbbb...aaaaa
|
||||
// 000...ddddd...ccccc...bbbbb...aa
|
||||
// ...................fffff...eeeee
|
||||
// ...............fffff...eeeee0000
|
||||
//
|
||||
// Non-CJK Two-byte chars
|
||||
// ...ddddd...........bbbbb........
|
||||
// ...hhhhh...........fffff........
|
||||
// make
|
||||
// ...ddddd...........bbbbb........
|
||||
// 000...ddddd...........bbbbb.....
|
||||
// ...hhhhh...........fffff........
|
||||
// hhhh...........fffff........0000
|
||||
//
|
||||
// Non-CJK Three-byte chars
|
||||
// ...........ccccc................
|
||||
// ...................fffff........
|
||||
// ...lllll...................iiiii
|
||||
// make
|
||||
// ...........ccccc................
|
||||
// 000...........ccccc.............
|
||||
// ...................fffff........
|
||||
// ...............fffff........0000
|
||||
// ...lllll...................iiiii
|
||||
// .lllll...................iiiii00
|
||||
//
|
||||
|
||||
// QUADGRAM
|
||||
// Pick up 1..12 bytes plus pre/post space and hash them via mask/shift/add
|
||||
// OVERSHOOTS up to 3 bytes
|
||||
// For runtime use of tables
|
||||
// Does X86 unaligned loads
|
||||
uint32 QuadHashV2Mix(const char* word_ptr, int bytecount, uint32 prepost) {
|
||||
const uint32* word_ptr32 = reinterpret_cast<const uint32*>(word_ptr);
|
||||
uint32 word0, word1, word2;
|
||||
if (bytecount <= 4) {
|
||||
word0 = UNALIGNED_LOAD32(word_ptr32) & kWordMask0[bytecount & 3];
|
||||
word0 = word0 ^ (word0 >> 3);
|
||||
return word0 ^ prepost;
|
||||
} else if (bytecount <= 8) {
|
||||
word0 = UNALIGNED_LOAD32(word_ptr32);
|
||||
word0 = word0 ^ (word0 >> 3);
|
||||
word1 = UNALIGNED_LOAD32(word_ptr32 + 1) & kWordMask0[bytecount & 3];
|
||||
word1 = word1 ^ (word1 << 4);
|
||||
return (word0 ^ prepost) + word1;
|
||||
}
|
||||
// else do 12 bytes
|
||||
word0 = UNALIGNED_LOAD32(word_ptr32);
|
||||
word0 = word0 ^ (word0 >> 3);
|
||||
word1 = UNALIGNED_LOAD32(word_ptr32 + 1);
|
||||
word1 = word1 ^ (word1 << 4);
|
||||
word2 = UNALIGNED_LOAD32(word_ptr32 + 2) & kWordMask0[bytecount & 3];
|
||||
word2 = word2 ^ (word2 << 2);
|
||||
return (word0 ^ prepost) + word1 + word2;
|
||||
}
|
||||
|
||||
|
||||
// QUADGRAM wrapper with surrounding spaces
|
||||
// Pick up 1..12 bytes plus pre/post space and hash them via mask/shift/add
|
||||
// UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes
|
||||
// For runtime use of tables
|
||||
uint32 QuadHashV2(const char* word_ptr, int bytecount) {
|
||||
if (bytecount == 0) {return 0;}
|
||||
uint32 prepost = 0;
|
||||
if (word_ptr[-1] == ' ') {prepost |= kPreSpaceIndicator;}
|
||||
if (word_ptr[bytecount] == ' ') {prepost |= kPostSpaceIndicator;}
|
||||
return QuadHashV2Mix(word_ptr, bytecount, prepost);
|
||||
}
|
||||
|
||||
// QUADGRAM wrapper with surrounding underscores (offline use)
|
||||
// Pick up 1..12 bytes plus pre/post '_' and hash them via mask/shift/add
|
||||
// OVERSHOOTS up to 3 bytes
|
||||
// For offline construction of tables
|
||||
uint32 QuadHashV2Underscore(const char* word_ptr, int bytecount) {
|
||||
if (bytecount == 0) {return 0;}
|
||||
const char* local_word_ptr = word_ptr;
|
||||
int local_bytecount = bytecount;
|
||||
uint32 prepost = 0;
|
||||
if (local_word_ptr[0] == '_') {
|
||||
prepost |= kPreSpaceIndicator;
|
||||
++local_word_ptr;
|
||||
--local_bytecount;
|
||||
}
|
||||
if (local_word_ptr[local_bytecount - 1] == '_') {
|
||||
prepost |= kPostSpaceIndicator;
|
||||
--local_bytecount;
|
||||
}
|
||||
return QuadHashV2Mix(local_word_ptr, local_bytecount, prepost);
|
||||
}
|
||||
|
||||
|
||||
// OCTAGRAM
|
||||
// Pick up 1..24 bytes plus pre/post space and hash them via mask/shift/add
|
||||
// UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes
|
||||
//
|
||||
// The low 32 bits follow the pattern from above, tuned to different scripts
|
||||
// The high 8 bits are a simple sum of all bytes, shifted by 0/1/2/3 bits each
|
||||
// For runtime use of tables V3
|
||||
// Does X86 unaligned loads
|
||||
uint64 OctaHash40Mix(const char* word_ptr, int bytecount, uint64 prepost) {
|
||||
const uint32* word_ptr32 = reinterpret_cast<const uint32*>(word_ptr);
|
||||
uint64 word0;
|
||||
uint64 word1;
|
||||
uint64 sum;
|
||||
|
||||
if (word_ptr[-1] == ' ') {prepost |= kPreSpaceIndicator;}
|
||||
if (word_ptr[bytecount] == ' ') {prepost |= kPostSpaceIndicator;}
|
||||
switch ((bytecount - 1) >> 2) {
|
||||
case 0: // 1..4 bytes
|
||||
word0 = UNALIGNED_LOAD32(word_ptr32) & kWordMask0[bytecount & 3];
|
||||
sum = word0;
|
||||
word0 = word0 ^ (word0 >> 3);
|
||||
break;
|
||||
case 1: // 5..8 bytes
|
||||
word0 = UNALIGNED_LOAD32(word_ptr32);
|
||||
sum = word0;
|
||||
word0 = word0 ^ (word0 >> 3);
|
||||
word1 = UNALIGNED_LOAD32(word_ptr32 + 1) & kWordMask0[bytecount & 3];
|
||||
sum += word1;
|
||||
word1 = word1 ^ (word1 << 4);
|
||||
word0 += word1;
|
||||
break;
|
||||
case 2: // 9..12 bytes
|
||||
word0 = UNALIGNED_LOAD32(word_ptr32);
|
||||
sum = word0;
|
||||
word0 = word0 ^ (word0 >> 3);
|
||||
word1 = UNALIGNED_LOAD32(word_ptr32 + 1);
|
||||
sum += word1;
|
||||
word1 = word1 ^ (word1 << 4);
|
||||
word0 += word1;
|
||||
word1 = UNALIGNED_LOAD32(word_ptr32 + 2) & kWordMask0[bytecount & 3];
|
||||
sum += word1;
|
||||
word1 = word1 ^ (word1 << 2);
|
||||
word0 += word1;
|
||||
break;
|
||||
case 3: // 13..16 bytes
|
||||
word0 =UNALIGNED_LOAD32(word_ptr32);
|
||||
sum = word0;
|
||||
word0 = word0 ^ (word0 >> 3);
|
||||
word1 = UNALIGNED_LOAD32(word_ptr32 + 1);
|
||||
sum += word1;
|
||||
word1 = word1 ^ (word1 << 4);
|
||||
word0 += word1;
|
||||
word1 = UNALIGNED_LOAD32(word_ptr32 + 2);
|
||||
sum += word1;
|
||||
word1 = word1 ^ (word1 << 2);
|
||||
word0 += word1;
|
||||
word1 = UNALIGNED_LOAD32(word_ptr32 + 3) & kWordMask0[bytecount & 3];
|
||||
sum += word1;
|
||||
word1 = word1 ^ (word1 >> 8);
|
||||
word0 += word1;
|
||||
break;
|
||||
case 4: // 17..20 bytes
|
||||
word0 = UNALIGNED_LOAD32(word_ptr32);
|
||||
sum = word0;
|
||||
word0 = word0 ^ (word0 >> 3);
|
||||
word1 = UNALIGNED_LOAD32(word_ptr32 + 1);
|
||||
sum += word1;
|
||||
word1 = word1 ^ (word1 << 4);
|
||||
word0 += word1;
|
||||
word1 = UNALIGNED_LOAD32(word_ptr32 + 2);
|
||||
sum += word1;
|
||||
word1 = word1 ^ (word1 << 2);
|
||||
word0 += word1;
|
||||
word1 = UNALIGNED_LOAD32(word_ptr32 + 3);
|
||||
sum += word1;
|
||||
word1 = word1 ^ (word1 >> 8);
|
||||
word0 += word1;
|
||||
word1 = UNALIGNED_LOAD32(word_ptr32 + 4) & kWordMask0[bytecount & 3];
|
||||
sum += word1;
|
||||
word1 = word1 ^ (word1 >> 4);
|
||||
word0 += word1;
|
||||
break;
|
||||
default: // 21..24 bytes and higher (ignores beyond 24)
|
||||
word0 = UNALIGNED_LOAD32(word_ptr32);
|
||||
sum = word0;
|
||||
word0 = word0 ^ (word0 >> 3);
|
||||
word1 = UNALIGNED_LOAD32(word_ptr32 + 1);
|
||||
sum += word1;
|
||||
word1 = word1 ^ (word1 << 4);
|
||||
word0 += word1;
|
||||
word1 = UNALIGNED_LOAD32(word_ptr32 + 2);
|
||||
sum += word1;
|
||||
word1 = word1 ^ (word1 << 2);
|
||||
word0 += word1;
|
||||
word1 = UNALIGNED_LOAD32(word_ptr32 + 3);
|
||||
sum += word1;
|
||||
word1 = word1 ^ (word1 >> 8);
|
||||
word0 += word1;
|
||||
word1 = UNALIGNED_LOAD32(word_ptr32 + 4);
|
||||
sum += word1;
|
||||
word1 = word1 ^ (word1 >> 4);
|
||||
word0 += word1;
|
||||
word1 = UNALIGNED_LOAD32(word_ptr32 + 5) & kWordMask0[bytecount & 3];
|
||||
sum += word1;
|
||||
word1 = word1 ^ (word1 >> 6);
|
||||
word0 += word1;
|
||||
break;
|
||||
}
|
||||
|
||||
sum += (sum >> 17); // extra 1-bit shift for bytes 2 & 3
|
||||
sum += (sum >> 9); // extra 1-bit shift for bytes 1 & 3
|
||||
sum = (sum & 0xff) << 32;
|
||||
return (word0 ^ prepost) + sum;
|
||||
}
|
||||
|
||||
// OCTAGRAM wrapper with surrounding spaces
|
||||
// Pick up 1..24 bytes plus pre/post space and hash them via mask/shift/add
|
||||
// UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes
|
||||
//
|
||||
// The low 32 bits follow the pattern from above, tuned to different scripts
|
||||
// The high 8 bits are a simple sum of all bytes, shifted by 0/1/2/3 bits each
|
||||
// For runtime use of tables V3
|
||||
uint64 OctaHash40(const char* word_ptr, int bytecount) {
|
||||
if (bytecount == 0) {return 0;}
|
||||
uint64 prepost = 0;
|
||||
if (word_ptr[-1] == ' ') {prepost |= kPreSpaceIndicator;}
|
||||
if (word_ptr[bytecount] == ' ') {prepost |= kPostSpaceIndicator;}
|
||||
return OctaHash40Mix(word_ptr, bytecount, prepost);
|
||||
}
|
||||
|
||||
|
||||
// OCTAGRAM wrapper with surrounding underscores (offline use)
|
||||
// Pick up 1..24 bytes plus pre/post space and hash them via mask/shift/add
|
||||
// UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes
|
||||
//
|
||||
// The low 32 bits follow the pattern from above, tuned to different scripts
|
||||
// The high 8 bits are a simple sum of all bytes, shifted by 0/1/2/3 bits each
|
||||
// For offline construction of tables
|
||||
uint64 OctaHash40underscore(const char* word_ptr, int bytecount) {
|
||||
if (bytecount == 0) {return 0;}
|
||||
const char* local_word_ptr = word_ptr;
|
||||
int local_bytecount = bytecount;
|
||||
uint64 prepost = 0;
|
||||
if (local_word_ptr[0] == '_') {
|
||||
prepost |= kPreSpaceIndicator;
|
||||
++local_word_ptr;
|
||||
--local_bytecount;
|
||||
}
|
||||
if (local_word_ptr[local_bytecount - 1] == '_') {
|
||||
prepost |= kPostSpaceIndicator;
|
||||
--local_bytecount;
|
||||
}
|
||||
return OctaHash40Mix(local_word_ptr, local_bytecount, prepost);
|
||||
}
|
||||
|
||||
// Hash a consecutive pair of tokens/words A B
|
||||
// Old: hash is B - A, which gives too many false hits on one-char diffs
|
||||
// Now: rotate(A,13) + B
|
||||
uint64 PairHash(uint64 worda_hash, uint64 wordb_hash) {
|
||||
return ((worda_hash >> 13) | (worda_hash << (64 - 13))) + wordb_hash;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
//----------------------------------------------------------------------------//
|
||||
// Finding groups of 1/2/4/8 letters //
|
||||
//----------------------------------------------------------------------------//
|
||||
|
||||
// src points to a letter. Find the byte length of a unigram starting there.
|
||||
int UniLen(const char* src) {
|
||||
const char* src_end = src;
|
||||
src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];
|
||||
return src_end - src;
|
||||
}
|
||||
|
||||
// src points to a letter. Find the byte length of a bigram starting there.
|
||||
int BiLen(const char* src) {
|
||||
const char* src_end = src;
|
||||
src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];
|
||||
src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];
|
||||
return src_end - src;
|
||||
}
|
||||
|
||||
// src points to a letter. Find the byte length of a quadgram starting there.
|
||||
int QuadLen(const char* src) {
|
||||
const char* src_end = src;
|
||||
src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];
|
||||
src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];
|
||||
src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];
|
||||
src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];
|
||||
return src_end - src;
|
||||
}
|
||||
|
||||
// src points to a letter. Find the byte length of an octagram starting there.
|
||||
int OctaLen(const char* src) {
|
||||
const char* src_end = src;
|
||||
int charcount = 0;
|
||||
while (src_end[0] != ' ') {
|
||||
src_end += UTF8OneCharLen(src);
|
||||
++charcount;
|
||||
if (charcount == 8) {break;}
|
||||
}
|
||||
return src_end - src;
|
||||
}
|
||||
|
||||
} // End namespace CLD2
|
||||
|
||||
|
||||
|
||||
|
||||
|
Reference in New Issue
Block a user