Initial CLD2 source upload.

git-svn-id: https://cld2.googlecode.com/svn/trunk@3 b252ecd4-b096-bf77-eb8e-91563289f87e
2013-06-24 23:52:45 +00:00
parent 84f0cff75d
commit af2750714b
65 changed files with 136486 additions and 0 deletions
--- a/internal/cldutil_shared.cc
+++ b/internal/cldutil_shared.cc
@@ -0,0 +1,437 @@
+// Copyright 2013 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//
+// Author: dsites@google.com (Dick Sites)
+//
+
+#include "cldutil_shared.h"
+#include <string>
+
+#include "cld2tablesummary.h"
+#include "integral_types.h"
+#include "port.h"
+#include "utf8statetable.h"
+
+namespace CLD2 {
+
+// Runtime routines for hashing, looking up, and scoring
+// unigrams (CJK), bigrams (CJK), quadgrams, and octagrams.
+// Unigrams and bigrams are for CJK languages only, including simplified/
+// traditional Chinese, Japanese, Korean, Vietnamese Han characters, and
+// Zhuang Han characters. Surrounding spaces are not considered.
+// Quadgrams and octagrams for for non-CJK and include two bits indicating
+// preceding and trailing spaces (word boundaries).
+
+
+// Indicator bits for leading/trailing space around quad/octagram
+// NOTE: 4444 bits are chosen to flip constant bits in hash of four chars of
+// 1-, 2-, or 3-bytes each.
+static const uint32 kPreSpaceIndicator =  0x00004444;
+static const uint32 kPostSpaceIndicator = 0x44440000;
+
+// Little-endian masks for 0..24 bytes picked up as uint32's
+static const uint32 kWordMask0[4] = {
+  0xFFFFFFFF, 0x000000FF, 0x0000FFFF, 0x00FFFFFF
+};
+
+static const int kMinCJKUTF8CharBytes = 3;
+
+static const int kMinGramCount = 3;
+static const int kMaxGramCount = 16;
+
+static const int UTFmax = 4;        // Max number of bytes in a UTF-8 character
+
+
+// Routines to access a hash table of <key:wordhash, value:probs> pairs
+// Buckets have 4-byte wordhash for sizes < 32K buckets, but only
+// 2-byte wordhash for sizes >= 32K buckets, with other wordhash bits used as
+// bucket subscript.
+// Probs is a packed: three languages plus a subscript for probability table
+// Buckets have all the keys together, then all the values.Key array never
+// crosses a cache-line boundary, so no-match case takes exactly one cache miss.
+// Match case may sometimes take an additional cache miss on value access.
+//
+// Other possibilites include 5 or 10 6-byte entries plus pad to make 32 or 64
+// byte buckets with single cache miss.
+// Or 2-byte key and 6-byte value, allowing 5 languages instead  of three.
+
+
+//----------------------------------------------------------------------------//
+// Hashing groups of 1/2/4/8 letters, perhaps with spaces or underscores      //
+//----------------------------------------------------------------------------//
+
+// Design principles for these hash functions
+// - Few operations
+// - Handle 1-, 2-, and 3-byte UTF-8 scripts, ignoring intermixing except in
+//   Latin script expect 1- and 2-byte mixtures.
+// - Last byte of each character has about 5 bits of information
+// - Spread good bits around so they can interact in at least two ways
+//   with other characters
+// - Use add for additional mixing thorugh carries
+
+// CJK Three-byte bigram
+//   ....dddd..cccccc..bbbbbb....aaaa
+//   ..................ffffff..eeeeee
+// make
+//   ....dddd..cccccc..bbbbbb....aaaa
+//   000....dddd..cccccc..bbbbbb....a
+//   ..................ffffff..eeeeee
+//   ffffff..eeeeee000000000000000000
+//
+// CJK Four-byte bigram
+//   ..dddddd..cccccc....bbbb....aaaa
+//   ..hhhhhh..gggggg....ffff....eeee
+// make
+//   ..dddddd..cccccc....bbbb....aaaa
+//   000..dddddd..cccccc....bbbb....a
+//   ..hhhhhh..gggggg....ffff....eeee
+//   ..ffff....eeee000000000000000000
+
+// BIGRAM
+// Pick up 1..8 bytes and hash them via mask/shift/add. NO pre/post
+// OVERSHOOTS up to 3 bytes
+// For runtime use of tables
+// Does X86 unaligned loads
+uint32 BiHashV2(const char* word_ptr, int bytecount) {
+  if (bytecount == 0) {return 0;}
+  const uint32* word_ptr32 = reinterpret_cast<const uint32*>(word_ptr);
+  uint32 word0, word1;
+  if (bytecount <= 4) {
+    word0 = UNALIGNED_LOAD32(word_ptr32) & kWordMask0[bytecount & 3];
+    word0 = word0 ^ (word0 >> 3);
+    return word0;
+  }
+  // Else do 8 bytes
+  word0 = UNALIGNED_LOAD32(word_ptr32);
+  word0 = word0 ^ (word0 >> 3);
+  word1 = UNALIGNED_LOAD32(word_ptr32 + 1) & kWordMask0[bytecount & 3];
+  word1 = word1 ^ (word1 << 18);
+  return word0 + word1;
+}
+
+//
+// Ascii-7 One-byte chars
+//   ...ddddd...ccccc...bbbbb...aaaaa
+// make
+//   ...ddddd...ccccc...bbbbb...aaaaa
+//   000...ddddd...ccccc...bbbbb...aa
+//
+// Latin 1- and 2-byte chars
+//   ...ddddd...ccccc...bbbbb...aaaaa
+//   ...................fffff...eeeee
+// make
+//   ...ddddd...ccccc...bbbbb...aaaaa
+//   000...ddddd...ccccc...bbbbb...aa
+//   ...................fffff...eeeee
+//   ...............fffff...eeeee0000
+//
+// Non-CJK Two-byte chars
+//   ...ddddd...........bbbbb........
+//   ...hhhhh...........fffff........
+// make
+//   ...ddddd...........bbbbb........
+//   000...ddddd...........bbbbb.....
+//   ...hhhhh...........fffff........
+//   hhhh...........fffff........0000
+//
+// Non-CJK Three-byte chars
+//   ...........ccccc................
+//   ...................fffff........
+//   ...lllll...................iiiii
+// make
+//   ...........ccccc................
+//   000...........ccccc.............
+//   ...................fffff........
+//   ...............fffff........0000
+//   ...lllll...................iiiii
+//   .lllll...................iiiii00
+//
+
+// QUADGRAM
+// Pick up 1..12 bytes plus pre/post space and hash them via mask/shift/add
+// OVERSHOOTS up to 3 bytes
+// For runtime use of tables
+// Does X86 unaligned loads
+uint32 QuadHashV2Mix(const char* word_ptr, int bytecount, uint32 prepost) {
+  const uint32* word_ptr32 = reinterpret_cast<const uint32*>(word_ptr);
+  uint32 word0, word1, word2;
+  if (bytecount <= 4) {
+    word0 = UNALIGNED_LOAD32(word_ptr32) & kWordMask0[bytecount & 3];
+    word0 = word0 ^ (word0 >> 3);
+    return word0 ^ prepost;
+  } else if (bytecount <= 8) {
+    word0 = UNALIGNED_LOAD32(word_ptr32);
+    word0 = word0 ^ (word0 >> 3);
+    word1 = UNALIGNED_LOAD32(word_ptr32 + 1) & kWordMask0[bytecount & 3];
+    word1 = word1 ^ (word1 << 4);
+    return (word0 ^ prepost) + word1;
+  }
+  // else do 12 bytes
+  word0 = UNALIGNED_LOAD32(word_ptr32);
+  word0 = word0 ^ (word0 >> 3);
+  word1 = UNALIGNED_LOAD32(word_ptr32 + 1);
+  word1 = word1 ^ (word1 << 4);
+  word2 = UNALIGNED_LOAD32(word_ptr32 + 2) & kWordMask0[bytecount & 3];
+  word2 = word2 ^ (word2 << 2);
+  return (word0 ^ prepost) + word1 + word2;
+}
+
+
+// QUADGRAM wrapper with surrounding spaces
+// Pick up 1..12 bytes plus pre/post space and hash them via mask/shift/add
+// UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes
+// For runtime use of tables
+uint32 QuadHashV2(const char* word_ptr, int bytecount) {
+  if (bytecount == 0) {return 0;}
+  uint32 prepost = 0;
+  if (word_ptr[-1] == ' ') {prepost |= kPreSpaceIndicator;}
+  if (word_ptr[bytecount] == ' ') {prepost |= kPostSpaceIndicator;}
+  return QuadHashV2Mix(word_ptr, bytecount, prepost);
+}
+
+// QUADGRAM wrapper with surrounding underscores (offline use)
+// Pick up 1..12 bytes plus pre/post '_' and hash them via mask/shift/add
+// OVERSHOOTS up to 3 bytes
+// For offline construction of tables
+uint32 QuadHashV2Underscore(const char* word_ptr, int bytecount) {
+  if (bytecount == 0) {return 0;}
+  const char* local_word_ptr = word_ptr;
+  int local_bytecount = bytecount;
+  uint32 prepost = 0;
+  if (local_word_ptr[0] == '_') {
+    prepost |= kPreSpaceIndicator;
+    ++local_word_ptr;
+    --local_bytecount;
+  }
+  if (local_word_ptr[local_bytecount - 1] == '_') {
+    prepost |= kPostSpaceIndicator;
+    --local_bytecount;
+  }
+  return QuadHashV2Mix(local_word_ptr, local_bytecount, prepost);
+}
+
+
+// OCTAGRAM
+// Pick up 1..24 bytes plus pre/post space and hash them via mask/shift/add
+// UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes
+//
+// The low 32 bits follow the pattern from above, tuned to different scripts
+// The high 8 bits are a simple sum of all bytes, shifted by 0/1/2/3 bits each
+// For runtime use of tables V3
+// Does X86 unaligned loads
+uint64 OctaHash40Mix(const char* word_ptr, int bytecount, uint64 prepost) {
+  const uint32* word_ptr32 = reinterpret_cast<const uint32*>(word_ptr);
+  uint64 word0;
+  uint64 word1;
+  uint64 sum;
+
+  if (word_ptr[-1] == ' ') {prepost |= kPreSpaceIndicator;}
+  if (word_ptr[bytecount] == ' ') {prepost |= kPostSpaceIndicator;}
+  switch ((bytecount - 1) >> 2) {
+  case 0:       // 1..4 bytes
+    word0 = UNALIGNED_LOAD32(word_ptr32) & kWordMask0[bytecount & 3];
+    sum = word0;
+    word0 = word0 ^ (word0 >> 3);
+    break;
+  case 1:       // 5..8 bytes
+    word0 = UNALIGNED_LOAD32(word_ptr32);
+    sum = word0;
+    word0 = word0 ^ (word0 >> 3);
+    word1 = UNALIGNED_LOAD32(word_ptr32 + 1) & kWordMask0[bytecount & 3];
+    sum += word1;
+    word1 = word1 ^ (word1 << 4);
+    word0 += word1;
+    break;
+  case 2:       // 9..12 bytes
+    word0 = UNALIGNED_LOAD32(word_ptr32);
+    sum = word0;
+    word0 = word0 ^ (word0 >> 3);
+    word1 = UNALIGNED_LOAD32(word_ptr32 + 1);
+    sum += word1;
+    word1 = word1 ^ (word1 << 4);
+    word0 += word1;
+    word1 = UNALIGNED_LOAD32(word_ptr32 + 2) & kWordMask0[bytecount & 3];
+    sum += word1;
+    word1 = word1 ^ (word1 << 2);
+    word0 += word1;
+    break;
+  case 3:       // 13..16 bytes
+    word0 =UNALIGNED_LOAD32(word_ptr32);
+    sum = word0;
+    word0 = word0 ^ (word0 >> 3);
+    word1 = UNALIGNED_LOAD32(word_ptr32 + 1);
+    sum += word1;
+    word1 = word1 ^ (word1 << 4);
+    word0 += word1;
+    word1 = UNALIGNED_LOAD32(word_ptr32 + 2);
+    sum += word1;
+    word1 = word1 ^ (word1 << 2);
+    word0 += word1;
+    word1 = UNALIGNED_LOAD32(word_ptr32 + 3) & kWordMask0[bytecount & 3];
+    sum += word1;
+    word1 = word1 ^ (word1 >> 8);
+    word0 += word1;
+    break;
+  case 4:       // 17..20 bytes
+    word0 = UNALIGNED_LOAD32(word_ptr32);
+    sum = word0;
+    word0 = word0 ^ (word0 >> 3);
+    word1 = UNALIGNED_LOAD32(word_ptr32 + 1);
+    sum += word1;
+    word1 = word1 ^ (word1 << 4);
+    word0 += word1;
+    word1 = UNALIGNED_LOAD32(word_ptr32 + 2);
+    sum += word1;
+    word1 = word1 ^ (word1 << 2);
+    word0 += word1;
+    word1 = UNALIGNED_LOAD32(word_ptr32 + 3);
+    sum += word1;
+    word1 = word1 ^ (word1 >> 8);
+    word0 += word1;
+    word1 = UNALIGNED_LOAD32(word_ptr32 + 4) & kWordMask0[bytecount & 3];
+    sum += word1;
+    word1 = word1 ^ (word1 >> 4);
+    word0 += word1;
+    break;
+  default:      // 21..24 bytes and higher (ignores beyond 24)
+    word0 = UNALIGNED_LOAD32(word_ptr32);
+    sum = word0;
+    word0 = word0 ^ (word0 >> 3);
+    word1 = UNALIGNED_LOAD32(word_ptr32 + 1);
+    sum += word1;
+    word1 = word1 ^ (word1 << 4);
+    word0 += word1;
+    word1 = UNALIGNED_LOAD32(word_ptr32 + 2);
+    sum += word1;
+    word1 = word1 ^ (word1 << 2);
+    word0 += word1;
+    word1 = UNALIGNED_LOAD32(word_ptr32 + 3);
+    sum += word1;
+    word1 = word1 ^ (word1 >> 8);
+    word0 += word1;
+    word1 = UNALIGNED_LOAD32(word_ptr32 + 4);
+    sum += word1;
+    word1 = word1 ^ (word1 >> 4);
+    word0 += word1;
+    word1 = UNALIGNED_LOAD32(word_ptr32 + 5) & kWordMask0[bytecount & 3];
+    sum += word1;
+    word1 = word1 ^ (word1 >> 6);
+    word0 += word1;
+    break;
+  }
+
+  sum += (sum >> 17);             // extra 1-bit shift for bytes 2 & 3
+  sum += (sum >> 9);              // extra 1-bit shift for bytes 1 & 3
+  sum = (sum & 0xff) << 32;
+  return (word0 ^ prepost) + sum;
+}
+
+// OCTAGRAM wrapper with surrounding spaces
+// Pick up 1..24 bytes plus pre/post space and hash them via mask/shift/add
+// UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes
+//
+// The low 32 bits follow the pattern from above, tuned to different scripts
+// The high 8 bits are a simple sum of all bytes, shifted by 0/1/2/3 bits each
+// For runtime use of tables V3
+uint64 OctaHash40(const char* word_ptr, int bytecount) {
+  if (bytecount == 0) {return 0;}
+  uint64 prepost = 0;
+  if (word_ptr[-1] == ' ') {prepost |= kPreSpaceIndicator;}
+  if (word_ptr[bytecount] == ' ') {prepost |= kPostSpaceIndicator;}
+  return OctaHash40Mix(word_ptr, bytecount, prepost);
+}
+
+
+// OCTAGRAM wrapper with surrounding underscores (offline use)
+// Pick up 1..24 bytes plus pre/post space and hash them via mask/shift/add
+// UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes
+//
+// The low 32 bits follow the pattern from above, tuned to different scripts
+// The high 8 bits are a simple sum of all bytes, shifted by 0/1/2/3 bits each
+// For offline construction of tables
+uint64 OctaHash40underscore(const char* word_ptr, int bytecount) {
+  if (bytecount == 0) {return 0;}
+  const char* local_word_ptr = word_ptr;
+  int local_bytecount = bytecount;
+  uint64 prepost = 0;
+  if (local_word_ptr[0] == '_') {
+    prepost |= kPreSpaceIndicator;
+    ++local_word_ptr;
+    --local_bytecount;
+  }
+  if (local_word_ptr[local_bytecount - 1] == '_') {
+    prepost |= kPostSpaceIndicator;
+    --local_bytecount;
+  }
+  return OctaHash40Mix(local_word_ptr, local_bytecount, prepost);
+}
+
+// Hash a consecutive pair of tokens/words A B
+// Old: hash is B - A, which gives too many false hits on one-char diffs
+// Now: rotate(A,13) + B
+uint64 PairHash(uint64 worda_hash, uint64 wordb_hash) {
+   return ((worda_hash >> 13) | (worda_hash << (64 - 13))) + wordb_hash;
+}
+
+
+
+
+//----------------------------------------------------------------------------//
+// Finding groups of 1/2/4/8 letters                                          //
+//----------------------------------------------------------------------------//
+
+// src points to a letter. Find the byte length of a unigram starting there.
+int UniLen(const char* src) {
+  const char* src_end = src;
+  src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];
+  return src_end - src;
+}
+
+// src points to a letter. Find the byte length of a bigram starting there.
+int BiLen(const char* src) {
+  const char* src_end = src;
+  src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];
+  src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];
+  return src_end - src;
+}
+
+// src points to a letter. Find the byte length of a quadgram starting there.
+int QuadLen(const char* src) {
+  const char* src_end = src;
+  src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];
+  src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];
+  src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];
+  src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];
+  return src_end - src;
+}
+
+// src points to a letter. Find the byte length of an octagram starting there.
+int OctaLen(const char* src) {
+  const char* src_end = src;
+  int charcount = 0;
+  while (src_end[0] != ' ') {
+    src_end += UTF8OneCharLen(src);
+    ++charcount;
+    if (charcount == 8) {break;}
+  }
+  return src_end - src;
+}
+
+}       // End namespace CLD2
+
+
+
+
+