cld2/internal/getonescriptspan.cc

// Copyright 2013 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

//
// Author: dsites@google.com (Dick Sites)
//


#include "getonescriptspan.h"
#include <sys/time.h>                               // for gettimeofday
#include <string.h>

#include "fixunicodevalue.h"
#include "lang_script.h"
#include "port.h"
#include "utf8statetable.h"

#include "utf8prop_lettermarkscriptnum.h"
#include "utf8repl_lettermarklower.h"
#include "utf8scannot_lettermarkspecial.h"


namespace CLD2 {

// Alphabetical order for binary search, from
// generated_entities.cc
extern const int kNameToEntitySize;
extern const CharIntPair kNameToEntity[];

static const int kMaxUpToWordBoundary = 50;       // span < this make longer,
                                                  // else make shorter
static const int kMaxAdvanceToWordBoundary = 10;  // +/- this many bytes
                                                  // to round to word boundary,
                                                  // direction above

static const char kSpecialSymbol[256] = {       // true for < > &
  0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
  0,0,0,0,0,0,1,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,1,0,1,0,
  0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
  0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,

  0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
  0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
  0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
  0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
};


#define LT 0      // <
#define GT 1      // >
#define EX 2      // !
#define HY 3      // -
#define QU 4      // "
#define AP 5      // '
#define SL 6      // /
#define S_ 7
#define C_ 8
#define R_ 9
#define I_ 10
#define P_ 11
#define T_ 12
#define Y_ 13
#define L_ 14
#define E_ 15
#define CR 16     // <cr> or <lf>
#define NL 17     // non-letter: ASCII whitespace, digit, punctuation
#define PL 18     // possible letter, incl. &
#define xx 19     // <unused>

// Map byte to one of ~20 interesting categories for cheap tag parsing
static const uint8 kCharToSub[256] = {
  NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,CR,NL, NL,CR,NL,NL,
  NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,
  NL,EX,QU,NL, NL,NL,PL,AP, NL,NL,NL,NL, NL,HY,NL,SL,
  NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, LT,NL,GT,NL,

  PL,PL,PL,C_, PL,E_,PL,PL, PL,I_,PL,PL, L_,PL,PL,PL,
  P_,PL,R_,S_, T_,PL,PL,PL, PL,Y_,PL,NL, NL,NL,NL,NL,
  PL,PL,PL,C_, PL,E_,PL,PL, PL,I_,PL,PL, L_,PL,PL,PL,
  P_,PL,R_,S_, T_,PL,PL,PL, PL,Y_,PL,NL, NL,NL,NL,NL,

  NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,
  NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,
  NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,
  NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,

  PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL,
  PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL,
  PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL,
  PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL,
};

#undef LT
#undef GT
#undef EX
#undef HY
#undef QU
#undef AP
#undef SL
#undef S_
#undef C_
#undef R_
#undef I_
#undef P_
#undef T_
#undef Y_
#undef L_
#undef E_
#undef CR
#undef NL
#undef PL
#undef xx


#define OK 0
#define X_ 1


static const int kMaxExitStateLettersMarksOnly = 1;
static const int kMaxExitStateAllText = 2;


// State machine to do cheap parse of non-letter strings incl. tags
// advances <tag>
//          |    |
// advances <tag> ... </tag>  for <script> <style>
//          |               |
// advances <!-- ... <tag> ... -->
//          |                     |
// advances <tag
//          ||  (0)
// advances <tag <tag2>
//          ||  (0)
//
// We start in state [0] at a non-letter and make at least one transition
// When scanning for just letters, arriving back at state [0] or [1] exits
//   the state machine.
// When scanning for any non-tag text, arriving at state [2] also exits
static const uint8 kTagParseTbl_0[] = {
// <  >  !  -   "  '  /  S   C  R  I  P   T  Y  L  E  CR NL PL xx
   3, 2, 2, 2,  2, 2, 2,OK, OK,OK,OK,OK, OK,OK,OK,OK,  2, 2,OK,X_, // [0] OK    exit state
  X_,X_,X_,X_, X_,X_,X_,X_, X_,X_,X_,X_, X_,X_,X_,X_, X_,X_,X_,X_, // [1] error exit state
   3, 2, 2, 2,  2, 2, 2,OK, OK,OK,OK,OK, OK,OK,OK,OK,  2, 2,OK,X_, // [2] NL*   [exit state]
  X_, 2, 4, 9, 10,11, 9,13,  9, 9, 9, 9,  9, 9, 9, 9,  9, 9, 9,X_, // [3] <
  X_, 2, 9, 5, 10,11, 9, 9,  9, 9, 9, 9,  9, 9, 9, 9,  9, 9, 9,X_, // [4] <!
  X_, 2, 9, 6, 10,11, 9, 9,  9, 9, 9, 9,  9, 9, 9, 9,  9, 9, 9,X_, // [5] <!-
   6, 6, 6, 7,  6, 6, 6, 6,  6, 6, 6, 6,  6, 6, 6, 6,  6, 6, 6,X_, // [6] <!--.*
   6, 6, 6, 8,  6, 6, 6, 6,  6, 6, 6, 6,  6, 6, 6, 6,  6, 6, 6,X_, // [7] <!--.*-
   6, 2, 6, 8,  6, 6, 6, 6,  6, 6, 6, 6,  6, 6, 6, 6,  6, 6, 6,X_, // [8] <!--.*--
  X_, 2, 9, 9, 10,11, 9, 9,  9, 9, 9, 9,  9, 9, 9, 9,  9, 9, 9,X_, // [9] <.*
  10,10,10,10,  9,10,10,10, 10,10,10,10, 10,10,10,10, 12,10,10,X_, // [10] <.*"
  11,11,11,11, 11, 9,11,11, 11,11,11,11, 11,11,11,11, 12,11,11,X_, // [11] <.*'
  X_, 2,12,12, 12,12,12,12, 12,12,12,12, 12,12,12,12, 12,12,12,X_, // [12] <.* no " '

// <  >  !  -   "  '  /  S   C  R  I  P   T  Y  L  E  CR NL PL xx
  X_, 2, 9, 9, 10,11, 9, 9, 14, 9, 9, 9, 28, 9, 9, 9,  9, 9, 9,X_, // [13] <S
  X_, 2, 9, 9, 10,11, 9, 9,  9,15, 9, 9,  9, 9, 9, 9,  9, 9, 9,X_, // [14] <SC
  X_, 2, 9, 9, 10,11, 9, 9,  9, 9,16, 9,  9, 9, 9, 9,  9, 9, 9,X_, // [15] <SCR
  X_, 2, 9, 9, 10,11, 9, 9,  9, 9, 9,17,  9, 9, 9, 9,  9, 9, 9,X_, // [16] <SCRI
  X_, 2, 9, 9, 10,11, 9, 9,  9, 9, 9, 9, 18, 9, 9, 9,  9, 9, 9,X_, // [17] <SCRIP
  X_,19, 9, 9, 10,11, 9, 9,  9, 9, 9, 9,  9, 9, 9, 9, 19,19, 9,X_, // [18] <SCRIPT
  20,19,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,X_, // [19] <SCRIPT .*
  19,19,19,19, 19,19,21,19, 19,19,19,19, 19,19,19,19, 19,19,19,X_, // [20] <SCRIPT .*<
  19,19,19,19, 19,19,19,22, 19,19,19,19, 19,19,19,19, 21,21,19,X_, // [21] <SCRIPT .*</ allow SP CR LF
  19,19,19,19, 19,19,19,19, 23,19,19,19, 19,19,19,19, 19,19,19,X_, // [22] <SCRIPT .*</S
  19,19,19,19, 19,19,19,19, 19,24,19,19, 19,19,19,19, 19,19,19,X_, // [23] <SCRIPT .*</SC
  19,19,19,19, 19,19,19,19, 19,19,25,19, 19,19,19,19, 19,19,19,X_, // [24] <SCRIPT .*</SCR
  19,19,19,19, 19,19,19,19, 19,19,19,26, 19,19,19,19, 19,19,19,X_, // [25] <SCRIPT .*</SCRI
  19,19,19,19, 19,19,19,19, 19,19,19,19, 27,19,19,19, 19,19,19,X_, // [26] <SCRIPT .*</SCRIP
  19, 2,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,X_, // [27] <SCRIPT .*</SCRIPT

// <  >  !  -   "  '  /  S   C  R  I  P   T  Y  L  E  CR NL PL xx
  X_, 2, 9, 9, 10,11, 9, 9,  9, 9, 9, 9,  9,29, 9, 9,  9, 9, 9,X_, // [28] <ST
  X_, 2, 9, 9, 10,11, 9, 9,  9, 9, 9, 9,  9, 9,30, 9,  9, 9, 9,X_, // [29] <STY
  X_, 2, 9, 9, 10,11, 9, 9,  9, 9, 9, 9,  9, 9, 9,31,  9, 9, 9,X_, // [30] <STYL
  X_,32, 9, 9, 10,11, 9, 9,  9, 9, 9, 9,  9, 9, 9, 9, 32,32, 9,X_, // [31] <STYLE
  33,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,X_, // [32] <STYLE .*
  32,32,32,32, 32,32,34,32, 32,32,32,32, 32,32,32,32, 32,32,32,X_, // [33] <STYLE .*<
  32,32,32,32, 32,32,32,35, 32,32,32,32, 32,32,32,32, 34,34,32,X_, // [34] <STYLE .*</ allow SP CR LF
  32,32,32,32, 32,32,32,32, 32,32,32,32, 36,32,32,32, 32,32,32,X_, // [35] <STYLE .*</S
  32,32,32,32, 32,32,32,32, 32,32,32,32, 32,37,32,32, 32,32,32,X_, // [36] <STYLE .*</ST
  32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,38,32, 32,32,32,X_, // [37] <STYLE .*</STY
  32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,39, 32,32,32,X_, // [38] <STYLE .*</STYL
  32, 2,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,X_, // [39] <STYLE .*</STYLE
};

#undef OK
#undef X_

enum
{
  UTFmax        = 4,            // maximum bytes per rune
  Runesync      = 0x80,         // cannot represent part of a UTF sequence (<)
  Runeself      = 0x80,         // rune and UTF sequences are the same (<)
  Runeerror     = 0xFFFD,       // decoding error in UTF
  Runemax       = 0x10FFFF,     // maximum rune value
};

// Debugging. Not thread safe.
static char gDisplayPiece[32];
const uint8 gCharlen[16] = {1,1,1,1, 1,1,1,1, 1,1,1,1, 2,2,3,4};
char* DisplayPiece(const char* next_byte_, int byte_length_) {
  // Copy up to 8 UTF-8 chars to buffer
  int k = 0;    // byte count
  int n = 0;    // character count
  for (int i = 0; i < byte_length_; ++i) {
    char c = next_byte_[i];
    if ((c & 0xc0) != 0x80) {
      // Beginning of a UTF-8 character
      int charlen = gCharlen[static_cast<uint8>(c) >> 4];
      if (i + charlen > byte_length_) {break;} // Not enough room for full char
      if (k >= (32 - 7)) {break;}   // Not necessarily enough room
      if (n >= 8) {break;}          // Enough characters already
      ++n;
    }
    if (c == '<') {
      memcpy(&gDisplayPiece[k], "&lt;", 4); k += 4;
    } else if (c == '>') {
      memcpy(&gDisplayPiece[k], "&gt;", 4); k += 4;
    } else if (c == '&') {
      memcpy(&gDisplayPiece[k], "&amp;", 5); k += 5;
    } else if (c == '\'') {
      memcpy(&gDisplayPiece[k], "&apos;", 6); k += 6;
    } else if (c == '"') {
      memcpy(&gDisplayPiece[k], "&quot;", 6); k += 6;
    } else {
      gDisplayPiece[k++] = c;
    }
  }
  gDisplayPiece[k++] = '\0';
  return gDisplayPiece;
}


// runetochar copies (encodes) one rune, pointed to by r, to at most
// UTFmax bytes starting at s and returns the number of bytes generated.
int runetochar(char *str, const char32 *rune) {
  // Convert to unsigned for range check.
  unsigned long c;

  // 1 char 00-7F
  c = *rune;
  if(c <= 0x7F) {
    str[0] = c;
    return 1;
  }

  // 2 char 0080-07FF
  if(c <= 0x07FF) {
    str[0] = 0xC0 | (c >> 1*6);
    str[1] = 0x80 | (c & 0x3F);
    return 2;
  }

  // Range check
  if (c > Runemax) {
    c = Runeerror;
  }

  // 3 char 0800-FFFF
  if (c <= 0xFFFF) {
    str[0] = 0xE0 |  (c >> 2*6);
    str[1] = 0x80 | ((c >> 1*6) & 0x3F);
    str[2] = 0x80 |  (c & 0x3F);
    return 3;
  }

  // 4 char 10000-1FFFFF
  str[0] = 0xF0 | (c >> 3*6);
  str[1] = 0x80 | ((c >> 2*6) & 0x3F);
  str[2] = 0x80 | ((c >> 1*6) & 0x3F);
  str[3] = 0x80 | (c & 0x3F);
  return 4;
}


// Convert GetTimeOfDay output to 64-bit usec
static inline uint64 Microseconds(const struct timeval& t) {
  // The SumReducer uses uint64, so convert to (uint64) microseconds,
  // not (double) seconds.
  return t.tv_sec * 1000000ULL + t.tv_usec;
}


// Useful for converting an entity to an ascii value.
// RETURNS unicode value, or -1 if entity isn't valid.  Don't include & or ;
int LookupEntity(const char* entity_name, int entity_len) {
  // Make a C string
  if (entity_len >= 16) {return -1;}    // All real entities are shorter
  char temp[16];
  memcpy(temp, entity_name, entity_len);
  temp[entity_len] = '\0';
  int match = BinarySearch(temp, 0, kNameToEntitySize, kNameToEntity);
  if (match >= 0) {return kNameToEntity[match].i;}
  return -1;
}

bool ascii_isdigit(char c) {
  return ('0' <= c) && (c <= '9');
}
bool ascii_isxdigit(char c) {
  if (('0' <= c) && (c <= '9')) {return true;}
  if (('a' <= c) && (c <= 'f')) {return true;}
  if (('A' <= c) && (c <= 'F')) {return true;}
  return false;
}
bool ascii_isalnum(char c) {
  if (('0' <= c) && (c <= '9')) {return true;}
  if (('a' <= c) && (c <= 'z')) {return true;}
  if (('A' <= c) && (c <= 'Z')) {return true;}
  return false;
}
int hex_digit_to_int(char c) {
  if (('0' <= c) && (c <= '9')) {return c - '0';}
  if (('a' <= c) && (c <= 'f')) {return c - 'a' + 10;}
  if (('A' <= c) && (c <= 'F')) {return c - 'A' + 10;}
  return 0;
}

static int32 strto32_base10(const char* nptr, const char* limit,
                            const char **endptr) {
  *endptr = nptr;
  while (nptr < limit && *nptr == '0') {
    ++nptr;
  }
  if (nptr == limit || !ascii_isdigit(*nptr))
    return -1;
  const char* end_digits_run = nptr;
  while (end_digits_run < limit && ascii_isdigit(*end_digits_run)) {
    ++end_digits_run;
  }
  *endptr = end_digits_run;
  const int num_digits = end_digits_run - nptr;
  // kint32max == 2147483647.
  if (num_digits < 9 ||
      (num_digits == 10 && memcmp(nptr, "2147483647", 10) <= 0)) {
    int value = 0;
    for (; nptr < end_digits_run; ++nptr) {
      value *= 10;
      value += *nptr - '0';
    }
    // Overflow past the last valid unicode codepoint
    // (0x10ffff) is converted to U+FFFD by FixUnicodeValue().
    return FixUnicodeValue(value);
  } else {
    // Overflow: can't fit in an int32;
    // returns the replacement character 0xFFFD.
    return 0xFFFD;
  }
}

static int32 strto32_base16(const char* nptr, const char* limit,
                            const char **endptr) {
  *endptr = nptr;
  while (nptr < limit && *nptr == '0') {
    ++nptr;
  }
  if (nptr == limit || !ascii_isxdigit(*nptr)) {
    return -1;
  }
  const char* end_xdigits_run = nptr;
  while (end_xdigits_run < limit && ascii_isxdigit(*end_xdigits_run)) {
    ++end_xdigits_run;
  }
  *endptr = end_xdigits_run;
  const int num_xdigits = end_xdigits_run - nptr;
  // kint32max == 0x7FFFFFFF.
  if (num_xdigits < 8 || (num_xdigits == 8 && nptr[0] < '8')) {
    int value = 0;
    for (; nptr < end_xdigits_run; ++nptr) {
      value <<= 4;
      value += hex_digit_to_int(*nptr);
    }
    // Overflow past the last valid unicode codepoint
    // (0x10ffff) is converted to U+FFFD by FixUnicodeValue().
    return FixUnicodeValue(value);
  } else {
    // Overflow: can't fit in an int32;
    // returns the replacement character 0xFFFD.
    return 0xFFFD;
  }
}

// Unescape the current character pointed to by src.  SETS the number
// of chars read for the conversion (in UTF8).  If src isn't a valid entity,
// just consume the & and RETURN -1.  If src doesn't point to & -- which it
// should -- set src_consumed to 0 and RETURN -1.
int ReadEntity(const char* src, int srcn, int* src_consumed) {
  const char* const srcend = src + srcn;

  if (srcn == 0 || *src != '&') {      // input should start with an ampersand
    *src_consumed = 0;
    return -1;
  }
  *src_consumed = 1;                   // we'll get the & at least

  // The standards are a bit unclear on when an entity ends.  Certainly a ";"
  // ends one, but spaces probably do too.  We follow the lead of both IE and
  // Netscape, which as far as we can tell end numeric entities (1st case below)
  // at any non-digit, and end character entities (2nd case) at any non-alnum.
  const char* entstart, *entend;  // where the entity starts and ends
  entstart = src + 1;             // read past the &
  int entval;                     // UCS2 value of the entity
  if ( *entstart == '#' ) {       // -- 1st case: numeric entity
    if ( entstart + 2 >= srcend ) {
      return -1;                  // no way a legitimate number could fit
    } else if ( entstart[1] == 'x' || entstart[1] == 'X' ) {   // hex numeric
      entval = strto32_base16(entstart + 2, srcend, &entend);
    } else {                                  // decimal numeric entity
      entval = strto32_base10(entstart+1, srcend, &entend);
    }
    if (entval == -1 || entend > srcend) {
      return -1;                 // not entirely correct, but close enough
    }
  } else {                       // -- 2nd case: character entity
    for (entend = entstart;
         entend < srcend && ascii_isalnum(*entend);
         ++entend ) {
      // entity consists of alphanumeric chars
    }
    entval = LookupEntity(entstart, entend - entstart);
    if (entval < 0) {
      return -1;  // not a legal entity name
    }
    // Now we do a strange-seeming IE6-compatibility check: if entval is
    // >= 256, it *must* be followed by a semicolon or it's not considered
    // an entity.  The problem is lots of the newfangled entity names, like
    // "lang", also occur in URL CGI arguments: "/search?q=test&lang=en".
    // When these links are written in HTML, it would be really bad if the
    // "&lang" were treated as an entity, which is what the spec says
    // *should* happen (even when the HTML is inside an "A HREF" tag!)
    // IE ignores the spec for these new, high-value entities, so we do too.
    if ( entval >= 256 && !(entend < srcend && *entend == ';') ) {
      return -1;                 // make non-;-terminated entity illegal
    }
  }

  // Finally, figure out how much src was consumed
  if ( entend < srcend && *entend == ';' ) {
    entend++;                    // standard says ; terminator is special
  }
  *src_consumed = entend - src;
  return entval;
}


// Src points to '&'
// Writes entity value to dst. Returns take(src), put(dst) byte counts
void EntityToBuffer(const char* src, int len, char* dst,
                    int* tlen, int* plen) {
  char32 entval = ReadEntity(src, len, tlen);

  // ReadEntity does this already: entval = FixUnicodeValue(entval);

  // Convert UTF-32 to UTF-8
  if (entval > 0) {
    *plen = runetochar(dst, &entval);
  } else {
    // Illegal entity; ignore the '&'
    *tlen = 1;
    *plen = 0;
  }
}

// Returns true if character is < > or &, none of which are letters
bool inline IsSpecial(char c) {
  if ((c & 0xe0) == 0x20) {
    return kSpecialSymbol[static_cast<uint8>(c)];
  }
  return false;
}

// Quick Skip to next letter or < > & or to end of string (eos)
// Always return is_letter for eos
int ScanToLetterOrSpecial(const char* src, int len) {
  int bytes_consumed;
  StringPiece str(src, len);
  UTF8GenericScan(&utf8scannot_lettermarkspecial_obj, str, &bytes_consumed);
  return bytes_consumed;
}


// src points to non-letter, such as tag-opening '<'
// Return length from here to next possible letter
// On eos or another < before >, return 1
// advances <tag>
//          |    |
// advances <tag> ... </tag>  for <script> <style>
//          |               |
// advances <!-- ... <tag> ... -->
//          |                     |
// advances <tag
//          ||  (1)
// advances <tag <tag2>
//          ||  (1)
int ScanToPossibleLetter(const char* isrc, int len, int max_exit_state) {
  const uint8* src = reinterpret_cast<const uint8*>(isrc);
  const uint8* srclimit = src + len;
  const uint8* tagParseTbl = kTagParseTbl_0;
  int e = 0;
  while (src < srclimit) {
    e = tagParseTbl[kCharToSub[*src++]];
    if (e <= max_exit_state) {
      // We overshot by one byte
      --src;
      break;
    }
    tagParseTbl = &kTagParseTbl_0[e * 20];
  }

  if (src >= srclimit) {
    // We fell off the end of the text.
    // It looks like the most common case for this is a truncated file, not
    // mismatched angle brackets. So we pretend that the last char was '>'
    return len;
  }

  // OK to be in state 0 or state 2 at exit
  if ((e != 0) && (e != 2)) {
    // Error, '<' followed by '<'
    // We want to back up to first <, then advance by one byte past it
    int offset = src - reinterpret_cast<const uint8*>(isrc);

    // Backscan to first '<' and return enough length to just get past it
    --offset;   // back up over the second '<', which caused us to stop
    while ((0 < offset) && (isrc[offset] != '<')) {
      // Find the first '<', which is unmatched
      --offset;
    }
    // skip to just beyond first '<'
    return offset + 1;
  }

  return src - reinterpret_cast<const uint8*>(isrc);
}


ScriptScanner::ScriptScanner(const char* buffer,
                             int buffer_length,
                             bool is_plain_text)
  : start_byte_(buffer),
  next_byte_(buffer),
  next_byte_limit_(buffer + buffer_length),
  byte_length_(buffer_length),
  is_plain_text_(is_plain_text),
  letters_marks_only_(true),
  one_script_only_(true),
  exit_state_(kMaxExitStateLettersMarksOnly) {
    script_buffer_ = new char[kMaxScriptBuffer];
    script_buffer_lower_ = new char[kMaxScriptLowerBuffer];
    map2original_.Clear();    // map from script_buffer_ to buffer
    map2uplow_.Clear();       // map from script_buffer_lower_ to script_buffer_
}

// Extended version to allow spans of any non-tag text and spans of mixed script
ScriptScanner::ScriptScanner(const char* buffer,
                             int buffer_length,
                             bool is_plain_text,
                             bool any_text,
                             bool any_script)
  : start_byte_(buffer),
  next_byte_(buffer),
  next_byte_limit_(buffer + buffer_length),
  byte_length_(buffer_length),
  is_plain_text_(is_plain_text),
  letters_marks_only_(!any_text),
  one_script_only_(!any_script),
  exit_state_(any_text ? kMaxExitStateAllText : kMaxExitStateLettersMarksOnly) {
    script_buffer_ = new char[kMaxScriptBuffer];
    script_buffer_lower_ = new char[kMaxScriptLowerBuffer];
    map2original_.Clear();    // map from script_buffer_ to buffer
    map2uplow_.Clear();       // map from script_buffer_lower_ to script_buffer_
}


ScriptScanner::~ScriptScanner() {
  delete[] script_buffer_;
  delete[] script_buffer_lower_;
}


// Get to the first real non-tag letter or entity that is a letter
// Sets script of that letter
// Return len if no more letters
int ScriptScanner::SkipToFrontOfSpan(const char* src, int len, int* script) {
  int sc = UNKNOWN_ULSCRIPT;
  int skip = 0;
  int tlen, plen;

  // Do run of non-letters (tag | &NL | NL)*
  tlen = 0;
  while (skip < len) {
    // Do fast scan to next interesting byte
    // int oldskip = skip;
    skip += ScanToLetterOrSpecial(src + skip, len - skip);

    // Check for no more letters/specials
    if (skip >= len) {
      // All done
      *script = sc;
      return len;
    }

    // We are at a letter, nonletter, tag, or entity
    if (IsSpecial(src[skip]) && !is_plain_text_) {
      if (src[skip] == '<') {
        // Begining of tag; skip to end and go around again
        tlen = ScanToPossibleLetter(src + skip, len - skip,
                                    exit_state_);
        sc = 0;
      } else if (src[skip] == '>') {
        // Unexpected end of tag; skip it and go around again
        tlen = 1;         // Over the >
        sc = 0;
      } else if (src[skip] == '&') {
        // Expand entity, no advance
        char temp[4];
        EntityToBuffer(src + skip, len - skip,
                       temp, &tlen, &plen);
        sc = GetUTF8LetterScriptNum(temp);
      }
    } else {
      // Update 1..4 bytes
      tlen = UTF8OneCharLen(src + skip);
      sc = GetUTF8LetterScriptNum(src + skip);
    }
    if (sc != 0) {break;}           // Letter found
    skip += tlen;                   // Else advance
  }

  *script = sc;
  return skip;
}


// These are for ASCII-only tag names
// Compare one letter uplow to c, ignoring case of uplowp
inline bool EqCase(char uplow, char c) {
  return (uplow | 0x20) == c;
}

// These are for ASCII-only tag names
// Return true for space / < > etc. all less than 0x40
inline bool NeqLetter(char c) {
  return c < 0x40;
}

// These are for ASCII-only tag names
// Return true for space \n false for \r
inline bool WS(char c) {
  return (c == ' ') || (c == '\n');
}

// Canonical CR or LF
static const char LF = '\n';


// The naive loop scans from next_byte_ to script_buffer_ until full.
// But this can leave an awkward hard-to-identify short fragment at the
// end of the input. We would prefer to make the next-to-last fragment
// shorter and the last fragment longer.

// Copy next run of non-tag characters to buffer [NUL terminated]
// This just replaces tags with space or \n and removes entities.
// Tags <br> <p> and <tr> are replaced with \n. Non-letter sequences
// including \r or \n are replaced by \n. All other tags and skipped text
// are replaced with ASCII space.
//
// Buffer ALWAYS has leading space and trailing space space space NUL
bool ScriptScanner::GetOneTextSpan(LangSpan* span) {
  span->text = script_buffer_;
  span->text_bytes = 0;
  span->offset = next_byte_ - start_byte_;
  span->ulscript = UNKNOWN_ULSCRIPT;
  span->lang = UNKNOWN_LANGUAGE;
  span->truncated = false;

  int put_soft_limit = kMaxScriptBytes - kWithinScriptTail;
  if ((kMaxScriptBytes <= byte_length_) &&
      (byte_length_ < (2 * kMaxScriptBytes))) {
    // Try to split the last two fragments in half
    put_soft_limit = byte_length_ / 2;
  }

  script_buffer_[0] = ' ';  // Always a space at front of output
  script_buffer_[1] = '\0';
  int take = 0;
  int put = 1;              // Start after the initial space
  int tlen, plen;

  if (byte_length_ <= 0) {
    return false;          // No more text to be found
  }

  // Go over alternating spans of text and tags,
  // copying letters to buffer with single spaces for each run of non-letters
  bool last_byte_was_space = false;
  while (take < byte_length_) {
    char c = next_byte_[take];
    if (c == '\r') {c = LF;}      // Canonical CR or LF
    if (c == '\n') {c = LF;}      // Canonical CR or LF

    if (IsSpecial(c) && !is_plain_text_) {
      if (c == '<') {
        // Replace tag with space
        c = ' ';                      // for almost-full test below
        // or if <p> <br> <tr>, replace with \n
        if (take < (byte_length_ - 3)) {
          if (EqCase(next_byte_[take + 1], 'p') &&
              NeqLetter(next_byte_[take + 2])) {
            c = LF;
          }
          if (EqCase(next_byte_[take + 1], 'b') &&
              EqCase(next_byte_[take + 2], 'r') &&
              NeqLetter(next_byte_[take + 3])) {
            c = LF;
          }
          if (EqCase(next_byte_[take + 1], 't') &&
              EqCase(next_byte_[take + 2], 'r') &&
              NeqLetter(next_byte_[take + 3])) {
            c = LF;
          }
        }
        // Begining of tag; skip to end and go around again
        tlen = 1 + ScanToPossibleLetter(next_byte_ + take, byte_length_ - take,
                                    exit_state_);
        // Copy one byte, compressing spaces
        if (!last_byte_was_space || !WS(c)) {
          script_buffer_[put++] = c;      // Advance dest
          last_byte_was_space = WS(c);
        }
      } else if (c == '>') {
        // Unexpected end of tag; copy it and go around again
        tlen = 1;         // Over the >
        script_buffer_[put++] = c;    // Advance dest
      } else if (c == '&') {
        // Expand entity, no advance
        EntityToBuffer(next_byte_ + take, byte_length_ - take,
                       script_buffer_ + put, &tlen, &plen);
        put += plen;                  // Advance dest
      }
      take += tlen;                   // Advance source
    } else {
      // Copy one byte, compressing spaces
      if (!last_byte_was_space || !WS(c)) {
        script_buffer_[put++] = c;      // Advance dest
        last_byte_was_space = WS(c);
      }
      ++take;                         // Advance source
    }

    if (WS(c) &&
        (put >= put_soft_limit)) {
      // Buffer is almost full
      span->truncated = true;
      break;
    }
    if (put >= kMaxScriptBytes) {
      // Buffer is completely full
      span->truncated = true;
      break;
    }
  }

  // Almost done. Back up to a character boundary if needed
  while ((0 < take) && ((next_byte_[take] & 0xc0) == 0x80)) {
    // Back up over continuation byte
    --take;
    --put;
  }

  // Update input position
  next_byte_ += take;
  byte_length_ -= take;

  // Put four more spaces/NUL. Worst case is abcd _ _ _ \0
  //                          kMaxScriptBytes |   | put
  script_buffer_[put + 0] = ' ';
  script_buffer_[put + 1] = ' ';
  script_buffer_[put + 2] = ' ';
  script_buffer_[put + 3] = '\0';

  span->text_bytes = put;       // Does not include the last four chars above
  return true;
}


// Copy next run of same-script non-tag letters to buffer [NUL terminated]
// Buffer ALWAYS has leading space and trailing space space space NUL
bool ScriptScanner::GetOneScriptSpan(LangSpan* span) {
  if (!letters_marks_only_) {
    // Return non-tag text, including punctuation and digits
    return GetOneTextSpan(span);
  }

  span->text = script_buffer_;
  span->text_bytes = 0;
  span->offset = next_byte_ - start_byte_;
  span->ulscript = UNKNOWN_ULSCRIPT;
  span->lang = UNKNOWN_LANGUAGE;
  span->truncated = false;

  // struct timeval script_start, script_mid, script_end;

  int put_soft_limit = kMaxScriptBytes - kWithinScriptTail;
  if ((kMaxScriptBytes <= byte_length_) &&
      (byte_length_ < (2 * kMaxScriptBytes))) {
    // Try to split the last two fragments in half
    put_soft_limit = byte_length_ / 2;
  }


  int spanscript;           // The script of this span
  int sc = UNKNOWN_ULSCRIPT;  // The script of next character
  int tlen = 0;
  int plen = 0;

  script_buffer_[0] = ' ';  // Always a space at front of output
  script_buffer_[1] = '\0';
  int take = 0;
  int put = 1;              // Start after the initial space

  // Build offsets from span->text back to start_byte_ + span->offset
  // This mapping reflects deletion of non-letters, expansion of
  // entities, etc.
  map2original_.Clear();
  map2original_.Delete(span->offset);   // So that MapBack(0) gives offset

  // gettimeofday(&script_start, NULL);
  // Get to the first real non-tag letter or entity that is a letter
  int skip = SkipToFrontOfSpan(next_byte_, byte_length_, &spanscript);
  next_byte_ += skip;
  byte_length_ -= skip;

  if (skip != 1) {
    map2original_.Delete(skip);
    map2original_.Insert(1);
  } else {
    map2original_.Copy(1);
  }
  if (byte_length_ <= 0) {
    map2original_.Reset();
    return false;               // No more letters to be found
  }

  // gettimeofday(&script_mid, NULL);

  // There is at least one letter, so we know the script for this span
  span->ulscript = (ULScript)spanscript;


  // Go over alternating spans of same-script letters and non-letters,
  // copying letters to buffer with single spaces for each run of non-letters
  while (take < byte_length_) {
    // Copy run of letters in same script (&LS | LS)*
    int letter_count = 0;              // Keep track of word length
    bool need_break = false;

    while (take < byte_length_) {
      // We are at a letter, nonletter, tag, or entity
      if (IsSpecial(next_byte_[take]) && !is_plain_text_) {
        if (next_byte_[take] == '<') {
          // Begining of tag
          sc = 0;
          break;
        } else if (next_byte_[take] == '>') {
          // Unexpected end of tag
          sc = 0;
          break;
        } else if (next_byte_[take] == '&') {
          // Copy entity, no advance
          EntityToBuffer(next_byte_ + take, byte_length_ - take,
                         script_buffer_ + put, &tlen, &plen);
          sc = GetUTF8LetterScriptNum(script_buffer_ + put);
        }
      } else {
        // Real letter, safely copy up to 4 bytes, increment by 1..4
        // Will update by 1..4 bytes at Advance, below
        tlen = plen = UTF8OneCharLen(next_byte_ + take);
        if (take < (byte_length_ - 3)) {
          // X86 fast case, does unaligned load/store
          UNALIGNED_STORE32(script_buffer_ + put,
                            UNALIGNED_LOAD32(next_byte_ + take));

        } else {
          // Slow case, happens 1-3 times per input document
          memcpy(script_buffer_ + put, next_byte_ + take, plen);
        }
        sc = GetUTF8LetterScriptNum(next_byte_ + take);
      }

      // Allow continue across a single letter in a different script:
      // A B D = three scripts, c = common script, i = inherited script,
      // - = don't care, ( = take position before the += below
      //  AAA(A-    continue
      //
      //  AAA(BA    continue
      //  AAA(BB    break
      //  AAA(Bc    continue (breaks after B)
      //  AAA(BD    break
      //  AAA(Bi    break
      //
      //  AAA(c-    break
      //
      //  AAA(i-    continue
      //

      if ((sc != spanscript) && (sc != ULScript_Inherited)) {
        // Might need to break this script span
        if (sc == ULScript_Common) {
          need_break = true;
        } else {
          // Look at next following character, ignoring entity as Common
          int sc2 = GetUTF8LetterScriptNum(next_byte_ + take + tlen);
          if ((sc2 != ULScript_Common) && (sc2 != spanscript)) {
            // We found a non-trivial change of script
            if (one_script_only_) {
              need_break = true;
            }
          }
        }
      }
      if (need_break) {break;}  // Non-letter or letter in wrong script

      take += tlen;                   // Advance
      put += plen;                    // Advance

      // Update the offset map to reflect take/put lengths
      if (tlen == plen) {
        map2original_.Copy(tlen);
      } else if (tlen < plen) {
        map2original_.Copy(tlen);
        map2original_.Insert(plen - tlen);
      } else {    // plen < tlen
        map2original_.Copy(plen);
        map2original_.Delete(tlen - plen);
      }

      ++letter_count;
      if (put >= kMaxScriptBytes) {
        // Buffer is full
        span->truncated = true;
        break;
      }
    }     // End while letters

    // Do run of non-letters (tag | &NL | NL)*
    while (take < byte_length_) {
      // Do fast scan to next interesting byte
      tlen = ScanToLetterOrSpecial(next_byte_ + take, byte_length_ - take);
      take += tlen;
      map2original_.Delete(tlen);
      if (take >= byte_length_) {break;}    // Might have scanned to end

      // We are at a letter, nonletter, tag, or entity
      if (IsSpecial(next_byte_[take]) && !is_plain_text_) {
        if (next_byte_[take] == '<') {
          // Begining of tag; skip to end and go around again
          tlen = ScanToPossibleLetter(next_byte_ + take, byte_length_ - take,
                                      exit_state_);
          sc = 0;
        } else if (next_byte_[take] == '>') {
          // Unexpected end of tag; skip it and go around again
          tlen = 1;         // Over the >
          sc = 0;
        } else if (next_byte_[take] == '&') {
          // Expand entity, no advance
          EntityToBuffer(next_byte_ + take, byte_length_ - take,
                         script_buffer_ + put, &tlen, &plen);
          sc = GetUTF8LetterScriptNum(script_buffer_ + put);
        }
      } else {
        // Update 1..4
        tlen = UTF8OneCharLen(next_byte_ + take);
        sc = GetUTF8LetterScriptNum(next_byte_ + take);
      }
      if (sc != 0) {break;}           // Letter found
      take += tlen;                   // Else advance
      map2original_.Delete(tlen);
    }     // End while not-letters

    script_buffer_[put++] = ' ';
    map2original_.Insert(1);

    // Letter in wrong script ?
    if ((sc != spanscript) && (sc != ULScript_Inherited)) {break;}
    if (put >= put_soft_limit) {
      // Buffer is almost full
      span->truncated = true;
      break;
    }
  }

  // Almost done. Back up to a character boundary if needed
  while ((0 < take) && (take < byte_length_) &&
         ((next_byte_[take] & 0xc0) == 0x80)) {
    // Back up over continuation byte
    --take;
    --put;
  }

  // Update input position
  next_byte_ += take;
  byte_length_ -= take;

  // Put four more spaces/NUL. Worst case is abcd _ _ _ \0
  //                          kMaxScriptBytes |   | put
  script_buffer_[put + 0] = ' ';
  script_buffer_[put + 1] = ' ';
  script_buffer_[put + 2] = ' ';
  script_buffer_[put + 3] = '\0';
  map2original_.Insert(4);
  map2original_.Reset();

  span->text_bytes = put;       // Does not include the last four chars above
  return true;
}

// Force Latin, Cyrillic, Armenian, Greek scripts to be lowercase
// List changes with each version of Unicode, so always lowercase
// Unicode 6.2.0:
//   ARMENIAN COPTIC CYRILLIC DESERET GEORGIAN GLAGOLITIC GREEK LATIN
void ScriptScanner::LowerScriptSpan(LangSpan* span) {
  // If needed, lowercase all the text. If we do it sooner, might miss
  // lowercasing an entity such as &Aacute;
  // We only need to do this for Latn and Cyrl scripts
  map2uplow_.Clear();
  if (true ||
      (span->ulscript == ULScript_Latin) ||
      (span->ulscript == ULScript_Cyrillic) ||
      (span->ulscript == ULScript_Armenian) ||
      (span->ulscript == ULScript_Greek)) {
    // Full Unicode lowercase of the entire buffer, including
    // four pad bytes off the end.
    // Ahhh. But the last byte 0x00 is not interchange-valid, so we do 3 pad
    // bytes and put the 0x00 in explicitly.
    // Build an offset map from script_buffer_lower_ back to script_buffer_
    int consumed, filled, changed;
    StringPiece istr(span->text, span->text_bytes + 3);
    StringPiece ostr(script_buffer_lower_, kMaxScriptLowerBuffer);

    UTF8GenericReplace(&utf8repl_lettermarklower_obj,
                              istr, ostr, is_plain_text_,
                              &consumed, &filled, &changed, &map2uplow_);
    script_buffer_lower_[filled] = '\0';
    span->text = script_buffer_lower_;
    span->text_bytes = filled - 3;
  } else {
    map2uplow_.Copy(span->text_bytes + 3);
  }
  map2uplow_.Reset();
}

// Copy next run of same-script non-tag letters to buffer [NUL terminated]
// Force Latin, Cyrillic, Greek scripts to be lowercase
// Buffer ALWAYS has leading space and trailing space space space NUL
bool ScriptScanner::GetOneScriptSpanLower(LangSpan* span) {
  bool ok = GetOneScriptSpan(span);
  LowerScriptSpan(span);
  return ok;
}


// Maps byte offset in most recent GetOneScriptSpan/Lower
// span->text [0..text_bytes] into an additional byte offset from
// span->offset, to get back to corresponding text in the original
// input buffer.
// text_offset must be the first byte
// of a UTF-8 character, or just beyond the last character. Normally this
// routine is called with the first byte of an interesting range and
// again with the first byte of the following range.
int ScriptScanner::MapBack(int text_offset) {
  return map2original_.MapBack(map2uplow_.MapBack(text_offset));
}


// Gets lscript number for letters; always returns
//   0 (common script) for non-letters
int GetUTF8LetterScriptNum(const char* src) {
  int srclen = UTF8OneCharLen(src);
  const uint8* usrc = reinterpret_cast<const uint8*>(src);
  return UTF8GenericPropertyTwoByte(&utf8prop_lettermarkscriptnum_obj,
                                    &usrc, &srclen);
}

}  // namespace CLD2