new ku-Latn text, bad UTF-8, besteffort flag, comments

git-svn-id: https://cld2.googlecode.com/svn/trunk@174 b252ecd4-b096-bf77-eb8e-91563289f87e
This commit is contained in:
dsites@google.com
2014-10-28 20:32:54 +00:00
parent 92a3e24c4e
commit b312a68c4a
3 changed files with 82 additions and 40 deletions

View File

@@ -22,6 +22,7 @@
#include "cldutil.h"
#include "debug.h"
#include "lang_script.h"
#include <stdint.h>
#include <stdio.h>
@@ -378,7 +379,7 @@ uint16 NextChunkLang(const SummaryBuffer* summarybuffer, int i) {
//
// We go out of our way to minimize the variation in the ResultChunkVector,
// so that the caller has fewer but more meaningful spans in different
// lanaguges, for the likely purpose of translation or spell-check.
// languages, for the likely purpose of translation or spell-check.
//
// The language of each chunk is lang1, but it might be unreliable for
// either of two reasons: its score is relatively too close to the score of
@@ -436,6 +437,7 @@ void SummaryBufferToVector(ScriptScanner* scanner, const char* text,
if (n >= n_limit) {n = 0;} // New boundary not found within range
// Also back up exactly one leading punctuation character if '"#@
// 'random', "quotes", #hashtags, @handles
if (n < n_limit) {
unsigned char c = us[-n - 1];
if ((c == '\'') || (c == '"') || (c == '#') || (c == '@')) {++n;}