diff --git a/internal/compact_lang_det_test.cc b/internal/compact_lang_det_test.cc
index 6793b05..5f3a580 100644
--- a/internal/compact_lang_det_test.cc
+++ b/internal/compact_lang_det_test.cc
@@ -42,18 +42,16 @@ typedef int32 Encoding;
static const Encoding UNKNOWN_ENCODING = 0;
-#ifndef CLD2_DYNAMIC_MODE
- // Linker supplies the right tables; see ScoringTables compact_lang_det_impl.cc
- // These are here JUST for printing versions
- extern const UTF8PropObj cld_generated_CjkUni_obj;
- extern const CLD2TableSummary kCjkDeltaBi_obj;
- extern const CLD2TableSummary kDistinctBiTable_obj;
- extern const CLD2TableSummary kQuad_obj;
- extern const CLD2TableSummary kDeltaOcta_obj;
- extern const CLD2TableSummary kDistinctOcta_obj;
- extern const CLD2TableSummary kOcta2_obj;
- extern const short kAvgDeltaOctaScore[];
-#endif
+// Linker supplies the right tables; see ScoringTables compact_lang_det_impl.cc
+// These are here JUST for printing versions
+extern const UTF8PropObj cld_generated_CjkUni_obj;
+extern const CLD2TableSummary kCjkDeltaBi_obj;
+extern const CLD2TableSummary kDistinctBiTable_obj;
+extern const CLD2TableSummary kQuad_obj;
+extern const CLD2TableSummary kDeltaOcta_obj;
+extern const CLD2TableSummary kDistinctOcta_obj;
+extern const CLD2TableSummary kOcta2_obj;
+extern const short kAvgDeltaOctaScore[];
bool FLAGS_cld_version = false;
bool FLAGS_cld_html = true;
@@ -203,7 +201,6 @@ void DumpLanguages(Language summary_lang,
int main(int argc, char** argv) {
if (FLAGS_cld_version) {
-#ifndef CLD2_DYNAMIC_MODE
printf("%s %4dKB uni build date, bytes\n",
"........",
cld_generated_CjkUni_obj.total_size >> 10);
@@ -219,14 +216,12 @@ int main(int argc, char** argv) {
kDeltaOcta_obj.kCLDTableBuildDate,
(kDeltaOcta_obj.kCLDTableSize *
sizeof(IndirectProbBucket4)) >> 10);
-#else
- printf("FLAGS_cld_version doesn't work with dynamic data mode\n");
-#endif
exit(0);
} // End FLAGS_cld_version
+
int flags = 0;
bool get_vector = false;
- const char* data_file = NULL;
+ bool do_line = false;
const char* fname = NULL;
for (int i = 1; i < argc; ++i) {
if (argv[i][0] != '-') {fname = argv[i];}
@@ -236,24 +231,18 @@ int main(int argc, char** argv) {
if (strcmp(argv[i], "--verbose") == 0) {flags |= kCLDFlagVerbose;}
if (strcmp(argv[i], "--echo") == 0) {flags |= kCLDFlagEcho;}
if (strcmp(argv[i], "--vector") == 0) {get_vector = true;}
- if (strcmp(argv[i], "--data-file") == 0) { data_file = argv[++i];}
+ if (strcmp(argv[i], "--line") == 0) {do_line = true;}
}
-#ifdef CLD2_DYNAMIC_MODE
- if (data_file == NULL) {
- fprintf(stderr, "When running in dynamic mode, you must specify --data-file [FILE]\n");
- return -1;
- }
- fprintf(stdout, "Loading data from: %s\n", data_file);
- CLD2::loadDataFromFile(data_file);
- fprintf(stdout, "Data loaded, test commencing\n");
-#endif
-
FILE* fin;
if (fname == NULL) {
fin = stdin;
} else {
- fin = fopen(fname, "rb");
+ if (do_line) {
+ fin = fopen(fname, "r");
+ } else {
+ fin = fopen(fname, "rb");
+ }
if (fin == NULL) {
fprintf(stderr, "%s did not open\n", fname);
exit(0);
@@ -272,6 +261,51 @@ int main(int argc, char** argv) {
char* buffer = new char[10000000]; // Max 10MB of input for this test program
struct timeval news, newe;
+ // Full-blown flag-bit and hints interface
+ bool allow_extended_lang = true;
+ Language plus_one = UNKNOWN_LANGUAGE;
+ bool ignore_7bit = false;
+
+ if (do_line) {
+ while (Readline(fin, buffer)) {
+ if (IsComment(buffer)) {continue;}
+
+ // Detect language one line at a time
+ Language summary_lang = UNKNOWN_LANGUAGE;
+
+ Language language3[3];
+ int percent3[3];
+ double normalized_score3[3];
+ ResultChunkVector resultchunkvector;
+ bool is_plain_text = FLAGS_plain;
+ int text_bytes;
+
+ CLDHints cldhints = {NULL, tldhint, enchint, langhint};
+
+ summary_lang = CLD2::DetectLanguageSummaryV2(
+ buffer,
+ strlen(buffer),
+ is_plain_text,
+ &cldhints,
+ allow_extended_lang,
+ flags,
+ plus_one,
+ language3,
+ percent3,
+ normalized_score3,
+ get_vector ? &resultchunkvector : NULL,
+ &text_bytes,
+ &is_reliable);
+ printf("%s%s %d%% %s\n",
+ LanguageName(language3[0]),
+ is_reliable ? "" : "*",
+ percent3[0],
+ buffer);
+ }
+ fclose(fin);
+ delete[] buffer;
+ return 0;
+ }
if ((flags & kCLDFlagHtml) != 0) {
// Begin HTML file
@@ -287,16 +321,11 @@ int main(int argc, char** argv) {
fprintf(stderr, "file = %s
\n", fname ? fname : "stdin");
}
- // Full-blown flag-bit and hints interface
- bool allow_extended_lang = true;
- Language plus_one = UNKNOWN_LANGUAGE;
-
+ // Read entire file
int n = fread(buffer, 1, 10000000, fin);
- bool ignore_7bit = false;
-
-
- // Detect language
+
+ // Detect languages in entire file
Language summary_lang = UNKNOWN_LANGUAGE;
Language language3[3];
diff --git a/internal/scoreonescriptspan.cc b/internal/scoreonescriptspan.cc
index 887e78b..11ccaea 100644
--- a/internal/scoreonescriptspan.cc
+++ b/internal/scoreonescriptspan.cc
@@ -22,6 +22,7 @@
#include "cldutil.h"
#include "debug.h"
#include "lang_script.h"
+#include
#include
@@ -378,7 +379,7 @@ uint16 NextChunkLang(const SummaryBuffer* summarybuffer, int i) {
//
// We go out of our way to minimize the variation in the ResultChunkVector,
// so that the caller has fewer but more meaningful spans in different
-// lanaguges, for the likely purpose of translation or spell-check.
+// languages, for the likely purpose of translation or spell-check.
//
// The language of each chunk is lang1, but it might be unreliable for
// either of two reasons: its score is relatively too close to the score of
@@ -436,6 +437,7 @@ void SummaryBufferToVector(ScriptScanner* scanner, const char* text,
if (n >= n_limit) {n = 0;} // New boundary not found within range
// Also back up exactly one leading punctuation character if '"#@
+ // 'random', "quotes", #hashtags, @handles
if (n < n_limit) {
unsigned char c = us[-n - 1];
if ((c == '\'') || (c == '"') || (c == '#') || (c == '@')) {++n;}
diff --git a/internal/unittest_data.h b/internal/unittest_data.h
index 9f9a3b0..0cdafc5 100644
--- a/internal/unittest_data.h
+++ b/internal/unittest_data.h
@@ -130,7 +130,10 @@ const char* kTeststr_ks_Arab = " ژماں سرابن منز گرٲن چھِہ
const char* kTeststr_ks_Deva = "नमस्ते शारदे देवि काश्मिरपुर्वासिनि त्वामहम प्रार्थये देवि विद्य दानम च देहि मे कॉशुर लेख॒नुक सारिव॒य खॊत॒ आसान तरीक॒ छु यि देवनागरी टाइपराइटर इस्तिमाल करुन. अथ मंज़ छि कॉशुर लेख॒न॒चि सारॆय मात्रायि. अमि अलाव॒ हॆकिव तॊह्य् यिम॒ यूनिकोड एडिटर ति वरतॉविथ मगर कॉशिरि मात्रायि लेख॒नस गछ़ि हना दिकथ: अक्षरमालाछु अख मुफ़्त त॒ सॅहॅल सोफ्टवेर यॆमि स॒त्य् युनिकोड देवनागरी मंज़ ITRANS scheme स॒त्य् छु यिवान लेख॒न॒. वुछिव: सहायता. अथ स॒त्य् जुडिथ जालपृष्ठ (वेबपेज) (सॉरी अँग्रीज़ी पॉठ्य)";
const char* kTeststr_ku_Arab = " بۆ به ڕێوه بردنی نامه ی که دێتن ڕاسته وخۆ ڕه وان بکه نامه کانی گ مایل بۆ حسابی پۆستێکی تر هێنانی په یوه ندکاره کان له";
-const char* kTeststr_ku_Latn = " be zmaneki ter le inglis werdegeretewe em srvise heshta le cor beta daye wate hest a taqi dekrete u bashtr dekret tewawwzmanekan wernegrawnetewe u ne hemu laperakn ke eme pshtiwan dekayn be teaweti wergerawete nermwalley wergeran teksti new wene nasnatewe";
+// Update 2014.10.15, from more kmr text
+//const char* kTeststr_ku_Latn = " be zmaneki ter le inglis werdegeretewe em srvise heshta le cor beta daye wate hest a taqi dekrete u bashtr dekret tewawwzmanekan wernegrawnetewe u ne hemu laperakn ke eme pshtiwan dekayn be teaweti wergerawete nermwalley wergeran teksti new wene nasnatewe";
+const char* kTeststr_ku_Latn = " Nû pêvajo ya ezmûn ya pêşin di dîtin ku cezayên pêkan bi biryar standin, jûriyên neh zilam û sê jin wê gelektir govanan guhdar bike, bendewarî nav 3-mehan xilas be, ku zilamê Fransî yê 37 salê wê bi berdarî û heta mirinê bi avêtin zindanê.";
+
const char* kTeststr_ky_Arab = " جانا انى تانۇۇ ۇلۇتۇن تانۇۇ قىرعىزدى بئلۉۉ دەگەندىك اچىق ايتساق ماناستى تاانىعاندىق ۅزۉڭدۉ تاانىعاندىق بۉگۉن تەما جۉكتۅمۅ ق ى رع ى ز ت ى ل ى";
const char* kTeststr_ky_Cyrl = " агай эле оболу мен садыбакас аганын өзү менен эмес эмгектери менен тааныштым жылдары ташкенде өзбекстан илимдер академиясынын баяны";
const char* kTeststr_la_Latn = " a deo qui enim nocendi causa mentiri solet si iam consulendi causa mentiatur multum profecit sed aliud est quod per se ipsum laudabile proponitur aliud quod in deterioris comparatione praeponitur aliter enim gratulamur cum sanus est homo aliter cum melius";
@@ -262,6 +265,9 @@ const char* kTeststr_fr_en_Latn =
"Pour une aide rapide et effective, veuiller trouver votre aide dans le menu ci-dessus."
"Motoring events began soon after the construction of the first successful gasoline-fueled automobiles. The quick brown fox jumped over the lazy dog";
+// Simple English with bad UTF-8
+const char* kTeststr_en_Latn_bad_UTF8 = "Forty good bytes followed by bad UTF-8:'\xC0\xA9' and then good again.";
+
// This can be used to cross-check the build date of the main quadgram table
const char* kTeststr_version = "qpdbmrmxyzptlkuuddlrlrbas las les qpdbmrmxyzptlkuuddlrlrbas el la qpdbmrmxyzptlkuuddlrlrbas";
@@ -375,7 +381,9 @@ const char* kTeststr_ks_Arab = " \xDA\x98\xD9\x85\xD8\xA7\xDA\xBA \xD8\xB3\xD8\x
const char* kTeststr_ks_Deva =
"\xE0\xA4\xA8\xE0\xA4\xAE\xE0\xA4\xB8\xE0\xA5\x8D\xE0\xA4\xA4\xE0\xA5\x87 \xE0\xA4\xB6\xE0\xA4\xBE\xE0\xA4\xB0\xE0\xA4\xA6\xE0\xA5\x87 \xE0\xA4\xA6\xE0\xA5\x87\xE0\xA4\xB5\xE0\xA4\xBF \xE0\xA4\x95\xE0\xA4\xBE\xE0\xA4\xB6\xE0\xA5\x8D\xE0\xA4\xAE\xE0\xA4\xBF\xE0\xA4\xB0\xE0\xA4\xAA\xE0\xA5\x81\xE0\xA4\xB0\xE0\xA5\x8D\xE0\xA4\xB5\xE0\xA4\xBE\xE0\xA4\xB8\xE0\xA4\xBF\xE0\xA4\xA8\xE0\xA4\xBF \xE0\xA4\xA4\xE0\xA5\x8D\xE0\xA4\xB5\xE0\xA4\xBE\xE0\xA4\xAE\xE0\xA4\xB9\xE0\xA4\xAE \xE0\xA4\xAA\xE0\xA5\x8D\xE0\xA4\xB0\xE0\xA4\xBE\xE0\xA4\xB0\xE0\xA5\x8D\xE0\xA4\xA5\xE0\xA4\xAF\xE0\xA5\x87 \xE0\xA4\xA6\xE0\xA5\x87\xE0\xA4\xB5\xE0\xA4\xBF \xE0\xA4\xB5\xE0\xA4\xBF\xE0\xA4\xA6\xE0\xA5\x8D\xE0\xA4\xAF \xE0\xA4\xA6\xE0\xA4\xBE\xE0\xA4\xA8\xE0\xA4\xAE \xE0\xA4\x9A \xE0\xA4\xA6\xE0\xA5\x87\xE0\xA4\xB9\xE0\xA4\xBF \xE0\xA4\xAE\xE0\xA5\x87 \xE0\xA4\x95\xE0\xA5\x89\xE0\xA4\xB6\xE0\xA5\x81\xE0\xA4\xB0 \xE0\xA4\xB2\xE0\xA5\x87\xE0\xA4\x96\xE0\xA5\x92\xE0\xA4\xA8\xE0\xA5\x81\xE0\xA4\x95 \xE0\xA4\xB8\xE0\xA4\xBE\xE0\xA4\xB0\xE0\xA4\xBF\xE0\xA4\xB5\xE0\xA5\x92\xE0\xA4\xAF \xE0\xA4\x96\xE0\xA5\x8A\xE0\xA4\xA4\xE0\xA5\x92 \xE0\xA4\x86\xE0\xA4\xB8\xE0\xA4\xBE\xE0\xA4\xA8 \xE0\xA4\xA4\xE0\xA4\xB0\xE0\xA5\x80\xE0\xA4\x95\xE0\xA5\x92 \xE0\xA4\x9B\xE0\xA5\x81 \xE0\xA4\xAF\xE0\xA4\xBF \xE0\xA4\xA6\xE0\xA5\x87\xE0\xA4\xB5\xE0\xA4\xA8\xE0\xA4\xBE\xE0\xA4\x97\xE0\xA4\xB0\xE0\xA5\x80 \xE0\xA4\x9F\xE0\xA4\xBE\xE0\xA4\x87\xE0\xA4\xAA\xE0\xA4\xB0\xE0\xA4\xBE\xE0\xA4\x87\xE0\xA4\x9F\xE0\xA4\xB0 \xE0\xA4\x87\xE0\xA4\xB8\xE0\xA5\x8D\xE0\xA4\xA4\xE0\xA4\xBF\xE0\xA4\xAE\xE0\xA4\xBE\xE0\xA4\xB2 \xE0\xA4\x95\xE0\xA4\xB0\xE0\xA5\x81\xE0\xA4\xA8. \xE0\xA4\x85\xE0\xA4\xA5 \xE0\xA4\xAE\xE0\xA4\x82\xE0\xA4\x9C\xE0\xA4\xBC \xE0\xA4\x9B\xE0\xA4\xBF \xE0\xA4\x95\xE0\xA5\x89\xE0\xA4\xB6\xE0\xA5\x81\xE0\xA4\xB0 \xE0\xA4\xB2\xE0\xA5\x87\xE0\xA4\x96\xE0\xA5\x92\xE0\xA4\xA8\xE0\xA5\x92\xE0\xA4\x9A\xE0\xA4\xBF \xE0\xA4\xB8\xE0\xA4\xBE\xE0\xA4\xB0\xE0\xA5\x86\xE0\xA4\xAF \xE0\xA4\xAE\xE0\xA4\xBE\xE0\xA4\xA4\xE0\xA5\x8D\xE0\xA4\xB0\xE0\xA4\xBE\xE0\xA4\xAF\xE0\xA4\xBF. \xE0\xA4\x85\xE0\xA4\xAE\xE0\xA4\xBF \xE0\xA4\x85\xE0\xA4\xB2\xE0\xA4\xBE\xE0\xA4\xB5\xE0\xA5\x92 \xE0\xA4\xB9\xE0\xA5\x86\xE0\xA4\x95\xE0\xA4\xBF\xE0\xA4\xB5 \xE0\xA4\xA4\xE0\xA5\x8A\xE0\xA4\xB9\xE0\xA5\x8D\xE0\xA4\xAF\xE0\xA5\x8D \xE0\xA4\xAF\xE0\xA4\xBF\xE0\xA4\xAE\xE0\xA5\x92 \xE0\xA4\xAF\xE0\xA5\x82\xE0\xA4\xA8\xE0\xA4\xBF\xE0\xA4\x95\xE0\xA5\x8B\xE0\xA4\xA1 \xE0\xA4\x8F\xE0\xA4\xA1\xE0\xA4\xBF\xE0\xA4\x9F\xE0\xA4\xB0 \xE0\xA4\xA4\xE0\xA4\xBF \xE0\xA4\xB5\xE0\xA4\xB0\xE0\xA4\xA4\xE0\xA5\x89\xE0\xA4\xB5\xE0\xA4\xBF\xE0\xA4\xA5 \xE0\xA4\xAE\xE0\xA4\x97\xE0\xA4\xB0 \xE0\xA4\x95\xE0\xA5\x89\xE0\xA4\xB6\xE0\xA4\xBF\xE0\xA4\xB0\xE0\xA4\xBF \xE0\xA4\xAE\xE0\xA4\xBE\xE0\xA4\xA4\xE0\xA5\x8D\xE0\xA4\xB0\xE0\xA4\xBE\xE0\xA4\xAF\xE0\xA4\xBF \xE0\xA4\xB2\xE0\xA5\x87\xE0\xA4\x96\xE0\xA5\x92\xE0\xA4\xA8\xE0\xA4\xB8 \xE0\xA4\x97\xE0\xA4\x9B\xE0\xA4\xBC\xE0\xA4\xBF \xE0\xA4\xB9\xE0\xA4\xA8\xE0\xA4\xBE \xE0\xA4\xA6\xE0\xA4\xBF\xE0\xA4\x95\xE0\xA4\xA5: \xE0\xA4\x85\xE0\xA4\x95\xE0\xA5\x8D\xE0\xA4\xB7\xE0\xA4\xB0\xE0\xA4\xAE\xE0\xA4\xBE\xE0\xA4\xB2\xE0\xA4\xBE\xE0\xA4\x9B\xE0\xA5\x81 \xE0\xA4\x85\xE0\xA4\x96 \xE0\xA4\xAE\xE0\xA5\x81\xE0\xA4\xAB\xE0\xA4\xBC\xE0\xA5\x8D\xE0\xA4\xA4 \xE0\xA4\xA4\xE0\xA5\x92 \xE0\xA4\xB8\xE0\xA5\x85\xE0\xA4\xB9\xE0\xA5\x85\xE0\xA4\xB2 \xE0\xA4\xB8\xE0\xA5\x8B\xE0\xA4\xAB\xE0\xA5\x8D\xE0\xA4\x9F\xE0\xA4\xB5\xE0\xA5\x87\xE0\xA4\xB0 \xE0\xA4\xAF\xE0\xA5\x86\xE0\xA4\xAE\xE0\xA4\xBF \xE0\xA4\xB8\xE0\xA5\x92\xE0\xA4\xA4\xE0\xA5\x8D\xE0\xA4\xAF\xE0\xA5\x8D \xE0\xA4\xAF\xE0\xA5\x81\xE0\xA4\xA8\xE0\xA4\xBF\xE0\xA4\x95\xE0\xA5\x8B\xE0\xA4\xA1 \xE0\xA4\xA6\xE0\xA5\x87\xE0\xA4\xB5\xE0\xA4\xA8\xE0\xA4\xBE\xE0\xA4\x97\xE0\xA4\xB0\xE0\xA5\x80 \xE0\xA4\xAE\xE0\xA4\x82\xE0\xA4\x9C\xE0\xA4\xBC ITRANS scheme \xE0\xA4\xB8\xE0\xA5\x92\xE0\xA4\xA4\xE0\xA5\x8D\xE0\xA4\xAF\xE0\xA5\x8D \xE0\xA4\x9B\xE0\xA5\x81 \xE0\xA4\xAF\xE0\xA4\xBF\xE0\xA4\xB5\xE0\xA4\xBE\xE0\xA4\xA8\xE0\xA4\xB2\xE0\xA5\x87\xE0\xA4\x96\xE0\xA5\x92\xE0\xA4\xA8\xE0\xA5\x92. \xE0\xA4\xB5\xE0\xA5\x81\xE0\xA4\x9B\xE0\xA4\xBF\xE0\xA4\xB5: \xE0\xA4\xB8\xE0\xA4\xB9\xE0\xA4\xBE\xE0\xA4\xAF\xE0\xA4\xA4\xE0\xA4\xBE. \xE0\xA4\x85\xE0\xA4\xA5 \xE0\xA4\xB8\xE0\xA5\x92\xE0\xA4\xA4\xE0\xA5\x8D\xE0\xA4\xAF\xE0\xA5\x8D \xE0\xA4\x9C\xE0\xA5\x81\xE0\xA4\xA1\xE0\xA4\xBF\xE0\xA4\xA5 \xE0\xA4\x9C\xE0\xA4\xBE\xE0\xA4\xB2\xE0\xA4\xAA\xE0\xA5\x83\xE0\xA4\xB7\xE0\xA5\x8D\xE0\xA4\xA0 (\xE0\xA4\xB5\xE0\xA5\x87\xE0\xA4\xAC\xE0\xA4\xAA\xE0\xA5\x87\xE0\xA4\x9C) (\xE0\xA4\xB8\xE0\xA5\x89\xE0\xA4\xB0\xE0\xA5\x80 \xE0\xA4\x85\xE0\xA4\x81\xE0\xA4\x97\xE0\xA5\x8D\xE0\xA4\xB0\xE0\xA5\x80\xE0\xA4\x9C\xE0\xA4\xBC\xE0\xA5\x80 \xE0\xA4\xAA\xE0\xA5\x89\xE0\xA4\xA0\xE0\xA5\x8D\xE0\xA4\xAF)";
const char* kTeststr_ku_Arab = " \xD8\xA8\xDB\x86 \xD8\xA8\xD9\x87 \xDA\x95\xDB\x8E\xD9\x88\xD9\x87 \xD8\xA8\xD8\xB1\xD8\xAF\xD9\x86\xDB\x8C \xD9\x86\xD8\xA7\xD9\x85\xD9\x87 \xDB\x8C \xDA\xA9\xD9\x87 \xD8\xAF\xDB\x8E\xD8\xAA\xD9\x86 \xDA\x95\xD8\xA7\xD8\xB3\xD8\xAA\xD9\x87 \xD9\x88\xD8\xAE\xDB\x86 \xDA\x95\xD9\x87 \xD9\x88\xD8\xA7\xD9\x86 \xD8\xA8\xDA\xA9\xD9\x87 \xD9\x86\xD8\xA7\xD9\x85\xD9\x87 \xDA\xA9\xD8\xA7\xD9\x86\xDB\x8C \xDA\xAF \xD9\x85\xD8\xA7\xDB\x8C\xD9\x84 \xD8\xA8\xDB\x86 \xD8\xAD\xD8\xB3\xD8\xA7\xD8\xA8\xDB\x8C \xD9\xBE\xDB\x86\xD8\xB3\xD8\xAA\xDB\x8E\xDA\xA9\xDB\x8C \xD8\xAA\xD8\xB1 \xD9\x87\xDB\x8E\xD9\x86\xD8\xA7\xD9\x86\xDB\x8C \xD9\xBE\xD9\x87 \xDB\x8C\xD9\x88\xD9\x87 \xD9\x86\xD8\xAF\xDA\xA9\xD8\xA7\xD8\xB1\xD9\x87 \xDA\xA9\xD8\xA7\xD9\x86 \xD9\x84\xD9\x87";
-const char* kTeststr_ku_Latn = " be zmaneki ter le inglis werdegeretewe em srvise heshta le cor beta daye wate hest a taqi dekrete u bashtr dekret tewawwzmanekan wernegrawnetewe u ne hemu laperakn ke eme pshtiwan dekayn be teaweti wergerawete nermwalley wergeran teksti new wene nasnatewe";
+//const char* kTeststr_ku_Latn = " be zmaneki ter le inglis werdegeretewe em srvise heshta le cor beta daye wate hest a taqi dekrete u bashtr dekret tewawwzmanekan wernegrawnetewe u ne hemu laperakn ke eme pshtiwan dekayn be teaweti wergerawete nermwalley wergeran teksti new wene nasnatewe";
+const char* kTeststr_ku_Latn = " N\xC3\xBB p\xC3\xAAvajo ya ezm\xC3\xBBn ya p\xC3\xAA\xC5\x9Fin di d\xC3\xAEtin ku cezay\xC3\xAAn p\xC3\xAAkan bi biryar standin, j\xC3\xBBriy\xC3\xAAn neh zilam \xC3\xBB s\xC3\xAA jin w\xC3\xAA gelektir govanan guhdar bike, bendewar\xC3\xAE nav 3-mehan xilas be, ku zilam\xC3\xAA Frans\xC3\xAE y\xC3\xAA 37 sal\xC3\xAA w\xC3\xAA bi berdar\xC3\xAE \xC3\xBB heta mirin\xC3\xAA bi av\xC3\xAAtin zindan\xC3\xAA.";
+
const char* kTeststr_ky_Arab = " \xD8\xAC\xD8\xA7\xD9\x86\xD8\xA7 \xD8\xA7\xD9\x86\xD9\x89 \xD8\xAA\xD8\xA7\xD9\x86\xDB\x87\xDB\x87 \xDB\x87\xD9\x84\xDB\x87\xD8\xAA\xDB\x87\xD9\x86 \xD8\xAA\xD8\xA7\xD9\x86\xDB\x87\xDB\x87 \xD9\x82\xD9\x89\xD8\xB1\xD8\xB9\xD9\x89\xD8\xB2\xD8\xAF\xD9\x89 \xD8\xA8\xD8\xA6\xD9\x84\xDB\x89\xDB\x89 \xD8\xAF\xDB\x95\xDA\xAF\xDB\x95\xD9\x86\xD8\xAF\xD9\x89\xD9\x83 \xD8\xA7\xDA\x86\xD9\x89\xD9\x82 \xD8\xA7\xD9\x8A\xD8\xAA\xD8\xB3\xD8\xA7\xD9\x82 \xD9\x85\xD8\xA7\xD9\x86\xD8\xA7\xD8\xB3\xD8\xAA\xD9\x89 \xD8\xAA\xD8\xA7\xD8\xA7\xD9\x86\xD9\x89\xD8\xB9\xD8\xA7\xD9\x86\xD8\xAF\xD9\x89\xD9\x82 \xDB\x85\xD8\xB2\xDB\x89\xDA\xAD\xD8\xAF\xDB\x89 \xD8\xAA\xD8\xA7\xD8\xA7\xD9\x86\xD9\x89\xD8\xB9\xD8\xA7\xD9\x86\xD8\xAF\xD9\x89\xD9\x82 \xD8\xA8\xDB\x89\xDA\xAF\xDB\x89\xD9\x86 \xD8\xAA\xDB\x95\xD9\x85\xD8\xA7 \xD8\xAC\xDB\x89\xD9\x83\xD8\xAA\xDB\x85\xD9\x85\xDB\x85 \xD9\x82 \xD9\x89 \xD8\xB1\xD8\xB9 \xD9\x89 \xD8\xB2 \xD8\xAA \xD9\x89 \xD9\x84 \xD9\x89";
const char* kTeststr_ky_Cyrl = " \xD0\xB0\xD0\xB3\xD0\xB0\xD0\xB9 \xD1\x8D\xD0\xBB\xD0\xB5 \xD0\xBE\xD0\xB1\xD0\xBE\xD0\xBB\xD1\x83 \xD0\xBC\xD0\xB5\xD0\xBD \xD1\x81\xD0\xB0\xD0\xB4\xD1\x8B\xD0\xB1\xD0\xB0\xD0\xBA\xD0\xB0\xD1\x81 \xD0\xB0\xD0\xB3\xD0\xB0\xD0\xBD\xD1\x8B\xD0\xBD \xD3\xA9\xD0\xB7\xD2\xAF \xD0\xBC\xD0\xB5\xD0\xBD\xD0\xB5\xD0\xBD \xD1\x8D\xD0\xBC\xD0\xB5\xD1\x81 \xD1\x8D\xD0\xBC\xD0\xB3\xD0\xB5\xD0\xBA\xD1\x82\xD0\xB5\xD1\x80\xD0\xB8 \xD0\xBC\xD0\xB5\xD0\xBD\xD0\xB5\xD0\xBD \xD1\x82\xD0\xB0\xD0\xB0\xD0\xBD\xD1\x8B\xD1\x88\xD1\x82\xD1\x8B\xD0\xBC \xD0\xB6\xD1\x8B\xD0\xBB\xD0\xB4\xD0\xB0\xD1\x80\xD1\x8B \xD1\x82\xD0\xB0\xD1\x88\xD0\xBA\xD0\xB5\xD0\xBD\xD0\xB4\xD0\xB5 \xD3\xA9\xD0\xB7\xD0\xB1\xD0\xB5\xD0\xBA\xD1\x81\xD1\x82\xD0\xB0\xD0\xBD \xD0\xB8\xD0\xBB\xD0\xB8\xD0\xBC\xD0\xB4\xD0\xB5\xD1\x80 \xD0\xB0\xD0\xBA\xD0\xB0\xD0\xB4\xD0\xB5\xD0\xBC\xD0\xB8\xD1\x8F\xD1\x81\xD1\x8B\xD0\xBD\xD1\x8B\xD0\xBD \xD0\xB1\xD0\xB0\xD1\x8F\xD0\xBD\xD1\x8B";
const char* kTeststr_la_Latn = " a deo qui enim nocendi causa mentiri solet si iam consulendi causa mentiatur multum profecit sed aliud est quod per se ipsum laudabile proponitur aliud quod in deterioris comparatione praeponitur aliter enim gratulamur cum sanus est homo aliter cum melius";
@@ -507,6 +515,9 @@ const char* kTeststr_fr_en_Latn =
"Pour une aide rapide et effective, veuiller trouver votre aide dans le menu ci-dessus."
"Motoring events began soon after the construction of the first successful gasoline-fueled automobiles. The quick brown fox jumped over the lazy dog";
+// Simple English with bad UTF-8
+const char* kTeststr_en_Latn_bad_UTF8 = "Forty good bytes followed by bad UTF-8:'\xC0\xA9' and then good again.";
+
// This can be used to cross-check the build date of the main quadgram table
const char* kTeststr_version = "qpdbmrmxyzptlkuuddlrlrbas las les qpdbmrmxyzptlkuuddlrlrbas el la qpdbmrmxyzptlkuuddlrlrbas";