stdex
Additional custom or not Standard C++ covered algorithms
Loading...
Searching...
No Matches
langid.hpp
1/*
2 SPDX-License-Identifier: MIT
3 Copyright © 2024 Amebis
4*/
5
6#pragma once
7
8#include "compat.hpp"
9#include "string.hpp"
10#include "unicode.hpp"
11#ifdef _WIN32
12#include "windows.h"
13#endif
14#include <stddef.h>
15#include <stdint.h>
16#include <map>
17#include <string>
18
19#if defined(__GNUC__)
20#pragma GCC diagnostic push
21#pragma GCC diagnostic ignored "-Wexit-time-destructors"
22#endif
23
24namespace stdex
25{
26#ifdef _WIN32
27 using langid = LANGID;
28#else
29 using langid = uint16_t;
30#endif
31
32 constexpr langid langid_unknown = 127;
33
41 inline langid langid_from_rfc1766(_In_z_ const char* rfc1766)
42 {
43 struct stricmp_less
44 {
45 bool operator()(_In_z_ const char* str1, _In_z_ const char* str2) const
46 {
47 stdex_assert(str1);
48 stdex_assert(str2);
49 size_t i;
50 for (i = 0; ; ++i) {
51 auto a = stdex::tolower(str1[i]);
52 auto b = stdex::tolower(str2[i]);
53 auto a_end = !a || stdex::ispunct(a);
54 auto b_end = !b || stdex::ispunct(b);
55 if (a_end && b_end) return false;
56 if (b_end || a > b) return false;
57 if (a_end || a < b) return true;
58 }
59 }
60 };
61 struct language_mapping
62 {
63 langid id;
64 std::map<const char*, langid, stricmp_less> sublanguages;
65 };
66 static const std::map<const char*, language_mapping, stricmp_less> languages = {
67 {"af", {1078, {}}}, // Afrikaans
68 {"ar", {0x01, // Arabic
69 {
70 {"ae", static_cast<langid>(14337)}, // Arabic(U.A.E.)
71 {"bh", static_cast<langid>(15361)}, // Arabic(Bahrain)
72 {"dz", static_cast<langid>(5121)}, // Arabic(Algeria)
73 {"eg", static_cast<langid>(3073)}, // Arabic(Egypt)
74 {"iq", static_cast<langid>(2049)}, // Arabic(Iraq)
75 {"jo", static_cast<langid>(11265)}, // Arabic(Jordan)
76 {"kw", static_cast<langid>(13313)}, // Arabic(Kuwait)
77 {"lb", static_cast<langid>(12289)}, // Arabic(Lebanon)
78 {"ly", static_cast<langid>(4097)}, // Arabic(Libya)
79 {"ma", static_cast<langid>(6145)}, // Arabic(Morocco)
80 {"om", static_cast<langid>(8193)}, // Arabic(Oman)
81 {"qa", static_cast<langid>(16385)}, // Arabic(Qatar)
82 {"sa", static_cast<langid>(1025)}, // Arabic(Saudi Arabia)
83 {"sy", static_cast<langid>(10241)}, // Arabic(Syria)
84 {"tn", static_cast<langid>(7169)}, // Arabic(Tunisia)
85 {"ye", static_cast<langid>(9217)}, // Arabic(Yemen)
86 }}},
87 {"be", {1059, {}}}, // Belarusian
88 {"bg", {1026, {}}}, // Bulgarian
89 {"ca", {1027, {}}}, // Catalan
90 {"cs", {1029, {}}}, // Czech
91 {"da", {1030, {}}}, // Danish
92 {"de", {0x07, // German
93 {
94 {"at", static_cast<langid>(3079)}, // German(Austrian)
95 {"ch", static_cast<langid>(2055)}, // German(Swiss)
96 {"de", static_cast<langid>(1031)}, // German(Germany)
97 {"li", static_cast<langid>(5127)}, // German(Liechtenstein)
98 {"lu", static_cast<langid>(4103)}, // German(Luxembourg)
99 }}},
100 {"el", {1032, {}}}, // Greek
101 {"en", {0x09, // English
102 {
103 {"au", static_cast<langid>(3081)}, // English(Australian)
104 {"bz", static_cast<langid>(10249)}, // English(Belize)
105 {"ca", static_cast<langid>(4105)}, // English(Canadian)
106 {"ca", static_cast<langid>(9225)}, // English(Caribbean)
107 {"gb", static_cast<langid>(2057)}, // English(British)
108 {"ie", static_cast<langid>(6153)}, // English(Ireland)
109 {"jm", static_cast<langid>(8201)}, // English(Jamaica)
110 {"nz", static_cast<langid>(5129)}, // English(New Zealand)
111 {"tt", static_cast<langid>(11273)}, // English(Trinidad)
112 {"us", static_cast<langid>(1033)}, // English(United States)
113 {"za", static_cast<langid>(7177)}, // English(South Africa)
114 }}},
115 {"es", {0x0a, // Spanish
116 {
117 {"ar", static_cast<langid>(11274)}, // Spanish(Argentina)
118 {"bo", static_cast<langid>(16394)}, // Spanish(Bolivia)
119 {"cl", static_cast<langid>(13322)}, // Spanish(Chile)
120 {"co", static_cast<langid>(9226)}, // Spanish(Colombia)
121 {"cr", static_cast<langid>(5130)}, // Spanish(Costa Rica)
122 {"do", static_cast<langid>(7178)}, // Spanish(Dominican Republic)
123 {"ec", static_cast<langid>(12298)}, // Spanish(Ecuador)
124 {"es", static_cast<langid>(1034)}, // Spanish(Spain)
125 {"gt", static_cast<langid>(4106)}, // Spanish(Guatemala)
126 {"hn", static_cast<langid>(18442)}, // Spanish(Honduras)
127 {"mx", static_cast<langid>(2058)}, // Spanish(Mexican)
128 {"ni", static_cast<langid>(19466)}, // Spanish(Nicaragua)
129 {"pa", static_cast<langid>(6154)}, // Spanish(Panama)
130 {"pe", static_cast<langid>(10250)}, // Spanish(Peru)
131 {"pr", static_cast<langid>(20490)}, // Spanish(Puerto Rico)
132 {"py", static_cast<langid>(15370)}, // Spanish(Paraguay)
133 {"sv", static_cast<langid>(17418)}, // Spanish(El Salvador)
134 {"uy", static_cast<langid>(14346)}, // Spanish(Uruguay)
135 {"ve", static_cast<langid>(8202)}, // Spanish(Venezuela)
136 }}},
137 {"et", {1061, {}}}, // Estonian
138 {"eu", {1069, {}}}, // Basque
139 {"fa", {1065, {}}}, // Farsi
140 {"fi", {1035, {}}}, // Finnish
141 {"fo", {1080, {}}}, // Faeroese
142 {"fr", {0x0c, // French
143 {
144 {"be", static_cast<langid>(2060)}, // French(Belgian)
145 {"ca", static_cast<langid>(3084)}, // French(Canadian)
146 {"ch", static_cast<langid>(4108)}, // French(Swiss)
147 {"fr", static_cast<langid>(1036)}, // French(Luxembourg)
148 {"lu", static_cast<langid>(5132)}, // French(Luxembourg)
149 }}},
150 {"gd", {1084, {}}}, // Gaelic(Scots)
151 {"he", {1037, {}}}, // Hebrew
152 {"hi", {1081, {}}}, // Hindi
153 {"hr", {1050, {}}}, // Croatian
154 {"hu", {1038, {}}}, // Hungarian
155 {"in", {1057, {}}}, // Indonesian
156 {"is", {1039, {}}}, // Icelandic
157 {"it", {0x10, // Italian
158 {
159 {"ch", static_cast<langid>(2064)}, // Italian(Swiss)
160 {"it", static_cast<langid>(1040)}, // Italian(Italy)
161 }}},
162 {"ja", {1041, {}}}, // Japanese
163 {"ji", {1085, {}}}, // Yiddish
164 {"ko", {0x12, // Korean
165 {
166 {"johab", static_cast<langid>(2066)}, // Korean(Johab)
167 {"kr", static_cast<langid>(1042)}, // Korean(Korea)
168 }}},
169 {"lt", {1063, {}}}, // Lithuanian
170 {"lv", {1062, {}}}, // Latvian
171 {"mk", {1071, {}}}, // Macedonian (FYROM)
172 {"ms", {1086, {}}}, // Malaysian
173 {"mt", {1082, {}}}, // Maltese
174 {"nl", {0x13, // Dutch
175 {
176 {"be", static_cast<langid>(2067)}, // Dutch(Belgian)
177 {"nl", static_cast<langid>(1043)}, // Dutch(Netherland)
178 }}},
179 {"no", {0x14, // Norwegian
180 {
181 {"bokmaal", static_cast<langid>(1044)}, // Norwegian(Bokmaal)
182 {"nynorsk", static_cast<langid>(2068)}, // Norwegian(Nynorsk)
183 }}},
184 {"pl", {1045, {}}}, // Polish
185 {"pt", {0x16, // Portuguese
186 {
187 {"br", static_cast<langid>(1046)}, // Portuguese(Brazil)
188 {"pt", static_cast<langid>(2070)}, // Portuguese(Portugal)
189 }}},
190 {"rm", {1047, {}}}, // Rhaeto-Romanic
191 {"ro", {0x18, // Romanian
192 {
193 {"mo", static_cast<langid>(2072)}, // Romanian(Moldavia)
194 {"ro", static_cast<langid>(1048)}, // Romanian(Romania)
195 }}},
196 {"ru", {0x19, // Russian
197 {
198 {"mo", static_cast<langid>(2073)}, // Russian(Moldavia)
199 {"ru", static_cast<langid>(1049)}, // Russian(Russia)
200 }}},
201 {"sb", {1070, {}}}, // Sorbian
202 {"sk", {1051, {}}}, // Slovak
203 {"sl", {1060, {}}}, // Slovenian
204 {"sq", {1052, {}}}, // Albanian
205 {"sr", {0x1a, // Serbian
206 {
207 {"cyrillic", static_cast<langid>(3098)}, // Serbian(Cyrillic)
208 {"latin", static_cast<langid>(2074)}, // Serbian(Latin)
209 }}},
210 {"sv", {0x1d, // Swedish
211 {
212 {"fi", static_cast<langid>(2077)}, // Swedish(Finland)
213 {"se", static_cast<langid>(1053)}, // Swedish(Sweden)
214 }}},
215 {"sx", {1072, {}}}, // Sutu
216 {"sz", {1083, {}}}, // Sami(Lappish)
217 {"th", {1054, {}}}, // Thai
218 {"tn", {1074, {}}}, // Tswana
219 {"tr", {1055, {}}}, // Turkish
220 {"ts", {1073, {}}}, // Tsonga
221 {"uk", {1058, {}}}, // Ukrainian
222 {"ur", {1056, {}}}, // Urdu
223 {"ve", {1075, {}}}, // Venda
224 {"vi", {1066, {}}}, // Vietnamese
225 {"xh", {1076, {}}}, // Xhosa
226 {"zh", {0x04, // Chinese
227 {
228 {"cn", static_cast<langid>(2052)}, // Chinese(PRC)
229 {"hk", static_cast<langid>(3076)}, // Chinese(Hong Kong)
230 {"sg", static_cast<langid>(4100)}, // Chinese(Singapore)
231 {"tw", static_cast<langid>(1028)}, // Chinese(Taiwan)
232 }}},
233 {"zu", {1077, {}}}, // Zulu
234 };
235
236 if (auto el = languages.find(rfc1766); el != languages.end()) {
237 if (!el->second.sublanguages.empty()) {
238 if (auto n = stdex::strlen(el->first); ispunct(rfc1766[n])) {
239 n++;
240 if (auto el_sub = el->second.sublanguages.find(&rfc1766[n]); el_sub != el->second.sublanguages.end())
241 return el_sub->second;
242 }
243 }
244 return el->second.id;
245 }
246 return langid_unknown;
247 }
248}
249
250#if defined(__GNUC__)
251#pragma GCC diagnostic pop
252#endif