stdex: add langid

Signed-off-by: Simon Rozman <simon@rozman.si>
This commit is contained in:
Simon Rozman 2024-08-19 11:20:42 +02:00
parent 98e8756808
commit 7b21a3983c
5 changed files with 332 additions and 7 deletions

View File

@ -28,6 +28,7 @@
F421D48B2B75177E004ECBB0 /* unicode.cpp in Sources */ = {isa = PBXBuildFile; fileRef = F4C07F572AB08E690044EDC0 /* unicode.cpp */; }; F421D48B2B75177E004ECBB0 /* unicode.cpp in Sources */ = {isa = PBXBuildFile; fileRef = F4C07F572AB08E690044EDC0 /* unicode.cpp */; };
F421D48C2B751780004ECBB0 /* watchdog.cpp in Sources */ = {isa = PBXBuildFile; fileRef = F4CCA3B62B73B912007B857B /* watchdog.cpp */; }; F421D48C2B751780004ECBB0 /* watchdog.cpp in Sources */ = {isa = PBXBuildFile; fileRef = F4CCA3B62B73B912007B857B /* watchdog.cpp */; };
F421D48D2B751783004ECBB0 /* zlib.cpp in Sources */ = {isa = PBXBuildFile; fileRef = F421D46B2B750BFD004ECBB0 /* zlib.cpp */; }; F421D48D2B751783004ECBB0 /* zlib.cpp in Sources */ = {isa = PBXBuildFile; fileRef = F421D46B2B750BFD004ECBB0 /* zlib.cpp */; };
F4481A1A2C73427600CED93B /* langid.cpp in Sources */ = {isa = PBXBuildFile; fileRef = F4481A192C73427600CED93B /* langid.cpp */; };
F4C07F522AB059580044EDC0 /* pch.cpp in Sources */ = {isa = PBXBuildFile; fileRef = F4C07F512AB059580044EDC0 /* pch.cpp */; }; F4C07F522AB059580044EDC0 /* pch.cpp in Sources */ = {isa = PBXBuildFile; fileRef = F4C07F512AB059580044EDC0 /* pch.cpp */; };
F4C07F552AB05B5B0044EDC0 /* main.cpp in Sources */ = {isa = PBXBuildFile; fileRef = F4C07F542AB05B5B0044EDC0 /* main.cpp */; }; F4C07F552AB05B5B0044EDC0 /* main.cpp in Sources */ = {isa = PBXBuildFile; fileRef = F4C07F542AB05B5B0044EDC0 /* main.cpp */; };
/* End PBXBuildFile section */ /* End PBXBuildFile section */
@ -79,6 +80,7 @@
F421D47A2B750EAE004ECBB0 /* inftrees.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = inftrees.c; sourceTree = "<group>"; }; F421D47A2B750EAE004ECBB0 /* inftrees.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = inftrees.c; sourceTree = "<group>"; };
F421D47B2B750EAE004ECBB0 /* uncompr.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = uncompr.c; sourceTree = "<group>"; }; F421D47B2B750EAE004ECBB0 /* uncompr.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = uncompr.c; sourceTree = "<group>"; };
F437AA902AC1BB64001E2230 /* hash.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; path = hash.cpp; sourceTree = "<group>"; }; F437AA902AC1BB64001E2230 /* hash.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; path = hash.cpp; sourceTree = "<group>"; };
F4481A192C73427600CED93B /* langid.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = langid.cpp; sourceTree = "<group>"; };
F4B7FBDC2AAF49BC00C6BE9F /* UnitTests */ = {isa = PBXFileReference; explicitFileType = "compiled.mach-o.executable"; includeInIndex = 0; path = UnitTests; sourceTree = BUILT_PRODUCTS_DIR; }; F4B7FBDC2AAF49BC00C6BE9F /* UnitTests */ = {isa = PBXFileReference; explicitFileType = "compiled.mach-o.executable"; includeInIndex = 0; path = UnitTests; sourceTree = BUILT_PRODUCTS_DIR; };
F4C07F4E2AB059300044EDC0 /* math.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = math.cpp; sourceTree = "<group>"; }; F4C07F4E2AB059300044EDC0 /* math.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = math.cpp; sourceTree = "<group>"; };
F4C07F502AB059580044EDC0 /* pch.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = pch.hpp; sourceTree = "<group>"; }; F4C07F502AB059580044EDC0 /* pch.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = pch.hpp; sourceTree = "<group>"; };
@ -157,6 +159,7 @@
children = ( children = (
F4C07F532AB05A240044EDC0 /* compat.hpp */, F4C07F532AB05A240044EDC0 /* compat.hpp */,
F437AA902AC1BB64001E2230 /* hash.cpp */, F437AA902AC1BB64001E2230 /* hash.cpp */,
F4481A192C73427600CED93B /* langid.cpp */,
F4C07F542AB05B5B0044EDC0 /* main.cpp */, F4C07F542AB05B5B0044EDC0 /* main.cpp */,
F4C07F4E2AB059300044EDC0 /* math.cpp */, F4C07F4E2AB059300044EDC0 /* math.cpp */,
F4C07F562AB08E690044EDC0 /* parser.cpp */, F4C07F562AB08E690044EDC0 /* parser.cpp */,
@ -252,6 +255,7 @@
F421D4802B750EAE004ECBB0 /* trees.c in Sources */, F421D4802B750EAE004ECBB0 /* trees.c in Sources */,
F421D48D2B751783004ECBB0 /* zlib.cpp in Sources */, F421D48D2B751783004ECBB0 /* zlib.cpp in Sources */,
F421D47D2B750EAE004ECBB0 /* deflate.c in Sources */, F421D47D2B750EAE004ECBB0 /* deflate.c in Sources */,
F4481A1A2C73427600CED93B /* langid.cpp in Sources */,
F421D4702B750E0F004ECBB0 /* adler32.c in Sources */, F421D4702B750E0F004ECBB0 /* adler32.c in Sources */,
F421D4822B750EAE004ECBB0 /* uncompr.c in Sources */, F421D4822B750EAE004ECBB0 /* uncompr.c in Sources */,
F421D4742B750E21004ECBB0 /* crc32.c in Sources */, F421D4742B750E21004ECBB0 /* crc32.c in Sources */,

35
UnitTests/langid.cpp Normal file
View File

@ -0,0 +1,35 @@
/*
SPDX-License-Identifier: MIT
Copyright © 2024 Amebis
*/
#include "pch.hpp"
using namespace std;
#ifdef _WIN32
using namespace Microsoft::VisualStudio::CppUnitTestFramework;
#endif
namespace UnitTests
{
void langid::from_rfc1766()
{
Assert::AreEqual<stdex::langid>(9, stdex::langid_from_rfc1766("en"));
Assert::AreEqual<stdex::langid>(1033, stdex::langid_from_rfc1766("en.US"));
Assert::AreEqual<stdex::langid>(1033, stdex::langid_from_rfc1766("en_US"));
Assert::AreEqual<stdex::langid>(2057, stdex::langid_from_rfc1766("en.GB"));
Assert::AreEqual<stdex::langid>(2057, stdex::langid_from_rfc1766("en_GB"));
Assert::AreEqual<stdex::langid>(9, stdex::langid_from_rfc1766("EN"));
Assert::AreEqual<stdex::langid>(1033, stdex::langid_from_rfc1766("EN.US"));
Assert::AreEqual<stdex::langid>(1033, stdex::langid_from_rfc1766("EN_US"));
Assert::AreEqual<stdex::langid>(2057, stdex::langid_from_rfc1766("EN.GB"));
Assert::AreEqual<stdex::langid>(2057, stdex::langid_from_rfc1766("EN_GB"));
Assert::AreEqual<stdex::langid>(1060, stdex::langid_from_rfc1766("sl"));
Assert::AreEqual<stdex::langid>(1060, stdex::langid_from_rfc1766("sl.SI"));
Assert::AreEqual<stdex::langid>(1060, stdex::langid_from_rfc1766("sl_SI"));
Assert::AreEqual<stdex::langid>(1060, stdex::langid_from_rfc1766("SL"));
Assert::AreEqual<stdex::langid>(1060, stdex::langid_from_rfc1766("SL.SI"));
Assert::AreEqual<stdex::langid>(1060, stdex::langid_from_rfc1766("SL_SI"));
}
}

View File

@ -12,24 +12,25 @@ int main(int, const char *[])
UnitTests::hash::crc32(); UnitTests::hash::crc32();
UnitTests::hash::md5(); UnitTests::hash::md5();
UnitTests::hash::sha1(); UnitTests::hash::sha1();
UnitTests::math::mul(); UnitTests::langid::from_rfc1766();
UnitTests::math::add(); UnitTests::math::add();
UnitTests::parser::wtest(); UnitTests::math::mul();
UnitTests::parser::sgml_test();
UnitTests::parser::http_test(); UnitTests::parser::http_test();
UnitTests::parser::sgml_test();
UnitTests::parser::wtest();
UnitTests::pool::test(); UnitTests::pool::test();
UnitTests::ring::test(); UnitTests::ring::test();
UnitTests::sgml::sgml2str(); UnitTests::sgml::sgml2str();
UnitTests::sgml::str2sgml(); UnitTests::sgml::str2sgml();
UnitTests::stream::async(); UnitTests::stream::async();
UnitTests::stream::replicator();
UnitTests::stream::open_close();
UnitTests::stream::file_stat(); UnitTests::stream::file_stat();
UnitTests::stream::open_close();
UnitTests::stream::replicator();
UnitTests::string::sprintf(); UnitTests::string::sprintf();
UnitTests::unicode::str2wstr();
UnitTests::unicode::wstr2str();
UnitTests::unicode::charset_encoder(); UnitTests::unicode::charset_encoder();
UnitTests::unicode::normalize(); UnitTests::unicode::normalize();
UnitTests::unicode::str2wstr();
UnitTests::unicode::wstr2str();
UnitTests::watchdog::test(); UnitTests::watchdog::test();
UnitTests::zlib::test(); UnitTests::zlib::test();
std::cout << "PASS\n"; std::cout << "PASS\n";

View File

@ -14,6 +14,7 @@
#include <stdex/html.hpp> #include <stdex/html.hpp>
#include <stdex/idrec.hpp> #include <stdex/idrec.hpp>
#include <stdex/interval.hpp> #include <stdex/interval.hpp>
#include <stdex/langid.hpp>
#include <stdex/locale.hpp> #include <stdex/locale.hpp>
#include <stdex/mapping.hpp> #include <stdex/mapping.hpp>
#include <stdex/math.hpp> #include <stdex/math.hpp>
@ -54,6 +55,12 @@ namespace UnitTests
TEST_METHOD(sha1); TEST_METHOD(sha1);
}; };
TEST_CLASS(langid)
{
public:
TEST_METHOD(from_rfc1766);
};
TEST_CLASS(math) TEST_CLASS(math)
{ {
public: public:

278
include/stdex/langid.hpp Normal file
View File

@ -0,0 +1,278 @@
/*
SPDX-License-Identifier: MIT
Copyright © 2024 Amebis
*/
#pragma once
#include "compat.hpp"
#include "string.hpp"
#include "unicode.hpp"
#ifdef _WIN32
#include "windows.h"
#endif
#include <stddef.h>
#include <stdint.h>
#include <map>
#include <string>
#if defined(__GNUC__)
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wexit-time-destructors"
#endif
namespace stdex
{
#ifdef _WIN32
using langid = LANGID;
#else
using langid = uint16_t;
#endif
constexpr langid langid_unknown = 127;
#ifdef _WIN32
///
/// Parses language name and returns matching language code
///
/// \param[in] rfc1766 Language name in RFC1766 syntax
///
/// \returns Language code or `langid_unknown` if match not found
///
langid langid_from_rfc1766(_In_z_ const char *rfc1766)
{
return LANGIDFROMLCID(LocaleNameToLCID(str2wstr(rfc1766, langid::utf8).c_str(), 0));
}
///
/// Parses language name and returns matching language code
///
/// \param[in] rfc1766 Language name in RFC1766 syntax
///
/// \returns Language code or `langid_unknown` if match not found
///
langid langid_from_rfc1766(_In_z_ const wchar_t *rfc1766)
{
return LANGIDFROMLCID(LocaleNameToLCID(rfc1766, 0));
}
#else
///
/// Parses language name and returns matching language code
///
/// \param[in] rfc1766 Language name in RFC1766 syntax
///
/// \returns Language code or `langid_unknown` if match not found
///
inline langid langid_from_rfc1766(_In_z_ const char *rfc1766)
{
struct stricmp_less
{
bool operator()(_In_z_ const char *str1, _In_z_ const char *str2) const
{
stdex_assert(str1);
stdex_assert(str2);
size_t i;
for (i = 0; ; ++i) {
auto a = stdex::tolower(str1[i]);
auto b = stdex::tolower(str2[i]);
auto a_end = !a || stdex::ispunct(a);
auto b_end = !b || stdex::ispunct(b);
if (a_end && b_end) return false;
if (b_end || a > b) return false;
if (a_end || a < b) return true;
}
}
};
struct language_mapping
{
langid id; ///< Language ID
std::map<const char *, langid, stricmp_less> sublanguages; ///< Sublanguages
};
static const std::map<const char *, language_mapping, stricmp_less> languages = {
{"af", {1078, {}}}, // Afrikaans
{"ar", {0x01, // Arabic
{
{"ae", 14337}, // Arabic(U.A.E.)
{"bh", 15361}, // Arabic(Bahrain)
{"dz", 5121}, // Arabic(Algeria)
{"eg", 3073}, // Arabic(Egypt)
{"iq", 2049}, // Arabic(Iraq)
{"jo", 11265}, // Arabic(Jordan)
{"kw", 13313}, // Arabic(Kuwait)
{"lb", 12289}, // Arabic(Lebanon)
{"ly", 4097}, // Arabic(Libya)
{"ma", 6145}, // Arabic(Morocco)
{"om", 8193}, // Arabic(Oman)
{"qa", 16385}, // Arabic(Qatar)
{"sa", 1025}, // Arabic(Saudi Arabia)
{"sy", 10241}, // Arabic(Syria)
{"tn", 7169}, // Arabic(Tunisia)
{"ye", 9217}, // Arabic(Yemen)
}}},
{"be", {1059, {}}}, // Belarusian
{"bg", {1026, {}}}, // Bulgarian
{"ca", {1027, {}}}, // Catalan
{"cs", {1029, {}}}, // Czech
{"da", {1030, {}}}, // Danish
{"de", {0x07, // German
{
{"at", 3079}, // German(Austrian)
{"ch", 2055}, // German(Swiss)
{"de", 1031}, // German(Germany)
{"li", 5127}, // German(Liechtenstein)
{"lu", 4103}, // German(Luxembourg)
}}},
{"el", {1032, {}}}, // Greek
{"en", {0x09, // English
{
{"au", 3081}, // English(Australian)
{"bz", 10249}, // English(Belize)
{"ca", 4105}, // English(Canadian)
{"ca", 9225}, // English(Caribbean)
{"gb", 2057}, // English(British)
{"ie", 6153}, // English(Ireland)
{"jm", 8201}, // English(Jamaica)
{"nz", 5129}, // English(New Zealand)
{"tt", 11273}, // English(Trinidad)
{"us", 1033}, // English(United States)
{"za", 7177}, // English(South Africa)
}}},
{"es", {0x0a, // Spanish
{
{"ar", 11274}, // Spanish(Argentina)
{"bo", 16394}, // Spanish(Bolivia)
{"c", 13322}, // Spanish(Chile)
{"co", 9226}, // Spanish(Colombia)
{"cr", 5130}, // Spanish(Costa Rica)
{"do", 7178}, // Spanish(Dominican Republic)
{"ec", 12298}, // Spanish(Ecuador)
{"es", 1034}, // Spanish(Spain)
{"gt", 4106}, // Spanish(Guatemala)
{"hn", 18442}, // Spanish(Honduras)
{"mx", 2058}, // Spanish(Mexican)
{"ni", 19466}, // Spanish(Nicaragua)
{"pa", 6154}, // Spanish(Panama)
{"pe", 10250}, // Spanish(Peru)
{"pr", 20490}, // Spanish(Puerto Rico)
{"py", 15370}, // Spanish(Paraguay)
{"sv", 17418}, // Spanish(El Salvador)
{"uy", 14346}, // Spanish(Uruguay)
{"ve", 8202}, // Spanish(Venezuela)
}}},
{"et", {1061, {}}}, // Estonian
{"eu", {1069, {}}}, // Basque
{"fa", {1065, {}}}, // Farsi
{"fi", {1035, {}}}, // Finnish
{"fo", {1080, {}}}, // Faeroese
{"fr", {0x0c, // French
{
{"be", 2060}, // French(Belgian)
{"ca", 3084}, // French(Canadian)
{"ch", 4108}, // French(Swiss)
{"fr", 1036}, // French(Luxembourg)
{"lu", 5132}, // French(Luxembourg)
}}},
{"gd", {1084, {}}}, // Gaelic(Scots)
{"he", {1037, {}}}, // Hebrew
{"hi", {1081, {}}}, // Hindi
{"hr", {1050, {}}}, // Croatian
{"hu", {1038, {}}}, // Hungarian
{"in", {1057, {}}}, // Indonesian
{"is", {1039, {}}}, // Icelandic
{"it", {0x10, // Italian
{
{"ch", 2064}, // Italian(Swiss)
{"it", 1040}, // Italian(Italy)
}}},
{"ja", {1041, {}}}, // Japanese
{"ji", {1085, {}}}, // Yiddish
{"ko", {0x12, // Korean
{
{"johab", 2066}, // Korean(Johab)
{"kr", 1042}, // Korean(Korea)
}}},
{"lt", {1063, {}}}, // Lithuanian
{"lv", {1062, {}}}, // Latvian
{"mk", {1071, {}}}, // Macedonian (FYROM)
{"ms", {1086, {}}}, // Malaysian
{"mt", {1082, {}}}, // Maltese
{"nl", {0x13, // Dutch
{
{"be", 2067}, // Dutch(Belgian)
{"nl", 1043}, // Dutch(Netherland)
}}},
{"no", {0x14, // Norwegian
{
{"bokmaal", 1044}, // Norwegian(Bokmaal)
{"nynorsk", 2068}, // Norwegian(Nynorsk)
}}},
{"pl", {1045, {}}}, // Polish
{"pt", {0x16, // Portuguese
{
{"br", 1046}, // Portuguese(Brazil)
{"pt", 2070}, // Portuguese(Portugal)
}}},
{"rm", {1047, {}}}, // Rhaeto-Romanic
{"ro", {0x18, // Romanian
{
{"mo", 2072}, // Romanian(Moldavia)
{"ro", 1048}, // Romanian(Romania)
}}},
{"ru", {0x19, // Russian
{
{"mo", 2073}, // Russian(Moldavia)
{"ru", 1049}, // Russian(Russia)
}}},
{"sb", {1070, {}}}, // Sorbian
{"sk", {1051, {}}}, // Slovak
{"sl", {1060, {}}}, // Slovenian
{"sq", {1052, {}}}, // Albanian
{"sr", {0x1a, // Serbian
{
{"cyrillic", 3098}, // Serbian(Cyrillic)
{"latin", 2074}, // Serbian(Latin)
}}},
{"sv", {0x1d, // Swedish
{
{"fi", 2077}, // Swedish(Finland)
{"se", 1053}, // Swedish(Sweden)
}}},
{"sx", {1072, {}}}, // Sutu
{"sz", {1083, {}}}, // Sami(Lappish)
{"th", {1054, {}}}, // Thai
{"tn", {1074, {}}}, // Tswana
{"tr", {1055, {}}}, // Turkish
{"ts", {1073, {}}}, // Tsonga
{"uk", {1058, {}}}, // Ukrainian
{"ur", {1056, {}}}, // Urdu
{"ve", {1075, {}}}, // Venda
{"vi", {1066, {}}}, // Vietnamese
{"xh", {1076, {}}}, // Xhosa
{"zh", {0x04, // Chinese
{
{"cn", 2052}, // Chinese(PRC)
{"hk", 3076}, // Chinese(Hong Kong)
{"sg", 4100}, // Chinese(Singapore)
{"tw", 1028}, // Chinese(Taiwan)
}}},
{"zu", {1077, {}}}, // Zulu
};
if (auto el = languages.find(rfc1766); el != languages.end()) {
if (!el->second.sublanguages.empty()) {
if (auto n = stdex::strlen(el->first); ispunct(rfc1766[n])) {
n++;
if (auto el_sub = el->second.sublanguages.find(&rfc1766[n]); el_sub != el->second.sublanguages.end())
return el_sub->second;
}
}
return el->second.id;
}
return langid_unknown;
}
#endif
}
#if defined(__GNUC__)
#pragma GCC diagnostic pop
#endif