diff --git a/UnitTests/UnitTests.xcodeproj/project.pbxproj b/UnitTests/UnitTests.xcodeproj/project.pbxproj index 895194a78..b7086f2da 100644 --- a/UnitTests/UnitTests.xcodeproj/project.pbxproj +++ b/UnitTests/UnitTests.xcodeproj/project.pbxproj @@ -28,6 +28,7 @@ F421D48B2B75177E004ECBB0 /* unicode.cpp in Sources */ = {isa = PBXBuildFile; fileRef = F4C07F572AB08E690044EDC0 /* unicode.cpp */; }; F421D48C2B751780004ECBB0 /* watchdog.cpp in Sources */ = {isa = PBXBuildFile; fileRef = F4CCA3B62B73B912007B857B /* watchdog.cpp */; }; F421D48D2B751783004ECBB0 /* zlib.cpp in Sources */ = {isa = PBXBuildFile; fileRef = F421D46B2B750BFD004ECBB0 /* zlib.cpp */; }; + F4481A1A2C73427600CED93B /* langid.cpp in Sources */ = {isa = PBXBuildFile; fileRef = F4481A192C73427600CED93B /* langid.cpp */; }; F4C07F522AB059580044EDC0 /* pch.cpp in Sources */ = {isa = PBXBuildFile; fileRef = F4C07F512AB059580044EDC0 /* pch.cpp */; }; F4C07F552AB05B5B0044EDC0 /* main.cpp in Sources */ = {isa = PBXBuildFile; fileRef = F4C07F542AB05B5B0044EDC0 /* main.cpp */; }; /* End PBXBuildFile section */ @@ -79,6 +80,7 @@ F421D47A2B750EAE004ECBB0 /* inftrees.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = inftrees.c; sourceTree = ""; }; F421D47B2B750EAE004ECBB0 /* uncompr.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = uncompr.c; sourceTree = ""; }; F437AA902AC1BB64001E2230 /* hash.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; path = hash.cpp; sourceTree = ""; }; + F4481A192C73427600CED93B /* langid.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = langid.cpp; sourceTree = ""; }; F4B7FBDC2AAF49BC00C6BE9F /* UnitTests */ = {isa = PBXFileReference; explicitFileType = "compiled.mach-o.executable"; includeInIndex = 0; path = UnitTests; sourceTree = BUILT_PRODUCTS_DIR; }; F4C07F4E2AB059300044EDC0 /* math.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = math.cpp; sourceTree = ""; }; F4C07F502AB059580044EDC0 /* pch.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = pch.hpp; sourceTree = ""; }; @@ -157,6 +159,7 @@ children = ( F4C07F532AB05A240044EDC0 /* compat.hpp */, F437AA902AC1BB64001E2230 /* hash.cpp */, + F4481A192C73427600CED93B /* langid.cpp */, F4C07F542AB05B5B0044EDC0 /* main.cpp */, F4C07F4E2AB059300044EDC0 /* math.cpp */, F4C07F562AB08E690044EDC0 /* parser.cpp */, @@ -252,6 +255,7 @@ F421D4802B750EAE004ECBB0 /* trees.c in Sources */, F421D48D2B751783004ECBB0 /* zlib.cpp in Sources */, F421D47D2B750EAE004ECBB0 /* deflate.c in Sources */, + F4481A1A2C73427600CED93B /* langid.cpp in Sources */, F421D4702B750E0F004ECBB0 /* adler32.c in Sources */, F421D4822B750EAE004ECBB0 /* uncompr.c in Sources */, F421D4742B750E21004ECBB0 /* crc32.c in Sources */, diff --git a/UnitTests/langid.cpp b/UnitTests/langid.cpp new file mode 100644 index 000000000..9cd906241 --- /dev/null +++ b/UnitTests/langid.cpp @@ -0,0 +1,35 @@ +/* + SPDX-License-Identifier: MIT + Copyright © 2024 Amebis +*/ + +#include "pch.hpp" + +using namespace std; +#ifdef _WIN32 +using namespace Microsoft::VisualStudio::CppUnitTestFramework; +#endif + +namespace UnitTests +{ + void langid::from_rfc1766() + { + Assert::AreEqual(9, stdex::langid_from_rfc1766("en")); + Assert::AreEqual(1033, stdex::langid_from_rfc1766("en.US")); + Assert::AreEqual(1033, stdex::langid_from_rfc1766("en_US")); + Assert::AreEqual(2057, stdex::langid_from_rfc1766("en.GB")); + Assert::AreEqual(2057, stdex::langid_from_rfc1766("en_GB")); + Assert::AreEqual(9, stdex::langid_from_rfc1766("EN")); + Assert::AreEqual(1033, stdex::langid_from_rfc1766("EN.US")); + Assert::AreEqual(1033, stdex::langid_from_rfc1766("EN_US")); + Assert::AreEqual(2057, stdex::langid_from_rfc1766("EN.GB")); + Assert::AreEqual(2057, stdex::langid_from_rfc1766("EN_GB")); + + Assert::AreEqual(1060, stdex::langid_from_rfc1766("sl")); + Assert::AreEqual(1060, stdex::langid_from_rfc1766("sl.SI")); + Assert::AreEqual(1060, stdex::langid_from_rfc1766("sl_SI")); + Assert::AreEqual(1060, stdex::langid_from_rfc1766("SL")); + Assert::AreEqual(1060, stdex::langid_from_rfc1766("SL.SI")); + Assert::AreEqual(1060, stdex::langid_from_rfc1766("SL_SI")); + } +} diff --git a/UnitTests/main.cpp b/UnitTests/main.cpp index 36ebca5f6..05c781896 100644 --- a/UnitTests/main.cpp +++ b/UnitTests/main.cpp @@ -12,24 +12,25 @@ int main(int, const char *[]) UnitTests::hash::crc32(); UnitTests::hash::md5(); UnitTests::hash::sha1(); - UnitTests::math::mul(); + UnitTests::langid::from_rfc1766(); UnitTests::math::add(); - UnitTests::parser::wtest(); - UnitTests::parser::sgml_test(); + UnitTests::math::mul(); UnitTests::parser::http_test(); + UnitTests::parser::sgml_test(); + UnitTests::parser::wtest(); UnitTests::pool::test(); UnitTests::ring::test(); UnitTests::sgml::sgml2str(); UnitTests::sgml::str2sgml(); UnitTests::stream::async(); - UnitTests::stream::replicator(); - UnitTests::stream::open_close(); UnitTests::stream::file_stat(); + UnitTests::stream::open_close(); + UnitTests::stream::replicator(); UnitTests::string::sprintf(); - UnitTests::unicode::str2wstr(); - UnitTests::unicode::wstr2str(); UnitTests::unicode::charset_encoder(); UnitTests::unicode::normalize(); + UnitTests::unicode::str2wstr(); + UnitTests::unicode::wstr2str(); UnitTests::watchdog::test(); UnitTests::zlib::test(); std::cout << "PASS\n"; diff --git a/UnitTests/pch.hpp b/UnitTests/pch.hpp index b64491f8a..73d03b090 100644 --- a/UnitTests/pch.hpp +++ b/UnitTests/pch.hpp @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -54,6 +55,12 @@ namespace UnitTests TEST_METHOD(sha1); }; + TEST_CLASS(langid) + { + public: + TEST_METHOD(from_rfc1766); + }; + TEST_CLASS(math) { public: diff --git a/include/stdex/langid.hpp b/include/stdex/langid.hpp new file mode 100644 index 000000000..b66080f4a --- /dev/null +++ b/include/stdex/langid.hpp @@ -0,0 +1,278 @@ +/* + SPDX-License-Identifier: MIT + Copyright © 2024 Amebis +*/ + +#pragma once + +#include "compat.hpp" +#include "string.hpp" +#include "unicode.hpp" +#ifdef _WIN32 +#include "windows.h" +#endif +#include +#include +#include +#include + +#if defined(__GNUC__) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wexit-time-destructors" +#endif + +namespace stdex +{ +#ifdef _WIN32 + using langid = LANGID; +#else + using langid = uint16_t; +#endif + + constexpr langid langid_unknown = 127; + +#ifdef _WIN32 + /// + /// Parses language name and returns matching language code + /// + /// \param[in] rfc1766 Language name in RFC1766 syntax + /// + /// \returns Language code or `langid_unknown` if match not found + /// + langid langid_from_rfc1766(_In_z_ const char *rfc1766) + { + return LANGIDFROMLCID(LocaleNameToLCID(str2wstr(rfc1766, langid::utf8).c_str(), 0)); + } + + /// + /// Parses language name and returns matching language code + /// + /// \param[in] rfc1766 Language name in RFC1766 syntax + /// + /// \returns Language code or `langid_unknown` if match not found + /// + langid langid_from_rfc1766(_In_z_ const wchar_t *rfc1766) + { + return LANGIDFROMLCID(LocaleNameToLCID(rfc1766, 0)); + } +#else + /// + /// Parses language name and returns matching language code + /// + /// \param[in] rfc1766 Language name in RFC1766 syntax + /// + /// \returns Language code or `langid_unknown` if match not found + /// + inline langid langid_from_rfc1766(_In_z_ const char *rfc1766) + { + struct stricmp_less + { + bool operator()(_In_z_ const char *str1, _In_z_ const char *str2) const + { + stdex_assert(str1); + stdex_assert(str2); + size_t i; + for (i = 0; ; ++i) { + auto a = stdex::tolower(str1[i]); + auto b = stdex::tolower(str2[i]); + auto a_end = !a || stdex::ispunct(a); + auto b_end = !b || stdex::ispunct(b); + if (a_end && b_end) return false; + if (b_end || a > b) return false; + if (a_end || a < b) return true; + } + } + }; + struct language_mapping + { + langid id; ///< Language ID + std::map sublanguages; ///< Sublanguages + }; + static const std::map languages = { + {"af", {1078, {}}}, // Afrikaans + {"ar", {0x01, // Arabic + { + {"ae", 14337}, // Arabic(U.A.E.) + {"bh", 15361}, // Arabic(Bahrain) + {"dz", 5121}, // Arabic(Algeria) + {"eg", 3073}, // Arabic(Egypt) + {"iq", 2049}, // Arabic(Iraq) + {"jo", 11265}, // Arabic(Jordan) + {"kw", 13313}, // Arabic(Kuwait) + {"lb", 12289}, // Arabic(Lebanon) + {"ly", 4097}, // Arabic(Libya) + {"ma", 6145}, // Arabic(Morocco) + {"om", 8193}, // Arabic(Oman) + {"qa", 16385}, // Arabic(Qatar) + {"sa", 1025}, // Arabic(Saudi Arabia) + {"sy", 10241}, // Arabic(Syria) + {"tn", 7169}, // Arabic(Tunisia) + {"ye", 9217}, // Arabic(Yemen) + }}}, + {"be", {1059, {}}}, // Belarusian + {"bg", {1026, {}}}, // Bulgarian + {"ca", {1027, {}}}, // Catalan + {"cs", {1029, {}}}, // Czech + {"da", {1030, {}}}, // Danish + {"de", {0x07, // German + { + {"at", 3079}, // German(Austrian) + {"ch", 2055}, // German(Swiss) + {"de", 1031}, // German(Germany) + {"li", 5127}, // German(Liechtenstein) + {"lu", 4103}, // German(Luxembourg) + }}}, + {"el", {1032, {}}}, // Greek + {"en", {0x09, // English + { + {"au", 3081}, // English(Australian) + {"bz", 10249}, // English(Belize) + {"ca", 4105}, // English(Canadian) + {"ca", 9225}, // English(Caribbean) + {"gb", 2057}, // English(British) + {"ie", 6153}, // English(Ireland) + {"jm", 8201}, // English(Jamaica) + {"nz", 5129}, // English(New Zealand) + {"tt", 11273}, // English(Trinidad) + {"us", 1033}, // English(United States) + {"za", 7177}, // English(South Africa) + }}}, + {"es", {0x0a, // Spanish + { + {"ar", 11274}, // Spanish(Argentina) + {"bo", 16394}, // Spanish(Bolivia) + {"c", 13322}, // Spanish(Chile) + {"co", 9226}, // Spanish(Colombia) + {"cr", 5130}, // Spanish(Costa Rica) + {"do", 7178}, // Spanish(Dominican Republic) + {"ec", 12298}, // Spanish(Ecuador) + {"es", 1034}, // Spanish(Spain) + {"gt", 4106}, // Spanish(Guatemala) + {"hn", 18442}, // Spanish(Honduras) + {"mx", 2058}, // Spanish(Mexican) + {"ni", 19466}, // Spanish(Nicaragua) + {"pa", 6154}, // Spanish(Panama) + {"pe", 10250}, // Spanish(Peru) + {"pr", 20490}, // Spanish(Puerto Rico) + {"py", 15370}, // Spanish(Paraguay) + {"sv", 17418}, // Spanish(El Salvador) + {"uy", 14346}, // Spanish(Uruguay) + {"ve", 8202}, // Spanish(Venezuela) + }}}, + {"et", {1061, {}}}, // Estonian + {"eu", {1069, {}}}, // Basque + {"fa", {1065, {}}}, // Farsi + {"fi", {1035, {}}}, // Finnish + {"fo", {1080, {}}}, // Faeroese + {"fr", {0x0c, // French + { + {"be", 2060}, // French(Belgian) + {"ca", 3084}, // French(Canadian) + {"ch", 4108}, // French(Swiss) + {"fr", 1036}, // French(Luxembourg) + {"lu", 5132}, // French(Luxembourg) + }}}, + {"gd", {1084, {}}}, // Gaelic(Scots) + {"he", {1037, {}}}, // Hebrew + {"hi", {1081, {}}}, // Hindi + {"hr", {1050, {}}}, // Croatian + {"hu", {1038, {}}}, // Hungarian + {"in", {1057, {}}}, // Indonesian + {"is", {1039, {}}}, // Icelandic + {"it", {0x10, // Italian + { + {"ch", 2064}, // Italian(Swiss) + {"it", 1040}, // Italian(Italy) + }}}, + {"ja", {1041, {}}}, // Japanese + {"ji", {1085, {}}}, // Yiddish + {"ko", {0x12, // Korean + { + {"johab", 2066}, // Korean(Johab) + {"kr", 1042}, // Korean(Korea) + }}}, + {"lt", {1063, {}}}, // Lithuanian + {"lv", {1062, {}}}, // Latvian + {"mk", {1071, {}}}, // Macedonian (FYROM) + {"ms", {1086, {}}}, // Malaysian + {"mt", {1082, {}}}, // Maltese + {"nl", {0x13, // Dutch + { + {"be", 2067}, // Dutch(Belgian) + {"nl", 1043}, // Dutch(Netherland) + }}}, + {"no", {0x14, // Norwegian + { + {"bokmaal", 1044}, // Norwegian(Bokmaal) + {"nynorsk", 2068}, // Norwegian(Nynorsk) + }}}, + {"pl", {1045, {}}}, // Polish + {"pt", {0x16, // Portuguese + { + {"br", 1046}, // Portuguese(Brazil) + {"pt", 2070}, // Portuguese(Portugal) + }}}, + {"rm", {1047, {}}}, // Rhaeto-Romanic + {"ro", {0x18, // Romanian + { + {"mo", 2072}, // Romanian(Moldavia) + {"ro", 1048}, // Romanian(Romania) + }}}, + {"ru", {0x19, // Russian + { + {"mo", 2073}, // Russian(Moldavia) + {"ru", 1049}, // Russian(Russia) + }}}, + {"sb", {1070, {}}}, // Sorbian + {"sk", {1051, {}}}, // Slovak + {"sl", {1060, {}}}, // Slovenian + {"sq", {1052, {}}}, // Albanian + {"sr", {0x1a, // Serbian + { + {"cyrillic", 3098}, // Serbian(Cyrillic) + {"latin", 2074}, // Serbian(Latin) + }}}, + {"sv", {0x1d, // Swedish + { + {"fi", 2077}, // Swedish(Finland) + {"se", 1053}, // Swedish(Sweden) + }}}, + {"sx", {1072, {}}}, // Sutu + {"sz", {1083, {}}}, // Sami(Lappish) + {"th", {1054, {}}}, // Thai + {"tn", {1074, {}}}, // Tswana + {"tr", {1055, {}}}, // Turkish + {"ts", {1073, {}}}, // Tsonga + {"uk", {1058, {}}}, // Ukrainian + {"ur", {1056, {}}}, // Urdu + {"ve", {1075, {}}}, // Venda + {"vi", {1066, {}}}, // Vietnamese + {"xh", {1076, {}}}, // Xhosa + {"zh", {0x04, // Chinese + { + {"cn", 2052}, // Chinese(PRC) + {"hk", 3076}, // Chinese(Hong Kong) + {"sg", 4100}, // Chinese(Singapore) + {"tw", 1028}, // Chinese(Taiwan) + }}}, + {"zu", {1077, {}}}, // Zulu + }; + + if (auto el = languages.find(rfc1766); el != languages.end()) { + if (!el->second.sublanguages.empty()) { + if (auto n = stdex::strlen(el->first); ispunct(rfc1766[n])) { + n++; + if (auto el_sub = el->second.sublanguages.find(&rfc1766[n]); el_sub != el->second.sublanguages.end()) + return el_sub->second; + } + } + return el->second.id; + } + return langid_unknown; + } +#endif +} + +#if defined(__GNUC__) +#pragma GCC diagnostic pop +#endif