Add SGML↔Unicode conversion
Signed-off-by: Simon Rozman <simon@rozman.si>
This commit is contained in:
parent
9ce8e6bff9
commit
b43b853235
5
UnitTests/.gitignore
vendored
Normal file
5
UnitTests/.gitignore
vendored
Normal file
@ -0,0 +1,5 @@
|
||||
/.vs
|
||||
/*.user
|
||||
/Debug
|
||||
/Release
|
||||
/x64
|
31
UnitTests/UnitTests.sln
Normal file
31
UnitTests/UnitTests.sln
Normal file
@ -0,0 +1,31 @@
|
||||
|
||||
Microsoft Visual Studio Solution File, Format Version 12.00
|
||||
# Visual Studio Version 16
|
||||
VisualStudioVersion = 16.0.32126.315
|
||||
MinimumVisualStudioVersion = 10.0.40219.1
|
||||
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "UnitTests", "UnitTests.vcxproj", "{9AFC377D-C32D-4D42-82C2-09FC818020A2}"
|
||||
EndProject
|
||||
Global
|
||||
GlobalSection(SolutionConfigurationPlatforms) = preSolution
|
||||
Debug|Win32 = Debug|Win32
|
||||
Debug|x64 = Debug|x64
|
||||
Release|Win32 = Release|Win32
|
||||
Release|x64 = Release|x64
|
||||
EndGlobalSection
|
||||
GlobalSection(ProjectConfigurationPlatforms) = postSolution
|
||||
{9AFC377D-C32D-4D42-82C2-09FC818020A2}.Debug|Win32.ActiveCfg = Debug|Win32
|
||||
{9AFC377D-C32D-4D42-82C2-09FC818020A2}.Debug|Win32.Build.0 = Debug|Win32
|
||||
{9AFC377D-C32D-4D42-82C2-09FC818020A2}.Debug|x64.ActiveCfg = Debug|x64
|
||||
{9AFC377D-C32D-4D42-82C2-09FC818020A2}.Debug|x64.Build.0 = Debug|x64
|
||||
{9AFC377D-C32D-4D42-82C2-09FC818020A2}.Release|Win32.ActiveCfg = Release|Win32
|
||||
{9AFC377D-C32D-4D42-82C2-09FC818020A2}.Release|Win32.Build.0 = Release|Win32
|
||||
{9AFC377D-C32D-4D42-82C2-09FC818020A2}.Release|x64.ActiveCfg = Release|x64
|
||||
{9AFC377D-C32D-4D42-82C2-09FC818020A2}.Release|x64.Build.0 = Release|x64
|
||||
EndGlobalSection
|
||||
GlobalSection(SolutionProperties) = preSolution
|
||||
HideSolutionNode = FALSE
|
||||
EndGlobalSection
|
||||
GlobalSection(ExtensibilityGlobals) = postSolution
|
||||
SolutionGuid = {BBDB843D-98C3-46EF-BDE8-0E80FD851852}
|
||||
EndGlobalSection
|
||||
EndGlobal
|
129
UnitTests/UnitTests.vcxproj
Normal file
129
UnitTests/UnitTests.vcxproj
Normal file
@ -0,0 +1,129 @@
|
||||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<Project DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
|
||||
<ItemGroup Label="ProjectConfigurations">
|
||||
<ProjectConfiguration Include="Debug|Win32">
|
||||
<Configuration>Debug</Configuration>
|
||||
<Platform>Win32</Platform>
|
||||
</ProjectConfiguration>
|
||||
<ProjectConfiguration Include="Release|Win32">
|
||||
<Configuration>Release</Configuration>
|
||||
<Platform>Win32</Platform>
|
||||
</ProjectConfiguration>
|
||||
<ProjectConfiguration Include="Debug|x64">
|
||||
<Configuration>Debug</Configuration>
|
||||
<Platform>x64</Platform>
|
||||
</ProjectConfiguration>
|
||||
<ProjectConfiguration Include="Release|x64">
|
||||
<Configuration>Release</Configuration>
|
||||
<Platform>x64</Platform>
|
||||
</ProjectConfiguration>
|
||||
</ItemGroup>
|
||||
<PropertyGroup Label="Globals">
|
||||
<VCProjectVersion>16.0</VCProjectVersion>
|
||||
<ProjectGuid>{9AFC377D-C32D-4D42-82C2-09FC818020A2}</ProjectGuid>
|
||||
<Keyword>Win32Proj</Keyword>
|
||||
<RootNamespace>UnitTests</RootNamespace>
|
||||
<WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
|
||||
<ProjectSubType>NativeUnitTestProject</ProjectSubType>
|
||||
</PropertyGroup>
|
||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
|
||||
<PropertyGroup Label="Configuration">
|
||||
<ConfigurationType>DynamicLibrary</ConfigurationType>
|
||||
<UseDebugLibraries>true</UseDebugLibraries>
|
||||
<PlatformToolset>$(DefaultPlatformToolset)</PlatformToolset>
|
||||
<CharacterSet>Unicode</CharacterSet>
|
||||
<UseOfMfc>false</UseOfMfc>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)'=='Debug'" Label="Configuration">
|
||||
<UseDebugLibraries>true</UseDebugLibraries>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)'=='Release'" Label="Configuration">
|
||||
<UseDebugLibraries>false</UseDebugLibraries>
|
||||
<WholeProgramOptimization>true</WholeProgramOptimization>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration" />
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration" />
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration" />
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration" />
|
||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
||||
<ImportGroup Label="ExtensionSettings">
|
||||
</ImportGroup>
|
||||
<ImportGroup Label="Shared">
|
||||
</ImportGroup>
|
||||
<ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
||||
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
|
||||
</ImportGroup>
|
||||
<ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
|
||||
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
|
||||
</ImportGroup>
|
||||
<ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
|
||||
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
|
||||
</ImportGroup>
|
||||
<ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
|
||||
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
|
||||
</ImportGroup>
|
||||
<PropertyGroup Label="UserMacros" />
|
||||
<PropertyGroup>
|
||||
<RunCodeAnalysis>true</RunCodeAnalysis>
|
||||
<CodeAnalysisRuleSet>NativeRecommendedRules.ruleset</CodeAnalysisRuleSet>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)'=='Debug'">
|
||||
<LinkIncremental>true</LinkIncremental>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)'=='Release'">
|
||||
<LinkIncremental>false</LinkIncremental>
|
||||
</PropertyGroup>
|
||||
<ItemDefinitionGroup>
|
||||
<ClCompile>
|
||||
<PrecompiledHeader>Use</PrecompiledHeader>
|
||||
<WarningLevel>Level4</WarningLevel>
|
||||
<EnablePREfast>true</EnablePREfast>
|
||||
<SDLCheck>true</SDLCheck>
|
||||
<AdditionalIncludeDirectories>..\include;$(VCInstallDir)UnitTest\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
|
||||
<UseFullPaths>true</UseFullPaths>
|
||||
<PrecompiledHeaderFile>pch.h</PrecompiledHeaderFile>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
<SubSystem>Windows</SubSystem>
|
||||
<AdditionalLibraryDirectories>$(VCInstallDir)UnitTest\lib;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
|
||||
<AdditionalDependencies>Advapi32.lib;Shlwapi.lib;%(AdditionalDependencies)</AdditionalDependencies>
|
||||
</Link>
|
||||
</ItemDefinitionGroup>
|
||||
<ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
|
||||
<ClCompile>
|
||||
<PreprocessorDefinitions>_DEBUG;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
</ClCompile>
|
||||
</ItemDefinitionGroup>
|
||||
<ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
|
||||
<ClCompile>
|
||||
<PreprocessorDefinitions>NDEBUG;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<FunctionLevelLinking>true</FunctionLevelLinking>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
<EnableCOMDATFolding>true</EnableCOMDATFolding>
|
||||
<OptimizeReferences>true</OptimizeReferences>
|
||||
</Link>
|
||||
</ItemDefinitionGroup>
|
||||
<ItemDefinitionGroup Condition="'$(Platform)'=='Win32'">
|
||||
<ClCompile>
|
||||
<PreprocessorDefinitions>WIN32;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
</ClCompile>
|
||||
</ItemDefinitionGroup>
|
||||
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" />
|
||||
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" />
|
||||
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" />
|
||||
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" />
|
||||
<ItemGroup>
|
||||
<ClCompile Include="pch.cpp">
|
||||
<PrecompiledHeader>Create</PrecompiledHeader>
|
||||
</ClCompile>
|
||||
<ClCompile Include="sgml.cpp" />
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<ClInclude Include="pch.h" />
|
||||
</ItemGroup>
|
||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||
<ImportGroup Label="ExtensionTargets">
|
||||
</ImportGroup>
|
||||
</Project>
|
30
UnitTests/UnitTests.vcxproj.filters
Normal file
30
UnitTests/UnitTests.vcxproj.filters
Normal file
@ -0,0 +1,30 @@
|
||||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
|
||||
<ItemGroup>
|
||||
<Filter Include="Source Files">
|
||||
<UniqueIdentifier>{4FC737F1-C7A5-4376-A066-2A32D752A2FF}</UniqueIdentifier>
|
||||
<Extensions>cpp;c;cc;cxx;c++;cppm;ixx;def;odl;idl;hpj;bat;asm;asmx</Extensions>
|
||||
</Filter>
|
||||
<Filter Include="Header Files">
|
||||
<UniqueIdentifier>{93995380-89BD-4b04-88EB-625FBE52EBFB}</UniqueIdentifier>
|
||||
<Extensions>h;hh;hpp;hxx;h++;hm;inl;inc;ipp;xsd</Extensions>
|
||||
</Filter>
|
||||
<Filter Include="Resource Files">
|
||||
<UniqueIdentifier>{67DA6AB6-F800-4c08-8B7A-83BB121AAD01}</UniqueIdentifier>
|
||||
<Extensions>rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms</Extensions>
|
||||
</Filter>
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<ClCompile Include="pch.cpp">
|
||||
<Filter>Source Files</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="sgml.cpp">
|
||||
<Filter>Source Files</Filter>
|
||||
</ClCompile>
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<ClInclude Include="pch.h">
|
||||
<Filter>Header Files</Filter>
|
||||
</ClInclude>
|
||||
</ItemGroup>
|
||||
</Project>
|
6
UnitTests/pch.cpp
Normal file
6
UnitTests/pch.cpp
Normal file
@ -0,0 +1,6 @@
|
||||
/*
|
||||
SPDX-License-Identifier: MIT
|
||||
Copyright © 2023 Amebis
|
||||
*/
|
||||
|
||||
#include "pch.h"
|
25
UnitTests/pch.h
Normal file
25
UnitTests/pch.h
Normal file
@ -0,0 +1,25 @@
|
||||
/*
|
||||
SPDX-License-Identifier: MIT
|
||||
Copyright © 2023 Amebis
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#define SECURITY_WIN32
|
||||
#define _WINSOCKAPI_ // Prevent inclusion of winsock.h in windows.h
|
||||
|
||||
#include <stdex/base64.h>
|
||||
#include <stdex/errno.h>
|
||||
#include <stdex/exception.h>
|
||||
#include <stdex/hex.h>
|
||||
#include <stdex/idrec.h>
|
||||
#include <stdex/interval.h>
|
||||
#include <stdex/mapping.h>
|
||||
//#include <stdex/parser.h>
|
||||
#include <stdex/progress.h>
|
||||
#include <stdex/sal.h>
|
||||
#include <stdex/sgml.h>
|
||||
#include <stdex/string.h>
|
||||
#include <stdex/vector_queue.h>
|
||||
|
||||
#include <CppUnitTest.h>
|
59
UnitTests/sgml.cpp
Normal file
59
UnitTests/sgml.cpp
Normal file
@ -0,0 +1,59 @@
|
||||
/*
|
||||
SPDX-License-Identifier: MIT
|
||||
Copyright © 2023 Amebis
|
||||
*/
|
||||
|
||||
#include "pch.h"
|
||||
|
||||
using namespace std;
|
||||
using namespace Microsoft::VisualStudio::CppUnitTestFramework;
|
||||
|
||||
namespace UnitTests
|
||||
{
|
||||
TEST_CLASS(sgml)
|
||||
{
|
||||
public:
|
||||
TEST_METHOD(sgml2str)
|
||||
{
|
||||
Assert::AreEqual(L"This is a test.", stdex::sgml2str("This is a test.", (size_t)-1).c_str());
|
||||
Assert::AreEqual(L"Th\u00ed\u0161 i\u22c5 a te\u0073\u0304t.&unknown;😀😅", stdex::sgml2str("Thíš i⋅ a te&smacr;t.&unknown;😀😅", (size_t)-1).c_str());
|
||||
Assert::AreEqual(L"This", stdex::sgml2str("This is a test.", 4).c_str());
|
||||
Assert::AreEqual(L"T\u0068\u0301", stdex::sgml2str("T&hacute;is is a test.", 9).c_str());
|
||||
Assert::AreEqual(L"T&hac", stdex::sgml2str("T&hacute;is is a test.", 5).c_str());
|
||||
Assert::AreEqual(L"The "quoted" & text.", stdex::sgml2str("The "quoted" & text.", (size_t)-1, stdex::sgml_c).c_str());
|
||||
|
||||
stdex::mapping_vector<size_t> map;
|
||||
constexpr size_t i = 0;
|
||||
constexpr size_t j = 0;
|
||||
stdex::sgml2str("Thíš i⋅ a te&smacr;t.&unknown;😀😅", (size_t)-1, 0, stdex::mapping<size_t>(i, j), &map);
|
||||
Assert::IsTrue(stdex::mapping_vector<size_t>{
|
||||
{ i + 2, j + 2 },
|
||||
{ i + 10, j + 3 },
|
||||
{ i + 10, j + 3 },
|
||||
{ i + 18, j + 4 },
|
||||
{ i + 20, j + 6 },
|
||||
{ i + 26, j + 7 },
|
||||
{ i + 27, j + 8 },
|
||||
{ i + 32, j + 9 },
|
||||
{ i + 35, j + 12 },
|
||||
{ i + 42, j + 14 },
|
||||
{ i + 53, j + 25 },
|
||||
{ i + 62, j + 27 },
|
||||
{ i + 62, j + 27 },
|
||||
{ i + 71, j + 29 },
|
||||
} == map);
|
||||
}
|
||||
|
||||
TEST_METHOD(str2sgml)
|
||||
{
|
||||
Assert::AreEqual("This is a test.", stdex::str2sgml(L"This is a test.", (size_t)-1).c_str());
|
||||
Assert::AreEqual("Thíš i⋅ a te&smacr;t.&unknown;😀😅", stdex::str2sgml(L"Th\u00ed\u0161 i\u22c5 a te\u0073\u0304t.&unknown;😀😅", (size_t)-1).c_str());
|
||||
Assert::AreEqual("This", stdex::str2sgml(L"This is a test.", 4).c_str());
|
||||
Assert::AreEqual("te&smacr;", stdex::str2sgml(L"te\u0073\u0304t", 4).c_str());
|
||||
Assert::AreEqual("tes", stdex::str2sgml(L"te\u0073\u0304t", 3).c_str());
|
||||
Assert::AreEqual("⌘‰͢", stdex::str2sgml(L"⌘‰͢", (size_t)-1).c_str());
|
||||
Assert::AreEqual("$\"<>&", stdex::str2sgml(L"$\"<>&", (size_t)-1).c_str());
|
||||
Assert::AreEqual("$"<>&", stdex::str2sgml(L"$\"<>&", (size_t)-1, stdex::sgml_c).c_str());
|
||||
}
|
||||
};
|
||||
}
|
31
include/stdex/mapping.h
Normal file
31
include/stdex/mapping.h
Normal file
@ -0,0 +1,31 @@
|
||||
/*
|
||||
SPDX-License-Identifier: MIT
|
||||
Copyright © 2023 Amebis
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "sal.h"
|
||||
#include <vector>
|
||||
|
||||
namespace stdex
|
||||
{
|
||||
///
|
||||
/// Maps index in source string to index in destination string
|
||||
///
|
||||
template <class T>
|
||||
struct mapping {
|
||||
T from; // index in source string
|
||||
T to; // index in destination string
|
||||
|
||||
inline mapping() : from(0), to(0) {}
|
||||
inline mapping(_In_ T x) : from(x), to(x) {}
|
||||
inline mapping(_In_ T _from, _In_ T _to) : from(_from), to(_to) {}
|
||||
|
||||
friend bool operator ==(_In_ stdex::mapping<T> const& a, _In_ stdex::mapping<T> const& b) noexcept { return a.from == b.from && a.to == b.to; }
|
||||
friend bool operator !=(_In_ stdex::mapping<T> const& a, _In_ stdex::mapping<T> const& b) noexcept { return !(a == b); }
|
||||
};
|
||||
|
||||
template <class T, class _Alloc = std::allocator<mapping<T>>>
|
||||
using mapping_vector = std::vector<mapping<T>, _Alloc>;
|
||||
}
|
308
include/stdex/sgml.h
Normal file
308
include/stdex/sgml.h
Normal file
@ -0,0 +1,308 @@
|
||||
/*
|
||||
SPDX-License-Identifier: MIT
|
||||
Copyright © 2023 Amebis
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "mapping.h"
|
||||
#include "sal.h"
|
||||
#include "sgml_unicode.h"
|
||||
#include "string.h"
|
||||
#include <assert.h>
|
||||
|
||||
namespace stdex
|
||||
{
|
||||
/// \cond internal
|
||||
template <class T>
|
||||
inline const wchar_t* sgml2uni(_In_reads_or_z_(count) const T* entity, _In_ size_t count)
|
||||
{
|
||||
assert(entity && count);
|
||||
assert(count < 2 || entity[0] != '#'); // No numeric entities
|
||||
|
||||
for (size_t i = 0, j = _countof(sgml_unicode); i < j; ) {
|
||||
size_t m = (i + j) / 2;
|
||||
if (sgml_unicode[m].sgml[0] < entity[0])
|
||||
i = m + 1;
|
||||
else if (sgml_unicode[m].sgml[0] > entity[0])
|
||||
j = m;
|
||||
else {
|
||||
auto r = strncmp<char, T>(sgml_unicode[m].sgml + 1, _countof(sgml_unicode[0].sgml) - 1, entity + 1, count - 1);
|
||||
if (r < 0)
|
||||
i = m + 1;
|
||||
else if (r > 0)
|
||||
j = m;
|
||||
else {
|
||||
for (; i < m && strncmp<char, T>(sgml_unicode[m - 1].sgml, _countof(sgml_unicode[0].sgml), entity, count) == 0; m--);
|
||||
return sgml_unicode[m].unicode;
|
||||
}
|
||||
}
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
template <class T>
|
||||
inline const T* sgmlend(
|
||||
_In_reads_or_z_(count) const T* str,
|
||||
_In_ size_t count)
|
||||
{
|
||||
assert(str || !count);
|
||||
for (size_t i = 0; i < count; i++) {
|
||||
if (str[i] == ';')
|
||||
return str + i;
|
||||
if (!str[i] || str[i] == '&' || isspace(str[i]))
|
||||
break;
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
/// \endcond
|
||||
|
||||
constexpr int sgml_full = 0x80000000;
|
||||
constexpr int sgml_quot = 0x00000001;
|
||||
constexpr int sgml_apos = 0x00000002;
|
||||
constexpr int sgml_quot_apos = sgml_quot | sgml_apos;
|
||||
constexpr int sgml_amp = 0x00000004;
|
||||
constexpr int sgml_lt_gt = 0x00000008;
|
||||
constexpr int sgml_bsol = 0x00000010;
|
||||
constexpr int sgml_dollar = 0x00000020;
|
||||
constexpr int sgml_percnt = 0x00000040;
|
||||
constexpr int sgml_commat = 0x00000080;
|
||||
constexpr int sgml_num = 0x00000100;
|
||||
constexpr int sgml_lpar_rpar = 0x00000200;
|
||||
constexpr int sgml_lcub_rcub = 0x00000400;
|
||||
constexpr int sgml_lsqb_rsqb = 0x00000800;
|
||||
constexpr int sgml_sgml = sgml_amp | sgml_lt_gt;
|
||||
constexpr int sgml_ml_attrib = sgml_amp | sgml_quot_apos;
|
||||
constexpr int sgml_c = sgml_amp | sgml_bsol | sgml_quot_apos;
|
||||
// constexpr int sgml_ajt_lemma = sgml_amp | sgml_quot | sgml_dollar | sgml_percnt;
|
||||
// constexpr int sgml_ajt_form = sgml_ajt_lemma;
|
||||
// constexpr int sgml_kolos = sgml_amp | sgml_quot | sgml_dollar | sgml_percnt | sgml_lt_gt | sgml_bsol/* | sgml_commat | sgml_num*/ | sgml_lpar_rpar | sgml_lcub_rcub | sgml_lsqb_rsqb;
|
||||
|
||||
///
|
||||
/// Convert SGML string to Unicode string (UTF-16 on Windows)
|
||||
///
|
||||
/// \param[in] src SGML string
|
||||
/// \param[in] count_src SGML string character count limit
|
||||
/// \param[in] skip Bitwise flag of stdex::sgml_* constants that list SGML entities to skip converting
|
||||
/// \param[in] offset Logical starting offset of source and destination strings. Unused when map parameter is nullptr.
|
||||
/// \param[out] map The vector to append index mapping between source and destination string to.
|
||||
///
|
||||
/// \return Unicode string
|
||||
///
|
||||
template <class T>
|
||||
inline std::wstring sgml2str(
|
||||
_In_reads_or_z_(count_src) const T* src, _In_ size_t count_src,
|
||||
_In_ int skip = 0,
|
||||
_In_ const mapping<size_t>& offset = mapping<size_t>(0, 0),
|
||||
_Inout_opt_ mapping_vector<size_t>* map = nullptr)
|
||||
{
|
||||
assert(src || !count_src);
|
||||
|
||||
const bool
|
||||
skip_quot = (skip & sgml_quot) == 0,
|
||||
skip_apos = (skip & sgml_apos) == 0,
|
||||
skip_amp = (skip & sgml_amp) == 0,
|
||||
skip_lt_gt = (skip & sgml_lt_gt) == 0,
|
||||
skip_bsol = (skip & sgml_bsol) == 0,
|
||||
skip_dollar = (skip & sgml_dollar) == 0,
|
||||
skip_percnt = (skip & sgml_percnt) == 0,
|
||||
skip_commat = (skip & sgml_commat) == 0,
|
||||
skip_num = (skip & sgml_num) == 0,
|
||||
skip_lpar_rpar = (skip & sgml_lpar_rpar) == 0,
|
||||
skip_lcub_rcub = (skip & sgml_lcub_rcub) == 0,
|
||||
skip_lsqb_rsqb = (skip & sgml_lsqb_rsqb) == 0;
|
||||
|
||||
count_src = strnlen(src, count_src);
|
||||
std::wstring dst;
|
||||
dst.reserve(count_src);
|
||||
for (size_t i = 0; i < count_src;) {
|
||||
if (src[i] == '&') {
|
||||
auto end = sgmlend(src + i + 1, count_src - i - 1);
|
||||
if (end) {
|
||||
const wchar_t* entity_w;
|
||||
wchar_t chr[3];
|
||||
size_t n = end - src - i - 1;
|
||||
if (n >= 2 && src[i + 1] == '#') {
|
||||
uint32_t unicode;
|
||||
if (src[i + 2] == 'x' || src[i + 2] == 'X')
|
||||
unicode = strtou32(src + i + 3, n - 2, nullptr, 16);
|
||||
else
|
||||
unicode = strtou32(src + i + 2, n - 1, nullptr, 10);
|
||||
#ifdef _WIN32
|
||||
if (unicode < 0x10000) {
|
||||
chr[0] = (wchar_t)unicode;
|
||||
chr[1] = 0;
|
||||
} else {
|
||||
ucs4_to_surrogate_pair(chr, unicode);
|
||||
chr[2] = 0;
|
||||
}
|
||||
#else
|
||||
chr[0] = (wchar_t)unicode;
|
||||
chr[1] = 0;
|
||||
#endif
|
||||
entity_w = chr;
|
||||
}
|
||||
else
|
||||
entity_w = sgml2uni(src + i + 1, n);
|
||||
|
||||
if (entity_w &&
|
||||
(skip_quot || (entity_w[0] != L'"')) &&
|
||||
(skip_apos || (entity_w[0] != L'\'')) &&
|
||||
(skip_amp || (entity_w[0] != L'&')) &&
|
||||
(skip_lt_gt || (entity_w[0] != L'<' && entity_w[0] != L'>')) &&
|
||||
(skip_bsol || (entity_w[0] != L'\\')) &&
|
||||
(skip_dollar || (entity_w[0] != L'$')) &&
|
||||
(skip_percnt || (entity_w[0] != L'%')) &&
|
||||
(skip_commat || (entity_w[0] != L'@')) &&
|
||||
(skip_num || (entity_w[0] != L'#')) &&
|
||||
(skip_lpar_rpar || (entity_w[0] != L'(' && entity_w[0] != L')')) &&
|
||||
(skip_lcub_rcub || (entity_w[0] != L'{' && entity_w[0] != L'}')) &&
|
||||
(skip_lsqb_rsqb || (entity_w[0] != L'[' && entity_w[0] != L']')))
|
||||
{
|
||||
if (map) map->push_back(mapping<size_t>(offset.from + i, offset.to + dst.size()));
|
||||
dst.append(entity_w);
|
||||
i = end - src + 1;
|
||||
if (map) map->push_back(mapping<size_t>(offset.from + i, offset.to + dst.size()));
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
dst.append(1, src[i++]);
|
||||
}
|
||||
return dst;
|
||||
}
|
||||
|
||||
/// \cond internal
|
||||
inline const char* chr2sgml(_In_reads_or_z_(count) const wchar_t* entity, _In_ size_t count)
|
||||
{
|
||||
assert(entity && count);
|
||||
|
||||
const wchar_t e2 = entity[0];
|
||||
for (size_t i = 0, j = _countof(unicode_sgml); i < j; ) {
|
||||
size_t m = (i + j) / 2;
|
||||
wchar_t e1 = sgml_unicode[unicode_sgml[m]].unicode[0];
|
||||
if (e1 < e2)
|
||||
i = m + 1;
|
||||
else if (e1 > e2)
|
||||
j = m;
|
||||
else {
|
||||
auto r = strncmp(sgml_unicode[unicode_sgml[m]].unicode + 1, _countof(sgml_unicode[0].unicode) - 1, entity + 1, count - 1);
|
||||
if (r < 0)
|
||||
i = m + 1;
|
||||
else if (r > 0)
|
||||
j = m;
|
||||
else {
|
||||
for (; i < m && sgml_unicode[unicode_sgml[m - 1]].unicode[0] == e2 && strncmp(sgml_unicode[unicode_sgml[m - 1]].unicode + 1, _countof(sgml_unicode[0].unicode) - 1, entity + 1, count - 1) == 0; m--);
|
||||
return sgml_unicode[unicode_sgml[m]].sgml;
|
||||
}
|
||||
}
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
/// \endcond
|
||||
|
||||
///
|
||||
/// Convert Unicode string (UTF-16 on Windows) to SGML string
|
||||
///
|
||||
/// \param[in] src Unicode string
|
||||
/// \param[in] count_src Unicode string character count limit
|
||||
/// \param[in] what Bitwise flag of stdex::sgml_* constants that force extra characters otherwise not converted to SGML
|
||||
///
|
||||
/// \return SGML string
|
||||
///
|
||||
inline std::string str2sgml(
|
||||
_In_reads_or_z_(count_src) const wchar_t* src,
|
||||
_In_ size_t count_src,
|
||||
_In_ size_t what = 0)
|
||||
{
|
||||
assert(src || !count_src);
|
||||
|
||||
const bool
|
||||
do_ascii = (what & sgml_full) == 0,
|
||||
do_quot = (what & sgml_quot) == 0,
|
||||
do_apos = (what & sgml_apos) == 0,
|
||||
do_lt_gt = (what & sgml_lt_gt) == 0,
|
||||
do_bsol = (what & sgml_bsol) == 0,
|
||||
do_dollar = (what & sgml_dollar) == 0,
|
||||
do_percnt = (what & sgml_percnt) == 0,
|
||||
do_commat = (what & sgml_commat) == 0,
|
||||
do_num = (what & sgml_num) == 0,
|
||||
do_lpar_rpar = (what & sgml_lpar_rpar) == 0,
|
||||
do_lcub_rcub = (what & sgml_lcub_rcub) == 0,
|
||||
do_lsqb_rsqb = (what & sgml_lsqb_rsqb) == 0;
|
||||
|
||||
count_src = wcsnlen(src, count_src);
|
||||
std::string dst;
|
||||
dst.reserve(count_src);
|
||||
for (size_t i = 0; i < count_src;) {
|
||||
size_t n = glyphlen(src + i, count_src - i);
|
||||
if (n == 1 &&
|
||||
do_ascii && (unsigned int)src[i] < 128 &&
|
||||
src[i] != L'&' &&
|
||||
(do_quot || (src[i] != L'"')) &&
|
||||
(do_apos || (src[i] != L'\'')) &&
|
||||
(do_lt_gt || (src[i] != L'<' && src[i] != L'>')) &&
|
||||
(do_bsol || (src[i] != L'\\')) &&
|
||||
(do_dollar || (src[i] != L'$')) &&
|
||||
(do_percnt || (src[i] != L'%')) &&
|
||||
(do_commat || (src[i] != L'@')) &&
|
||||
(do_num || (src[i] != L'#')) &&
|
||||
(do_lpar_rpar || (src[i] != L'(' && src[i] != L')')) &&
|
||||
(do_lcub_rcub || (src[i] != L'{' && src[i] != L'}')) &&
|
||||
(do_lsqb_rsqb || (src[i] != L'[' && src[i] != L']')))
|
||||
{
|
||||
// 7-bit ASCII and no desire to encode it as an SGML entity.
|
||||
dst.append(1, (char)src[i++]);
|
||||
}
|
||||
else {
|
||||
const char* entity = chr2sgml(src + i, n);
|
||||
if (entity) {
|
||||
dst.append(1, '&');
|
||||
dst.append(entity);
|
||||
dst.append(1, ';');
|
||||
i += n;
|
||||
}
|
||||
else if (n == 1) {
|
||||
// Trivial character (1 code unit, 1 glyph), no entity available.
|
||||
if ((unsigned int)src[i] < 128)
|
||||
dst.append(1, (char)src[i++]);
|
||||
else {
|
||||
char tmp[3 + 8 + 1 + 1];
|
||||
snprintf(tmp, _countof(tmp), "&#x%x;", src[i++]);
|
||||
dst.append(tmp);
|
||||
}
|
||||
}
|
||||
else {
|
||||
// Non-trivial character. Decompose.
|
||||
const size_t end = i + n;
|
||||
while (i < end) {
|
||||
if ((entity = chr2sgml(src + i, 1)) != nullptr) {
|
||||
dst.append(1, '&');
|
||||
dst.append(entity);
|
||||
dst.append(1, ';');
|
||||
i++;
|
||||
}
|
||||
else if ((unsigned int)src[i] < 128)
|
||||
dst.append(1, (char)src[i++]);
|
||||
else {
|
||||
uint32_t unicode;
|
||||
#ifdef _WIN32
|
||||
if (i + 1 < end && is_surrogate_pair(src + i)) {
|
||||
unicode = surrogate_pair_to_ucs4(src + i);
|
||||
i += 2;
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
unicode = src[i++];
|
||||
}
|
||||
char tmp[3 + 8 + 1 + 1];
|
||||
snprintf(tmp, _countof(tmp), "&#x%x;", unicode);
|
||||
dst.append(tmp);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return dst;
|
||||
}
|
||||
}
|
3092
include/stdex/sgml_unicode.h
Normal file
3092
include/stdex/sgml_unicode.h
Normal file
File diff suppressed because it is too large
Load Diff
Loading…
x
Reference in New Issue
Block a user