/* SPDX-License-Identifier: MIT Copyright © 2023-2024 Amebis */ #pragma once #include "compat.hpp" #include "mapping.hpp" #include "sgml_unicode.hpp" #include "string.hpp" #include #include #include #include namespace stdex { /// \cond internal template const wchar_t* sgml2uni(_In_reads_or_z_(count) const T* entity, _In_ size_t count) { _Assume_(entity && count); _Assume_(count < 2 || entity[0] != '#'); // No numeric entities for (size_t i = 0, j = _countof(sgml_unicode); i < j; ) { size_t m = (i + j) / 2; if (sgml_unicode[m].sgml[0] < entity[0]) i = m + 1; else if (sgml_unicode[m].sgml[0] > entity[0]) j = m; else { auto r = strncmp(sgml_unicode[m].sgml + 1, _countof(sgml_unicode[0].sgml) - 1, entity + 1, count - 1); if (r < 0) i = m + 1; else if (r > 0) j = m; else { for (; i < m && strncmp(sgml_unicode[m - 1].sgml, _countof(sgml_unicode[0].sgml), entity, count) == 0; m--); return sgml_unicode[m].unicode; } } } return nullptr; } template const T* sgmlend( _In_reads_or_z_opt_(count) const T* str, _In_ size_t count) { _Assume_(str || !count); for (size_t i = 0; i < count; i++) { if (str[i] == ';') return str + i; if (!str[i] || str[i] == '&' || isspace(str[i])) break; } return nullptr; } /// \endcond constexpr int sgml_full = 0x80000000; constexpr int sgml_quot = 0x00000001; constexpr int sgml_apos = 0x00000002; constexpr int sgml_quot_apos = sgml_quot | sgml_apos; constexpr int sgml_amp = 0x00000004; constexpr int sgml_lt_gt = 0x00000008; constexpr int sgml_bsol = 0x00000010; constexpr int sgml_dollar = 0x00000020; constexpr int sgml_percnt = 0x00000040; constexpr int sgml_commat = 0x00000080; constexpr int sgml_num = 0x00000100; constexpr int sgml_lpar_rpar = 0x00000200; constexpr int sgml_lcub_rcub = 0x00000400; constexpr int sgml_lsqb_rsqb = 0x00000800; constexpr int sgml_sgml = sgml_amp | sgml_lt_gt; constexpr int sgml_ml_attrib = sgml_amp | sgml_quot_apos; constexpr int sgml_c = sgml_amp | sgml_bsol | sgml_quot_apos; // constexpr int sgml_kolos = sgml_amp | sgml_quot | sgml_dollar | sgml_percnt | sgml_lt_gt | sgml_bsol/* | sgml_commat | sgml_num*/ | sgml_lpar_rpar | sgml_lcub_rcub | sgml_lsqb_rsqb; /// /// Checks SGML string for error /// /// \param[in] src SGML string /// \param[in] count_src SGML string character count limit /// \param[in] what Bitwise flag of stdex::sgml_* constants that force extra checks. Currently, only stdex::sgml_full is used, which enforces 7-bit/ASCII checking. /// /// \return Index of error; or stdex::npos if no error detected. /// template size_t sgmlerr( _In_reads_or_z_opt_(count_src) const T_from* src, _In_ size_t count_src, _In_ int what = 0) { _Assume_(src || !count_src); const bool do_ascii = (what & sgml_full) == 0; for (size_t i = 0; i < count_src && src[i];) { if (src[i] == '&') { auto end = sgmlend(src + i + 1, count_src - i - 1); if (end) { const wchar_t* entity_w; wchar_t chr[3]; size_t n = end - src - i - 1; if (n >= 2 && src[i + 1] == '#') { uint32_t unicode; if (src[i + 2] == 'x' || src[i + 2] == 'X') unicode = strtou32(src + i + 3, n - 2, nullptr, 16); else unicode = strtou32(src + i + 2, n - 1, nullptr, 10); #ifdef _WIN32 if (unicode < 0x10000) { chr[0] = (wchar_t)unicode; chr[1] = 0; } else { ucs4_to_surrogate_pair(chr, unicode); chr[2] = 0; } #else chr[0] = (wchar_t)unicode; chr[1] = 0; #endif entity_w = chr; } else entity_w = sgml2uni(src + i + 1, n); if (entity_w) { i = end - src + 1; continue; } // Unknown entity. return i; } // Unterminated entity. return i; } if (do_ascii && !is7bit(src[i])) { // Non-ASCII character return i; } i++; } return npos; } /// /// Convert SGML string to Unicode (UTF-16 on Windows) and append to string /// /// \param[in,out] dst String to append Unicode to /// \param[in] src SGML string /// \param[in] count_src SGML string character count limit /// \param[in] skip Bitwise flag of stdex::sgml_* constants that list SGML entities to skip converting /// \param[in] offset Logical starting offset of source and destination strings. Unused when map parameter is nullptr. /// \param[in,out] map The vector to append index mapping between source and destination string to. /// template , class AX_to = std::allocator> void sgml2strcat( _Inout_ std::basic_string& dst, _In_reads_or_z_opt_(count_src) const T_from* src, _In_ size_t count_src, _In_ int skip = 0, _In_ const mapping& offset = mapping(0, 0), _Inout_opt_ mapping_vector* map = nullptr) { _Assume_(src || !count_src); const bool skip_quot = (skip & sgml_quot) == 0, skip_apos = (skip & sgml_apos) == 0, skip_amp = (skip & sgml_amp) == 0, skip_lt_gt = (skip & sgml_lt_gt) == 0, skip_bsol = (skip & sgml_bsol) == 0, skip_dollar = (skip & sgml_dollar) == 0, skip_percnt = (skip & sgml_percnt) == 0, skip_commat = (skip & sgml_commat) == 0, skip_num = (skip & sgml_num) == 0, skip_lpar_rpar = (skip & sgml_lpar_rpar) == 0, skip_lcub_rcub = (skip & sgml_lcub_rcub) == 0, skip_lsqb_rsqb = (skip & sgml_lsqb_rsqb) == 0; count_src = strnlen(src, count_src); dst.reserve(dst.size() + count_src); for (size_t i = 0; i < count_src;) { if (src[i] == '&') { auto end = sgmlend(src + i + 1, count_src - i - 1); if (end) { const wchar_t* entity_w; wchar_t chr[3]; size_t n = end - src - i - 1; if (n >= 2 && src[i + 1] == '#') { uint32_t unicode; if (src[i + 2] == 'x' || src[i + 2] == 'X') unicode = strtou32(src + i + 3, n - 2, nullptr, 16); else unicode = strtou32(src + i + 2, n - 1, nullptr, 10); #ifdef _WIN32 if (unicode < 0x10000) { chr[0] = (wchar_t)unicode; chr[1] = 0; } else { ucs4_to_surrogate_pair(chr, unicode); chr[2] = 0; } #else chr[0] = (wchar_t)unicode; chr[1] = 0; #endif entity_w = chr; } else entity_w = sgml2uni(src + i + 1, n); if (entity_w && (skip_quot || (entity_w[0] != L'"')) && (skip_apos || (entity_w[0] != L'\'')) && (skip_amp || (entity_w[0] != L'&')) && (skip_lt_gt || (entity_w[0] != L'<' && entity_w[0] != L'>')) && (skip_bsol || (entity_w[0] != L'\\')) && (skip_dollar || (entity_w[0] != L'$')) && (skip_percnt || (entity_w[0] != L'%')) && (skip_commat || (entity_w[0] != L'@')) && (skip_num || (entity_w[0] != L'#')) && (skip_lpar_rpar || (entity_w[0] != L'(' && entity_w[0] != L')')) && (skip_lcub_rcub || (entity_w[0] != L'{' && entity_w[0] != L'}')) && (skip_lsqb_rsqb || (entity_w[0] != L'[' && entity_w[0] != L']'))) { if (map) map->push_back(mapping(offset.from + i, offset.to + dst.size())); dst.append(entity_w); i = end - src + 1; if (map) map->push_back(mapping(offset.from + i, offset.to + dst.size())); continue; } } } dst.append(1, src[i++]); } } /// /// Convert SGML string to Unicode (UTF-16 on Windows) and append to string /// /// \param[in,out] dst String to append Unicode to /// \param[in] src SGML string /// \param[in] skip Bitwise flag of stdex::sgml_* constants that list SGML entities to skip converting /// \param[in] offset Logical starting offset of source and destination strings. Unused when map parameter is nullptr. /// \param[in,out] map The vector to append index mapping between source and destination string to. /// template , class AX_to = std::allocator, class TR_from = std::char_traits, class AX_from = std::allocator> void sgml2strcat( _Inout_ std::basic_string& dst, _In_ const std::basic_string& src, _In_ int skip = 0, _In_ const mapping& offset = mapping(0, 0), _Inout_opt_ mapping_vector* map = nullptr) { sgml2strcat(dst, src.data(), src.size(), skip, offset, map); } /// /// Convert SGML string to Unicode (UTF-16 on Windows) and append to string /// /// \param[in,out] dst String to append Unicode to /// \param[in] count_dst Unicode string character count limit. Function throws std::invalid_argument if there is not enough space in Unicode string (including space for zero-terminator). /// \param[in] src SGML string /// \param[in] count_src SGML string character count limit /// \param[in] skip Bitwise flag of stdex::sgml_* constants that list SGML entities to skip converting /// \param[in] offset Logical starting offset of source and destination strings. Unused when map parameter is nullptr. /// \param[in,out] map The vector to append index mapping between source and destination string to. /// /// \return Final length of SGML string in code points excluding zero-terminator /// template size_t sgml2strcat( _Inout_cap_(count_dst) wchar_t* dst, _In_ size_t count_dst, _In_reads_or_z_opt_(count_src) const T_from* src, _In_ size_t count_src, _In_ int skip = 0, _In_ const mapping& offset = mapping(0, 0), _Inout_opt_ mapping_vector* map = nullptr) { _Assume_(dst || !count_dst); _Assume_(src || !count_src); static const std::invalid_argument buffer_overrun("buffer overrun"); const bool skip_quot = (skip & sgml_quot) == 0, skip_apos = (skip & sgml_apos) == 0, skip_amp = (skip & sgml_amp) == 0, skip_lt_gt = (skip & sgml_lt_gt) == 0, skip_bsol = (skip & sgml_bsol) == 0, skip_dollar = (skip & sgml_dollar) == 0, skip_percnt = (skip & sgml_percnt) == 0, skip_commat = (skip & sgml_commat) == 0, skip_num = (skip & sgml_num) == 0, skip_lpar_rpar = (skip & sgml_lpar_rpar) == 0, skip_lcub_rcub = (skip & sgml_lcub_rcub) == 0, skip_lsqb_rsqb = (skip & sgml_lsqb_rsqb) == 0; size_t j = strnlen(dst, count_dst); count_src = strnlen(src, count_src); for (size_t i = 0; i < count_src;) { if (src[i] == '&') { auto end = sgmlend(src + i + 1, count_src - i - 1); if (end) { const wchar_t* entity_w; wchar_t chr[3]; size_t n = end - src - i - 1; if (n >= 2 && src[i + 1] == '#') { uint32_t unicode; if (src[i + 2] == 'x' || src[i + 2] == 'X') unicode = strtou32(src + i + 3, n - 2, nullptr, 16); else unicode = strtou32(src + i + 2, n - 1, nullptr, 10); #ifdef _WIN32 if (unicode < 0x10000) { chr[0] = (wchar_t)unicode; chr[1] = 0; } else { ucs4_to_surrogate_pair(chr, unicode); chr[2] = 0; } #else chr[0] = (wchar_t)unicode; chr[1] = 0; #endif entity_w = chr; } else entity_w = sgml2uni(src + i + 1, n); if (entity_w && (skip_quot || (entity_w[0] != L'"')) && (skip_apos || (entity_w[0] != L'\'')) && (skip_amp || (entity_w[0] != L'&')) && (skip_lt_gt || (entity_w[0] != L'<' && entity_w[0] != L'>')) && (skip_bsol || (entity_w[0] != L'\\')) && (skip_dollar || (entity_w[0] != L'$')) && (skip_percnt || (entity_w[0] != L'%')) && (skip_commat || (entity_w[0] != L'@')) && (skip_num || (entity_w[0] != L'#')) && (skip_lpar_rpar || (entity_w[0] != L'(' && entity_w[0] != L')')) && (skip_lcub_rcub || (entity_w[0] != L'{' && entity_w[0] != L'}')) && (skip_lsqb_rsqb || (entity_w[0] != L'[' && entity_w[0] != L']'))) { if (map) map->push_back(mapping(offset.from + i, offset.to + j)); size_t m = wcslen(entity_w); if (j + m >= count_dst) throw buffer_overrun; memcpy(dst + j, entity_w, m * sizeof(wchar_t)); j += m; i = end - src + 1; if (map) map->push_back(mapping(offset.from + i, offset.to + j)); continue; } } } if (j + 1 >= count_dst) throw buffer_overrun; dst[j++] = src[i++]; } if (j >= count_dst) throw buffer_overrun; dst[j] = 0; return j; } /// /// Convert SGML string to Unicode (UTF-16 on Windows) /// /// \param[in,out] dst String to write Unicode to /// \param[in] src SGML string /// \param[in] count_src SGML string character count limit /// \param[in] skip Bitwise flag of stdex::sgml_* constants that list SGML entities to skip converting /// \param[in] offset Logical starting offset of source and destination strings. Unused when map parameter is nullptr. /// \param[in,out] map The vector to write index mapping between source and destination string to. /// template , class AX_to = std::allocator> void sgml2strcpy( _Inout_ std::basic_string& dst, _In_reads_or_z_opt_(count_src) const T_from* src, _In_ size_t count_src, _In_ int skip = 0, _In_ const mapping& offset = mapping(0, 0), _Inout_opt_ mapping_vector* map = nullptr) { dst.clear(); if (map) map->clear(); sgml2strcat(dst, src, count_src, skip, offset, map); } /// /// Convert SGML string to Unicode (UTF-16 on Windows) /// /// \param[in,out] dst String to write Unicode to /// \param[in] src SGML string /// \param[in] skip Bitwise flag of stdex::sgml_* constants that list SGML entities to skip converting /// \param[in] offset Logical starting offset of source and destination strings. Unused when map parameter is nullptr. /// \param[in,out] map The vector to write index mapping between source and destination string to. /// template, class AX_to = std::allocator, class TR_from = std::char_traits, class AX_from = std::allocator> void sgml2strcpy( _Inout_ std::basic_string& dst, _In_ const std::basic_string& src, _In_ int skip = 0, _In_ const mapping& offset = mapping(0, 0), _Inout_opt_ mapping_vector* map = nullptr) { sgml2strcpy(dst, src.data(), src.size(), skip, offset, map); } /// /// Convert SGML string to Unicode (UTF-16 on Windows) /// /// \param[in,out] dst String to write Unicode to /// \param[in] count_dst Unicode string character count limit. Function throws std::invalid_argument if there is not enough space in Unicode string (including space for zero-terminator). /// \param[in] src SGML string /// \param[in] count_src SGML string character count limit /// \param[in] skip Bitwise flag of stdex::sgml_* constants that list SGML entities to skip converting /// \param[in] offset Logical starting offset of source and destination strings. Unused when map parameter is nullptr. /// \param[in,out] map The vector to write index mapping between source and destination string to. /// /// \return Final length of SGML string in code points excluding zero-terminator /// template size_t sgml2strcpy( _Inout_cap_(count_dst) wchar_t* dst, _In_ size_t count_dst, _In_reads_or_z_opt_(count_src) const T_from* src, _In_ size_t count_src, _In_ int skip = 0, _In_ const mapping& offset = mapping(0, 0), _Inout_opt_ mapping_vector* map = nullptr) { _Assume_(dst || !count_dst); if (count_dst) dst[0] = 0; if (map) map->clear(); return sgml2strcat(dst, count_dst, src, count_src, skip, offset, map); } /// /// Convert SGML string to Unicode string (UTF-16 on Windows) /// /// \param[in] src SGML string /// \param[in] count_src SGML string character count limit /// \param[in] skip Bitwise flag of stdex::sgml_* constants that list SGML entities to skip converting /// \param[in] offset Logical starting offset of source and destination strings. Unused when map parameter is nullptr. /// \param[in,out] map The vector to append index mapping between source and destination string to. /// /// \return Unicode string /// template std::wstring sgml2str( _In_reads_or_z_opt_(count_src) const T_from* src, _In_ size_t count_src, _In_ int skip = 0, _In_ const mapping& offset = mapping(0, 0), _Inout_opt_ mapping_vector* map = nullptr) { std::wstring dst; sgml2strcat(dst, src, count_src, skip, offset, map); return dst; } /// /// Convert SGML string to Unicode string (UTF-16 on Windows) /// /// \param[in] src SGML string /// \param[in] skip Bitwise flag of stdex::sgml_* constants that list SGML entities to skip converting /// \param[in] offset Logical starting offset of source and destination strings. Unused when map parameter is nullptr. /// \param[in,out] map The vector to append index mapping between source and destination string to. /// /// \return Unicode string /// template , class AX_from = std::allocator> std::wstring sgml2str( _In_ const std::basic_string& src, _In_ int skip = 0, _In_ const mapping& offset = mapping(0, 0), _Inout_opt_ mapping_vector* map = nullptr) { return sgml2str(src.data(), src.size(), skip, offset, map); } /// \cond internal inline const char* chr2sgml(_In_reads_or_z_(count) const wchar_t* entity, _In_ size_t count) { _Assume_(entity && count); const wchar_t e2 = entity[0]; for (size_t i = 0, j = _countof(unicode_sgml); i < j; ) { size_t m = (i + j) / 2; wchar_t e1 = sgml_unicode[unicode_sgml[m]].unicode[0]; if (e1 < e2) i = m + 1; else if (e1 > e2) j = m; else { auto r = strncmp(sgml_unicode[unicode_sgml[m]].unicode + 1, _countof(sgml_unicode[0].unicode) - 1, entity + 1, count - 1); if (r < 0) i = m + 1; else if (r > 0) j = m; else { for (; i < m && sgml_unicode[unicode_sgml[m - 1]].unicode[0] == e2 && strncmp(sgml_unicode[unicode_sgml[m - 1]].unicode + 1, _countof(sgml_unicode[0].unicode) - 1, entity + 1, count - 1) == 0; m--); return sgml_unicode[unicode_sgml[m]].sgml; } } } return nullptr; } /// \endcond /// /// Convert Unicode string (UTF-16 on Windows) to SGML and append to string /// /// \param[in,out] dst String to append SGML to /// \param[in] src Unicode string /// \param[in] count_src Unicode string character count limit /// \param[in] what Bitwise flag of stdex::sgml_* constants that force extra characters otherwise not converted to SGML /// template , class AX_to = std::allocator> inline void str2sgmlcat( _Inout_ std::basic_string& dst, _In_reads_or_z_opt_(count_src) const wchar_t* src, _In_ size_t count_src, _In_ int what = 0) { _Assume_(src || !count_src); const bool do_ascii = (what & sgml_full) == 0, do_quot = (what & sgml_quot) == 0, do_apos = (what & sgml_apos) == 0, do_lt_gt = (what & sgml_lt_gt) == 0, do_bsol = (what & sgml_bsol) == 0, do_dollar = (what & sgml_dollar) == 0, do_percnt = (what & sgml_percnt) == 0, do_commat = (what & sgml_commat) == 0, do_num = (what & sgml_num) == 0, do_lpar_rpar = (what & sgml_lpar_rpar) == 0, do_lcub_rcub = (what & sgml_lcub_rcub) == 0, do_lsqb_rsqb = (what & sgml_lsqb_rsqb) == 0; count_src = strnlen(src, count_src); dst.reserve(dst.size() + count_src); for (size_t i = 0; i < count_src;) { size_t n = glyphlen(src + i, count_src - i); if (n == 1 && do_ascii && is7bit(src[i]) && src[i] != L'&' && (do_quot || (src[i] != L'"')) && (do_apos || (src[i] != L'\'')) && (do_lt_gt || (src[i] != L'<' && src[i] != L'>')) && (do_bsol || (src[i] != L'\\')) && (do_dollar || (src[i] != L'$')) && (do_percnt || (src[i] != L'%')) && (do_commat || (src[i] != L'@')) && (do_num || (src[i] != L'#')) && (do_lpar_rpar || (src[i] != L'(' && src[i] != L')')) && (do_lcub_rcub || (src[i] != L'{' && src[i] != L'}')) && (do_lsqb_rsqb || (src[i] != L'[' && src[i] != L']'))) { // 7-bit ASCII and no desire to encode it as an SGML entity. dst.append(1, static_cast(src[i++])); } else { const char* entity = chr2sgml(src + i, n); if (entity) { dst.append(1, '&'); dst.append(entity); dst.append(1, ';'); i += n; } else if (n == 1) { // Trivial character (1 code unit, 1 glyph), no entity available. if (is7bit(src[i])) dst.append(1, static_cast(src[i++])); else { char tmp[3 + 8 + 1 + 1]; snprintf(tmp, _countof(tmp), "&#x%x;", src[i++]); dst.append(tmp); } } else { // Non-trivial character. Decompose. const size_t end = i + n; while (i < end) { if ((entity = chr2sgml(src + i, 1)) != nullptr) { dst.append(1, '&'); dst.append(entity); dst.append(1, ';'); i++; } else if (is7bit(src[i])) dst.append(1, static_cast(src[i++])); else { uint32_t unicode; #ifdef _WIN32 if (i + 1 < end && is_surrogate_pair(src + i)) { unicode = surrogate_pair_to_ucs4(src + i); i += 2; } else #endif { unicode = src[i++]; } char tmp[3 + 8 + 1 + 1]; snprintf(tmp, _countof(tmp), "&#x%x;", unicode); dst.append(tmp); } } } } } } /// /// Convert Unicode string (UTF-16 on Windows) to SGML and append to string /// /// \param[in,out] dst String to append SGML to /// \param[in] src Unicode string /// \param[in] what Bitwise flag of stdex::sgml_* constants that force extra characters otherwise not converted to SGML /// template , class AX_to = std::allocator> void str2sgmlcat( _Inout_ std::basic_string& dst, _In_ const std::basic_string_view> src, _In_ int what = 0) { str2sgmlcat(dst, src.data(), src.size(), what); } /// /// Convert Unicode string (UTF-16 on Windows) to SGML and append to string /// /// \param[in,out] dst String to append SGML to /// \param[in] count_dst SGML string character count limit. Function throws std::invalid_argument if there is not enough space in SGML string (including space for zero-terminator). /// \param[in] src Unicode string /// \param[in] count_src Unicode string character count limit /// \param[in] what Bitwise flag of stdex::sgml_* constants that force extra characters otherwise not converted to SGML /// /// \return Final length of SGML string in code points excluding zero-terminator /// inline size_t str2sgmlcat( _Inout_cap_(count_dst) char* dst, _In_ size_t count_dst, _In_reads_or_z_opt_(count_src) const wchar_t* src, _In_ size_t count_src, _In_ int what = 0) { _Assume_(dst || !count_dst); _Assume_(src || !count_src); static const std::invalid_argument buffer_overrun("buffer overrun"); const bool do_ascii = (what & sgml_full) == 0, do_quot = (what & sgml_quot) == 0, do_apos = (what & sgml_apos) == 0, do_lt_gt = (what & sgml_lt_gt) == 0, do_bsol = (what & sgml_bsol) == 0, do_dollar = (what & sgml_dollar) == 0, do_percnt = (what & sgml_percnt) == 0, do_commat = (what & sgml_commat) == 0, do_num = (what & sgml_num) == 0, do_lpar_rpar = (what & sgml_lpar_rpar) == 0, do_lcub_rcub = (what & sgml_lcub_rcub) == 0, do_lsqb_rsqb = (what & sgml_lsqb_rsqb) == 0; size_t j = strnlen(dst, count_dst); count_src = strnlen(src, count_src); for (size_t i = 0; i < count_src;) { size_t n = glyphlen(src + i, count_src - i); if (n == 1 && do_ascii && is7bit(src[i]) && src[i] != L'&' && (do_quot || (src[i] != L'"')) && (do_apos || (src[i] != L'\'')) && (do_lt_gt || (src[i] != L'<' && src[i] != L'>')) && (do_bsol || (src[i] != L'\\')) && (do_dollar || (src[i] != L'$')) && (do_percnt || (src[i] != L'%')) && (do_commat || (src[i] != L'@')) && (do_num || (src[i] != L'#')) && (do_lpar_rpar || (src[i] != L'(' && src[i] != L')')) && (do_lcub_rcub || (src[i] != L'{' && src[i] != L'}')) && (do_lsqb_rsqb || (src[i] != L'[' && src[i] != L']'))) { // 7-bit ASCII and no desire to encode it as an SGML entity. if (j + 1 >= count_dst) throw buffer_overrun; dst[j++] = static_cast(src[i++]); } else { const char* entity = chr2sgml(src + i, n); if (entity) { size_t m = strlen(entity); if (j + m + 2 >= count_dst) throw buffer_overrun; dst[j++] = '&'; memcpy(dst + j, entity, m * sizeof(char)); j += m; dst[j++] = ';'; i += n; } else if (n == 1) { // Trivial character (1 code unit, 1 glyph), no entity available. if (is7bit(src[i])) { if (j + 1 >= count_dst) throw buffer_overrun; dst[j++] = static_cast(src[i++]); } else { char tmp[3 + 8 + 1 + 1]; int m = snprintf(tmp, _countof(tmp), "&#x%x;", src[i++]); _Assume_(m >= 0); if (static_cast(m) >= count_dst) throw buffer_overrun; memcpy(dst + j, tmp, m * sizeof(char)); j += m; } } else { // Non-trivial character. Decompose. const size_t end = i + n; while (i < end) { if ((entity = chr2sgml(src + i, 1)) != nullptr) { size_t m = strlen(entity); if (j + m + 2 >= count_dst) throw buffer_overrun; dst[j++] = '&'; memcpy(dst + j, entity, m * sizeof(char)); j += m; dst[j++] = ';'; i++; } else if (is7bit(src[i])) { if (j + 1 >= count_dst) throw buffer_overrun; dst[j++] = static_cast(src[i++]); } else { uint32_t unicode; #ifdef _WIN32 if (i + 1 < end && is_surrogate_pair(src + i)) { unicode = surrogate_pair_to_ucs4(src + i); i += 2; } else #endif { unicode = src[i++]; } char tmp[3 + 8 + 1 + 1]; int m = snprintf(tmp, _countof(tmp), "&#x%x;", unicode); _Assume_(m >= 0); if (static_cast(m) >= count_dst) throw buffer_overrun; memcpy(dst + j, tmp, m * sizeof(char)); j += m; } } } } } if (j >= count_dst) throw buffer_overrun; dst[j] = 0; return j; } /// /// Convert Unicode string (UTF-16 on Windows) to SGML /// /// \param[in,out] dst String to write SGML to /// \param[in] src Unicode string /// \param[in] count_src Unicode string character count limit /// \param[in] what Bitwise flag of stdex::sgml_* constants that force extra characters otherwise not converted to SGML /// template , class AX_to = std::allocator> inline void str2sgmlcpy( _Inout_ std::basic_string& dst, _In_reads_or_z_opt_(count_src) const wchar_t* src, _In_ size_t count_src, _In_ int what = 0) { dst.clear(); str2sgmlcat(dst, src, count_src, what); } /// /// Convert Unicode string (UTF-16 on Windows) to SGML /// /// \param[in,out] dst String to write SGML to /// \param[in] src Unicode string /// \param[in] what Bitwise flag of stdex::sgml_* constants that force extra characters otherwise not converted to SGML /// template , class AX_to = std::allocator> void str2sgmlcpy( _Inout_ std::basic_string& dst, _In_ const std::basic_string_view> src, _In_ int what = 0) { str2sgmlcpy(dst, src.data(), src.size(), what); } /// /// Convert Unicode string (UTF-16 on Windows) to SGML /// /// \param[in,out] dst String to write SGML to /// \param[in] count_dst SGML string character count limit. Function throws std::invalid_argument if there is not enough space in SGML string (including space for zero-terminator). /// \param[in] src Unicode string /// \param[in] count_src Unicode string character count limit /// \param[in] what Bitwise flag of stdex::sgml_* constants that force extra characters otherwise not converted to SGML /// /// \return Final length of SGML string in code points excluding zero-terminator /// inline size_t str2sgmlcpy( _Inout_cap_(count_dst) char* dst, _In_ size_t count_dst, _In_reads_or_z_opt_(count_src) const wchar_t* src, _In_ size_t count_src, _In_ int what = 0) { _Assume_(dst || !count_dst); if (count_dst) dst[0] = 0; return str2sgmlcat(dst, count_dst, src, count_src, what); } /// /// Convert Unicode string (UTF-16 on Windows) to SGML string /// /// \param[in] src Unicode string /// \param[in] count_src Unicode string character count limit /// \param[in] what Bitwise flag of stdex::sgml_* constants that force extra characters otherwise not converted to SGML /// /// \return SGML string /// inline std::string str2sgml( _In_reads_or_z_opt_(count_src) const wchar_t* src, _In_ size_t count_src, _In_ int what = 0) { std::string dst; str2sgmlcat(dst, src, count_src, what); return dst; } /// /// Convert Unicode string (UTF-16 on Windows) to SGML string /// /// \param[in] src Unicode string /// \param[in] what Bitwise flag of stdex::sgml_* constants that force extra characters otherwise not converted to SGML /// /// \return SGML string /// inline std::string str2sgml( _In_ const std::basic_string_view> src, _In_ int what = 0) { return str2sgml(src.data(), src.size(), what); } }