diff --git a/docs/changes.txt b/docs/changes.txt index 7a22272a8c..cc5d1829fb 100644 --- a/docs/changes.txt +++ b/docs/changes.txt @@ -11,6 +11,9 @@ INCOMPATIBLE CHANGES SINCE 2.6.x - For all wxInputStreams, Eof() becomes true after an attempt has been made to read _past_ the end of file. - wxCHECK family of macros now must be followed by a semicolon +- wxMBConv::cMB2WC() and cWC2MB() take size of the input buffer and return + length of the converted string in all cases now. + Deprecated methods since 2.6.x and their replacements ----------------------------------------------------- diff --git a/docs/latex/wx/mbconv.tex b/docs/latex/wx/mbconv.tex index 3d5e4a07b4..102186019a 100644 --- a/docs/latex/wx/mbconv.tex +++ b/docs/latex/wx/mbconv.tex @@ -1,14 +1,38 @@ -% -% automatically generated by HelpGen from -% ../include/wx/strconv.h at 25/Mar/00 10:20:56 -% +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%% Name: mbconv.tex +%% Purpose: wxMBConv documentation +%% Author: Ove Kaaven, Vadim Zeitlin +%% Created: 2000-03-25 +%% RCS-ID: $Id$ +%% Copyright: (c) 2000 Ove Kaaven +%% (c) 2003-2006 Vadim Zeitlin +%% License: wxWindows license +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + \section{\class{wxMBConv}}\label{wxmbconv} This class is the base class of a hierarchy of classes capable of converting -text strings between multibyte (SBCS or DBCS) encodings and Unicode. It is itself -a wrapper around the standard libc mbstowcs() and wcstombs() routines, and has -one predefined instance, {\bf wxConvLibc}. +text strings between multibyte (SBCS or DBCS) encodings and Unicode. + +In the documentation for this and related classes please notice that +\emph{length} of the string refers to the number of characters in the string +not counting the terminating \NUL, if any. While the \emph{size} of the string +is the total number of bytes in the string, including any trailing {\NUL}s. +Thus, length of wide character string \texttt{L"foo"} is $3$ while its size can +be either $8$ or $16$ depending on whether \texttt{wchar\_t} is $2$ bytes (as +under Windows) or $4$ (Unix). + +\wxheading{Global variables} + +There are several predefined instances of this class: +\begin{twocollist} +\twocolitem{\textbf{wxConvLibc}}{Uses the standard ANSI C \texttt{mbstowcs()} and +\texttt{wcstombs()} functions to perform the conversions; thus depends on the +current locale.} +\twocolitem{\textbf{wxConvFile}}{The appropriate conversion for the file names, +depends on the system.} +\end{twocollist} \wxheading{Derived from} @@ -35,30 +59,31 @@ Constructor. \membersection{wxMBConv::MB2WC}\label{wxmbconvmb2wc} -\constfunc{virtual size\_t}{MB2WC}{\param{wchar\_t *}{outputBuf}, \param{const char *}{psz}, \param{size\_t }{outputSize}} +\constfunc{virtual size\_t}{MB2WC}{\param{wchar\_t *}{out}, \param{const char *}{in}, \param{size\_t }{outLen}} -Converts from a string {\it psz} in multibyte encoding to Unicode putting the -output into the buffer {\it outputBuf} of the maximum size {\it outputSize} (in wide -characters, not bytes). If {\it outputBuf} is {\tt NULL}, only the length of the -string which would result from the conversion is calculated and returned. -Note that this is the length and not size, i.e. the returned value does -{\bf not} include the trailing NUL. But when the function is called with a -non-{\tt NULL} {\it outputBuf}, the {\it outputSize} parameter should be the size of the buffer -and so it {\bf should} take into account the trailing NUL. +Converts from a string \arg{in} in multibyte encoding to Unicode putting up to +\arg{outLen} characters into the buffer \arg{out}. + +If \arg{out} is \NULL, only the length of the string which would result from +the conversion is calculated and returned. Note that this is the length and not +size, i.e. the returned value does \emph{not} include the trailing \NUL. But +when the function is called with a non-\NULL \arg{out} buffer, the \arg{outLen} +parameter should be one more to allow to properly \NUL-terminate the string. \wxheading{Parameters} -\docparam{outputBuf}{the output buffer, may be {\tt NULL} if the caller is only +\docparam{out}{The output buffer, may be \NULL if the caller is only interested in the length of the resulting string} -\docparam{psz}{the {\tt NUL}-terminated input string, cannot be {\tt NULL}} +\docparam{in}{The \NUL-terminated input string, cannot be \NULL} -\docparam{outputSize}{the size of the output buffer (in wide characters, {\bf including} the -NUL) , ignored if {\it outputBuf} is {\tt NULL}} +\docparam{outLen}{The length of the output buffer but \emph{including} +\NUL, ignored if \arg{out} is \NULL} \wxheading{Return value} -The length of the converted string (in wide characters, {\bf excluding} the NUL) +The length of the converted string \emph{excluding} the trailing {\NUL}. + \membersection{wxMBConv::WC2MB}\label{wxmbconvwc2mb} @@ -68,25 +93,54 @@ Converts from Unicode to multibyte encoding. The semantics of this function (including the return value meaning) is the same as for \helpref{MB2WC}{wxmbconvmb2wc}. -Notice that when the function is called with a non-{\tt NULL} buffer, the -{\it n} parameter should be the size of the buffer and so it {\bf should} take +Notice that when the function is called with a non-\NULL buffer, the +{\it n} parameter should be the size of the buffer and so it \emph{should} take into account the trailing NUL, which might take two or four bytes for some -encodings (UTF-16 and UTF-32). +encodings (UTF-16 and UTF-32) and not one. + \membersection{wxMBConv::cMB2WC}\label{wxmbconvcmb2wc} -\constfunc{const wxWCharBuffer}{cMB2WC}{\param{const char* }{psz}} +\constfunc{const wxWCharBuffer}{cMB2WC}{\param{const char *}{in}} + +\constfunc{const wxWCharBuffer}{cMB2WC}{\param{const char *}{in}, \param{size\_t }{inLen}, \param{size\_t }{*outLen}} + +Converts from multibyte encoding to Unicode by calling +\helpref{MB2WC}{wxmbconvmb2wc}, allocating a temporary wxWCharBuffer to hold +the result. + +The first overload takes a \NUL-terminated input string. The second one takes a +string of exactly the specified length and the string may include or not the +trailing {\NUL}s. If the string is not \NUL-terminated, a temporary +\NUL-terminated copy of it suitable for passing to \helpref{MB2WC}{wxmbconvmb2wc} +is made, so it is more efficient to ensure that the string is does have the +appropriate number of \NUL bytes (which is usually $1$ but may be $2$ or $4$ +for UTF-16 or UTF-32), especially for long strings. + +If \arg{outLen} is not-\NULL, it receives the length of the converted +string. -Converts from multibyte encoding to Unicode by calling MB2WC, -allocating a temporary wxWCharBuffer to hold the result. \membersection{wxMBConv::cWC2MB}\label{wxmbconvcwc2mb} -\constfunc{const wxCharBuffer}{cWC2MB}{\param{const wchar\_t* }{psz}} +\constfunc{const wxCharBuffer}{cWC2MB}{\param{const wchar\_t* }{in}} + +\constfunc{const wxCharBuffer}{cWC2MB}{\param{const wchar\_t* }{in}, \param{size\_t }{inLen}, \param{size\_t }{*outLen}} Converts from Unicode to multibyte encoding by calling WC2MB, allocating a temporary wxCharBuffer to hold the result. +The second overload of this function allows to convert a string of the given +length \arg{inLen}, whether it is \NUL-terminated or not (for wide character +strings, unlike for the multibyte ones, a single \NUL is always enough). +But notice that just as with \helpref{cMB2WC}{wxmbconvmb2wc}, it is more +efficient to pass an already terminated string to this function as otherwise a +copy is made internally. + +If \arg{outLen} is not-\NULL, it receives the length of the converted +string. + + \membersection{wxMBConv::cMB2WX}\label{wxmbconvcmb2wx} \constfunc{const char*}{cMB2WX}{\param{const char* }{psz}} @@ -99,6 +153,7 @@ it returns the parameter unaltered. If wxChar is wchar\_t, it returns the result in a wxWCharBuffer. The macro wxMB2WXbuf is defined as the correct return type (without const). + \membersection{wxMBConv::cWX2MB}\label{wxmbconvcwx2mb} \constfunc{const char*}{cWX2MB}{\param{const wxChar* }{psz}} @@ -110,6 +165,7 @@ it returns the parameter unaltered. If wxChar is wchar\_t, it returns the result in a wxCharBuffer. The macro wxWX2MBbuf is defined as the correct return type (without const). + \membersection{wxMBConv::cWC2WX}\label{wxmbconvcwc2wx} \constfunc{const wchar\_t*}{cWC2WX}{\param{const wchar\_t* }{psz}} @@ -121,6 +177,7 @@ it returns the parameter unaltered. If wxChar is char, it returns the result in a wxCharBuffer. The macro wxWC2WXbuf is defined as the correct return type (without const). + \membersection{wxMBConv::cWX2WC}\label{wxmbconvcwx2wc} \constfunc{const wchar\_t*}{cWX2WC}{\param{const wxChar* }{psz}} diff --git a/include/wx/strconv.h b/include/wx/strconv.h index d9e89d5e3c..ae701a15e5 100644 --- a/include/wx/strconv.h +++ b/include/wx/strconv.h @@ -35,23 +35,38 @@ class WXDLLIMPEXP_BASE wxMBConv { public: - // the actual conversion takes place here + // The functions doing actual conversion. On success, the return value is + // the length (i.e. the number of characters, not bytes, and not counting + // the trailing L'\0') of the converted string. On failure, (size_t)-1 is + // returned. In the special case when outputBuf is NULL the return value is + // the same one but nothing is written to the buffer. // - // note that outputSize is the size of the output buffer, not the length of input - // (the latter is always supposed to be NUL-terminated) - virtual size_t MB2WC(wchar_t *outputBuf, const char *psz, size_t outputSize) const = 0; - virtual size_t WC2MB(char *outputBuf, const wchar_t *psz, size_t outputSize) const = 0; + // Note that outLen is the length of the output buffer, not the length of + // the input (which is always supposed to be terminated by one or more + // NULs, as appropriate for the encoding)! + virtual size_t MB2WC(wchar_t *out, const char *in, size_t outLen) const = 0; + virtual size_t WC2MB(char *out, const wchar_t *in, size_t outLen) const = 0; // MB <-> WC - const wxWCharBuffer cMB2WC(const char *psz) const; - const wxCharBuffer cWC2MB(const wchar_t *psz) const; + const wxWCharBuffer cMB2WC(const char *in) const; + const wxCharBuffer cWC2MB(const wchar_t *in) const; - // MB <-> WC for strings with embedded null characters + // Functions converting strings which may contain embedded NULs and don't + // have to be NUL-terminated. // - // pszLen length of the input string - // pOutSize gets the final size of the converted string - const wxWCharBuffer cMB2WC(const char *psz, size_t pszLen, size_t* pOutSize) const; - const wxCharBuffer cWC2MB(const wchar_t *psz, size_t pszLen, size_t* pOutSize) const; + // inLen is the length of the buffer including trailing NUL if any: if the + // last 4 bytes of the buffer are all NULs, these functions are more + // efficient as they avoid copying the string, but otherwise a copy is made + // internally which could be quite bad for (very) long strings. + // + // outLen receives, if not NULL, the length of the converted string or 0 if + // the conversion failed (returning 0 and not -1 in this case makes it + // difficult to distinguish between failed conversion and empty input but + // this is done for backwards compatibility) + const wxWCharBuffer + cMB2WC(const char *in, size_t inLen, size_t *outLen) const; + const wxCharBuffer + cWC2MB(const wchar_t *in, size_t inLen, size_t *outLen) const; // convenience functions for converting MB or WC to/from wxWin default #if wxUSE_UNICODE @@ -68,6 +83,17 @@ public: // virtual dtor for any base class virtual ~wxMBConv(); + +private: + // this function must return the multibyte representation of L'\0' + // + // on error, nulLen should be set to -1 + virtual const char *GetMBNul(size_t *nulLen) const + { + *nulLen = 1; + + return ""; + } }; // ---------------------------------------------------------------------------- @@ -97,15 +123,28 @@ public: wxConvBrokenFileNames(const wxChar *charset); virtual ~wxConvBrokenFileNames() { delete m_conv; } - virtual size_t MB2WC(wchar_t *outputBuf, const char *psz, size_t outputSize) const; - virtual size_t WC2MB(char *outputBuf, const wchar_t *psz, size_t outputSize) const; + virtual size_t MB2WC(wchar_t *out, const char *in, size_t outLen) const + { + return m_conv->MB2WC(out, in, outLen); + } + + virtual size_t WC2MB(char *out, const wchar_t *in, size_t outLen) const + { + return m_conv->WC2MB(out, in, outLen); + } private: + virtual wxCharBuffer GetMBNul(size_t *nulLen) const + { + return m_conv->GetMBNul(nulLen); + } + + // the conversion object we forward to wxMBConv *m_conv; }; -#endif +#endif // __UNIX__ // ---------------------------------------------------------------------------- // wxMBConvUTF7 (for conversion using UTF7 encoding) @@ -125,7 +164,7 @@ public: class WXDLLIMPEXP_BASE wxMBConvUTF8 : public wxMBConv { public: - enum { + enum { MAP_INVALID_UTF8_NOT = 0, MAP_INVALID_UTF8_TO_PUA = 1, MAP_INVALID_UTF8_TO_OCTAL = 2 @@ -134,16 +173,30 @@ public: wxMBConvUTF8(int options = MAP_INVALID_UTF8_NOT) : m_options(options) { } virtual size_t MB2WC(wchar_t *outputBuf, const char *psz, size_t outputSize) const; virtual size_t WC2MB(char *outputBuf, const wchar_t *psz, size_t outputSize) const; - + private: int m_options; }; +// ---------------------------------------------------------------------------- +// wxMBConvUTF16Base: for both LE and BE variants +// ---------------------------------------------------------------------------- + +class WXDLLIMPEXP_BASE wxMBConvUTF16Base : public wxMBConv +{ +private: + virtual const char *GetMBNul(size_t *nulLen) const + { + *nulLen = 2; + return "\0"; + } +}; + // ---------------------------------------------------------------------------- // wxMBConvUTF16LE (for conversion using UTF16 Little Endian encoding) // ---------------------------------------------------------------------------- -class WXDLLIMPEXP_BASE wxMBConvUTF16LE : public wxMBConv +class WXDLLIMPEXP_BASE wxMBConvUTF16LE : public wxMBConvUTF16Base { public: virtual size_t MB2WC(wchar_t *outputBuf, const char *psz, size_t outputSize) const; @@ -154,18 +207,32 @@ public: // wxMBConvUTF16BE (for conversion using UTF16 Big Endian encoding) // ---------------------------------------------------------------------------- -class WXDLLIMPEXP_BASE wxMBConvUTF16BE : public wxMBConv +class WXDLLIMPEXP_BASE wxMBConvUTF16BE : public wxMBConvUTF16Base { public: virtual size_t MB2WC(wchar_t *outputBuf, const char *psz, size_t outputSize) const; virtual size_t WC2MB(char *outputBuf, const wchar_t *psz, size_t outputSize) const; }; +// ---------------------------------------------------------------------------- +// wxMBConvUTF32Base: base class for both LE and BE variants +// ---------------------------------------------------------------------------- + +class WXDLLIMPEXP_BASE wxMBConvUTF32Base : public wxMBConv +{ +private: + virtual const char *GetMBNul(size_t *nulLen) const + { + *nulLen = 4; + return "\0\0\0"; + } +}; + // ---------------------------------------------------------------------------- // wxMBConvUTF32LE (for conversion using UTF32 Little Endian encoding) // ---------------------------------------------------------------------------- -class WXDLLIMPEXP_BASE wxMBConvUTF32LE : public wxMBConv +class WXDLLIMPEXP_BASE wxMBConvUTF32LE : public wxMBConvUTF32Base { public: virtual size_t MB2WC(wchar_t *outputBuf, const char *psz, size_t outputSize) const; @@ -176,7 +243,7 @@ public: // wxMBConvUTF32BE (for conversion using UTF32 Big Endian encoding) // ---------------------------------------------------------------------------- -class WXDLLIMPEXP_BASE wxMBConvUTF32BE : public wxMBConv +class WXDLLIMPEXP_BASE wxMBConvUTF32BE : public wxMBConvUTF32Base { public: virtual size_t MB2WC(wchar_t *outputBuf, const char *psz, size_t outputSize) const; @@ -221,6 +288,8 @@ private: // charset string void SetName(const wxChar *charset); + virtual const char *GetMBNul(size_t *nulLen) const; + // note that we can't use wxString here because of compilation // dependencies: we're included from wx/string.h diff --git a/src/common/strconv.cpp b/src/common/strconv.cpp index 9d8481349f..dfc8a40dac 100644 --- a/src/common/strconv.cpp +++ b/src/common/strconv.cpp @@ -187,129 +187,163 @@ const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const return buf; } -const wxWCharBuffer wxMBConv::cMB2WC(const char *szString, size_t nStringLen, size_t* pOutSize) const +const wxWCharBuffer +wxMBConv::cMB2WC(const char *in, size_t inLen, size_t *outLen) const { - wxASSERT(pOutSize != NULL); + // the currently accumulated wide characters + wxWCharBuffer wbuf; - const char* szEnd = szString + nStringLen + 1; - const char* szPos = szString; - const char* szStart = szPos; + // the current length of wbuf + size_t lenBuf = 0; - size_t nActualLength = 0; - size_t nCurrentSize = nStringLen; //try normal size first (should never resize?) + // we need to know the representation of L'\0' for this conversion + size_t nulLen; + const char * const nul = GetMBNul(&nulLen); + if ( nulLen == (size_t)-1 || nulLen == 0 ) + return wxWCharBuffer(); - wxWCharBuffer theBuffer(nCurrentSize); + // make a copy of the input string unless it is already properly + // NUL-terminated + wxCharBuffer bufTmp; - //Convert the string until the length() is reached, continuing the - //loop every time a null character is reached - while(szPos != szEnd) + // now we can compute the input size if we were not given it: notice that + // in this case the string must be properly NUL-terminated, of course, as + // otherwise we have no way of knowing how long it is + if ( inLen == (size_t)-1 ) { - wxASSERT(szPos < szEnd); //something is _really_ screwed up if this rings true - - //Get the length of the current (sub)string - size_t nLen = MB2WC(NULL, szPos, 0); - - //Invalid conversion? - if( nLen == (size_t)-1 ) + // not the most efficient algorithm but it shouldn't matter as normally + // there are not many NULs in the string and so normally memcmp() + // should stop on the first character + for ( const char *p = in; ; p++ ) { - *pOutSize = 0; - theBuffer.data()[0u] = wxT('\0'); - return theBuffer; + if ( memcmp(p, nul, nulLen) == 0 ) + break; } - - //Increase the actual length (+1 for current null character) - nActualLength += nLen + 1; - - //if buffer too big, realloc the buffer - if (nActualLength > (nCurrentSize+1)) + inLen = p - in + nulLen; + } + else // we already have the size + { + // check if it's not already NUL-terminated too to avoid the copy + if ( inLen < nulLen || memcmp(in + inLen - nulLen, nul, nulLen) != 0 ) { - wxWCharBuffer theNewBuffer(nCurrentSize << 1); - memcpy(theNewBuffer.data(), theBuffer.data(), nCurrentSize * sizeof(wchar_t)); - theBuffer = theNewBuffer; - nCurrentSize <<= 1; + // make a copy in order to properly NUL-terminate the string + bufTmp = wxCharBuffer(inLen + nulLen - 1 /* 1 will be added */); + memcpy(bufTmp.data(), in, inLen); + memcpy(bufTmp.data() + inLen, nul, nulLen); } - - //Convert the current (sub)string - if ( MB2WC(&theBuffer.data()[szPos - szStart], szPos, nLen + 1) == (size_t)-1 ) - { - *pOutSize = 0; - theBuffer.data()[0u] = wxT('\0'); - return theBuffer; - } - - //Increment to next (sub)string - //Note that we have to use strlen instead of nLen here - //because XX2XX gives us the size of the output buffer, - //which is not necessarily the length of the string - szPos += strlen(szPos) + 1; } - //success - return actual length and the buffer - *pOutSize = nActualLength; - return theBuffer; + if ( bufTmp ) + in = bufTmp; + + for ( const char * const inEnd = in + inLen;; ) + { + // try to convert the current chunk if anything left + size_t lenChunk = in < inEnd ? MB2WC(NULL, in, 0) : 0; + if ( lenChunk == 0 ) + { + // nothing left in the input string, conversion succeeded + if ( outLen ) + { + // we shouldn't include the last NUL in the result length + *outLen = lenBuf ? lenBuf - 1 : 0; + } + + return wbuf; + } + + if ( lenChunk == (size_t)-1 ) + break; + + const size_t lenBufNew = lenBuf + lenChunk; + if ( !wbuf.extend(lenBufNew) ) + break; + + lenChunk = MB2WC(wbuf.data() + lenBuf, in, lenChunk + 1 /* for NUL */); + if ( lenChunk == (size_t)-1 ) + break; + + // +! for the embedded NUL (if something follows) + lenBuf = lenBufNew + 1; + + // advance the input pointer past the end of this chunk + while ( memcmp(in, nul, nulLen) != 0 ) + in++; + + in += nulLen; // skipping over its terminator as well + } + + // conversion failed + if ( outLen ) + *outLen = 0; + + return wxWCharBuffer(); } -const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *szString, size_t nStringLen, size_t* pOutSize) const +const wxCharBuffer +wxMBConv::cWC2MB(const wchar_t *in, size_t inLen, size_t *outLen) const { - wxASSERT(pOutSize != NULL); + // the currently accumulated multibyte characters + wxCharBuffer buf; - const wchar_t* szEnd = szString + nStringLen + 1; - const wchar_t* szPos = szString; - const wchar_t* szStart = szPos; + // the current length of buf + size_t lenBuf = 0; - size_t nActualLength = 0; - size_t nCurrentSize = nStringLen << 2; //try * 4 first - - wxCharBuffer theBuffer(nCurrentSize); - - //Convert the string until the length() is reached, continuing the - //loop every time a null character is reached - while(szPos != szEnd) + // make a copy of the input string unless it is already properly + // NUL-terminated + // + // if we don't know its length we have no choice but to assume that it is, + // indeed, properly terminated + wxWCharBuffer bufTmp; + if ( inLen == (size_t)-1 ) { - wxASSERT(szPos < szEnd); //something is _really_ screwed up if this rings true - - //Get the length of the current (sub)string - size_t nLen = WC2MB(NULL, szPos, 0); - - //Invalid conversion? - if( nLen == (size_t)-1 ) - { - *pOutSize = 0; - theBuffer.data()[0u] = wxT('\0'); - return theBuffer; - } - - //Increase the actual length (+1 for current null character) - nActualLength += nLen + 1; - - //if buffer too big, realloc the buffer - if (nActualLength > (nCurrentSize+1)) - { - wxCharBuffer theNewBuffer(nCurrentSize << 1); - memcpy(theNewBuffer.data(), theBuffer.data(), nCurrentSize); - theBuffer = theNewBuffer; - nCurrentSize <<= 1; - } - - //Convert the current (sub)string - if(WC2MB(&theBuffer.data()[szPos - szStart], szPos, nLen + 1) == (size_t)-1 ) - { - *pOutSize = 0; - theBuffer.data()[0u] = wxT('\0'); - return theBuffer; - } - - //Increment to next (sub)string - //Note that we have to use wxWcslen instead of nLen here - //because XX2XX gives us the size of the output buffer, - //which is not necessarily the length of the string - szPos += wxWcslen(szPos) + 1; + inLen = wxWcslen(in) + 1; + } + else if ( inLen != 0 && in[inLen - 1] != L'\0' ) + { + // make a copy in order to properly NUL-terminate the string + bufTmp = wxWCharBuffer(inLen); + memcpy(bufTmp.data(), in, inLen*sizeof(wchar_t)); } - //success - return actual length and the buffer - *pOutSize = nActualLength; - return theBuffer; + if ( bufTmp ) + in = bufTmp; + + for ( const wchar_t * const inEnd = in + inLen;; ) + { + // try to convert the current chunk, if anything left + size_t lenChunk = in < inEnd ? WC2MB(NULL, in, 0) : 0; + if ( lenChunk == 0 ) + { + // nothing left in the input string, conversion succeeded + if ( outLen ) + *outLen = lenBuf ? lenBuf - 1 : lenBuf; + + return buf; + } + + if ( lenChunk == (size_t)-1 ) + break; + + const size_t lenBufNew = lenBuf + lenChunk; + if ( !buf.extend(lenBufNew) ) + break; + + lenChunk = WC2MB(buf.data() + lenBuf, in, lenChunk + 1 /* for NUL */); + if ( lenChunk == (size_t)-1 ) + break; + + // chunk successfully converted, go to the next one + in += wxWcslen(in) + 1 /* skip NUL too */; + lenBuf = lenBufNew + 1; + } + + // conversion failed + if ( outLen ) + *outLen = 0; + + return wxCharBuffer(); } // ---------------------------------------------------------------------------- @@ -326,12 +360,12 @@ size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const return wxWC2MB(buf, psz, n); } -#ifdef __UNIX__ - // ---------------------------------------------------------------------------- // wxConvBrokenFileNames // ---------------------------------------------------------------------------- +#ifdef __UNIX__ + wxConvBrokenFileNames::wxConvBrokenFileNames(const wxChar *charset) { if ( !charset || wxStricmp(charset, _T("UTF-8")) == 0 @@ -341,23 +375,7 @@ wxConvBrokenFileNames::wxConvBrokenFileNames(const wxChar *charset) m_conv = new wxCSConv(charset); } -size_t -wxConvBrokenFileNames::MB2WC(wchar_t *outputBuf, - const char *psz, - size_t outputSize) const -{ - return m_conv->MB2WC( outputBuf, psz, outputSize ); -} - -size_t -wxConvBrokenFileNames::WC2MB(char *outputBuf, - const wchar_t *psz, - size_t outputSize) const -{ - return m_conv->WC2MB( outputBuf, psz, outputSize ); -} - -#endif +#endif // __UNIX__ // ---------------------------------------------------------------------------- // UTF-7 @@ -509,8 +527,6 @@ static const unsigned char utf7encode[128] = size_t wxMBConvUTF7::WC2MB(char *buf, const wchar_t *psz, size_t n) const { - - size_t len = 0; while (*psz && ((!buf) || (len < n))) @@ -888,19 +904,21 @@ size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const // swap 16bit MB to 16bit String size_t wxMBConvUTF16swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const { - size_t len=0; + size_t len = 0; - while (*psz && (!buf || len < n)) + while ( *psz && (!buf || len < n) ) { - if (buf) + if ( buf ) { *buf++ = ((char*)psz)[1]; *buf++ = ((char*)psz)[0]; } - len += sizeof(wxUint16); + len += 2; psz++; } - if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0; + + if ( buf && len < n ) + *buf = '\0'; return len; } @@ -1336,6 +1354,8 @@ protected: #endif private: + virtual const char *GetMBNul(size_t *nulLen) const; + // the name (for iconv_open()) of a wide char charset -- if none is // available on this machine, it will remain NULL static wxString ms_wcCharsetName; @@ -1343,6 +1363,10 @@ private: // true if the wide char encoding we use (i.e. ms_wcCharsetName) has // different endian-ness than the native one static bool ms_wcNeedsSwap; + + // NUL representation + size_t m_nulLen; + char m_nulBuf[8]; }; // make the constructor available for unit testing @@ -1362,6 +1386,8 @@ bool wxMBConv_iconv::ms_wcNeedsSwap = false; wxMBConv_iconv::wxMBConv_iconv(const wxChar *name) { + m_nulLen = (size_t)-2; + // iconv operates with chars, not wxChars, but luckily it uses only ASCII // names for the charsets const wxCharBuffer cname(wxString(name).ToAscii()); @@ -1618,6 +1644,27 @@ size_t wxMBConv_iconv::WC2MB(char *buf, const wchar_t *psz, size_t n) const return res; } +const char *wxMBConv_iconv::GetMBNul(size_t *nulLen) const +{ + if ( m_nulLen == (size_t)-2 ) + { + wxMBConv_iconv * const self = wxConstCast(this, wxMBConv_iconv); + +#if wxUSE_THREADS + // NB: explained in MB2WC + wxMutexLocker lock(self->m_iconvMutex); +#endif + + size_t inLen = 1, + outLen = WXSIZEOF(m_nulBuf); + self->m_nulLen = iconv(w2m, ICONV_CHAR_CAST(L""), &inLen, + &self->m_nulBuf, &outLen); + } + + *nulLen = m_nulLen; + return m_nulBuf; +} + #endif // HAVE_ICONV @@ -1639,19 +1686,22 @@ public: wxMBConv_win32() { m_CodePage = CP_ACP; + m_nulLen = (size_t)-2; } #if wxUSE_FONTMAP wxMBConv_win32(const wxChar* name) { m_CodePage = wxCharsetToCodepage(name); + m_nulLen = (size_t)-2; } wxMBConv_win32(wxFontEncoding encoding) { m_CodePage = wxEncodingToCodepage(encoding); + m_nulLen = (size_t)-2; } -#endif +#endif // wxUSE_FONTMAP size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const { @@ -1876,7 +1926,35 @@ private: #endif } + virtual const char *GetMBNul(size_t *nulLen) const + { + if ( m_nulLen == (size_t)-2 ) + { + wxMBConv_win32 * const self = wxConstCast(this, wxMBConv_win32); + + self->m_nulLen = ::WideCharToMultiByte + ( + m_CodePage, // code page + 0, // no flags + L"", // input string + 1, // translate just NUL + self->m_nulBuf, // output buffer + WXSIZEOF(m_nulBuf), // and its size + NULL, // "replacement" char + NULL // [out] was it used? + ); + + if ( m_nulLen == 0 ) + self->m_nulLen = (size_t)-1; + } + + *nulLen = m_nulLen; + return m_nulBuf; + } + long m_CodePage; + size_t m_nulLen; + char m_nulBuf[8]; }; #endif // wxHAVE_WIN32_MB2WC @@ -2516,6 +2594,27 @@ public: wxFontEncoding m_enc; wxEncodingConverter m2w, w2m; +private: + virtual const char *GetMBNul(size_t *nulLen) const + { + switch ( m_enc ) + { + case wxFONTENCODING_UTF16BE: + case wxFONTENCODING_UTF16LE: + *nulLen = 2; + return "\0"; + + case wxFONTENCODING_UTF32BE: + case wxFONTENCODING_UTF32LE: + *nulLen = 4; + return "\0\0\0"; + + default: + *nulLen = 1; + return ""; + } + } + // were we initialized successfully? bool m_ok; @@ -2908,6 +3007,20 @@ size_t wxCSConv::WC2MB(char *buf, const wchar_t *psz, size_t n) const return len; } +const char *wxCSConv::GetMBNul(size_t *nulLen) const +{ + CreateConvIfNeeded(); + + if ( m_convReal ) + { + // cast needed just to call private function of m_convReal + return ((wxCSConv *)m_convReal)->GetMBNul(nulLen); + } + + *nulLen = 1; + return ""; +} + // ---------------------------------------------------------------------------- // globals // ---------------------------------------------------------------------------- diff --git a/src/common/string.cpp b/src/common/string.cpp index 5bb461cb17..4e128a124f 100644 --- a/src/common/string.cpp +++ b/src/common/string.cpp @@ -1008,103 +1008,60 @@ int STRINGCLASS::compare(size_t nStart, size_t nLen, // from multibyte string wxString::wxString(const char *psz, wxMBConv& conv, size_t nLength) { - // if nLength != npos, then we have to make a NULL-terminated copy - // of first nLength bytes of psz first because the input buffer to MB2WC - // must always be NULL-terminated: - wxCharBuffer inBuf((const char *)NULL); - if (nLength != npos) - { - wxASSERT( psz != NULL ); - wxCharBuffer tmp(nLength); - memcpy(tmp.data(), psz, nLength); - tmp.data()[nLength] = '\0'; - inBuf = tmp; - psz = inBuf.data(); - } - - // first get the size of the buffer we need - size_t nLen; - if ( psz ) - { - // calculate the needed size ourselves or use the provided one - if (nLength == npos) - nLen = strlen(psz); - else - nLen = nLength; - } - else - { - // nothing to convert - nLen = 0; - } - - // anything to do? - if ( (nLen != 0) && (nLen != (size_t)-1) ) + if ( psz && nLength != 0 ) { - //Convert string - size_t nRealSize; - wxWCharBuffer theBuffer = conv.cMB2WC(psz, nLen, &nRealSize); + if ( nLength == npos ) + { + nLength = (size_t)-1; + } + else if ( nLength == length() ) + { + // this is important to avoid copying the string in cMB2WC: we're + // already NUL-terminated so we can pass this NUL with the data + nLength++; + } - //Copy - if (nRealSize) - assign( theBuffer.data() , nRealSize - 1 ); + size_t nLenWide; + wxWCharBuffer wbuf = conv.cMB2WC(psz, nLength, &nLenWide); + + if ( nLenWide ) + assign(wbuf, nLenWide); } } //Convert wxString in Unicode mode to a multi-byte string const wxCharBuffer wxString::mb_str(wxMBConv& conv) const { - size_t dwOutSize; - return conv.cWC2MB(c_str(), length(), &dwOutSize); + return conv.cWC2MB(c_str(), length() + 1 /* size, not length */, NULL); } #else // ANSI #if wxUSE_WCHAR_T + // from wide string wxString::wxString(const wchar_t *pwz, wxMBConv& conv, size_t nLength) { - // if nLength != npos, then we have to make a NULL-terminated copy - // of first nLength chars of psz first because the input buffer to WC2MB - // must always be NULL-terminated: - wxWCharBuffer inBuf((const wchar_t *)NULL); - if (nLength != npos) - { - wxASSERT( pwz != NULL ); - wxWCharBuffer tmp(nLength); - memcpy(tmp.data(), pwz, nLength * sizeof(wchar_t)); - tmp.data()[nLength] = '\0'; - inBuf = tmp; - pwz = inBuf.data(); - } - - // first get the size of the buffer we need - size_t nLen; - if ( pwz ) - { - // calculate the needed size ourselves or use the provided one - if (nLength == npos) - nLen = wxWcslen(pwz); - else - nLen = nLength; - } - else - { - // nothing to convert - nLen = 0; - } - // anything to do? - if ( (nLen != 0) && (nLen != (size_t)-1) ) + if ( pwz && nLength != 0 ) { - //Convert string - size_t nRealSize; - wxCharBuffer theBuffer = conv.cWC2MB(pwz, nLen, &nRealSize); + if ( nLength == npos ) + { + nLength = (size_t)-1; + } + else if ( nLength == length() ) + { + // this is important to avoid copying the string in cMB2WC: we're + // already NUL-terminated so we can pass this NUL with the data + nLength++; + } - //Copy - if (nRealSize) - assign( theBuffer.data() , nRealSize - 1 ); + size_t nLenMB; + wxCharBuffer buf = conv.cWC2MB(pwz, nLength, &nLenMB); + + if ( nLenMB ) + assign(buf, nLenMB); } } @@ -1112,8 +1069,7 @@ wxString::wxString(const wchar_t *pwz, wxMBConv& conv, size_t nLength) //mode is not enabled and wxUSE_WCHAR_T is enabled const wxWCharBuffer wxString::wc_str(wxMBConv& conv) const { - size_t dwOutSize; - return conv.cMB2WC(c_str(), length(), &dwOutSize); + return conv.cMB2WC(c_str(), length() + 1 /* size, not length */, NULL); } #endif // wxUSE_WCHAR_T