rewrote UTF-7 to work on streams of data to be comaptible with the way wxTextStream uses the converters; also converted a couple off by 1 bugs and unit test finally pass now
git-svn-id: https://svn.wxwidgets.org/svn/wx/wxWidgets/trunk@53889 c3d73ce0-8a6f-49c7-b76d-6d57e0e08775
This commit is contained in:
@@ -271,6 +271,7 @@ All:
|
|||||||
- wxString now uses std::[w]string internally by default, meaning that it is
|
- wxString now uses std::[w]string internally by default, meaning that it is
|
||||||
now thread-safe if the standard library provided with your compiler is.
|
now thread-safe if the standard library provided with your compiler is.
|
||||||
- Added wxCmdLineParser::AddUsageText() (Marcin 'Malcom' Malich).
|
- Added wxCmdLineParser::AddUsageText() (Marcin 'Malcom' Malich).
|
||||||
|
- Fix reading/writing UTF-7-encoded text streams.
|
||||||
|
|
||||||
All (Unix):
|
All (Unix):
|
||||||
|
|
||||||
|
@@ -249,10 +249,81 @@ private:
|
|||||||
class WXDLLIMPEXP_BASE wxMBConvUTF7 : public wxMBConv
|
class WXDLLIMPEXP_BASE wxMBConvUTF7 : public wxMBConv
|
||||||
{
|
{
|
||||||
public:
|
public:
|
||||||
virtual size_t MB2WC(wchar_t *outputBuf, const char *psz, size_t outputSize) const;
|
wxMBConvUTF7() { }
|
||||||
virtual size_t WC2MB(char *outputBuf, const wchar_t *psz, size_t outputSize) const;
|
|
||||||
|
// compiler-generated copy ctor, assignment operator and dtor are ok
|
||||||
|
// (assuming it's ok to copy the shift state -- not really sure about it)
|
||||||
|
|
||||||
|
virtual size_t ToWChar(wchar_t *dst, size_t dstLen,
|
||||||
|
const char *src, size_t srcLen = wxNO_LEN) const;
|
||||||
|
virtual size_t FromWChar(char *dst, size_t dstLen,
|
||||||
|
const wchar_t *src, size_t srcLen = wxNO_LEN) const;
|
||||||
|
|
||||||
virtual wxMBConv *Clone() const { return new wxMBConvUTF7; }
|
virtual wxMBConv *Clone() const { return new wxMBConvUTF7; }
|
||||||
|
|
||||||
|
private:
|
||||||
|
// UTF-7 decoder/encoder may be in direct mode or in shifted mode after a
|
||||||
|
// '+' (and until the '-' or any other non-base64 character)
|
||||||
|
enum Mode
|
||||||
|
{
|
||||||
|
Direct, // pass through state
|
||||||
|
Shifted // after a '+' (and before '-')
|
||||||
|
};
|
||||||
|
|
||||||
|
// the current decoder state: this is only used by ToWChar() if srcLen
|
||||||
|
// parameter is not wxNO_LEN, when working on the entire NUL-terminated
|
||||||
|
// strings we neither update nor use the state
|
||||||
|
class DecoderState
|
||||||
|
{
|
||||||
|
private:
|
||||||
|
// current state: this one is private as we want to enforce the use of
|
||||||
|
// ToDirect/ToShifted() methods below
|
||||||
|
Mode mode;
|
||||||
|
|
||||||
|
public:
|
||||||
|
// the initial state is direct
|
||||||
|
DecoderState() { mode = Direct; }
|
||||||
|
|
||||||
|
// switch to/from shifted mode
|
||||||
|
void ToDirect() { mode = Direct; }
|
||||||
|
void ToShifted() { mode = Shifted; accum = bit = 0; isLSB = false; }
|
||||||
|
|
||||||
|
bool IsDirect() const { return mode == Direct; }
|
||||||
|
bool IsShifted() const { return mode == Shifted; }
|
||||||
|
|
||||||
|
|
||||||
|
// these variables are only used in shifted mode
|
||||||
|
|
||||||
|
unsigned int accum; // accumulator of the bit we've already got
|
||||||
|
unsigned int bit; // the number of bits consumed mod 8
|
||||||
|
unsigned char msb; // the high byte of UTF-16 word
|
||||||
|
bool isLSB; // whether we're decoding LSB or MSB of UTF-16 word
|
||||||
|
};
|
||||||
|
|
||||||
|
DecoderState m_stateDecoder;
|
||||||
|
|
||||||
|
|
||||||
|
// encoder state is simpler as we always receive entire Unicode characters
|
||||||
|
// on input
|
||||||
|
class EncoderState
|
||||||
|
{
|
||||||
|
private:
|
||||||
|
Mode mode;
|
||||||
|
|
||||||
|
public:
|
||||||
|
EncoderState() { mode = Direct; }
|
||||||
|
|
||||||
|
void ToDirect() { mode = Direct; }
|
||||||
|
void ToShifted() { mode = Shifted; accum = bit = 0; }
|
||||||
|
|
||||||
|
bool IsDirect() const { return mode == Direct; }
|
||||||
|
bool IsShifted() const { return mode == Shifted; }
|
||||||
|
|
||||||
|
unsigned int accum;
|
||||||
|
unsigned int bit;
|
||||||
|
};
|
||||||
|
|
||||||
|
EncoderState m_stateEncoder;
|
||||||
};
|
};
|
||||||
|
|
||||||
// ----------------------------------------------------------------------------
|
// ----------------------------------------------------------------------------
|
||||||
|
@@ -291,6 +291,17 @@ public:
|
|||||||
This class converts between the UTF-7 encoding and Unicode.
|
This class converts between the UTF-7 encoding and Unicode.
|
||||||
It has one predefined instance, @b wxConvUTF7.
|
It has one predefined instance, @b wxConvUTF7.
|
||||||
|
|
||||||
|
Notice that, unlike all the other conversion objects, this converter is
|
||||||
|
stateful, i.e. it remembers its state from the last call to its ToWChar()
|
||||||
|
or FromWChar() and assumes it is called on the continuation of the same
|
||||||
|
string when the same method is called again. This assumption is only made
|
||||||
|
if an explicit length is specified as parameter to these functions as if an
|
||||||
|
entire @c NUL terminated string is processed the state doesn't need to be
|
||||||
|
remembered.
|
||||||
|
|
||||||
|
This also means that, unlike the other predefined conversion objects,
|
||||||
|
@b wxConvUTF7 is @em not thread-safe.
|
||||||
|
|
||||||
@library{wxbase}
|
@library{wxbase}
|
||||||
@category{conv}
|
@category{conv}
|
||||||
|
|
||||||
|
@@ -484,6 +484,8 @@ wxConvBrokenFileNames::wxConvBrokenFileNames(const wxString& charset)
|
|||||||
// ----------------------------------------------------------------------------
|
// ----------------------------------------------------------------------------
|
||||||
|
|
||||||
// Implementation (C) 2004 Fredrik Roubert
|
// Implementation (C) 2004 Fredrik Roubert
|
||||||
|
//
|
||||||
|
// Changes to work in streaming mode (C) 2008 Vadim Zeitlin
|
||||||
|
|
||||||
//
|
//
|
||||||
// BASE64 decoding table
|
// BASE64 decoding table
|
||||||
@@ -524,70 +526,132 @@ static const unsigned char utf7unb64[] =
|
|||||||
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
|
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
|
||||||
};
|
};
|
||||||
|
|
||||||
size_t wxMBConvUTF7::MB2WC(wchar_t *buf, const char *psz, size_t n) const
|
size_t wxMBConvUTF7::ToWChar(wchar_t *dst, size_t dstLen,
|
||||||
|
const char *src, size_t srcLen) const
|
||||||
{
|
{
|
||||||
|
DecoderState stateOrig,
|
||||||
|
*statePtr;
|
||||||
|
if ( srcLen == wxNO_LEN )
|
||||||
|
{
|
||||||
|
// convert the entire string, up to and including the trailing NUL
|
||||||
|
srcLen = strlen(src) + 1;
|
||||||
|
|
||||||
|
// when working on the entire strings we don't update nor use the shift
|
||||||
|
// state from the previous call
|
||||||
|
statePtr = &stateOrig;
|
||||||
|
}
|
||||||
|
else // when working with partial strings we do use the shift state
|
||||||
|
{
|
||||||
|
statePtr = wx_const_cast(DecoderState *, &m_stateDecoder);
|
||||||
|
|
||||||
|
// also save the old state to be able to rollback to it on error
|
||||||
|
stateOrig = m_stateDecoder;
|
||||||
|
}
|
||||||
|
|
||||||
|
// but to simplify the code below we use this variable in both cases
|
||||||
|
DecoderState& state = *statePtr;
|
||||||
|
|
||||||
|
|
||||||
|
// number of characters [which would have been] written to dst [if it were
|
||||||
|
// not NULL]
|
||||||
size_t len = 0;
|
size_t len = 0;
|
||||||
|
|
||||||
while ( *psz && (!buf || (len < n)) )
|
const char * const srcEnd = src + srcLen;
|
||||||
|
|
||||||
|
while ( (src < srcEnd) && (!dst || (len < dstLen)) )
|
||||||
{
|
{
|
||||||
unsigned char cc = *psz++;
|
const unsigned char cc = *src++;
|
||||||
if (cc != '+')
|
|
||||||
|
if ( state.IsShifted() )
|
||||||
{
|
{
|
||||||
// plain ASCII char
|
const unsigned char dc = utf7unb64[cc];
|
||||||
if (buf)
|
if ( dc == 0xff )
|
||||||
*buf++ = cc;
|
{
|
||||||
len++;
|
// end of encoded part
|
||||||
|
state.ToDirect();
|
||||||
|
|
||||||
|
// re-parse this character normally below unless it's '-' which
|
||||||
|
// is consumed by the decoder
|
||||||
|
if ( cc == '-' )
|
||||||
|
continue;
|
||||||
}
|
}
|
||||||
else if (*psz == '-')
|
else // valid encoded character
|
||||||
{
|
{
|
||||||
// encoded plus sign
|
// mini base64 decoder: each character is 6 bits
|
||||||
if (buf)
|
state.bit += 6;
|
||||||
*buf++ = cc;
|
state.accum <<= 6;
|
||||||
|
state.accum += dc;
|
||||||
|
|
||||||
|
if ( state.bit >= 8 )
|
||||||
|
{
|
||||||
|
// got the full byte, consume it
|
||||||
|
state.bit -= 8;
|
||||||
|
unsigned char b = (state.accum >> state.bit) & 0x00ff;
|
||||||
|
|
||||||
|
if ( state.isLSB )
|
||||||
|
{
|
||||||
|
// we've got the full word, output it
|
||||||
|
if ( dst )
|
||||||
|
*dst++ = (state.msb << 8) | b;
|
||||||
len++;
|
len++;
|
||||||
psz++;
|
state.isLSB = false;
|
||||||
}
|
}
|
||||||
else // start of BASE64 encoded string
|
else // MSB
|
||||||
{
|
{
|
||||||
bool lsb, ok;
|
// just store it while we wait for LSB
|
||||||
unsigned int d, l;
|
state.msb = b;
|
||||||
for ( ok = lsb = false, d = 0, l = 0;
|
state.isLSB = true;
|
||||||
(cc = utf7unb64[(unsigned char)*psz]) != 0xff;
|
}
|
||||||
psz++ )
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if ( state.IsDirect() )
|
||||||
{
|
{
|
||||||
d <<= 6;
|
// start of an encoded segment?
|
||||||
d += cc;
|
if ( cc == '+' )
|
||||||
for (l += 6; l >= 8; lsb = !lsb)
|
|
||||||
{
|
{
|
||||||
unsigned char c = (unsigned char)((d >> (l -= 8)) % 256);
|
if ( src == srcEnd )
|
||||||
if (lsb)
|
return wxCONV_FAILED; // can't have '+' at the end
|
||||||
|
|
||||||
|
if ( *src == '-' )
|
||||||
{
|
{
|
||||||
if (buf)
|
// just the encoded plus sign, don't switch to shifted mode
|
||||||
*buf++ |= c;
|
if ( dst )
|
||||||
|
*dst++ = '+';
|
||||||
len++;
|
len++;
|
||||||
ok = true;
|
src++;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
if (buf)
|
state.ToShifted();
|
||||||
*buf = (wchar_t)(c << 8);
|
}
|
||||||
|
}
|
||||||
|
else // not '+'
|
||||||
|
{
|
||||||
|
// only printable 7 bit ASCII characters (with the exception of
|
||||||
|
// NUL, TAB, CR and LF) can be used directly
|
||||||
|
if ( cc >= 0x7f || (cc < ' ' &&
|
||||||
|
!(cc == '\0' || cc == '\t' || cc == '\r' || cc == '\n')) )
|
||||||
|
return wxCONV_FAILED;
|
||||||
|
|
||||||
|
if ( dst )
|
||||||
|
*dst++ = cc;
|
||||||
|
len++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if ( !ok )
|
if ( !len )
|
||||||
{
|
{
|
||||||
// in valid UTF7 we should have valid characters after '+'
|
// as we didn't read any characters we should be called with the same
|
||||||
|
// data (followed by some more new data) again later so don't save our
|
||||||
|
// state
|
||||||
|
state = stateOrig;
|
||||||
|
|
||||||
return wxCONV_FAILED;
|
return wxCONV_FAILED;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (*psz == '-')
|
|
||||||
psz++;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if ( buf && (len < n) )
|
|
||||||
*buf = '\0';
|
|
||||||
|
|
||||||
return len;
|
return len;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -616,7 +680,7 @@ static const unsigned char utf7enb64[] =
|
|||||||
//
|
//
|
||||||
static const unsigned char utf7encode[128] =
|
static const unsigned char utf7encode[128] =
|
||||||
{
|
{
|
||||||
3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
|
0, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
|
||||||
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
|
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
|
||||||
2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
|
2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
|
||||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
|
||||||
@@ -626,21 +690,72 @@ static const unsigned char utf7encode[128] =
|
|||||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
|
||||||
};
|
};
|
||||||
|
|
||||||
size_t wxMBConvUTF7::WC2MB(char *buf, const wchar_t *psz, size_t n) const
|
static inline bool wxIsUTF7Direct(wchar_t wc)
|
||||||
{
|
{
|
||||||
|
return wc < 0x80 && utf7encode[wc] < 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t wxMBConvUTF7::FromWChar(char *dst, size_t dstLen,
|
||||||
|
const wchar_t *src, size_t srcLen) const
|
||||||
|
{
|
||||||
|
EncoderState stateOrig,
|
||||||
|
*statePtr;
|
||||||
|
if ( srcLen == wxNO_LEN )
|
||||||
|
{
|
||||||
|
// we don't apply the stored state when operating on entire strings at
|
||||||
|
// once
|
||||||
|
statePtr = &stateOrig;
|
||||||
|
|
||||||
|
srcLen = wxWcslen(src) + 1;
|
||||||
|
}
|
||||||
|
else // do use the mode we left the output in previously
|
||||||
|
{
|
||||||
|
stateOrig = m_stateEncoder;
|
||||||
|
statePtr = wx_const_cast(EncoderState *, &m_stateEncoder);
|
||||||
|
}
|
||||||
|
|
||||||
|
EncoderState& state = *statePtr;
|
||||||
|
|
||||||
|
|
||||||
size_t len = 0;
|
size_t len = 0;
|
||||||
|
|
||||||
while (*psz && ((!buf) || (len < n)))
|
const wchar_t * const srcEnd = src + srcLen;
|
||||||
|
while ( src < srcEnd && (!dst || len < dstLen) )
|
||||||
{
|
{
|
||||||
wchar_t cc = *psz++;
|
wchar_t cc = *src++;
|
||||||
if (cc < 0x80 && utf7encode[cc] < 1)
|
if ( wxIsUTF7Direct(cc) )
|
||||||
{
|
{
|
||||||
// plain ASCII char
|
if ( state.IsShifted() )
|
||||||
if (buf)
|
{
|
||||||
*buf++ = (char)cc;
|
// pad with zeros the last encoded block if necessary
|
||||||
|
if ( state.bit )
|
||||||
|
{
|
||||||
|
if ( dst )
|
||||||
|
*dst++ = utf7enb64[((state.accum % 16) << (6 - state.bit)) % 64];
|
||||||
len++;
|
len++;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
state.ToDirect();
|
||||||
|
|
||||||
|
if ( dst )
|
||||||
|
*dst++ = '-';
|
||||||
|
len++;
|
||||||
|
}
|
||||||
|
|
||||||
|
if ( dst )
|
||||||
|
*dst++ = (char)cc;
|
||||||
|
len++;
|
||||||
|
}
|
||||||
|
else if ( cc == '+' && state.IsDirect() )
|
||||||
|
{
|
||||||
|
if ( dst )
|
||||||
|
{
|
||||||
|
*dst++ = '+';
|
||||||
|
*dst++ = '-';
|
||||||
|
}
|
||||||
|
|
||||||
|
len += 2;
|
||||||
|
}
|
||||||
#ifndef WC_UTF16
|
#ifndef WC_UTF16
|
||||||
else if (((wxUint32)cc) > 0xffff)
|
else if (((wxUint32)cc) > 0xffff)
|
||||||
{
|
{
|
||||||
@@ -650,52 +765,45 @@ size_t wxMBConvUTF7::WC2MB(char *buf, const wchar_t *psz, size_t n) const
|
|||||||
#endif
|
#endif
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
if (buf)
|
if ( state.IsDirect() )
|
||||||
*buf++ = '+';
|
|
||||||
|
|
||||||
len++;
|
|
||||||
if (cc != '+')
|
|
||||||
{
|
{
|
||||||
|
state.ToShifted();
|
||||||
|
|
||||||
|
if ( dst )
|
||||||
|
*dst++ = '+';
|
||||||
|
len++;
|
||||||
|
}
|
||||||
|
|
||||||
// BASE64 encode string
|
// BASE64 encode string
|
||||||
unsigned int lsb, d, l;
|
for ( ;; )
|
||||||
for (d = 0, l = 0; /*nothing*/; psz++)
|
|
||||||
{
|
{
|
||||||
for (lsb = 0; lsb < 2; lsb ++)
|
for ( unsigned lsb = 0; lsb < 2; lsb++ )
|
||||||
{
|
{
|
||||||
d <<= 8;
|
state.accum <<= 8;
|
||||||
d += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
|
state.accum += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
|
||||||
|
|
||||||
for (l += 8; l >= 6; )
|
for (state.bit += 8; state.bit >= 6; )
|
||||||
{
|
{
|
||||||
l -= 6;
|
state.bit -= 6;
|
||||||
if (buf)
|
if ( dst )
|
||||||
*buf++ = utf7enb64[(d >> l) % 64];
|
*dst++ = utf7enb64[(state.accum >> state.bit) % 64];
|
||||||
len++;
|
len++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
cc = *psz;
|
if ( src == srcEnd || wxIsUTF7Direct(cc = *src) )
|
||||||
if (!(cc) || (cc < 0x80 && utf7encode[cc] < 1))
|
|
||||||
break;
|
break;
|
||||||
|
|
||||||
|
src++;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (l != 0)
|
|
||||||
{
|
|
||||||
if (buf)
|
|
||||||
*buf++ = utf7enb64[((d % 16) << (6 - l)) % 64];
|
|
||||||
|
|
||||||
len++;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (buf)
|
// we need to restore the original encoder state if we were called just to
|
||||||
*buf++ = '-';
|
// calculate the amount of space needed as we will presumably be called
|
||||||
len++;
|
// again to really convert the data now
|
||||||
}
|
if ( !dst )
|
||||||
}
|
state = stateOrig;
|
||||||
|
|
||||||
if (buf && (len < n))
|
|
||||||
*buf = 0;
|
|
||||||
|
|
||||||
return len;
|
return len;
|
||||||
}
|
}
|
||||||
|
Reference in New Issue
Block a user