rewrote UTF-7 to work on streams of data to be comaptible with the way wxTextStream uses the converters; also converted a couple off by 1 bugs and unit test finally pass now

git-svn-id: https://svn.wxwidgets.org/svn/wx/wxWidgets/trunk@53889 c3d73ce0-8a6f-49c7-b76d-6d57e0e08775
2008-06-01 03:08:45 +00:00
parent d4df635acb
commit 9d653e810c
4 changed files with 285 additions and 94 deletions
--- a/docs/changes.txt
+++ b/docs/changes.txt
@@ -271,6 +271,7 @@ All:
 - wxString now uses std::[w]string internally by default, meaning that it is
  now thread-safe if the standard library provided with your compiler is.
 - Added wxCmdLineParser::AddUsageText() (Marcin 'Malcom' Malich).
 - Fix reading/writing UTF-7-encoded text streams.
 All (Unix):
--- a/include/wx/strconv.h
+++ b/include/wx/strconv.h
@@ -249,10 +249,81 @@ private:
 class WXDLLIMPEXP_BASE wxMBConvUTF7 : public wxMBConv
 {
 public:
-    virtual size_t MB2WC(wchar_t *outputBuf, const char *psz, size_t outputSize) const;
+    wxMBConvUTF7() { }
-    virtual size_t WC2MB(char *outputBuf, const wchar_t *psz, size_t outputSize) const;
+
    // compiler-generated copy ctor, assignment operator and dtor are ok
    // (assuming it's ok to copy the shift state -- not really sure about it)
    virtual size_t ToWChar(wchar_t *dst, size_t dstLen,
                           const char *src, size_t srcLen = wxNO_LEN) const;
    virtual size_t FromWChar(char *dst, size_t dstLen,
                             const wchar_t *src, size_t srcLen = wxNO_LEN) const;
    virtual wxMBConv *Clone() const { return new wxMBConvUTF7; }
 private:
    // UTF-7 decoder/encoder may be in direct mode or in shifted mode after a
    // '+' (and until the '-' or any other non-base64 character)
    enum Mode
    {
        Direct,     // pass through state
        Shifted     // after a '+' (and before '-')
    };
    // the current decoder state: this is only used by ToWChar() if srcLen
    // parameter is not wxNO_LEN, when working on the entire NUL-terminated
    // strings we neither update nor use the state
    class DecoderState
    {
    private:
        // current state: this one is private as we want to enforce the use of
        // ToDirect/ToShifted() methods below
        Mode mode;
    public:
        // the initial state is direct
        DecoderState() { mode = Direct; }
        // switch to/from shifted mode
        void ToDirect() { mode = Direct; }
        void ToShifted() { mode = Shifted; accum = bit = 0; isLSB = false; }
        bool IsDirect() const { return mode == Direct; }
        bool IsShifted() const { return mode == Shifted; }
        // these variables are only used in shifted mode
        unsigned int accum; // accumulator of the bit we've already got
        unsigned int bit;   // the number of bits consumed mod 8
        unsigned char msb;  // the high byte of UTF-16 word
        bool isLSB;         // whether we're decoding LSB or MSB of UTF-16 word
    };
    DecoderState m_stateDecoder;
    // encoder state is simpler as we always receive entire Unicode characters
    // on input
    class EncoderState
    {
    private:
        Mode mode;
    public:
        EncoderState() { mode = Direct; }
        void ToDirect() { mode = Direct; }
        void ToShifted() { mode = Shifted; accum = bit = 0; }
        bool IsDirect() const { return mode == Direct; }
        bool IsShifted() const { return mode == Shifted; }
        unsigned int accum;
        unsigned int bit;
    };
    EncoderState m_stateEncoder;
 };
 // ----------------------------------------------------------------------------
--- a/interface/strconv.h
+++ b/interface/strconv.h
@@ -291,6 +291,17 @@ public:
    This class converts between the UTF-7 encoding and Unicode.
    It has one predefined instance, @b wxConvUTF7.
    Notice that, unlike all the other conversion objects, this converter is
    stateful, i.e. it remembers its state from the last call to its ToWChar()
    or FromWChar() and assumes it is called on the continuation of the same
    string when the same method is called again. This assumption is only made
    if an explicit length is specified as parameter to these functions as if an
    entire @c NUL terminated string is processed the state doesn't need to be
    remembered.
    This also means that, unlike the other predefined conversion objects,
    @b wxConvUTF7 is @em not thread-safe.
    @library{wxbase}
    @category{conv}
--- a/src/common/strconv.cpp
+++ b/src/common/strconv.cpp
@@ -484,6 +484,8 @@ wxConvBrokenFileNames::wxConvBrokenFileNames(const wxString& charset)
 // ----------------------------------------------------------------------------
 // Implementation (C) 2004 Fredrik Roubert
 //
 // Changes to work in streaming mode (C) 2008 Vadim Zeitlin
 //
 // BASE64 decoding table
@@ -524,70 +526,132 @@ static const unsigned char utf7unb64[] =
    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff 
 };
-size_t wxMBConvUTF7::MB2WC(wchar_t *buf, const char *psz, size_t n) const
+size_t wxMBConvUTF7::ToWChar(wchar_t *dst, size_t dstLen,
                             const char *src, size_t srcLen) const
 {
    DecoderState stateOrig,
         *statePtr;
    if ( srcLen == wxNO_LEN )
    {
        // convert the entire string, up to and including the trailing NUL
        srcLen = strlen(src) + 1;
        // when working on the entire strings we don't update nor use the shift
        // state from the previous call
        statePtr = &stateOrig;
    }
    else // when working with partial strings we do use the shift state
    {
        statePtr = wx_const_cast(DecoderState *, &m_stateDecoder);
        // also save the old state to be able to rollback to it on error
        stateOrig = m_stateDecoder;
    }
    // but to simplify the code below we use this variable in both cases
    DecoderState& state = *statePtr;
    // number of characters [which would have been] written to dst [if it were
    // not NULL]
    size_t len = 0;
-    while ( *psz && (!buf || (len < n)) )
+    const char * const srcEnd = src + srcLen;
    while ( (src < srcEnd) && (!dst || (len < dstLen)) )
    {
-        unsigned char cc = *psz++;
+        const unsigned char cc = *src++;
-        if (cc != '+')
+
        if ( state.IsShifted() )
        {
-            // plain ASCII char
+            const unsigned char dc = utf7unb64[cc];
-            if (buf)
+            if ( dc == 0xff )
-                *buf++ = cc;
+            {
-            len++;
+                // end of encoded part
                state.ToDirect();
                // re-parse this character normally below unless it's '-' which
                // is consumed by the decoder
                if ( cc == '-' )
                    continue;
            }
-        else if (*psz == '-')
+            else // valid encoded character
            {
-            // encoded plus sign
+                // mini base64 decoder: each character is 6 bits
-            if (buf)
+                state.bit += 6;
-                *buf++ = cc;
+                state.accum <<= 6;
                state.accum += dc;
                if ( state.bit >= 8 )
                {
                    // got the full byte, consume it
                    state.bit -= 8;
                    unsigned char b = (state.accum >> state.bit) & 0x00ff;
                    if ( state.isLSB )
                    {
                        // we've got the full word, output it
                        if ( dst )
                            *dst++ = (state.msb << 8) | b;
                        len++;
-            psz++;
+                        state.isLSB = false;
                    }
-        else // start of BASE64 encoded string
+                    else // MSB
                    {
-            bool lsb, ok;
+                        // just store it while we wait for LSB
-            unsigned int d, l;
+                        state.msb = b;
-            for ( ok = lsb = false, d = 0, l = 0;
+                        state.isLSB = true;
-                  (cc = utf7unb64[(unsigned char)*psz]) != 0xff;
+                    }
-                  psz++ )
+                }
            }
        }
        if ( state.IsDirect() )
        {
-                d <<= 6;
+            // start of an encoded segment?
-                d += cc;
+            if ( cc == '+' )
                for (l += 6; l >= 8; lsb = !lsb)
            {
-                    unsigned char c = (unsigned char)((d >> (l -= 8)) % 256);
+                if ( src == srcEnd )
-                    if (lsb)
+                    return wxCONV_FAILED; // can't have '+' at the end
                if ( *src == '-' )
                {
-                        if (buf)
+                    // just the encoded plus sign, don't switch to shifted mode
-                            *buf++ |= c;
+                    if ( dst )
                        *dst++ = '+';
                    len++;
-                        ok = true;
+                    src++;
                }
                else
                {
-                        if (buf)
+                    state.ToShifted();
-                            *buf = (wchar_t)(c << 8);
+                }
            }
            else // not '+'
            {
                // only printable 7 bit ASCII characters (with the exception of
                // NUL, TAB, CR and LF) can be used directly
                if ( cc >= 0x7f || (cc < ' ' &&
                      !(cc == '\0' || cc == '\t' || cc == '\r' || cc == '\n')) )
                    return wxCONV_FAILED;
                if ( dst )
                    *dst++ = cc;
                len++;
            }
        }
    }
-            if ( !ok )
+    if ( !len )
    {
-                // in valid UTF7 we should have valid characters after '+'
+        // as we didn't read any characters we should be called with the same
        // data (followed by some more new data) again later so don't save our
        // state
        state = stateOrig;
        return wxCONV_FAILED;
    }
            if (*psz == '-')
                psz++;
        }
    }
    if ( buf && (len < n) )
        *buf = '\0';
    return len;
 }
@@ -616,7 +680,7 @@ static const unsigned char utf7enb64[] =
 //
 static const unsigned char utf7encode[128] =
 {
-    3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
+    0, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
    3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
    2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
@@ -626,21 +690,72 @@ static const unsigned char utf7encode[128] =
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
 };
-size_t wxMBConvUTF7::WC2MB(char *buf, const wchar_t *psz, size_t n) const
+static inline bool wxIsUTF7Direct(wchar_t wc)
 {
    return wc < 0x80 && utf7encode[wc] < 1;
 }
 size_t wxMBConvUTF7::FromWChar(char *dst, size_t dstLen,
                               const wchar_t *src, size_t srcLen) const
 {
    EncoderState stateOrig,
                *statePtr;
    if ( srcLen == wxNO_LEN )
    {
        // we don't apply the stored state when operating on entire strings at
        // once
        statePtr = &stateOrig;
        srcLen = wxWcslen(src) + 1;
    }
    else // do use the mode we left the output in previously
    {
        stateOrig = m_stateEncoder;
        statePtr = wx_const_cast(EncoderState *, &m_stateEncoder);
    }
    EncoderState& state = *statePtr;
    size_t len = 0;
-    while (*psz && ((!buf) || (len < n)))
+    const wchar_t * const srcEnd = src + srcLen;
    while ( src < srcEnd && (!dst || len < dstLen) )
    {
-        wchar_t cc = *psz++;
+        wchar_t cc = *src++;
-        if (cc < 0x80 && utf7encode[cc] < 1)
+        if ( wxIsUTF7Direct(cc) )
        {
-            // plain ASCII char
+            if ( state.IsShifted() )
-            if (buf)
+            {
-                *buf++ = (char)cc;
+                // pad with zeros the last encoded block if necessary
-
+                if ( state.bit )
                {
                    if ( dst )
                        *dst++ = utf7enb64[((state.accum % 16) << (6 - state.bit)) % 64];
                    len++;
                }
                state.ToDirect();
                if ( dst )
                    *dst++ = '-';
                len++;
            }
            if ( dst )
                *dst++ = (char)cc;
            len++;
        }
        else if ( cc == '+' && state.IsDirect() )
        {
            if ( dst )
            {
                *dst++ = '+';
                *dst++ = '-';
            }
            len += 2;
        }
 #ifndef WC_UTF16
        else if (((wxUint32)cc) > 0xffff)
        {
@@ -650,52 +765,45 @@ size_t wxMBConvUTF7::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 #endif
        else
        {
-            if (buf)
+            if ( state.IsDirect() )
                *buf++ = '+';
            len++;
            if (cc != '+')
            {
                state.ToShifted();
                if ( dst )
                    *dst++ = '+';
                len++;
            }
            // BASE64 encode string
-                unsigned int lsb, d, l;
+            for ( ;; )
                for (d = 0, l = 0; /*nothing*/; psz++)
            {
-                    for (lsb = 0; lsb < 2; lsb ++)
+                for ( unsigned lsb = 0; lsb < 2; lsb++ )
                {
-                        d <<= 8;
+                    state.accum <<= 8;
-                        d += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
+                    state.accum += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
-                        for (l += 8; l >= 6; )
+                    for (state.bit += 8; state.bit >= 6; )
                    {
-                            l -= 6;
+                        state.bit -= 6;
-                            if (buf)
+                        if ( dst )
-                                *buf++ = utf7enb64[(d >> l) % 64];
+                            *dst++ = utf7enb64[(state.accum >> state.bit) % 64];
                        len++;
                    }
                }
-                    cc = *psz;
+                if ( src == srcEnd || wxIsUTF7Direct(cc = *src) )
                    if (!(cc) || (cc < 0x80 && utf7encode[cc] < 1))
                    break;
                src++;
            }
                if (l != 0)
                {
                    if (buf)
                        *buf++ = utf7enb64[((d % 16) << (6 - l)) % 64];
                    len++;
        }
    }
-            if (buf)
+    // we need to restore the original encoder state if we were called just to
-                *buf++ = '-';
+    // calculate the amount of space needed as we will presumably be called
-            len++;
+    // again to really convert the data now
-        }
+    if ( !dst )
-    }
+        state = stateOrig;
    if (buf && (len < n))
        *buf = 0;
    return len;
 }