Fix wxURI::Unescape() to work with Unicode strings
Such strings are not really URIs as they should have been encoded if they were but we can obtain them from e.g. wxFileSystem::FindFirst(), so handle them correctly here as it's simpler than checking all the places where Unescape() is called. Add a unit test checking that decoding an URI containing both Unicode and percent-encoded Unicode characters works correctly.
This commit is contained in:
@@ -63,6 +63,7 @@ All:
|
|||||||
|
|
||||||
- Add UTF-8 and ZIP 64 support to wxZip{Input,Output}Stream (Tobias Taschner).
|
- Add UTF-8 and ZIP 64 support to wxZip{Input,Output}Stream (Tobias Taschner).
|
||||||
- Upgrade libpng to 1.6.21 fixing several security bugs (Paul Kulchenko).
|
- Upgrade libpng to 1.6.21 fixing several security bugs (Paul Kulchenko).
|
||||||
|
- Fix handling of Unicode file names in wxFileSystem::FindFirst().
|
||||||
- Add wxStandardPaths::GetUserDir() (Tobias Taschner).
|
- Add wxStandardPaths::GetUserDir() (Tobias Taschner).
|
||||||
- Allow calling wxItemContainer::Add() and similar with std::vector<> argument.
|
- Allow calling wxItemContainer::Add() and similar with std::vector<> argument.
|
||||||
- Add "%z" support to printf()-like functions like wxString::Format() (RIVDSL).
|
- Add "%z" support to printf()-like functions like wxString::Format() (RIVDSL).
|
||||||
|
@@ -137,11 +137,6 @@ protected:
|
|||||||
static bool ParseIPv6address(const char*& uri);
|
static bool ParseIPv6address(const char*& uri);
|
||||||
static bool ParseIPvFuture(const char*& uri);
|
static bool ParseIPvFuture(const char*& uri);
|
||||||
|
|
||||||
// should be called with i pointing to '%', returns the encoded character
|
|
||||||
// following it or -1 if invalid and advances i past it (so that it points
|
|
||||||
// to the last character consumed on return)
|
|
||||||
static int DecodeEscape(wxString::const_iterator& i);
|
|
||||||
|
|
||||||
// append next character pointer to by p to the string in an escaped form
|
// append next character pointer to by p to the string in an escaped form
|
||||||
// and advance p past it
|
// and advance p past it
|
||||||
//
|
//
|
||||||
|
@@ -100,38 +100,32 @@ int wxURI::CharToHex(char c)
|
|||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
int wxURI::DecodeEscape(wxString::const_iterator& i)
|
|
||||||
{
|
|
||||||
int hi = CharToHex(*++i);
|
|
||||||
if ( hi == -1 )
|
|
||||||
return -1;
|
|
||||||
|
|
||||||
int lo = CharToHex(*++i);
|
|
||||||
if ( lo == -1 )
|
|
||||||
return -1;
|
|
||||||
|
|
||||||
return (hi << 4) | lo;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* static */
|
/* static */
|
||||||
wxString wxURI::Unescape(const wxString& uri)
|
wxString wxURI::Unescape(const wxString& uri)
|
||||||
{
|
{
|
||||||
|
// URIs can contain escaped 8-bit characters that have to be decoded using
|
||||||
|
// UTF-8 (see RFC 3986), however in our (probably broken...) case we can
|
||||||
|
// also end up with not escaped Unicode characters in the URI string which
|
||||||
|
// can't be decoded as UTF-8. So what we do here is to encode all Unicode
|
||||||
|
// characters as UTF-8 only to decode them back below. This is obviously
|
||||||
|
// inefficient but there doesn't seem to be anything else to do, other than
|
||||||
|
// not allowing to mix Unicode characters with escapes in the first place,
|
||||||
|
// but this seems to be done in a lot of places, unfortunately.
|
||||||
|
const wxScopedCharBuffer& uriU8(uri.utf8_str());
|
||||||
|
const size_t len = uriU8.length();
|
||||||
|
|
||||||
// the unescaped version can't be longer than the original one
|
// the unescaped version can't be longer than the original one
|
||||||
wxCharBuffer buf(uri.length());
|
wxCharBuffer buf(uriU8.length());
|
||||||
char *p = buf.data();
|
char *p = buf.data();
|
||||||
|
|
||||||
for ( wxString::const_iterator i = uri.begin(); i != uri.end(); ++i, ++p )
|
const char* const end = uriU8.data() + len;
|
||||||
|
for ( const char* s = uriU8.data(); s != end; ++s, ++p )
|
||||||
{
|
{
|
||||||
char c = *i;
|
char c = *s;
|
||||||
if ( c == '%' )
|
if ( c == '%' && s < end - 2 && IsHex(s[1]) && IsHex(s[2]) )
|
||||||
{
|
{
|
||||||
int n = wxURI::DecodeEscape(i);
|
c = (CharToHex(s[1]) << 4) | CharToHex(s[2]);
|
||||||
if ( n == -1 )
|
s += 2;
|
||||||
return wxString();
|
|
||||||
|
|
||||||
wxASSERT_MSG( n >= 0 && n <= 0xff, "unexpected character value" );
|
|
||||||
|
|
||||||
c = static_cast<char>(n);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
*p = c;
|
*p = c;
|
||||||
@@ -139,17 +133,7 @@ wxString wxURI::Unescape(const wxString& uri)
|
|||||||
|
|
||||||
*p = '\0';
|
*p = '\0';
|
||||||
|
|
||||||
// by default assume that the URI is in UTF-8, this is the most common
|
return wxString::FromUTF8(buf);
|
||||||
// practice
|
|
||||||
wxString s = wxString::FromUTF8(buf);
|
|
||||||
if ( s.empty() )
|
|
||||||
{
|
|
||||||
// if it isn't, use latin-1 as a fallback -- at least this always
|
|
||||||
// succeeds
|
|
||||||
s = wxCSConv(wxFONTENCODING_ISO8859_1).cMB2WC(buf);
|
|
||||||
}
|
|
||||||
|
|
||||||
return s;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void wxURI::AppendNextEscaped(wxString& s, const char *& p)
|
void wxURI::AppendNextEscaped(wxString& s, const char *& p)
|
||||||
|
@@ -338,6 +338,15 @@ void URITestCase::Unescaping()
|
|||||||
"\xD1\x87\xD0\xB8\xD1\x81\xD0\xBB\xD0\xBE"
|
"\xD1\x87\xD0\xB8\xD1\x81\xD0\xBB\xD0\xBE"
|
||||||
),
|
),
|
||||||
unescaped );
|
unescaped );
|
||||||
|
|
||||||
|
escaped = L"file://\u043C\u043E\u0439%5C%d1%84%d0%b0%d0%b9%d0%bb";
|
||||||
|
unescaped = wxURI::Unescape(escaped);
|
||||||
|
|
||||||
|
CPPUNIT_ASSERT_EQUAL
|
||||||
|
(
|
||||||
|
L"file://\u043C\u043E\u0439\\\u0444\u0430\u0439\u043B",
|
||||||
|
unescaped
|
||||||
|
);
|
||||||
#endif // wxUSE_UNICODE
|
#endif // wxUSE_UNICODE
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user