rewrite core foundation conversion to be efficient, double-pass mac WC2MB to verify that there were no invalid characters converted, set amount of memory to allocate to 32 for mac conversion to prevent needless large buffer allocations

git-svn-id: https://svn.wxwidgets.org/svn/wx/wxWidgets/trunk@30226 c3d73ce0-8a6f-49c7-b76d-6d57e0e08775
This commit is contained in:
Ryan Norton
2004-11-01 18:03:04 +00:00
parent 04fa0af10e
commit 638357a03e

View File

@@ -1567,14 +1567,10 @@ private:
CFStringEncoding wxCFStringEncFromFontEnc(wxFontEncoding encoding) CFStringEncoding wxCFStringEncFromFontEnc(wxFontEncoding encoding)
{ {
CFStringEncoding enc = 0 ; CFStringEncoding enc = kCFStringEncodingInvalidId ;
if ( encoding == wxFONTENCODING_DEFAULT ) if ( encoding == wxFONTENCODING_DEFAULT )
{ {
#if wxUSE_GUI enc = CFStringGetSystemEncoding();
encoding = wxFont::GetDefaultEncoding() ;
#else
encoding = wxLocale::GetSystemEncoding() ;
#endif
} }
else switch( encoding) else switch( encoding)
{ {
@@ -1662,7 +1658,6 @@ CFStringEncoding wxCFStringEncFromFontEnc(wxFontEncoding encoding)
case wxFONTENCODING_CP950 : case wxFONTENCODING_CP950 :
enc = kCFStringEncodingDOSChineseTrad; enc = kCFStringEncodingDOSChineseTrad;
break ; break ;
case wxFONTENCODING_CP1250 : case wxFONTENCODING_CP1250 :
enc = kCFStringEncodingWindowsLatin2; enc = kCFStringEncodingWindowsLatin2;
break ; break ;
@@ -1687,9 +1682,10 @@ CFStringEncoding wxCFStringEncFromFontEnc(wxFontEncoding encoding)
case wxFONTENCODING_CP1257 : case wxFONTENCODING_CP1257 :
enc = kCFStringEncodingWindowsBalticRim; enc = kCFStringEncodingWindowsBalticRim;
break ; break ;
case wxFONTENCODING_UTF7 : // This only really encodes to UTF7 (if that) evidently
enc = kCFStringEncodingNonLossyASCII ; // case wxFONTENCODING_UTF7 :
break ; // enc = kCFStringEncodingNonLossyASCII ;
// break ;
case wxFONTENCODING_UTF8 : case wxFONTENCODING_UTF8 :
enc = kCFStringEncodingUTF8 ; enc = kCFStringEncodingUTF8 ;
break ; break ;
@@ -1826,245 +1822,6 @@ CFStringEncoding wxCFStringEncFromFontEnc(wxFontEncoding encoding)
return enc ; return enc ;
} }
wxFontEncoding wxFontEncFromCFStringEnc(CFStringEncoding encoding)
{
wxFontEncoding enc = wxFONTENCODING_DEFAULT ;
switch( encoding)
{
case kCFStringEncodingISOLatin1 :
enc = wxFONTENCODING_ISO8859_1 ;
break ;
case kCFStringEncodingISOLatin2 :
enc = wxFONTENCODING_ISO8859_2;
break ;
case kCFStringEncodingISOLatin3 :
enc = wxFONTENCODING_ISO8859_3 ;
break ;
case kCFStringEncodingISOLatin4 :
enc = wxFONTENCODING_ISO8859_4;
break ;
case kCFStringEncodingISOLatinCyrillic :
enc = wxFONTENCODING_ISO8859_5;
break ;
case kCFStringEncodingISOLatinArabic :
enc = wxFONTENCODING_ISO8859_6;
break ;
case kCFStringEncodingISOLatinGreek :
enc = wxFONTENCODING_ISO8859_7;
break ;
case kCFStringEncodingISOLatinHebrew :
enc = wxFONTENCODING_ISO8859_8;
break ;
case kCFStringEncodingISOLatin5 :
enc = wxFONTENCODING_ISO8859_9;
break ;
case kCFStringEncodingISOLatin6 :
enc = wxFONTENCODING_ISO8859_10;
break ;
case kCFStringEncodingISOLatin7 :
enc = wxFONTENCODING_ISO8859_13;
break ;
case kCFStringEncodingISOLatin8 :
enc = wxFONTENCODING_ISO8859_14;
break ;
case kCFStringEncodingISOLatin9 :
enc =wxFONTENCODING_ISO8859_15 ;
break ;
case kCFStringEncodingKOI8_R :
enc = wxFONTENCODING_KOI8;
break ;
// case :
// enc = wxFONTENCODING_BULGARIAN;
// break ;
case kCFStringEncodingDOSLatinUS :
enc = wxFONTENCODING_CP437;
break ;
case kCFStringEncodingDOSLatin1 :
enc = wxFONTENCODING_CP850;
break ;
case kCFStringEncodingDOSLatin2 :
enc =wxFONTENCODING_CP852 ;
break ;
case kCFStringEncodingDOSCyrillic :
enc = wxFONTENCODING_CP855;
break ;
case kCFStringEncodingDOSRussian :
enc = wxFONTENCODING_CP866;
break ;
case kCFStringEncodingDOSThai :
enc =wxFONTENCODING_CP874 ;
break ;
case kCFStringEncodingDOSJapanese :
enc = wxFONTENCODING_CP932;
break ;
case kCFStringEncodingDOSChineseSimplif :
enc = wxFONTENCODING_CP936;
break ;
case kCFStringEncodingDOSKorean :
enc = wxFONTENCODING_CP949;
break ;
case kCFStringEncodingDOSChineseTrad :
enc = wxFONTENCODING_CP950;
break ;
case kCFStringEncodingWindowsLatin2 :
enc = wxFONTENCODING_CP1250;
break ;
case kCFStringEncodingWindowsCyrillic :
enc = wxFONTENCODING_CP1251;
break ;
case kCFStringEncodingWindowsLatin1 :
enc = wxFONTENCODING_CP1252;
break ;
case kCFStringEncodingWindowsGreek :
enc = wxFONTENCODING_CP1253;
break ;
case kCFStringEncodingWindowsLatin5 :
enc = wxFONTENCODING_CP1254;
break ;
case kCFStringEncodingWindowsHebrew :
enc = wxFONTENCODING_CP1255;
break ;
case kCFStringEncodingWindowsArabic :
enc = wxFONTENCODING_CP1256;
break ;
case kCFStringEncodingWindowsBalticRim :
enc =wxFONTENCODING_CP1257 ;
break ;
case kCFStringEncodingEUC_JP :
enc = wxFONTENCODING_EUC_JP;
break ;
case kCFStringEncodingUnicode :
enc = wxFONTENCODING_UTF16;
break;
case kCFStringEncodingMacRoman :
enc = wxFONTENCODING_MACROMAN ;
break ;
case kCFStringEncodingMacJapanese :
enc = wxFONTENCODING_MACJAPANESE ;
break ;
case kCFStringEncodingMacChineseTrad :
enc = wxFONTENCODING_MACCHINESETRAD ;
break ;
case kCFStringEncodingMacKorean :
enc = wxFONTENCODING_MACKOREAN ;
break ;
case kCFStringEncodingMacArabic :
enc =wxFONTENCODING_MACARABIC ;
break ;
case kCFStringEncodingMacHebrew :
enc = wxFONTENCODING_MACHEBREW ;
break ;
case kCFStringEncodingMacGreek :
enc = wxFONTENCODING_MACGREEK ;
break ;
case kCFStringEncodingMacCyrillic :
enc = wxFONTENCODING_MACCYRILLIC ;
break ;
case kCFStringEncodingMacDevanagari :
enc = wxFONTENCODING_MACDEVANAGARI ;
break ;
case kCFStringEncodingMacGurmukhi :
enc = wxFONTENCODING_MACGURMUKHI ;
break ;
case kCFStringEncodingMacGujarati :
enc = wxFONTENCODING_MACGUJARATI ;
break ;
case kCFStringEncodingMacOriya :
enc =wxFONTENCODING_MACORIYA ;
break ;
case kCFStringEncodingMacBengali :
enc =wxFONTENCODING_MACBENGALI ;
break ;
case kCFStringEncodingMacTamil :
enc = wxFONTENCODING_MACTAMIL ;
break ;
case kCFStringEncodingMacTelugu :
enc = wxFONTENCODING_MACTELUGU ;
break ;
case kCFStringEncodingMacKannada :
enc = wxFONTENCODING_MACKANNADA ;
break ;
case kCFStringEncodingMacMalayalam :
enc = wxFONTENCODING_MACMALAJALAM ;
break ;
case kCFStringEncodingMacSinhalese :
enc = wxFONTENCODING_MACSINHALESE ;
break ;
case kCFStringEncodingMacBurmese :
enc = wxFONTENCODING_MACBURMESE ;
break ;
case kCFStringEncodingMacKhmer :
enc = wxFONTENCODING_MACKHMER ;
break ;
case kCFStringEncodingMacThai :
enc = wxFONTENCODING_MACTHAI ;
break ;
case kCFStringEncodingMacLaotian :
enc = wxFONTENCODING_MACLAOTIAN ;
break ;
case kCFStringEncodingMacGeorgian :
enc = wxFONTENCODING_MACGEORGIAN ;
break ;
case kCFStringEncodingMacArmenian :
enc = wxFONTENCODING_MACARMENIAN ;
break ;
case kCFStringEncodingMacChineseSimp :
enc = wxFONTENCODING_MACCHINESESIMP ;
break ;
case kCFStringEncodingMacTibetan :
enc = wxFONTENCODING_MACTIBETAN ;
break ;
case kCFStringEncodingMacMongolian :
enc = wxFONTENCODING_MACMONGOLIAN ;
break ;
case kCFStringEncodingMacEthiopic :
enc = wxFONTENCODING_MACETHIOPIC ;
break ;
case kCFStringEncodingMacCentralEurRoman:
enc = wxFONTENCODING_MACCENTRALEUR ;
break ;
case kCFStringEncodingMacVietnamese:
enc = wxFONTENCODING_MACVIATNAMESE ;
break ;
case kCFStringEncodingMacExtArabic :
enc = wxFONTENCODING_MACARABICEXT ;
break ;
case kCFStringEncodingMacSymbol :
enc = wxFONTENCODING_MACSYMBOL ;
break ;
case kCFStringEncodingMacDingbats :
enc = wxFONTENCODING_MACDINGBATS ;
break ;
case kCFStringEncodingMacTurkish :
enc = wxFONTENCODING_MACTURKISH ;
break ;
case kCFStringEncodingMacCroatian :
enc = wxFONTENCODING_MACCROATIAN ;
break ;
case kCFStringEncodingMacIcelandic :
enc = wxFONTENCODING_MACICELANDIC ;
break ;
case kCFStringEncodingMacRomanian :
enc = wxFONTENCODING_MACROMANIAN ;
break ;
case kCFStringEncodingMacCeltic :
enc = wxFONTENCODING_MACCELTIC ;
break ;
case kCFStringEncodingMacGaelic :
enc = wxFONTENCODING_MACGAELIC ;
break ;
// case kCFStringEncodingMacKeyboardGlyphs :
// enc = wxFONTENCODING_MACKEYBOARD ;
// break ;
} ;
return enc ;
}
class wxMBConv_cocoa : public wxMBConv class wxMBConv_cocoa : public wxMBConv
{ {
public: public:
@@ -2089,160 +1846,118 @@ public:
void Init( CFStringEncoding encoding) void Init( CFStringEncoding encoding)
{ {
m_char_encoding = encoding ; m_encoding = encoding ;
m_unicode_encoding = kCFStringEncodingUnicode;
} }
size_t MB2WC(wchar_t * szOut, const char * szUnConv, size_t nOutSize) const size_t MB2WC(wchar_t * szOut, const char * szUnConv, size_t nOutSize) const
{ {
wxASSERT(szUnConv); wxASSERT(szUnConv);
size_t nBufSize = strlen(szUnConv) + 1; CFStringRef theString = CFStringCreateWithBytes (
size_t nRealOutSize; NULL, //the allocator
(const UInt8*)szUnConv,
UniChar* szUniCharBuffer = (UniChar*) szOut; strlen(szUnConv),
wchar_t* szConvBuffer = szOut; m_encoding,
false //no BOM/external representation
if (szConvBuffer == NULL && nOutSize != 0)
{
szConvBuffer = new wchar_t[nOutSize] ;
}
#if SIZEOF_WCHAR_T == 4
szUniCharBuffer = new UniChar[nOutSize];
#endif
CFDataRef theData = CFDataCreateWithBytesNoCopy (
NULL, //allocator
(const UInt8*)szUnConv,
nBufSize - 1,
NULL //deallocator
);
wxASSERT(theData);
CFStringRef theString = CFStringCreateFromExternalRepresentation (
NULL,
theData,
m_char_encoding
); );
wxASSERT(theString); wxASSERT(theString);
if (nOutSize == 0) size_t nOutLength = CFStringGetLength(theString);
if (szOut == NULL)
{ {
nRealOutSize = CFStringGetLength(theString) + 1;
CFRelease(theString); CFRelease(theString);
return nRealOutSize - 1; return nOutLength;
} }
CFRange theRange = { 0, CFStringGetLength(theString) }; CFRange theRange = { 0, nOutSize };
#if SIZEOF_WCHAR_T == 4
UniChar* szUniCharBuffer = new UniChar[nOutSize];
#endif
CFStringGetCharacters(theString, theRange, szUniCharBuffer); CFStringGetCharacters(theString, theRange, szUniCharBuffer);
nRealOutSize = (CFStringGetLength(theString) + 1);
CFRelease(theString); CFRelease(theString);
szUniCharBuffer[nRealOutSize-1] = '\0' ; szUniCharBuffer[nOutLength] = '\0' ;
#if SIZEOF_WCHAR_T == 4 #if SIZEOF_WCHAR_T == 4
wxMBConvUTF16 converter ; wxMBConvUTF16 converter ;
converter.MB2WC(szConvBuffer , (const char*)szUniCharBuffer , nRealOutSize ) ; converter.MB2WC(szOut, (const char*)szUniCharBuffer , nOutSize ) ;
delete[] szUniCharBuffer; delete[] szUniCharBuffer;
#endif #endif
if ( szOut == NULL )
delete [] szConvBuffer; return nOutLength;
return nRealOutSize ;
} }
size_t WC2MB(char *szOut, const wchar_t *szUnConv, size_t nOutSize) const size_t WC2MB(char *szOut, const wchar_t *szUnConv, size_t nOutSize) const
{ {
size_t nBufSize = wxWcslen(szUnConv) + 1; wxASSERT(szUnConv);
size_t nRealOutSize; size_t nRealOutSize;
char* szBuffer = szOut; size_t nBufSize = wxWcslen(szUnConv);
UniChar* szUniBuffer = (UniChar*) szUnConv; UniChar* szUniBuffer = (UniChar*) szUnConv;
if (szOut == NULL)
{
// worst case
nRealOutSize = ((nBufSize - 1) * 8) +1 ;
szBuffer = new char[ nRealOutSize ] ;
}
else
nRealOutSize = nOutSize;
#if SIZEOF_WCHAR_T == 4 #if SIZEOF_WCHAR_T == 4
wxMBConvUTF16BE converter ; wxMBConvUTF16BE converter ;
nBufSize = converter.WC2MB( NULL , szUnConv , 0 ); nBufSize = converter.WC2MB( NULL , szUnConv , 0 );
szUniBuffer = new UniChar[ (nBufSize / sizeof(UniChar)) + 1] ; szUniBuffer = new UniChar[ (nBufSize / sizeof(UniChar)) + 1] ;
converter.WC2MB( (char*) szUniBuffer , szUnConv, nBufSize + sizeof(UniChar)) ; converter.WC2MB( (char*) szUniBuffer , szUnConv, nBufSize + sizeof(UniChar)) ;
nBufSize /= sizeof(UniChar); nBufSize /= sizeof(UniChar);
++nBufSize;
#endif #endif
CFStringRef theString = CFStringCreateWithCharactersNoCopy( CFStringRef theString = CFStringCreateWithCharactersNoCopy(
NULL, //allocator NULL, //allocator
szUniBuffer, szUniBuffer,
nBufSize, nBufSize,
NULL //deallocator kCFAllocatorNull //deallocator - we want to deallocate it ourselves
); );
wxASSERT(theString); wxASSERT(theString);
//Note that CER puts a BOM when converting to unicode //Note that CER puts a BOM when converting to unicode
//so we may want to check and use getchars instead in that case //so we check and use getchars instead in that case
CFDataRef theData = CFStringCreateExternalRepresentation( if (m_encoding == kCFStringEncodingUnicode)
NULL, //allocator {
theString, if (szOut != NULL)
m_char_encoding, CFStringGetCharacters(theString, CFRangeMake(0, nOutSize - 1), (UniChar*) szOut);
0 //what to put in characters that can't be converted -
//0 tells CFString to return NULL if it meets such a character nRealOutSize = CFStringGetLength(theString) + 1;
}
else
{
CFStringGetBytes(
theString,
CFRangeMake(0, CFStringGetLength(theString)),
m_encoding,
0, //what to put in characters that can't be converted -
//0 tells CFString to return NULL if it meets such a character
false, //not an external representation
(UInt8*) szOut,
nOutSize,
(CFIndex*) &nRealOutSize
); );
}
if(!theData)
return (size_t)-1;
CFRelease(theString); CFRelease(theString);
nRealOutSize = CFDataGetLength(theData); #if SIZEOF_WCHAR_T == 4
delete[] szUniBuffer;
#endif
if ( szOut == NULL )
delete[] szBuffer;
if(nOutSize == 0)
{
//TODO: This gets flagged as a non-malloced address by the debugger...
//#if SIZEOF_WCHAR_T == 4
// delete[] szUniBuffer;
//#endif
CFRelease(theData);
return nRealOutSize - 1;
}
CFRange theRange = {0, CFDataGetLength(theData) };
CFDataGetBytes(theData, theRange, (UInt8*) szBuffer);
CFRelease(theData);
//TODO: This gets flagged as a non-malloced address by the debugger...
//#if SIZEOF_WCHAR_T == 4
// delete[] szUniBuffer;
//#endif
return nRealOutSize - 1; return nRealOutSize - 1;
} }
bool IsOk() const bool IsOk() const
{ {
//TODO: check for invalid en/de/coding return m_encoding != kCFStringEncodingInvalidId &&
return true; CFStringIsEncodingAvailable(m_encoding);
} }
private: private:
CFStringEncoding m_char_encoding ; CFStringEncoding m_encoding ;
CFStringEncoding m_unicode_encoding ;
}; };
#endif // defined(__WXCOCOA__) #endif // defined(__WXCOCOA__)
@@ -2304,7 +2019,8 @@ public:
if (buf == NULL) if (buf == NULL)
{ {
n = byteInLen ; //apple specs say at least 32
n = 32 ;
tbuf = (wchar_t*) malloc( n * SIZEOF_WCHAR_T) ; tbuf = (wchar_t*) malloc( n * SIZEOF_WCHAR_T) ;
} }
ByteCount byteBufferLen = n * sizeof( UniChar ) ; ByteCount byteBufferLen = n * sizeof( UniChar ) ;
@@ -2334,13 +2050,6 @@ public:
return res ; return res ;
} }
//NB: This is _broken_ - in invalid conversions, instead of returning -1
//like it should, it (sometimes?) converts invalid characters of the encoding to a question
//mark character '?'.
//
//We need to do the msw double-pass check for the question marks as Vadim
//lines out above (RN: I don't recall this happening in the core foundation version,
//but it might do it there also, ick)
size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const
{ {
OSStatus status = noErr ; OSStatus status = noErr ;
@@ -2351,8 +2060,8 @@ public:
if (buf == NULL) if (buf == NULL)
{ {
// worst case //apple specs say at least 32
n = ((byteInLen / SIZEOF_WCHAR_T) * 8) + SIZEOF_WCHAR_T; n = 32;
tbuf = (char*) malloc( n ) ; tbuf = (char*) malloc( n ) ;
} }
@@ -2377,7 +2086,22 @@ public:
size_t res = byteOutLen ; size_t res = byteOutLen ;
if ( buf && res < n) if ( buf && res < n)
{
buf[res] = 0; buf[res] = 0;
//we need to double-trip to verify it didn't insert any ? in place
//of bogus characters
wxWCharBuffer wcBuf(n);
size_t pszlen = wxWcslen(psz);
if ( MB2WC(wcBuf.data(), buf, n) == (size_t)-1 ||
wxWcslen(wcBuf) != pszlen ||
memcmp(wcBuf, psz, pszlen * sizeof(wchar_t)) != 0 )
{
// we didn't obtain the same thing we started from, hence
// the conversion was lossy and we consider that it failed
return (size_t)-1;
}
}
return res ; return res ;
} }