Files
wxWidgets/src/html/htmltag.cpp
Vadim Zeitlin 3625820490 Fixes for parsing invalid HTML without tag ends.
The code in wxHtmlParser supposed in many places that a '<' character must be
always followed by a '>' one and could create (and sometimes dereference)
invalid iterators if this wasn't the case resulting in asserts from MSVC debug
CRT and possibly crashes.

Fix this by ensuring that only valid iterators are used and add a trivial unit
test for wxHtmlParser which checks that it can parse invalid HTML without
crashing.

Closes #12869.

git-svn-id: https://svn.wxwidgets.org/svn/wx/wxWidgets/trunk@66678 c3d73ce0-8a6f-49c7-b76d-6d57e0e08775
2011-01-13 14:49:55 +00:00

645 lines
19 KiB
C++

/////////////////////////////////////////////////////////////////////////////
// Name: src/html/htmltag.cpp
// Purpose: wxHtmlTag class (represents single tag)
// Author: Vaclav Slavik
// RCS-ID: $Id$
// Copyright: (c) 1999 Vaclav Slavik
// Licence: wxWindows licence
/////////////////////////////////////////////////////////////////////////////
#include "wx/wxprec.h"
#ifdef __BORLANDC__
#pragma hdrstop
#endif
#if wxUSE_HTML
#include "wx/html/htmltag.h"
#ifndef WX_PRECOMP
#include "wx/colour.h"
#include "wx/wxcrtvararg.h"
#endif
#include "wx/html/htmlpars.h"
#include "wx/html/styleparams.h"
#include "wx/vector.h"
#include <stdio.h> // for vsscanf
#include <stdarg.h>
//-----------------------------------------------------------------------------
// wxHtmlTagsCache
//-----------------------------------------------------------------------------
struct wxHtmlCacheItem
{
// this is "pos" value passed to wxHtmlTag's constructor.
// it is position of '<' character of the tag
wxString::const_iterator Key;
// Tag type
enum Type
{
Type_Normal, // normal tag with a matching ending tag
Type_NoMatchingEndingTag, // there's no ending tag for this tag
Type_EndingTag // this is ending tag </..>
};
Type type;
// end positions for the tag:
// end1 is '<' of ending tag,
// end2 is '>' or both are
wxString::const_iterator End1, End2;
// name of this tag
wxChar *Name;
};
// NB: this is an empty class and not typedef because of forward declaration
class wxHtmlTagsCacheData : public wxVector<wxHtmlCacheItem>
{
};
bool wxIsCDATAElement(const wxChar *tag)
{
return (wxStrcmp(tag, wxT("SCRIPT")) == 0) ||
(wxStrcmp(tag, wxT("STYLE")) == 0);
}
bool wxIsCDATAElement(const wxString& tag)
{
return (wxStrcmp(tag.wx_str(), wxS("SCRIPT")) == 0) ||
(wxStrcmp(tag.wx_str(), wxS("STYLE")) == 0);
}
wxHtmlTagsCache::wxHtmlTagsCache(const wxString& source)
{
m_Cache = new wxHtmlTagsCacheData;
m_CachePos = 0;
wxChar tagBuffer[256];
const wxString::const_iterator end = source.end();
for ( wxString::const_iterator pos = source.begin(); pos < end; ++pos )
{
if (*pos != wxT('<'))
continue;
// possible tag start found:
// don't cache comment tags
if ( wxHtmlParser::SkipCommentTag(pos, end) )
continue;
// Remember the starting tag position.
wxString::const_iterator stpos = pos++;
// And look for the ending one.
int i;
for ( i = 0;
pos < end && i < (int)WXSIZEOF(tagBuffer) - 1 &&
*pos != wxT('>') && !wxIsspace(*pos);
++i, ++pos )
{
tagBuffer[i] = (wxChar)wxToupper(*pos);
}
tagBuffer[i] = wxT('\0');
while (pos < end && *pos != wxT('>'))
++pos;
if ( pos == end )
{
// We didn't find a closing bracket, this is not a valid tag after
// all. Notice that we need to roll back pos to avoid creating an
// invalid iterator when "++pos" is done in the loop statement.
--pos;
continue;
}
// We have a valid tag, add it to the cache.
size_t tg = Cache().size();
Cache().push_back(wxHtmlCacheItem());
Cache()[tg].Key = stpos;
Cache()[tg].Name = new wxChar[i+1];
memcpy(Cache()[tg].Name, tagBuffer, (i+1)*sizeof(wxChar));
if ((stpos+1) < end && *(stpos+1) == wxT('/')) // ending tag:
{
Cache()[tg].type = wxHtmlCacheItem::Type_EndingTag;
// find matching begin tag:
for (i = tg; i >= 0; i--)
{
if ((Cache()[i].type == wxHtmlCacheItem::Type_NoMatchingEndingTag) && (wxStrcmp(Cache()[i].Name, tagBuffer+1) == 0))
{
Cache()[i].type = wxHtmlCacheItem::Type_Normal;
Cache()[i].End1 = stpos;
Cache()[i].End2 = pos + 1;
break;
}
}
}
else
{
Cache()[tg].type = wxHtmlCacheItem::Type_NoMatchingEndingTag;
if (wxIsCDATAElement(tagBuffer))
{
// store the orig pos in case we are missing the closing
// tag (see below)
const wxString::const_iterator old_pos = pos;
bool foundCloseTag = false;
// find next matching tag
int tag_len = wxStrlen(tagBuffer);
while (pos < end)
{
// find the ending tag
while (pos + 1 < end &&
(*pos != '<' || *(pos+1) != '/'))
++pos;
if (*pos == '<')
++pos;
// see if it matches
int match_pos = 0;
while (pos < end && match_pos < tag_len )
{
wxChar c = *pos;
if ( c == '>' || c == '<' )
break;
// cast to wxChar needed to suppress warning in
// Unicode build
if ((wxChar)wxToupper(c) == tagBuffer[match_pos])
{
++match_pos;
}
else if (c == wxT(' ') || c == wxT('\n') ||
c == wxT('\r') || c == wxT('\t'))
{
// need to skip over these
}
else
{
match_pos = 0;
}
++pos;
}
// found a match
if (match_pos == tag_len)
{
pos = pos - tag_len - 3;
foundCloseTag = true;
break;
}
else // keep looking for the closing tag
{
++pos;
}
}
if (!foundCloseTag)
{
// we didn't find closing tag; this means the markup
// is incorrect and the best thing we can do is to
// ignore the unclosed tag and continue parsing as if
// it didn't exist:
pos = old_pos;
}
}
}
}
// ok, we're done, now we'll free .Name members of cache - we don't need it anymore:
for ( wxHtmlTagsCacheData::iterator i = Cache().begin();
i != Cache().end(); ++i )
{
wxDELETEA(i->Name);
}
}
wxHtmlTagsCache::~wxHtmlTagsCache()
{
delete m_Cache;
}
void wxHtmlTagsCache::QueryTag(const wxString::const_iterator& at,
const wxString::const_iterator& inputEnd,
wxString::const_iterator *end1,
wxString::const_iterator *end2,
bool *hasEnding)
{
if (Cache().empty())
{
*end1 =
*end2 = inputEnd;
*hasEnding = true;
return;
}
if (Cache()[m_CachePos].Key != at)
{
int delta = (at < Cache()[m_CachePos].Key) ? -1 : 1;
do
{
m_CachePos += delta;
if ( m_CachePos < 0 || m_CachePos >= (int)Cache().size() )
{
if ( m_CachePos < 0 )
m_CachePos = 0;
else
m_CachePos = Cache().size() - 1;
// something is very wrong with HTML, give up by returning an
// impossibly large value which is going to be ignored by the
// caller
*end1 =
*end2 = inputEnd;
*hasEnding = true;
return;
}
}
while (Cache()[m_CachePos].Key != at);
}
switch ( Cache()[m_CachePos].type )
{
case wxHtmlCacheItem::Type_Normal:
*end1 = Cache()[m_CachePos].End1;
*end2 = Cache()[m_CachePos].End2;
*hasEnding = true;
break;
case wxHtmlCacheItem::Type_EndingTag:
wxFAIL_MSG("QueryTag called for ending tag - can't be");
// but if it does happen, fall through, better than crashing
case wxHtmlCacheItem::Type_NoMatchingEndingTag:
// If input HTML is invalid and there's no closing tag for this
// one, pretend that it runs all the way to the end of input
*end1 = inputEnd;
*end2 = inputEnd;
*hasEnding = false;
break;
}
}
//-----------------------------------------------------------------------------
// wxHtmlTag
//-----------------------------------------------------------------------------
wxHtmlTag::wxHtmlTag(wxHtmlTag *parent,
const wxString *source,
const wxString::const_iterator& pos,
const wxString::const_iterator& end_pos,
wxHtmlTagsCache *cache,
wxHtmlEntitiesParser *entParser)
{
/* Setup DOM relations */
m_Next = NULL;
m_FirstChild = m_LastChild = NULL;
m_Parent = parent;
if (parent)
{
m_Prev = m_Parent->m_LastChild;
if (m_Prev == NULL)
m_Parent->m_FirstChild = this;
else
m_Prev->m_Next = this;
m_Parent->m_LastChild = this;
}
else
m_Prev = NULL;
/* Find parameters and their values: */
wxChar c wxDUMMY_INITIALIZE(0);
// fill-in name, params and begin pos:
wxString::const_iterator i(pos+1);
// find tag's name and convert it to uppercase:
while ((i < end_pos) &&
((c = *(i++)) != wxT(' ') && c != wxT('\r') &&
c != wxT('\n') && c != wxT('\t') &&
c != wxT('>') && c != wxT('/')))
{
if ((c >= wxT('a')) && (c <= wxT('z')))
c -= (wxT('a') - wxT('A'));
m_Name << c;
}
// if the tag has parameters, read them and "normalize" them,
// i.e. convert to uppercase, replace whitespaces by spaces and
// remove whitespaces around '=':
if (*(i-1) != wxT('>'))
{
#define IS_WHITE(c) (c == wxT(' ') || c == wxT('\r') || \
c == wxT('\n') || c == wxT('\t'))
wxString pname, pvalue;
wxChar quote;
enum
{
ST_BEFORE_NAME = 1,
ST_NAME,
ST_BEFORE_EQ,
ST_BEFORE_VALUE,
ST_VALUE
} state;
quote = 0;
state = ST_BEFORE_NAME;
while (i < end_pos)
{
c = *(i++);
if (c == wxT('>') && !(state == ST_VALUE && quote != 0))
{
if (state == ST_BEFORE_EQ || state == ST_NAME)
{
m_ParamNames.Add(pname);
m_ParamValues.Add(wxGetEmptyString());
}
else if (state == ST_VALUE && quote == 0)
{
m_ParamNames.Add(pname);
if (entParser)
m_ParamValues.Add(entParser->Parse(pvalue));
else
m_ParamValues.Add(pvalue);
}
break;
}
switch (state)
{
case ST_BEFORE_NAME:
if (!IS_WHITE(c))
{
pname = c;
state = ST_NAME;
}
break;
case ST_NAME:
if (IS_WHITE(c))
state = ST_BEFORE_EQ;
else if (c == wxT('='))
state = ST_BEFORE_VALUE;
else
pname << c;
break;
case ST_BEFORE_EQ:
if (c == wxT('='))
state = ST_BEFORE_VALUE;
else if (!IS_WHITE(c))
{
m_ParamNames.Add(pname);
m_ParamValues.Add(wxGetEmptyString());
pname = c;
state = ST_NAME;
}
break;
case ST_BEFORE_VALUE:
if (!IS_WHITE(c))
{
if (c == wxT('"') || c == wxT('\''))
quote = c, pvalue = wxGetEmptyString();
else
quote = 0, pvalue = c;
state = ST_VALUE;
}
break;
case ST_VALUE:
if ((quote != 0 && c == quote) ||
(quote == 0 && IS_WHITE(c)))
{
m_ParamNames.Add(pname);
if (quote == 0)
{
// VS: backward compatibility, no real reason,
// but wxHTML code relies on this... :(
pvalue.MakeUpper();
}
if (entParser)
m_ParamValues.Add(entParser->Parse(pvalue));
else
m_ParamValues.Add(pvalue);
state = ST_BEFORE_NAME;
}
else
pvalue << c;
break;
}
}
#undef IS_WHITE
}
m_Begin = i;
cache->QueryTag(pos, source->end(), &m_End1, &m_End2, &m_hasEnding);
if (m_End1 > end_pos) m_End1 = end_pos;
if (m_End2 > end_pos) m_End2 = end_pos;
#if WXWIN_COMPATIBILITY_2_8
m_sourceStart = source->begin();
#endif
// Try to parse any style parameters that can be handled simply by
// converting them to the equivalent HTML 3 attributes: this is a far cry
// from perfect but better than nothing.
static const struct EquivAttr
{
const char *style;
const char *attr;
} equivAttrs[] =
{
{ "text-align", "ALIGN" },
{ "width", "WIDTH" },
{ "vertical-align", "VALIGN" },
{ "background", "BGCOLOR" },
};
wxHtmlStyleParams styleParams(*this);
for ( unsigned n = 0; n < WXSIZEOF(equivAttrs); n++ )
{
const EquivAttr& ea = equivAttrs[n];
if ( styleParams.HasParam(ea.style) && !HasParam(ea.attr) )
{
m_ParamNames.Add(ea.attr);
m_ParamValues.Add(styleParams.GetParam(ea.style));
}
}
}
wxHtmlTag::~wxHtmlTag()
{
wxHtmlTag *t1, *t2;
t1 = m_FirstChild;
while (t1)
{
t2 = t1->GetNextSibling();
delete t1;
t1 = t2;
}
}
bool wxHtmlTag::HasParam(const wxString& par) const
{
return (m_ParamNames.Index(par, false) != wxNOT_FOUND);
}
wxString wxHtmlTag::GetParam(const wxString& par, bool with_quotes) const
{
int index = m_ParamNames.Index(par, false);
if (index == wxNOT_FOUND)
return wxGetEmptyString();
if (with_quotes)
{
// VS: backward compatibility, seems to be never used by wxHTML...
wxString s;
s << wxT('"') << m_ParamValues[index] << wxT('"');
return s;
}
else
return m_ParamValues[index];
}
int wxHtmlTag::ScanParam(const wxString& par,
const char *format,
void *param) const
{
wxString parval = GetParam(par);
return wxSscanf(parval, format, param);
}
int wxHtmlTag::ScanParam(const wxString& par,
const wchar_t *format,
void *param) const
{
wxString parval = GetParam(par);
return wxSscanf(parval, format, param);
}
/* static */
bool wxHtmlTag::ParseAsColour(const wxString& str, wxColour *clr)
{
wxCHECK_MSG( clr, false, wxT("invalid colour argument") );
// handle colours defined in HTML 4.0 first:
if (str.length() > 1 && str[0] != wxT('#'))
{
#define HTML_COLOUR(name, r, g, b) \
if (str.IsSameAs(wxS(name), false)) \
{ clr->Set(r, g, b); return true; }
HTML_COLOUR("black", 0x00,0x00,0x00)
HTML_COLOUR("silver", 0xC0,0xC0,0xC0)
HTML_COLOUR("gray", 0x80,0x80,0x80)
HTML_COLOUR("white", 0xFF,0xFF,0xFF)
HTML_COLOUR("maroon", 0x80,0x00,0x00)
HTML_COLOUR("red", 0xFF,0x00,0x00)
HTML_COLOUR("purple", 0x80,0x00,0x80)
HTML_COLOUR("fuchsia", 0xFF,0x00,0xFF)
HTML_COLOUR("green", 0x00,0x80,0x00)
HTML_COLOUR("lime", 0x00,0xFF,0x00)
HTML_COLOUR("olive", 0x80,0x80,0x00)
HTML_COLOUR("yellow", 0xFF,0xFF,0x00)
HTML_COLOUR("navy", 0x00,0x00,0x80)
HTML_COLOUR("blue", 0x00,0x00,0xFF)
HTML_COLOUR("teal", 0x00,0x80,0x80)
HTML_COLOUR("aqua", 0x00,0xFF,0xFF)
#undef HTML_COLOUR
}
// then try to parse #rrggbb representations or set from other well
// known names (note that this doesn't strictly conform to HTML spec,
// but it doesn't do real harm -- but it *must* be done after the standard
// colors are handled above):
if (clr->Set(str))
return true;
return false;
}
bool wxHtmlTag::GetParamAsColour(const wxString& par, wxColour *clr) const
{
const wxString str = GetParam(par);
return !str.empty() && ParseAsColour(str, clr);
}
bool wxHtmlTag::GetParamAsInt(const wxString& par, int *clr) const
{
if ( !HasParam(par) )
return false;
long i;
if ( !GetParam(par).ToLong(&i) )
return false;
*clr = (int)i;
return true;
}
wxString wxHtmlTag::GetAllParams() const
{
// VS: this function is for backward compatibility only,
// never used by wxHTML
wxString s;
size_t cnt = m_ParamNames.GetCount();
for (size_t i = 0; i < cnt; i++)
{
s << m_ParamNames[i];
s << wxT('=');
if (m_ParamValues[i].Find(wxT('"')) != wxNOT_FOUND)
s << wxT('\'') << m_ParamValues[i] << wxT('\'');
else
s << wxT('"') << m_ParamValues[i] << wxT('"');
}
return s;
}
wxHtmlTag *wxHtmlTag::GetFirstSibling() const
{
if (m_Parent)
return m_Parent->m_FirstChild;
else
{
wxHtmlTag *cur = (wxHtmlTag*)this;
while (cur->m_Prev)
cur = cur->m_Prev;
return cur;
}
}
wxHtmlTag *wxHtmlTag::GetLastSibling() const
{
if (m_Parent)
return m_Parent->m_LastChild;
else
{
wxHtmlTag *cur = (wxHtmlTag*)this;
while (cur->m_Next)
cur = cur->m_Next;
return cur;
}
}
wxHtmlTag *wxHtmlTag::GetNextTag() const
{
if (m_FirstChild) return m_FirstChild;
if (m_Next) return m_Next;
wxHtmlTag *cur = m_Parent;
if (!cur) return NULL;
while (cur->m_Parent && !cur->m_Next)
cur = cur->m_Parent;
return cur->m_Next;
}
#endif