fix the problem with parsing HTML comments (closes bug 1116708; based on patch 1168583)

git-svn-id: https://svn.wxwidgets.org/svn/wx/wxWidgets/trunk@45336 c3d73ce0-8a6f-49c7-b76d-6d57e0e08775
This commit is contained in:
Vadim Zeitlin
2007-04-08 21:02:03 +00:00
parent 132276cf0d
commit 4609ee2ef8
3 changed files with 72 additions and 27 deletions

View File

@@ -128,6 +128,13 @@ public:
// Returns entity parser object, used to substitute HTML &entities;
wxHtmlEntitiesParser *GetEntitiesParser() const { return m_entitiesParser; }
// Returns true if the tag starting at the given position is a comment tag
//
// p should point to '<' character and is modified to point to the closing
// '>' of the end comment tag if this is indeed a comment
static bool
SkipCommentTag(wxString::const_iterator& p, wxString::const_iterator end);
protected:
// DOM structure
void CreateDOMTree();

View File

@@ -171,29 +171,11 @@ void wxHtmlParser::CreateDOMSubTree(wxHtmlTag *cur,
wxHtmlTextPiece(textBeginning, i - textBeginning));
// if it is a comment, skip it:
if (i < end_pos-6 && m_Source.GetChar(i+1) == wxT('!') &&
m_Source.GetChar(i+2) == wxT('-') &&
m_Source.GetChar(i+3) == wxT('-'))
wxString::const_iterator iter = m_Source.begin() + i;
if ( SkipCommentTag(iter, m_Source.end()) )
{
// Comments begin with "<!--" and end with "--[ \t\r\n]*>"
// according to HTML 4.0
int dashes = 0;
i += 4;
while (i < end_pos)
{
c = m_Source.GetChar(i++);
if ((c == wxT(' ') || c == wxT('\n') ||
c == wxT('\r') || c == wxT('\t')) && dashes >= 2) {}
else if (c == wxT('>') && dashes >= 2)
{
textBeginning = i;
break;
}
else if (c == wxT('-'))
dashes++;
else
dashes = 0;
}
textBeginning =
i = iter - m_Source.begin() + 1; // skip closing '>' too
}
// add another tag to the tree:
@@ -951,4 +933,55 @@ wxString wxHtmlParser::ExtractCharsetInformation(const wxString& markup)
return charset;
}
#endif
/* static */
bool
wxHtmlParser::SkipCommentTag(wxString::const_iterator& start,
wxString::const_iterator end)
{
wxASSERT_MSG( *start == '<', _T("should be called on the tag start") );
wxString::const_iterator p = start;
// comments begin with "<!--" in HTML 4.0
if ( end - p < 3 || *++p != '!' || *++p != '-' || *++p != '-' )
{
// not a comment at all
return false;
}
// skip the start of the comment tag in any case, if we don't find the
// closing tag we should ignore broken markup
start = p;
// comments end with "--[ \t\r\n]*>", i.e. white space is allowed between
// comment delimiter and the closing tag character (section 3.2.4 of
// http://www.w3.org/TR/html401/)
int dashes = 0;
while ( ++p < end )
{
const wxChar c = *p;
if ( (c == wxT(' ') || c == wxT('\n') ||
c == wxT('\r') || c == wxT('\t')) && dashes >= 2 )
{
// ignore white space before potential tag end
continue;
}
if ( c == wxT('>') && dashes >= 2 )
{
// found end of comment
start = p;
break;
}
if ( c == wxT('-') )
dashes++;
else
dashes = 0;
}
return true;
}
#endif // wxUSE_HTML

View File

@@ -68,11 +68,18 @@ wxHtmlTagsCache::wxHtmlTagsCache(const wxString& source)
m_CacheSize = 0;
m_CachePos = 0;
int pos = 0;
while (pos < lng)
for ( int pos = 0; pos < lng; pos++ )
{
if (src[pos] == wxT('<')) // tag found:
{
// don't cache comment tags
wxString::const_iterator iter = source.begin() + pos;
if ( wxHtmlParser::SkipCommentTag(iter, source.end()) )
{
pos = iter - source.begin();
continue;
}
if (m_CacheSize % CACHE_INCREMENT == 0)
m_Cache = (wxHtmlCacheItem*) realloc(m_Cache, (m_CacheSize + CACHE_INCREMENT) * sizeof(wxHtmlCacheItem));
int tg = m_CacheSize++;
@@ -169,8 +176,6 @@ wxHtmlTagsCache::wxHtmlTagsCache(const wxString& source)
}
}
}
pos++;
}
// ok, we're done, now we'll free .Name members of cache - we don't need it anymore: