fix the problem with parsing HTML comments (closes bug 1116708; based on patch 1168583)

git-svn-id: https://svn.wxwidgets.org/svn/wx/wxWidgets/trunk@45336 c3d73ce0-8a6f-49c7-b76d-6d57e0e08775
2007-04-08 21:02:03 +00:00
parent 132276cf0d
commit 4609ee2ef8
3 changed files with 72 additions and 27 deletions
--- a/include/wx/html/htmlpars.h
+++ b/include/wx/html/htmlpars.h
@@ -128,6 +128,13 @@ public:
    // Returns entity parser object, used to substitute HTML &entities;
    wxHtmlEntitiesParser *GetEntitiesParser() const { return m_entitiesParser; }
    // Returns true if the tag starting at the given position is a comment tag
    //
    // p should point to '<' character and is modified to point to the closing
    // '>' of the end comment tag if this is indeed a comment
    static bool
    SkipCommentTag(wxString::const_iterator& p, wxString::const_iterator end);
 protected:
    // DOM structure
    void CreateDOMTree();
--- a/src/html/htmlpars.cpp
+++ b/src/html/htmlpars.cpp
@@ -171,29 +171,11 @@ void wxHtmlParser::CreateDOMSubTree(wxHtmlTag *cur,
                    wxHtmlTextPiece(textBeginning, i - textBeginning));
            // if it is a comment, skip it:
-            if (i < end_pos-6 && m_Source.GetChar(i+1) == wxT('!') &&
+            wxString::const_iterator iter = m_Source.begin() + i;
-                                 m_Source.GetChar(i+2) == wxT('-') &&
+            if ( SkipCommentTag(iter, m_Source.end()) )
                                 m_Source.GetChar(i+3) == wxT('-'))
            {
-                // Comments begin with "<!--" and end with "--[ \t\r\n]*>"
+                textBeginning =
-                // according to HTML 4.0
+                i = iter - m_Source.begin() + 1; // skip closing '>' too
                int dashes = 0;
                i += 4;
                while (i < end_pos)
                {
                    c = m_Source.GetChar(i++);
                    if ((c == wxT(' ') || c == wxT('\n') ||
                        c == wxT('\r') || c == wxT('\t')) && dashes >= 2) {}
                    else if (c == wxT('>') && dashes >= 2)
                    {
                        textBeginning = i;
                        break;
                    }
                    else if (c == wxT('-'))
                        dashes++;
                    else
                        dashes = 0;
                }
            }
            // add another tag to the tree:
@@ -951,4 +933,55 @@ wxString wxHtmlParser::ExtractCharsetInformation(const wxString& markup)
    return charset;
 }
-#endif
+/* static */
 bool
 wxHtmlParser::SkipCommentTag(wxString::const_iterator& start,
                             wxString::const_iterator end)
 {
    wxASSERT_MSG( *start == '<', _T("should be called on the tag start") );
    wxString::const_iterator p = start;
    // comments begin with "<!--" in HTML 4.0
    if ( end - p < 3 || *++p != '!' || *++p != '-' || *++p != '-' )
    {
        // not a comment at all
        return false;
    }
    // skip the start of the comment tag in any case, if we don't find the
    // closing tag we should ignore broken markup
    start = p;
    // comments end with "--[ \t\r\n]*>", i.e. white space is allowed between
    // comment delimiter and the closing tag character (section 3.2.4 of
    // http://www.w3.org/TR/html401/)
    int dashes = 0;
    while ( ++p < end )
    {
        const wxChar c = *p;
        if ( (c == wxT(' ') || c == wxT('\n') ||
              c == wxT('\r') || c == wxT('\t')) && dashes >= 2 )
        {
            // ignore white space before potential tag end
            continue;
        }
        if ( c == wxT('>') && dashes >= 2 )
        {
            // found end of comment
            start = p;
            break;
        }
        if ( c == wxT('-') )
            dashes++;
        else
            dashes = 0;
    }
    return true;
 }
 #endif // wxUSE_HTML
--- a/src/html/htmltag.cpp
+++ b/src/html/htmltag.cpp
@@ -68,11 +68,18 @@ wxHtmlTagsCache::wxHtmlTagsCache(const wxString& source)
    m_CacheSize = 0;
    m_CachePos = 0;
-    int pos = 0;
+    for ( int pos = 0; pos < lng; pos++ )
    while (pos < lng)
    {
        if (src[pos] == wxT('<'))   // tag found:
        {
            // don't cache comment tags
            wxString::const_iterator iter = source.begin() + pos;
            if ( wxHtmlParser::SkipCommentTag(iter, source.end()) )
            {
                pos = iter - source.begin();
                continue;
            }
            if (m_CacheSize % CACHE_INCREMENT == 0)
                m_Cache = (wxHtmlCacheItem*) realloc(m_Cache, (m_CacheSize + CACHE_INCREMENT) * sizeof(wxHtmlCacheItem));
            int tg = m_CacheSize++;
@@ -169,8 +176,6 @@ wxHtmlTagsCache::wxHtmlTagsCache(const wxString& source)
                }
            }
        }
        pos++;
    }
    // ok, we're done, now we'll free .Name members of cache - we don't need it anymore: