///////////////////////////////////////////////////////////////////////////// // Name: htmlparser.cpp // Purpose: Simple HTML parser // Author: Julian Smart // Modified by: // Created: 2002-09-25 // RCS-ID: $Id$ // Copyright: (c) Julian Smart // Licence: wxWindows license ///////////////////////////////////////////////////////////////////////////// // ---------------------------------------------------------------------------- // headers // ---------------------------------------------------------------------------- #if defined(__GNUG__) && !defined(NO_GCC_PRAGMA) #pragma implementation "htmlparser.h" #endif // For compilers that support precompilation, includes "wx/wx.h". #include "wx/wxprec.h" #ifdef __BORLANDC__ #pragma hdrstop #endif #include "wx/wfstream.h" #include "wx/textfile.h" #include "wx/txtstrm.h" #include "htmlparser.h" /// Useful insertion operators for wxOutputStream. static wxOutputStream& operator <<(wxOutputStream& stream, const wxString& s) { wxTextOutputStream txt(stream); // This is to make sure the line-ending is native! txt.WriteString(s); return stream; } #if 0 // Gives warning because not used... static wxOutputStream& operator <<(wxOutputStream& stream, long l) { wxString str; str.Printf("%ld", l); return stream << str; } static wxOutputStream& operator <<(wxOutputStream& stream, const char c) { wxString str; str.Printf("%c", c); return stream << str; } #endif // 0 /* * wxSimpleHtmlAttribute * Representation of an attribute */ wxSimpleHtmlParser::wxSimpleHtmlParser() { m_topLevel = NULL; m_pos = 0; } wxSimpleHtmlParser::~wxSimpleHtmlParser() { Clear(); } bool wxSimpleHtmlParser::ParseFile(const wxString& filename) { wxTextFile textFile; if (textFile.Open(filename)) { wxString text; wxString line; int i; int count = textFile.GetLineCount(); for (i = 0; i < count; i++) { if (i == 0) line = textFile.GetFirstLine(); else line = textFile.GetNextLine(); text += line; if (i != (count - 1)) text += wxT("\n"); } #if 0 for ( line = textFile.GetFirstLine(); !textFile.Eof(); line = textFile.GetNextLine() ) { text += line; if (!textFile.Eof()) text += wxT("\n"); } #endif return ParseString(text); } else return false; } bool wxSimpleHtmlParser::ParseString(const wxString& str) { Clear(); m_pos = 0; m_text = str; m_length = str.Length(); m_topLevel = new wxSimpleHtmlTag(wxT("TOPLEVEL"), wxSimpleHtmlTag_TopLevel); bool bResult = ParseHtml(m_topLevel); wxASSERT(bResult); // Failed to parse the TAGs. // Hint: Check if every open tag has a close tag! return bResult; } // Main recursive parsing function bool wxSimpleHtmlParser::ParseHtml(wxSimpleHtmlTag* parent) { if (!parent) return false; while (!Eof()) { EatWhitespace(); if (IsComment()) { ParseComment(); } else if (IsDirective()) { wxSimpleHtmlTag* tag = ParseDirective(); if (tag) parent->AppendTag(tag); } else if (IsXMLDeclaration()) { wxSimpleHtmlTag* tag = ParseXMLDeclaration(); if (tag) parent->AppendTag(tag); } else if (IsTagClose()) { wxSimpleHtmlTag* tag = ParseTagClose(); if (tag) { if (IsCloseTagNeeded(tag->GetName())) { if (!parent->GetParent()) return false; parent->GetParent()->AppendTag(tag); return true; } else parent->AppendTag(tag); } } else if (IsTagStartBracket(GetChar(m_pos))) { wxSimpleHtmlTag* tag = ParseTagHeader(); if (tag) parent->AppendTag(tag); if (IsCloseTagNeeded(tag->GetName())) { if (!ParseHtml(tag)) return false; // Something didn't go ok, so don't continue. } } else { // Just a text string wxString text; ParseText(text); wxSimpleHtmlTag* tag = new wxSimpleHtmlTag(wxT("TEXT"), wxSimpleHtmlTag_Text); tag->SetText(text); if(parent->GetParent()) parent->GetParent()->AppendTag(tag); else parent->AppendTag(tag); // When this occurs it is probably the // empty lines at the end of the file... } } return true; } // Plain text, up until an angled bracket bool wxSimpleHtmlParser::ParseText(wxString& text) { while (!Eof() && GetChar(m_pos) != wxT('<')) { text += (wxChar)GetChar(m_pos); m_pos ++; } DecodeSpecialChars(text); return true; } wxSimpleHtmlTag* wxSimpleHtmlParser::ParseTagHeader() { if (IsTagStartBracket(GetChar(m_pos))) { m_pos ++; EatWhitespace(); wxString word; ReadWord(word, true); EatWhitespace(); wxSimpleHtmlTag* tag = new wxSimpleHtmlTag(word, wxSimpleHtmlTag_Open); ParseAttributes(tag); EatWhitespace(); if (IsTagEndBracket(GetChar(m_pos))) m_pos ++; return tag; } else return NULL; } wxSimpleHtmlTag* wxSimpleHtmlParser::ParseTagClose() { Matches(wxT(" while (!IsTagEndBracket(GetChar(m_pos)) && !Eof()) { EatWhitespace(); wxString attrName, attrValue; if (IsString()) { ReadString(attrName, true); tag->AppendAttribute(attrName, wxEmptyString); } else if (IsNumeric(GetChar(m_pos))) { ReadNumber(attrName, true); tag->AppendAttribute(attrName, wxEmptyString); } else { // Try to read an attribute name/value pair, or at least a name // without the value ReadLiteral(attrName, true); EatWhitespace(); if (GetChar(m_pos) == wxT('=')) { m_pos ++; EatWhitespace(); if (IsString()) ReadString(attrValue, true); else if (!Eof() && !IsTagEndBracket(GetChar(m_pos))) ReadLiteral(attrValue, true); } if (!attrName.IsEmpty()) tag->AppendAttribute(attrName, attrValue); } } return true; } // e.g. wxSimpleHtmlTag* wxSimpleHtmlParser::ParseDirective() { Matches(wxT(" wxSimpleHtmlTag* wxSimpleHtmlParser::ParseXMLDeclaration() { Matches(wxT(""), true)) { m_pos ++; } return true; } bool wxSimpleHtmlParser::EatWhitespace() { while (!Eof() && IsWhitespace(GetChar(m_pos))) m_pos ++; return true; } bool wxSimpleHtmlParser::EatWhitespace(int& pos) { while (!Eof(pos) && IsWhitespace(GetChar(pos))) pos ++; return true; } bool wxSimpleHtmlParser::ReadString(wxString& str, bool eatIt) { int pos = m_pos; if (GetChar(pos) == (int) '"') { pos ++; while (!Eof(pos) && GetChar(pos) != (int) '"') { // TODO: how are quotes escaped in HTML? str += (wxChar) GetChar(pos); pos ++; } if (GetChar(pos) == (int) '"') pos ++; if (eatIt) m_pos = pos; DecodeSpecialChars(str); return true; } else return false; } bool wxSimpleHtmlParser::ReadWord(wxString& str, bool eatIt) { int pos = m_pos; if (!IsAlpha(GetChar(pos))) return false; str += (wxChar) GetChar(pos) ; pos ++; while (!Eof(pos) && IsWordChar(GetChar(pos))) { str += (wxChar) GetChar(pos); pos ++; } if (eatIt) m_pos = pos; DecodeSpecialChars(str); return true; } bool wxSimpleHtmlParser::ReadNumber(wxString& str, bool eatIt) { int pos = m_pos; if (!IsNumeric(GetChar(pos))) return false; str += (wxChar) GetChar(pos) ; pos ++; while (!Eof(pos) && IsNumeric(GetChar(pos))) { str += (wxChar) GetChar(pos); pos ++; } if (eatIt) m_pos = pos; DecodeSpecialChars(str); return true; } // Could be number, string, whatever, but read up until whitespace or end of tag (but not a quoted string) bool wxSimpleHtmlParser::ReadLiteral(wxString& str, bool eatIt) { int pos = m_pos; while (!Eof(pos) && !IsWhitespace(GetChar(pos)) && !IsTagEndBracket(GetChar(pos)) && GetChar(pos) != wxT('=')) { str += (wxChar)GetChar(pos); pos ++; } if (eatIt) m_pos = pos; DecodeSpecialChars(str); return true; } bool wxSimpleHtmlParser::IsComment() { return Matches(wxT("