git-svn-id: https://svn.wxwidgets.org/svn/wx/wxWidgets/trunk@27631 c3d73ce0-8a6f-49c7-b76d-6d57e0e08775
278 lines
8.4 KiB
C++
278 lines
8.4 KiB
C++
/////////////////////////////////////////////////////////////////////////////
|
|
// Name: htmlparser.cpp
|
|
// Purpose: Simple HTML parser
|
|
// Author: Julian Smart
|
|
// Modified by:
|
|
// Created: 2002-09-25
|
|
// RCS-ID: $Id$
|
|
// Copyright: (c) Julian Smart
|
|
// Licence: wxWindows license
|
|
/////////////////////////////////////////////////////////////////////////////
|
|
|
|
#ifndef _HTMLPARSER_H_
|
|
#define _HTMLPARSER_H_
|
|
|
|
#if defined(__GNUG__) && !defined(NO_GCC_PRAGMA)
|
|
#pragma interface "htmlparser.cpp"
|
|
#endif
|
|
|
|
//#include "wx/module.h"
|
|
#include "wx/stream.h"
|
|
|
|
/*
|
|
* wxSimpleHtmlAttribute
|
|
* Representation of an attribute
|
|
*/
|
|
|
|
class wxSimpleHtmlAttribute
|
|
{
|
|
friend class wxSimpleHtmlTag;
|
|
public:
|
|
wxSimpleHtmlAttribute(const wxString& name, const wxString& value)
|
|
{
|
|
m_name = name; m_value = value; m_next = NULL;
|
|
}
|
|
//// Operations
|
|
|
|
// Write this attribute
|
|
void Write(wxOutputStream& stream);
|
|
|
|
//// Accessors
|
|
const wxString& GetName() const { return m_name; }
|
|
const wxString& GetValue() const { return m_value; }
|
|
void SetName(const wxString& name) { m_name = name; }
|
|
void SetValue(const wxString& value) { m_value = value; }
|
|
|
|
wxSimpleHtmlAttribute* GetNextAttribute() { return m_next; }
|
|
void SetNextAttribute(wxSimpleHtmlAttribute* attr) { m_next = attr; }
|
|
|
|
bool HasName(const wxString& name) const { return (0 == m_name.CmpNoCase(name)); }
|
|
bool HasValue(const wxString& val) const { return (0 == m_value.CmpNoCase(val)); }
|
|
|
|
private:
|
|
wxString m_name;
|
|
wxString m_value;
|
|
wxSimpleHtmlAttribute* m_next;
|
|
};
|
|
|
|
|
|
/*
|
|
* wxSimpleHtmlTag
|
|
* Representation of a tag or chunk of text
|
|
*/
|
|
|
|
enum { wxSimpleHtmlTag_Text, wxSimpleHtmlTag_TopLevel, wxSimpleHtmlTag_Open, wxSimpleHtmlTag_Close, wxSimpleHtmlTag_Directive, wxSimpleHtmlTag_XMLDeclaration };
|
|
|
|
class wxSimpleHtmlTag
|
|
{
|
|
public:
|
|
wxSimpleHtmlTag(const wxString& tagName, int tagType);
|
|
~wxSimpleHtmlTag();
|
|
|
|
//// Operations
|
|
void ClearAttributes();
|
|
wxSimpleHtmlAttribute* FindAttribute(const wxString& name) const ;
|
|
void AppendAttribute(const wxString& name, const wxString& value);
|
|
void ClearChildren();
|
|
// Remove 1 tag from the child list.
|
|
void RemoveChild(wxSimpleHtmlTag *remove);
|
|
// Appaned tag to the end of the child list.
|
|
void AppendTag(wxSimpleHtmlTag* tag);
|
|
// Insert tag after ourself in the parents child list.
|
|
void AppendTagAfterUs(wxSimpleHtmlTag* tag);
|
|
// Write this tag
|
|
void Write(wxOutputStream& stream);
|
|
|
|
// Gets the text from this tag and its descendants
|
|
wxString GetTagText();
|
|
|
|
//// Accessors
|
|
const wxString& GetName() const { return m_name; }
|
|
void SetName(const wxString& name) { m_name = name; }
|
|
|
|
int GetType() const { return m_type; }
|
|
void SetType(int t) { m_type = t; }
|
|
|
|
// If type is wxSimpleHtmlTag_Text, m_text will contain some text.
|
|
const wxString& GetText() const { return m_text; }
|
|
void SetText(const wxString& text) { m_text = text; }
|
|
|
|
wxSimpleHtmlAttribute* GetFirstAttribute() { return m_attributes; }
|
|
void SetFirstAttribute(wxSimpleHtmlAttribute* attr) { m_attributes = attr; }
|
|
|
|
int GetAttributeCount() const ;
|
|
wxSimpleHtmlAttribute* GetAttribute(int i) const ;
|
|
|
|
wxSimpleHtmlTag* GetChildren() const { return m_children; }
|
|
void SetChildren(wxSimpleHtmlTag* children) { m_children = children; }
|
|
|
|
wxSimpleHtmlTag* GetParent() const { return m_parent; }
|
|
void SetParent(wxSimpleHtmlTag* parent) { m_parent = parent; }
|
|
int GetChildCount() const;
|
|
wxSimpleHtmlTag* GetChild(int i) const;
|
|
wxSimpleHtmlTag* GetNext() const { return m_next; }
|
|
|
|
//// Convenience accessors & search functions
|
|
bool NameIs(const wxString& name) { return (m_name.CmpNoCase(name) == 0); }
|
|
bool HasAttribute(const wxString& name, const wxString& value) const;
|
|
bool HasAttribute(const wxString& name) const;
|
|
bool GetAttributeValue(wxString& value, const wxString& attrName);
|
|
|
|
// Search forward from this tag until we find a tag with this name & optionally attribute
|
|
wxSimpleHtmlTag* FindTag(const wxString& tagName, const wxString& attrName = wxEmptyString);
|
|
|
|
// Gather the text until we hit the given close tag
|
|
bool FindTextUntilTagClose(wxString& text, const wxString& tagName);
|
|
|
|
private:
|
|
wxString m_name;
|
|
int m_type;
|
|
wxString m_text;
|
|
wxSimpleHtmlAttribute* m_attributes;
|
|
|
|
// List of children
|
|
wxSimpleHtmlTag* m_children;
|
|
wxSimpleHtmlTag* m_next; // Next sibling
|
|
wxSimpleHtmlTag* m_parent;
|
|
};
|
|
|
|
/*
|
|
* wxSimpleHtmlParser
|
|
* Simple HTML parser, for such tasks as scanning HTML for keywords, contents, etc.
|
|
*/
|
|
|
|
class wxSimpleHtmlParser : public wxObject
|
|
{
|
|
|
|
public:
|
|
wxSimpleHtmlParser();
|
|
~wxSimpleHtmlParser();
|
|
|
|
//// Operations
|
|
bool ParseFile(const wxString& filename);
|
|
bool ParseString(const wxString& str);
|
|
void Clear();
|
|
// Write this file
|
|
void Write(wxOutputStream& stream);
|
|
bool WriteFile(wxString& filename);
|
|
|
|
//// Helpers
|
|
|
|
// Main recursive parsing function
|
|
bool ParseHtml(wxSimpleHtmlTag* parent);
|
|
|
|
wxSimpleHtmlTag* ParseTagHeader();
|
|
wxSimpleHtmlTag* ParseTagClose();
|
|
bool ParseAttributes(wxSimpleHtmlTag* tag);
|
|
wxSimpleHtmlTag* ParseDirective(); // e.g. <!DOCTYPE ....>
|
|
wxSimpleHtmlTag* ParseXMLDeclaration(); // e.g. <?xml .... ?>
|
|
bool ParseComment(); // Throw away comments
|
|
// Plain text, up until an angled bracket
|
|
bool ParseText(wxString& text);
|
|
|
|
bool EatWhitespace(); // Throw away whitespace
|
|
bool EatWhitespace(int& pos); // Throw away whitespace: using 'pos'
|
|
bool ReadString(wxString& str, bool eatIt = false);
|
|
bool ReadWord(wxString& str, bool eatIt = false);
|
|
bool ReadNumber(wxString& str, bool eatIt = false);
|
|
// Could be number, string, whatever, but read up until whitespace.
|
|
bool ReadLiteral(wxString& str, bool eatIt = false);
|
|
|
|
bool IsComment();
|
|
bool IsDirective();
|
|
bool IsXMLDeclaration();
|
|
bool IsString();
|
|
bool IsWord();
|
|
bool IsTagClose();
|
|
bool IsTagStartBracket(int ch);
|
|
bool IsTagEndBracket(int ch);
|
|
bool IsWhitespace(int ch);
|
|
bool IsAlpha(int ch);
|
|
bool IsWordChar(int ch);
|
|
bool IsNumeric(int ch);
|
|
// Check if a specific tag needs a close tag. If not this function should return false.
|
|
// If no close tag is needed the result will be that the tag will be insert in a none
|
|
// hierarchical way. i.e. if the function would return false all the time we would get
|
|
// a flat list of all tags (like it used to be previously).
|
|
virtual bool IsCloseTagNeeded(const wxString &name);
|
|
|
|
// Encode/Decode Special Characters like:
|
|
// > Begins a tag. >
|
|
// < Ends a tag. <
|
|
// " Quotation mark. "
|
|
// ' Apostrophe. '
|
|
// & Ampersand. &
|
|
static void DecodeSpecialChars(wxString &value);
|
|
static wxString EncodeSpecialChars(const wxString &value);
|
|
|
|
// Matches this string (case insensitive)
|
|
bool Matches(const wxString& tok, bool eatIt = false) ;
|
|
bool Eof() const { return (m_pos >= m_length); }
|
|
bool Eof(int pos) const { return (pos >= m_length); }
|
|
|
|
void SetPosition(int pos) { m_pos = pos; }
|
|
|
|
|
|
//// Accessors
|
|
wxSimpleHtmlTag* GetTopLevelTag() const { return m_topLevel; }
|
|
|
|
// Safe way of getting a character
|
|
int GetChar(size_t i) const;
|
|
|
|
private:
|
|
|
|
wxSimpleHtmlTag* m_topLevel;
|
|
int m_pos; // Position in string
|
|
int m_length; // Length of string
|
|
wxString m_text; // The actual text
|
|
|
|
};
|
|
|
|
/*
|
|
* wxSimpleHtmlTagSpec
|
|
* Describes a tag, and what type it is.
|
|
* wxSimpleHtmlModule will initialise/cleanup a list of these, one per tag type
|
|
*/
|
|
|
|
#if 0
|
|
class wxSimpleHtmlTagSpec : public wxObject
|
|
{
|
|
|
|
public:
|
|
wxSimpleHtmlTagSpec(const wxString& name, int type);
|
|
|
|
//// Operations
|
|
static void AddTagSpec(wxSimpleHtmlTagSpec* spec);
|
|
static void Clear();
|
|
|
|
//// Accessors
|
|
const wxString& GetName() const { return m_name; }
|
|
int GetType() const { return m_type; }
|
|
|
|
private:
|
|
|
|
wxString m_name;
|
|
int m_type;
|
|
|
|
static wxList* sm_tagSpecs;
|
|
};
|
|
|
|
/*
|
|
* wxSimpleHtmlModule
|
|
* Responsible for init/cleanup of appropriate data structures
|
|
*/
|
|
|
|
class wxSimpleHtmlModule : public wxModule
|
|
{
|
|
DECLARE_DYNAMIC_CLASS(wxSimpleHtmlModule)
|
|
|
|
public:
|
|
wxSimpleHtmlModule() {};
|
|
|
|
bool OnInit() ;
|
|
void OnExit() ;
|
|
};
|
|
#endif
|
|
|
|
#endif
|