/*************************************************************************** * Copyright (C) 2004 by Paulo Moura Guedes * * moura@kdewebdev.org * * * * This program is free software; you can redistribute it and/or modify * * it under the terms of the GNU General Public License as published by * * the Free Software Foundation; either version 2 of the License, or * * (at your option) any later version. * * * * This program is distributed in the hope that it will be useful, * * but WITHOUT ANY WARRANTY; without even the implied warranty of * * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * * GNU General Public License for more details. * * * * You should have received a copy of the GNU General Public License * * along with this program; if not, write to the * * Free Software Foundation, Inc., * * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. * ***************************************************************************/ #ifndef HTML_PARSER_H #define HTML_PARSER_H #include #include #include "mstring.h" #include "node.h" #include using namespace std; typedef unsigned int uint; class HtmlParser { public: HtmlParser(); HtmlParser(TQString const& documento); ~HtmlParser(); vector const& nodes() const; bool hasBaseUrl() const; bool hasTitle() const; bool hasContentType() const; NodeBASE const& baseUrl() const; NodeTITLE const& title() const; NodeMETA const& contentTypeMetaNode() const; static uint estimativaLinks(uint doc_size); /** * Convenience function for performance as it only parse in order * to get the charset. */ static TQString findCharsetInMetaElement(TQString const& html); // test: void mostra() const; private: vector const& parseNodesOfType(TQString const& element); /** * Vector nodes passed for performance. */ static void parseNodesOfType(TQString const& element, TQString const& doc, vector& nodes); void parseNodesOfTypeA(); void parseNodesOfTypeAREA(); void parseNodesOfTypeLINK(); void parseNodesOfTypeMETA(); void parseNodesOfTypeIMG(); void parseNodesOfTypeFRAME(); void parseNodesOfTypeIFRAME(); void parseNodesOfTypeBASE(); void parseNodesOfTypeTITLE(); void stripComments(); void stripScriptContent(); /** Return the index of the next character of the end of tag. e.g. endOfTag(" luck\">") => 22 (not 15) */ static int endOfTag(TQString const& s, int index = 0, TQChar end_of_tag = '>'); private: vector aux_; // for what the hell is this? looks ugly... maybe I was drunk, can't remember vector nodes_; NodeBASE node_BASE_; NodeTITLE node_TITLE_; NodeMETA node_META_content_type_; bool is_content_type_set_; TQString document_; TQString script_; // Fica aqui guardado (JavaScript, etc) TQString comments_; }; inline HtmlParser::~HtmlParser() { //kdDebug(23100) << "*"; } inline uint HtmlParser::estimativaLinks(uint doc_size) { return doc_size / 100; // valor estimado... } inline bool HtmlParser::hasContentType() const { return is_content_type_set_; } #endif