From e9ae80694875f869892f13f4fcaf1170a00dea41 Mon Sep 17 00:00:00 2001 From: toma Date: Wed, 25 Nov 2009 17:56:58 +0000 Subject: Copy the KDE 3.5 branch to branches/trinity for new KDE 3.5 features. BUG:215923 git-svn-id: svn://anonsvn.kde.org/home/kde/branches/trinity/kdewebdev@1054174 283d02a7-25f6-0310-bc7c-ecb5cbfe19da --- klinkstatus/src/engine/Makefile.am | 9 + klinkstatus/src/engine/linkchecker.cpp | 703 +++++++++++++++++++++ klinkstatus/src/engine/linkchecker.h | 128 ++++ klinkstatus/src/engine/linkfilter.cpp | 46 ++ klinkstatus/src/engine/linkfilter.h | 49 ++ klinkstatus/src/engine/linkstatus.cpp | 214 +++++++ klinkstatus/src/engine/linkstatus.h | 187 ++++++ klinkstatus/src/engine/linkstatus_impl.h | 417 +++++++++++++ klinkstatus/src/engine/searchmanager.cpp | 916 ++++++++++++++++++++++++++++ klinkstatus/src/engine/searchmanager.h | 193 ++++++ klinkstatus/src/engine/searchmanager_impl.h | 158 +++++ 11 files changed, 3020 insertions(+) create mode 100644 klinkstatus/src/engine/Makefile.am create mode 100644 klinkstatus/src/engine/linkchecker.cpp create mode 100644 klinkstatus/src/engine/linkchecker.h create mode 100644 klinkstatus/src/engine/linkfilter.cpp create mode 100644 klinkstatus/src/engine/linkfilter.h create mode 100644 klinkstatus/src/engine/linkstatus.cpp create mode 100644 klinkstatus/src/engine/linkstatus.h create mode 100644 klinkstatus/src/engine/linkstatus_impl.h create mode 100644 klinkstatus/src/engine/searchmanager.cpp create mode 100644 klinkstatus/src/engine/searchmanager.h create mode 100644 klinkstatus/src/engine/searchmanager_impl.h (limited to 'klinkstatus/src/engine') diff --git a/klinkstatus/src/engine/Makefile.am b/klinkstatus/src/engine/Makefile.am new file mode 100644 index 00000000..1bd3ba88 --- /dev/null +++ b/klinkstatus/src/engine/Makefile.am @@ -0,0 +1,9 @@ +INCLUDES = -I$(top_srcdir)/src/ui $(all_includes) +METASOURCES = AUTO +noinst_HEADERS = linkchecker.h linkstatus.h linkstatus_impl.h searchmanager.h \ + searchmanager_impl.h linkfilter.h +libengine_la_LDFLAGS = $(all_libraries) +noinst_LTLIBRARIES = libengine.la +libengine_la_SOURCES = linkchecker.cpp linkstatus.cpp searchmanager.cpp \ + linkfilter.cpp +libengine_la_LIBADD = $(LIB_KHTML) diff --git a/klinkstatus/src/engine/linkchecker.cpp b/klinkstatus/src/engine/linkchecker.cpp new file mode 100644 index 00000000..bcc503ad --- /dev/null +++ b/klinkstatus/src/engine/linkchecker.cpp @@ -0,0 +1,703 @@ +/*************************************************************************** + * Copyright (C) 2004 by Puto Moura * + * mojo@localhost.localdomain * + * * + * This program is free software; you can redistribute it and/or modify * + * it under the terms of the GNU General Public License as published by * + * the Free Software Foundation; either version 2 of the License, or * + * (at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, * + * but WITHOUT ANY WARRANTY; without even the implied warranty of * + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * + * GNU General Public License for more details. * + * * + * You should have received a copy of the GNU General Public License * + * along with this program; if not, write to the * + * Free Software Foundation, Inc., * + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. * + ***************************************************************************/ +#include "linkchecker.h" +#include "searchmanager.h" +#include "../utils/utils.h" +#include "../parser/htmlparser.h" + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +int LinkChecker::count_ = 0; + +LinkChecker::LinkChecker(LinkStatus* linkstatus, int time_out, + QObject *parent, const char *name) + : QObject(parent, name), search_manager_(0), + linkstatus_(linkstatus), t_job_(0), time_out_(time_out), checker_(0), document_charset_(), + redirection_(false), header_checked_(false), finnished_(false), + parsing_(false), is_charset_checked_(false), has_defined_charset_(false) +{ + Q_ASSERT(linkstatus_); + Q_ASSERT(!linkstatus_->checked()); + + kdDebug(23100) << endl << ++count_ << ": " << "Checking " << linkstatus_->absoluteUrl().url() << endl; +} + +LinkChecker::~LinkChecker() +{} + +void LinkChecker::setSearchManager(SearchManager* search_manager) +{ + Q_ASSERT(search_manager); + search_manager_ = search_manager; +} + +void LinkChecker::check() +{ + Q_ASSERT(!finnished_); + + KURL url(linkStatus()->absoluteUrl()); + Q_ASSERT(url.isValid()); + + if(url.hasRef()) { + KMimeType::Ptr mimeType = KMimeType::findByURL(url); + if(mimeType->is("text/html") || mimeType->is("application/xml")) { + checkRef(); + return; + } + } + + t_job_ = KIO::get(url, false, false); + + t_job_->addMetaData("PropagateHttpHeader", "true"); // to have the http header + + if (linkstatus_->parent()) { + t_job_->addMetaData("referrer", linkstatus_->parent()->absoluteUrl().prettyURL()); + } + + if(search_manager_->sendIdentification()) + { + t_job_->addMetaData("SendUserAgent", "true"); + t_job_->addMetaData("UserAgent", search_manager_->userAgent()); + } + else + t_job_->addMetaData("SendUserAgent", "false"); + + + QObject::connect(t_job_, SIGNAL(data(KIO::Job *, const QByteArray &)), + this, SLOT(slotData(KIO::Job *, const QByteArray &))); + QObject::connect(t_job_, SIGNAL(mimetype(KIO::Job *, const QString &)), + this, SLOT(slotMimetype(KIO::Job *, const QString &))); + QObject::connect(t_job_, SIGNAL(result(KIO::Job *)), + this, SLOT(slotResult(KIO::Job *))); + QObject::connect(t_job_, SIGNAL(redirection(KIO::Job *, const KURL &)), + this, SLOT(slotRedirection(KIO::Job *, const KURL &))); + + QTimer::singleShot( time_out_ * 1000, this, SLOT(slotTimeOut()) ); + + t_job_->setInteractive(false); +} + +void LinkChecker::slotTimeOut() +{ + if(!finnished_ && !parsing_) + { + kdDebug(23100) << "timeout: " << linkstatus_->absoluteUrl().url() << endl; + if(t_job_ && t_job_->slave()) + kdDebug(23100) << " - " << t_job_->slave() << "/" << t_job_->slave()->slave_pid() << endl; + else + kdDebug(23100) << endl; + + +// Q_ASSERT(t_job_); // can happen: e.g. bad result signal + if(t_job_->error() != KIO::ERR_USER_CANCELED) + { + linkstatus_->setErrorOccurred(true); + linkstatus_->setChecked(true); + linkstatus_->setError(i18n("Timeout")); + linkstatus_->setStatus(LinkStatus::TIMEOUT); + + killJob(); + finnish(); + } + } +} + +void LinkChecker::slotMimetype (KIO::Job* /*job*/, const QString &type) +{ + if(finnished_) + return; + +// kdDebug(23100) << "LinkChecker::slotMimetype:" << type << "-> " << linkstatus_->absoluteUrl().url() +// << " - " << t_job_->slave() << "/" << t_job_->slave()->slave_pid() << endl; + + Q_ASSERT(t_job_); + + LinkStatus* ls = 0; +/* if(redirection_) + ls = linkStatus()->redirection(); + else*/ + ls = linkstatus_; + Q_ASSERT(ls); + + ls->setMimeType(type); + KURL url = ls->absoluteUrl(); + + // we doesn't do nothing if file is http or https because we need the header + // which is only available in the data response + if(!t_job_->error()) // if a error happened let result() handle that + { + if(ls->onlyCheckHeader()) + { + //kdDebug(23100) << "only check header: " << ls->absoluteUrl().prettyURL() << endl; + + // file is OK (http can have an error page though job->error() is false) + if(!url.protocol().startsWith("http")) + { + ls->setStatusText("OK"); + ls->setStatus(LinkStatus::SUCCESSFULL); + + killJob(); + finnish(); + } + } + else // !ls->onlyCheckHeader() + { + //kdDebug(23100) << "NOT only check header: " << ls->absoluteUrl().prettyURL() << endl; + + // file is OK (http can have an error page though job->error() is false) + if(!url.protocol().startsWith("http")) // if not, it have to go trough slotData to get the http header + { + // it's not an html page, so we don't want the file content + if(type != "text/html"/* && type != "text/plain"*/) + { + //kdDebug(23100) << "mimetype: " << type << endl; + ls->setStatusText("OK"); + ls->setStatus(LinkStatus::SUCCESSFULL); + + killJob(); + finnish(); + } + } + } + } +} + +void LinkChecker::slotData(KIO::Job* /*job*/, const QByteArray& data) +{ + if(finnished_) + return; + + kdDebug(23100) << "LinkChecker::slotData -> " << linkstatus_->absoluteUrl().url() + << " - " << t_job_->slave() << "/" << t_job_->slave()->slave_pid() << endl; + + Q_ASSERT(t_job_); + + LinkStatus* ls = 0; +/* if(redirection_) + ls = linkStatus()->redirection(); + else*/ + ls = linkstatus_; + Q_ASSERT(ls); + + KURL url = ls->absoluteUrl(); + + if(!t_job_->error()) + { + if(ls->onlyCheckHeader()) + { + Q_ASSERT(header_checked_ == false); + // the job should have been killed in slotMimetype + Q_ASSERT(url.protocol() == "http" || url.protocol() == "https"); + + // get the header and quit + if(url.protocol().startsWith("http")) + { + // get the header + ls->setHttpHeader(getHttpHeader(t_job_)); + + if(t_job_->isErrorPage()) + ls->setIsErrorPage(true); + + if(header_checked_) + { + killJob(); + linkstatus_->setStatus(getHttpStatus()); + linkstatus_->setChecked(true); + finnish(); + return; + } + } + } + else + { + if(url.protocol().startsWith("http")) + { + if(!header_checked_) + { + ls->setHttpHeader(getHttpHeader(t_job_)); + } + if(ls->mimeType() != "text/html" && header_checked_) + { + //kdDebug(23100) << "mimetype of " << ls->absoluteUrl().prettyURL() << ": " << ls->mimeType() << endl; + ls->setStatus(getHttpStatus()); + killJob(); + finnish(); // if finnish is called before kill what you get is a segfault, don't know why + return; + } + else if(t_job_->isErrorPage() && header_checked_) + { + //kdDebug(23100) << "ERROR PAGE" << endl; + ls->setIsErrorPage(true); + ls->setStatus(getHttpStatus()); + killJob(); + finnish(); + return; + } + } + else + { + Q_ASSERT(ls->mimeType() == "text/html"); + } + if(!is_charset_checked_) + findDocumentCharset(data); + + QTextCodec* codec = 0; + if(has_defined_charset_) + codec = QTextCodec::codecForName(document_charset_); + if(!codec) + codec = QTextCodec::codecForName("iso8859-1"); // default + + doc_html_ += codec->toUnicode(data); + } + } +} + +void LinkChecker::findDocumentCharset(QString const& doc) +{ + Q_ASSERT(!is_charset_checked_); + + is_charset_checked_ = true; // only check the first stream of data + + if(header_checked_) + document_charset_ = linkstatus_->httpHeader().charset(); + + // try to look in the meta elements + if(document_charset_.isNull() || document_charset_.isEmpty()) + document_charset_ = HtmlParser::findCharsetInMetaElement(doc); + + if(!document_charset_.isNull() && !document_charset_.isEmpty()) + has_defined_charset_ = true; +} + +// only comes here if an error happened or in case of a clean html page +// if onlyCheckHeader is false +void LinkChecker::slotResult(KIO::Job* /*job*/) +{ + if(finnished_) + return; + + kdDebug(23100) << "LinkChecker::slotResult -> " << linkstatus_->absoluteUrl().url() << endl; + + Q_ASSERT(t_job_); + if(!t_job_) + return; + + if(redirection_) { + if(!processRedirection(redirection_url_)) { + t_job_ = 0; + linkstatus_->setChecked(true); + finnish(); + return; + } + } + + KIO::TransferJob* job = t_job_; + t_job_ = 0; + + emit jobFinnished(this); + + if(job->error() == KIO::ERR_USER_CANCELED) + { + // FIXME This can happen! If the job is non interactive... + kdWarning(23100) << endl << "Job killed quietly, yet signal result was emited..." << endl; + kdDebug(23100) << linkstatus_->toString() << endl; + finnish(); + return; + } + + LinkStatus* ls = 0; + if(redirection_) + ls = linkStatus()->redirection(); + else + ls = linkstatus_; + Q_ASSERT(ls); + + if(!(!ls->onlyCheckHeader() || + job->error() || + !header_checked_)) + kdWarning(23100) << ls->toString() << endl; + + Q_ASSERT(!ls->onlyCheckHeader() || job->error() || !header_checked_); + + if(ls->isErrorPage()) + kdWarning(23100) << "\n\n" << ls->toString() << endl << endl; + + Q_ASSERT(!job->isErrorPage()); + + if(job->error()) + { + kdDebug(23100) << "Job error: " << job->errorString() << endl; + kdDebug(23100) << "Job error code: " << job->error() << endl; + + if(job->error() == KIO::ERR_IS_DIRECTORY) + { + ls->setStatusText("OK"); + ls->setStatus(LinkStatus::SUCCESSFULL); + } + else + { + ls->setErrorOccurred(true); + if(job->error() == KIO::ERR_SERVER_TIMEOUT) + ls->setStatus(LinkStatus::TIMEOUT); + else + ls->setStatus(LinkStatus::BROKEN); + + if(job->errorString().isEmpty()) + kdWarning(23100) << "\n\nError string is empty, error = " << job->error() << "\n\n\n"; + if(job->error() != KIO::ERR_NO_CONTENT) + ls->setError(job->errorString()); + else + ls->setError(i18n("No Content")); + } + } + + else + { + if(!ls->absoluteUrl().protocol().startsWith("http")) { + ls->setStatusText("OK"); + ls->setStatus(LinkStatus::SUCCESSFULL); + } + else + { + if(!header_checked_) + { + kdDebug(23100) << "\n\nheader not received... checking again...\n\n\n"; + //check again + check(); + return; + } + Q_ASSERT(header_checked_); + + ls->setStatus(getHttpStatus()); + } + + if(!doc_html_.isNull() && !doc_html_.isEmpty()) + { + ls->setDocHtml(doc_html_); + + parsing_ = true; + HtmlParser parser(doc_html_); + + if(parser.hasBaseUrl()) + ls->setBaseURI(KURL(parser.baseUrl().url())); + if(parser.hasTitle()) + ls->setHtmlDocTitle(parser.title().attributeTITLE()); + + ls->setChildrenNodes(parser.nodes()); + parsing_ = false; + } + } + finnish(); +} + + +void LinkChecker::slotRedirection (KIO::Job* /*job*/, const KURL &url) +{ + kdDebug(23100) << "LinkChecker::slotRedirection -> " << + linkstatus_->absoluteUrl().url() << " -> " << url.url() << endl; +// << " - " << t_job_->slave() << "/" << t_job_->slave()->slave_pid() << endl; + + redirection_ = true; + redirection_url_ = url; +} + +bool LinkChecker::processRedirection(KURL const& toUrl) +{ + if(finnished_) + return true; + + kdDebug(23100) << "LinkChecker::processRedirection -> " << linkstatus_->absoluteUrl().url() << " -> " << toUrl.url() << endl; + + Q_ASSERT(t_job_); + Q_ASSERT(linkstatus_->absoluteUrl().protocol().startsWith("http")); + Q_ASSERT(redirection_); + + linkstatus_->setHttpHeader(getHttpHeader(t_job_, false)); + linkstatus_->setIsRedirection(true); + linkstatus_->setStatusText("redirection"); + linkstatus_->setStatus(LinkStatus::HTTP_REDIRECTION); + linkstatus_->setChecked(true); + + LinkStatus* ls_red = new LinkStatus(*linkstatus_); + ls_red->setAbsoluteUrl(toUrl); + ls_red->setRootUrl(linkstatus_->rootUrl()); + + if(!linkstatus_->onlyCheckHeader()) + ls_red->setOnlyCheckHeader(false); + + linkstatus_->setRedirection(ls_red); + ls_red->setParent(linkstatus_); + ls_red->setOriginalUrl(toUrl.url()); + + Q_ASSERT(search_manager_); + + if(search_manager_->localDomain(ls_red->absoluteUrl())) + ls_red->setExternalDomainDepth(-1); + else + { + if(search_manager_->localDomain(linkstatus_->absoluteUrl())) + ls_red->setExternalDomainDepth(linkstatus_->externalDomainDepth() + 1); + else + ls_red->setExternalDomainDepth(linkstatus_->externalDomainDepth()); + } + + if(!toUrl.isValid() || search_manager_->existUrl(toUrl, linkstatus_->absoluteUrl())) + { + ls_red->setChecked(false); + return false; + } + else + { + ls_red->setChecked(true); + return true; + } +} + +void LinkChecker::finnish() +{ + Q_ASSERT(!t_job_); + + if(!finnished_) + { + kdDebug(23100) << "LinkChecker::finnish -> " << linkstatus_->absoluteUrl().url() << endl; + + finnished_ = true; + + if(redirection_) + Q_ASSERT(linkstatus_->checked()); + else + linkstatus_->setChecked(true); + + emit transactionFinished(linkstatus_, this); + } +} + +HttpResponseHeader LinkChecker::getHttpHeader(KIO::Job* /*job*/, bool remember_check) +{ + //kdDebug(23100) << "LinkChecker::getHttpHeader -> " << linkstatus_->absoluteUrl().url() << endl; + + Q_ASSERT(!finnished_); + Q_ASSERT(t_job_); + + QString header_string = t_job_->queryMetaData("HTTP-Headers"); + // Q_ASSERT(!header_string.isNull() && !header_string.isEmpty()); +// kdDebug(23100) << "HTTP header: " << endl << header_string << endl; +// kdDebug(23100) << "Keys: " << HttpResponseHeader(header_string).keys() << endl; +// kdDebug(23100) << "Content-type: " << HttpResponseHeader(header_string).contentType() << endl; +// kdDebug(23100) << "Content-type: " << HttpResponseHeader(header_string).value("content-type") << endl; + + if(header_string.isNull() || header_string.isEmpty()) + { + header_checked_ = false; + kdWarning(23100) << "header_string.isNull() || header_string.isEmpty(): " + << linkstatus_->toString() << endl; + } + else if(remember_check) + header_checked_ = true; + + return HttpResponseHeader(header_string); +} + +void LinkChecker::checkRef() +{ + KURL url(linkStatus()->absoluteUrl()); + Q_ASSERT(url.hasRef()); + + QString ref = url.ref(); + if(ref == "" || ref == "top") { + linkstatus_->setStatusText("OK"); + linkstatus_->setStatus(LinkStatus::SUCCESSFULL); + finnish(); + return; + } + + QString url_base; + LinkStatus const* ls_parent = 0; + int i_ref = -1; + + if(linkStatus()->originalUrl().startsWith("#")) + ls_parent = linkStatus()->parent(); + + else + { + i_ref = url.url().find("#"); + url_base = url.url().left(i_ref); + //kdDebug(23100) << "url_base: " << url_base << endl; + + Q_ASSERT(search_manager_); + + ls_parent = search_manager_->linkStatus(url_base); + } + + if(ls_parent) + checkRef(ls_parent); + else + { + url = KURL::fromPathOrURL(url.url().left(i_ref)); + checkRef(url); + } +} + +void LinkChecker::checkRef(KURL const& url) +{ + Q_ASSERT(search_manager_); + + QString url_string = url.url(); + KHTMLPart* html_part = search_manager_->htmlPart(url_string); + if(!html_part) + { + kdDebug() << "new KHTMLPart: " + url_string << endl; + + html_part = new KHTMLPart(); + html_part->setOnlyLocalReferences(true); + + QString tmpFile; + if(KIO::NetAccess::download(url, tmpFile, 0)) + { + QString doc_html = FileManager::read(tmpFile); + html_part->begin(); + html_part->write(doc_html); + html_part->end(); + + KIO::NetAccess::removeTempFile(tmpFile); + } + else + { + kdDebug(23100) << KIO::NetAccess::lastErrorString() << endl; + } + + search_manager_->addHtmlPart(url_string, html_part); + } + + if(hasAnchor(html_part, linkStatus()->absoluteUrl().ref())) + { + linkstatus_->setStatusText("OK"); + linkstatus_->setStatus(LinkStatus::SUCCESSFULL); + } + else + { + linkstatus_->setErrorOccurred(true); + linkstatus_->setError(i18n( "Link destination not found." )); + linkstatus_->setStatus(LinkStatus::BROKEN); + } + + finnish(); +} + +void LinkChecker::checkRef(LinkStatus const* linkstatus_parent) +{ + Q_ASSERT(search_manager_); + + QString url_string = linkstatus_parent->absoluteUrl().url(); + KHTMLPart* html_part = search_manager_->htmlPart(url_string); + if(!html_part) + { + kdDebug() << "new KHTMLPart: " + url_string << endl; + + html_part = new KHTMLPart(); + html_part->setOnlyLocalReferences(true); + + html_part->begin(); + html_part->write(linkstatus_parent->docHtml()); + html_part->end(); + + search_manager_->addHtmlPart(url_string, html_part); + } + + if(hasAnchor(html_part, linkStatus()->absoluteUrl().ref())) + { + linkstatus_->setStatusText("OK"); + linkstatus_->setStatus(LinkStatus::SUCCESSFULL); + } + else + { + linkstatus_->setErrorOccurred(true); + linkstatus_->setError(i18n( "Link destination not found." )); + linkstatus_->setStatus(LinkStatus::BROKEN); + } + + finnish(); +} + +bool LinkChecker::hasAnchor(KHTMLPart* html_part, QString const& anchor) +{ + DOM::HTMLDocument htmlDocument = html_part->htmlDocument(); + DOM::HTMLCollection anchors = htmlDocument.anchors(); + + DOM::DOMString name_ref(anchor); + Q_ASSERT(!name_ref.isNull()); + + DOM::Node node = anchors.namedItem(name_ref); + if(node.isNull()) + { + node = htmlDocument.getElementById(name_ref); + } + + if(!node.isNull()) + return true; + else + return false; +} + +void LinkChecker::killJob() +{ + if(!t_job_) + return; + + KIO::TransferJob* aux = t_job_; + t_job_ = 0; + aux->disconnect(this); + aux->kill(true); // quietly +} + +LinkStatus::Status LinkChecker::getHttpStatus() const +{ + QString status_code = QString::number(linkstatus_->httpHeader().statusCode()); + + if(status_code[0] == '2') + return LinkStatus::SUCCESSFULL; + else if(status_code[0] == '3') + return LinkStatus::HTTP_REDIRECTION; + else if(status_code[0] == '4') + return LinkStatus::HTTP_CLIENT_ERROR; + else if(status_code[0] == '5') + return LinkStatus::HTTP_SERVER_ERROR; + else + return LinkStatus::UNDETERMINED; +} + +#include "linkchecker.moc" diff --git a/klinkstatus/src/engine/linkchecker.h b/klinkstatus/src/engine/linkchecker.h new file mode 100644 index 00000000..a992e5fd --- /dev/null +++ b/klinkstatus/src/engine/linkchecker.h @@ -0,0 +1,128 @@ +/*************************************************************************** + * Copyright (C) 2004 by Paulo Moura Guedes * + * moura@kdewebdev.org * + * * + * This program is free software; you can redistribute it and/or modify * + * it under the terms of the GNU General Public License as published by * + * the Free Software Foundation; either version 2 of the License, or * + * (at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, * + * but WITHOUT ANY WARRANTY; without even the implied warranty of * + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * + * GNU General Public License for more details. * + * * + * You should have received a copy of the GNU General Public License * + * along with this program; if not, write to the * + * Free Software Foundation, Inc., * + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. * + ***************************************************************************/ +#ifndef LINKCHECKER_H +#define LINKCHECKER_H + +#include +#include +#include + +#include +class KHTMLPart; + +#include "../parser/http.h" +#include "linkstatus.h" +class SearchManager; + +#include +using namespace std; + +/** +@author Paulo Moura Guedes +*/ +class LinkChecker : public QObject +{ + Q_OBJECT +public: + LinkChecker(LinkStatus* linkstatus, int time_out = 50, + QObject *parent = 0, const char *name = 0); + ~LinkChecker(); + + //virtual void run(); + void check(); + void setSearchManager(SearchManager* search_manager); + + LinkStatus const* linkStatus() const; + + static bool hasAnchor(KHTMLPart* html_part, QString const& anchor); + +signals: + + void transactionFinished(const LinkStatus * linkstatus, + LinkChecker * checker); + void jobFinnished(LinkChecker * checker); + +protected slots: + + void slotData(KIO::Job *, const QByteArray &data); + void slotRedirection (KIO::Job *, const KURL &url); + void slotMimetype(KIO::Job *, const QString &type); + void slotResult(KIO::Job* job); + void slotTimeOut(); + +protected: + + void finnish(); + HttpResponseHeader getHttpHeader(KIO::Job* job, bool remember_check = true); + void checkRef(); // #... + +private: + + LinkStatus::Status getHttpStatus() const; + void checkRef(LinkStatus const* linkstatus_parent); + void checkRef(KURL const& url); + void killJob(); + /** + * @param url + * @return false if the redirection was already checked by the search manager + */ + bool processRedirection(KURL const& url); + + void findDocumentCharset(QString const& data); + +private: + + SearchManager* search_manager_; + LinkStatus* const linkstatus_; + KIO::TransferJob* t_job_; + int time_out_; + LinkChecker* checker_; + QString document_charset_; +/* A redirection has appened, with the current URL. Several redirections + can happen until the final URL is reached.*/ + bool redirection_; + KURL redirection_url_; + QString doc_html_; + bool header_checked_; + bool finnished_; + bool parsing_; + + /** + * Whether the charset of the document is already checked. + * (e.g. ) + */ + bool is_charset_checked_; + /** + * Wheter the page define the enconding (latin1, utf8, etc). + * According to the spec (http://www.w3.org/TR/html4/charset.html), + * it first check the server response and then the info in the html meta element. + */ + bool has_defined_charset_; + + static int count_; // debug attribute that counts how many links were checked +}; + +inline LinkStatus const* LinkChecker::linkStatus() const +{ + return linkstatus_; +} + + +#endif diff --git a/klinkstatus/src/engine/linkfilter.cpp b/klinkstatus/src/engine/linkfilter.cpp new file mode 100644 index 00000000..4d15f2e6 --- /dev/null +++ b/klinkstatus/src/engine/linkfilter.cpp @@ -0,0 +1,46 @@ +/*************************************************************************** + * Copyright (C) 2004 by Paulo Moura Guedes * + * moura@kdewebdev.org * + * * + * This program is free software; you can redistribute it and/or modify * + * it under the terms of the GNU General Public License as published by * + * the Free Software Foundation; either version 2 of the License, or * + * (at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, * + * but WITHOUT ANY WARRANTY; without even the implied warranty of * + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * + * GNU General Public License for more details. * + * * + * You should have received a copy of the GNU General Public License * + * along with this program; if not, write to the * + * Free Software Foundation, Inc., * + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. * + ***************************************************************************/ +#include "linkfilter.h" + +#include "linkstatus.h" + + +LinkMatcher::LinkMatcher(QString const& text, ResultView::Status status) + : m_text(text), m_status(status) +{ +} + +LinkMatcher::~LinkMatcher() +{ +} + +bool LinkMatcher::matches(LinkStatus const& link ) const +{ +/* kdDebug() << link.absoluteUrl().url() << endl; + kdDebug() << link.label() << endl; + kdDebug() << link.absoluteUrl().url().contains(m_text) << endl; + kdDebug() << link.label().contains(m_text) << endl; + */ + return (link.absoluteUrl().url().contains(m_text, false) || link.label().contains(m_text, false)) && + ResultView::displayableWithStatus(&link, m_status); +} + + + diff --git a/klinkstatus/src/engine/linkfilter.h b/klinkstatus/src/engine/linkfilter.h new file mode 100644 index 00000000..84da16cb --- /dev/null +++ b/klinkstatus/src/engine/linkfilter.h @@ -0,0 +1,49 @@ +/*************************************************************************** + * Copyright (C) 2004 by Paulo Moura Guedes * + * moura@kdewebdev.org * + * * + * This program is free software; you can redistribute it and/or modify * + * it under the terms of the GNU General Public License as published by * + * the Free Software Foundation; either version 2 of the License, or * + * (at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, * + * but WITHOUT ANY WARRANTY; without even the implied warranty of * + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * + * GNU General Public License for more details. * + * * + * You should have received a copy of the GNU General Public License * + * along with this program; if not, write to the * + * Free Software Foundation, Inc., * + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. * + ***************************************************************************/ +#ifndef LINKFILTER_H +#define LINKFILTER_H + +#include "../ui/resultview.h" + +/** + @author Paulo Moura Guedes +*/ +class LinkMatcher +{ +public: + LinkMatcher(QString const& text, ResultView::Status status); + ~LinkMatcher(); + + bool matches(LinkStatus const& link) const; + + void setText(const QString& text) { m_text = text; } + QString text() const { return m_text; } + + void setStatus(ResultView::Status status) { m_status = status; } + ResultView::Status status() const { return m_status; } + + bool nullFilter() const { return m_text.isEmpty() && m_status == ResultView::none; } + +private: + QString m_text; + ResultView::Status m_status; +}; + +#endif diff --git a/klinkstatus/src/engine/linkstatus.cpp b/klinkstatus/src/engine/linkstatus.cpp new file mode 100644 index 00000000..c8b359ed --- /dev/null +++ b/klinkstatus/src/engine/linkstatus.cpp @@ -0,0 +1,214 @@ +/*************************************************************************** + * Copyright (C) 2004 by Paulo Moura Guedes * + * moura@kdewebdev.org * + * * + * This program is free software; you can redistribute it and/or modify * + * it under the terms of the GNU General Public License as published by * + * the Free Software Foundation; either version 2 of the License, or * + * (at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, * + * but WITHOUT ANY WARRANTY; without even the implied warranty of * + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * + * GNU General Public License for more details. * + * * + * You should have received a copy of the GNU General Public License * + * along with this program; if not, write to the * + * Free Software Foundation, Inc., * + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. * + ***************************************************************************/ + +#include "linkstatus.h" +#include "../parser/node.h" +#include "../ui/treeview.h" + +#include +#include + +#include + + +LinkStatus::~LinkStatus() +{ + //kdDebug(23100) << "|"; + + for(uint i = 0; i != children_nodes_.size(); ++i) + { + if(children_nodes_[i]) + { + delete children_nodes_[i]; + children_nodes_[i] = 0; + } + } + + children_nodes_.clear(); + + if(isRedirection()) + { + if(redirection_) + { + delete redirection_; + redirection_ = 0; + } + } +} + +void LinkStatus::reset() +{ + depth_ = -1; + external_domain_depth_ = -1; + is_root_ = false; + error_occurred_ = false; + is_redirection_ = false; + checked_ = false; + only_check_header_ = true; + malformed_ = false; + Q_ASSERT(!node_); + has_base_URI_ = false; + label_ = ""; + absolute_url_ = ""; + doc_html_ = ""; + http_header_ = HttpResponseHeader(); + error_ = ""; + + for(uint i = 0; i != children_nodes_.size(); ++i) + { + if(children_nodes_[i]) + { + delete children_nodes_[i]; + children_nodes_[i] = 0; + } + } + + children_nodes_.clear(); + + if(isRedirection()) + { + if(redirection_) + { + delete redirection_; + redirection_ = 0; + } + } + Q_ASSERT(!parent_); + base_URI_ = ""; +} + +QString const LinkStatus::toString() const +{ + QString aux; + + if(!is_root_) + { + Q_ASSERT(parent_); + aux += i18n( "Parent: %1" ).arg( parent()->absoluteUrl().prettyURL() ) + "\n"; + } + Q_ASSERT(!original_url_.isNull()); + + aux += i18n( "URL: %1" ).arg( absoluteUrl().prettyURL() ) + "\n"; + aux += i18n( "Original URL: %1" ).arg( originalUrl() ) + "\n"; + if(node()) + aux += i18n( "Node: %1" ).arg( node()->content() ) + "\n"; + + return aux; +} + + +LinkStatus* LinkStatus::lastRedirection(LinkStatus* ls) +{ + if(ls->isRedirection()) + if(ls->redirection()) + return lastRedirection(ls->redirection()); + else + return ls; + else + return ls; +} + +void LinkStatus::loadNode() +{ + Q_ASSERT(node_); + + setOriginalUrl(node_->url()); + setLabel(node_->linkLabel()); + + if(malformed()) + { + setErrorOccurred(true); + setError(i18n( "Malformed" )); + setStatus(LinkStatus::MALFORMED); + kdDebug(23100) << "Malformed:" << endl; + kdDebug(23100) << "Node: " << node()->content() << endl; + //kdDebug(23100) << toString() << endl; // probable segfault + } +} + +bool LinkStatus::malformed() const // don't inline please (#include "node.h") +{ + return (malformed_ || node_->malformed()); +} + +void LinkStatus::setChildrenNodes(vector const& nodes) // don't inline please (#include "node.h") +{ + children_nodes_.reserve(nodes.size()); + children_nodes_ = nodes; +} + +void LinkStatus::setMalformed(bool flag) +{ + malformed_ = flag; + if(flag) + { + setErrorOccurred(true); + setError(i18n( "Malformed" )); + setStatus(LinkStatus::MALFORMED); + kdDebug(23100) << "Malformed!" << endl; + kdDebug(23100) << node()->content() << endl; + //kdDebug(23100) << toString() << endl; // probable segfault + } + else if(error() == i18n( "Malformed" )) + { + setErrorOccurred(false); + setError(""); + setStatus(LinkStatus::UNDETERMINED); + } +} + +void LinkStatus::save(QDomElement& element) const +{ + QDomElement child_element = element.ownerDocument().createElement("link"); + + // + QDomElement tmp_1 = element.ownerDocument().createElement("url"); + tmp_1.appendChild(element.ownerDocument().createTextNode(absoluteUrl().prettyURL())); + child_element.appendChild(tmp_1); + + // + tmp_1 = element.ownerDocument().createElement("status"); + tmp_1.setAttribute("broken", + ResultView::displayableWithStatus(this, ResultView::bad) ? + "true" : "false"); + tmp_1.appendChild(element.ownerDocument().createTextNode(statusText())); + child_element.appendChild(tmp_1); + + //