summaryrefslogtreecommitdiffstats
path: root/klinkstatus/src/engine
diff options
context:
space:
mode:
Diffstat (limited to 'klinkstatus/src/engine')
-rw-r--r--klinkstatus/src/engine/Makefile.am9
-rw-r--r--klinkstatus/src/engine/linkchecker.cpp703
-rw-r--r--klinkstatus/src/engine/linkchecker.h128
-rw-r--r--klinkstatus/src/engine/linkfilter.cpp46
-rw-r--r--klinkstatus/src/engine/linkfilter.h49
-rw-r--r--klinkstatus/src/engine/linkstatus.cpp214
-rw-r--r--klinkstatus/src/engine/linkstatus.h187
-rw-r--r--klinkstatus/src/engine/linkstatus_impl.h417
-rw-r--r--klinkstatus/src/engine/searchmanager.cpp916
-rw-r--r--klinkstatus/src/engine/searchmanager.h193
-rw-r--r--klinkstatus/src/engine/searchmanager_impl.h158
11 files changed, 3020 insertions, 0 deletions
diff --git a/klinkstatus/src/engine/Makefile.am b/klinkstatus/src/engine/Makefile.am
new file mode 100644
index 00000000..1bd3ba88
--- /dev/null
+++ b/klinkstatus/src/engine/Makefile.am
@@ -0,0 +1,9 @@
+INCLUDES = -I$(top_srcdir)/src/ui $(all_includes)
+METASOURCES = AUTO
+noinst_HEADERS = linkchecker.h linkstatus.h linkstatus_impl.h searchmanager.h \
+ searchmanager_impl.h linkfilter.h
+libengine_la_LDFLAGS = $(all_libraries)
+noinst_LTLIBRARIES = libengine.la
+libengine_la_SOURCES = linkchecker.cpp linkstatus.cpp searchmanager.cpp \
+ linkfilter.cpp
+libengine_la_LIBADD = $(LIB_KHTML)
diff --git a/klinkstatus/src/engine/linkchecker.cpp b/klinkstatus/src/engine/linkchecker.cpp
new file mode 100644
index 00000000..bcc503ad
--- /dev/null
+++ b/klinkstatus/src/engine/linkchecker.cpp
@@ -0,0 +1,703 @@
+/***************************************************************************
+ * Copyright (C) 2004 by Puto Moura *
+ * mojo@localhost.localdomain *
+ * *
+ * This program is free software; you can redistribute it and/or modify *
+ * it under the terms of the GNU General Public License as published by *
+ * the Free Software Foundation; either version 2 of the License, or *
+ * (at your option) any later version. *
+ * *
+ * This program is distributed in the hope that it will be useful, *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
+ * GNU General Public License for more details. *
+ * *
+ * You should have received a copy of the GNU General Public License *
+ * along with this program; if not, write to the *
+ * Free Software Foundation, Inc., *
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. *
+ ***************************************************************************/
+#include "linkchecker.h"
+#include "searchmanager.h"
+#include "../utils/utils.h"
+#include "../parser/htmlparser.h"
+
+#include <qstring.h>
+#include <qtimer.h>
+#include <qtextcodec.h>
+#include <qcstring.h>
+
+#include <kio/netaccess.h>
+#include <kio/global.h>
+#include <kio/job.h>
+#include <kio/scheduler.h>
+#include <kio/slave.h>
+#include <kmimetype.h>
+#include <kapplication.h>
+#include <klocale.h>
+#include <khtml_part.h>
+#include <dom/html_misc.h>
+#include <dom/dom_node.h>
+#include <dom/dom_string.h>
+
+
+int LinkChecker::count_ = 0;
+
+LinkChecker::LinkChecker(LinkStatus* linkstatus, int time_out,
+ QObject *parent, const char *name)
+ : QObject(parent, name), search_manager_(0),
+ linkstatus_(linkstatus), t_job_(0), time_out_(time_out), checker_(0), document_charset_(),
+ redirection_(false), header_checked_(false), finnished_(false),
+ parsing_(false), is_charset_checked_(false), has_defined_charset_(false)
+{
+ Q_ASSERT(linkstatus_);
+ Q_ASSERT(!linkstatus_->checked());
+
+ kdDebug(23100) << endl << ++count_ << ": " << "Checking " << linkstatus_->absoluteUrl().url() << endl;
+}
+
+LinkChecker::~LinkChecker()
+{}
+
+void LinkChecker::setSearchManager(SearchManager* search_manager)
+{
+ Q_ASSERT(search_manager);
+ search_manager_ = search_manager;
+}
+
+void LinkChecker::check()
+{
+ Q_ASSERT(!finnished_);
+
+ KURL url(linkStatus()->absoluteUrl());
+ Q_ASSERT(url.isValid());
+
+ if(url.hasRef()) {
+ KMimeType::Ptr mimeType = KMimeType::findByURL(url);
+ if(mimeType->is("text/html") || mimeType->is("application/xml")) {
+ checkRef();
+ return;
+ }
+ }
+
+ t_job_ = KIO::get(url, false, false);
+
+ t_job_->addMetaData("PropagateHttpHeader", "true"); // to have the http header
+
+ if (linkstatus_->parent()) {
+ t_job_->addMetaData("referrer", linkstatus_->parent()->absoluteUrl().prettyURL());
+ }
+
+ if(search_manager_->sendIdentification())
+ {
+ t_job_->addMetaData("SendUserAgent", "true");
+ t_job_->addMetaData("UserAgent", search_manager_->userAgent());
+ }
+ else
+ t_job_->addMetaData("SendUserAgent", "false");
+
+
+ QObject::connect(t_job_, SIGNAL(data(KIO::Job *, const QByteArray &)),
+ this, SLOT(slotData(KIO::Job *, const QByteArray &)));
+ QObject::connect(t_job_, SIGNAL(mimetype(KIO::Job *, const QString &)),
+ this, SLOT(slotMimetype(KIO::Job *, const QString &)));
+ QObject::connect(t_job_, SIGNAL(result(KIO::Job *)),
+ this, SLOT(slotResult(KIO::Job *)));
+ QObject::connect(t_job_, SIGNAL(redirection(KIO::Job *, const KURL &)),
+ this, SLOT(slotRedirection(KIO::Job *, const KURL &)));
+
+ QTimer::singleShot( time_out_ * 1000, this, SLOT(slotTimeOut()) );
+
+ t_job_->setInteractive(false);
+}
+
+void LinkChecker::slotTimeOut()
+{
+ if(!finnished_ && !parsing_)
+ {
+ kdDebug(23100) << "timeout: " << linkstatus_->absoluteUrl().url() << endl;
+ if(t_job_ && t_job_->slave())
+ kdDebug(23100) << " - " << t_job_->slave() << "/" << t_job_->slave()->slave_pid() << endl;
+ else
+ kdDebug(23100) << endl;
+
+
+// Q_ASSERT(t_job_); // can happen: e.g. bad result signal
+ if(t_job_->error() != KIO::ERR_USER_CANCELED)
+ {
+ linkstatus_->setErrorOccurred(true);
+ linkstatus_->setChecked(true);
+ linkstatus_->setError(i18n("Timeout"));
+ linkstatus_->setStatus(LinkStatus::TIMEOUT);
+
+ killJob();
+ finnish();
+ }
+ }
+}
+
+void LinkChecker::slotMimetype (KIO::Job* /*job*/, const QString &type)
+{
+ if(finnished_)
+ return;
+
+// kdDebug(23100) << "LinkChecker::slotMimetype:" << type << "-> " << linkstatus_->absoluteUrl().url()
+// << " - " << t_job_->slave() << "/" << t_job_->slave()->slave_pid() << endl;
+
+ Q_ASSERT(t_job_);
+
+ LinkStatus* ls = 0;
+/* if(redirection_)
+ ls = linkStatus()->redirection();
+ else*/
+ ls = linkstatus_;
+ Q_ASSERT(ls);
+
+ ls->setMimeType(type);
+ KURL url = ls->absoluteUrl();
+
+ // we doesn't do nothing if file is http or https because we need the header
+ // which is only available in the data response
+ if(!t_job_->error()) // if a error happened let result() handle that
+ {
+ if(ls->onlyCheckHeader())
+ {
+ //kdDebug(23100) << "only check header: " << ls->absoluteUrl().prettyURL() << endl;
+
+ // file is OK (http can have an error page though job->error() is false)
+ if(!url.protocol().startsWith("http"))
+ {
+ ls->setStatusText("OK");
+ ls->setStatus(LinkStatus::SUCCESSFULL);
+
+ killJob();
+ finnish();
+ }
+ }
+ else // !ls->onlyCheckHeader()
+ {
+ //kdDebug(23100) << "NOT only check header: " << ls->absoluteUrl().prettyURL() << endl;
+
+ // file is OK (http can have an error page though job->error() is false)
+ if(!url.protocol().startsWith("http")) // if not, it have to go trough slotData to get the http header
+ {
+ // it's not an html page, so we don't want the file content
+ if(type != "text/html"/* && type != "text/plain"*/)
+ {
+ //kdDebug(23100) << "mimetype: " << type << endl;
+ ls->setStatusText("OK");
+ ls->setStatus(LinkStatus::SUCCESSFULL);
+
+ killJob();
+ finnish();
+ }
+ }
+ }
+ }
+}
+
+void LinkChecker::slotData(KIO::Job* /*job*/, const QByteArray& data)
+{
+ if(finnished_)
+ return;
+
+ kdDebug(23100) << "LinkChecker::slotData -> " << linkstatus_->absoluteUrl().url()
+ << " - " << t_job_->slave() << "/" << t_job_->slave()->slave_pid() << endl;
+
+ Q_ASSERT(t_job_);
+
+ LinkStatus* ls = 0;
+/* if(redirection_)
+ ls = linkStatus()->redirection();
+ else*/
+ ls = linkstatus_;
+ Q_ASSERT(ls);
+
+ KURL url = ls->absoluteUrl();
+
+ if(!t_job_->error())
+ {
+ if(ls->onlyCheckHeader())
+ {
+ Q_ASSERT(header_checked_ == false);
+ // the job should have been killed in slotMimetype
+ Q_ASSERT(url.protocol() == "http" || url.protocol() == "https");
+
+ // get the header and quit
+ if(url.protocol().startsWith("http"))
+ {
+ // get the header
+ ls->setHttpHeader(getHttpHeader(t_job_));
+
+ if(t_job_->isErrorPage())
+ ls->setIsErrorPage(true);
+
+ if(header_checked_)
+ {
+ killJob();
+ linkstatus_->setStatus(getHttpStatus());
+ linkstatus_->setChecked(true);
+ finnish();
+ return;
+ }
+ }
+ }
+ else
+ {
+ if(url.protocol().startsWith("http"))
+ {
+ if(!header_checked_)
+ {
+ ls->setHttpHeader(getHttpHeader(t_job_));
+ }
+ if(ls->mimeType() != "text/html" && header_checked_)
+ {
+ //kdDebug(23100) << "mimetype of " << ls->absoluteUrl().prettyURL() << ": " << ls->mimeType() << endl;
+ ls->setStatus(getHttpStatus());
+ killJob();
+ finnish(); // if finnish is called before kill what you get is a segfault, don't know why
+ return;
+ }
+ else if(t_job_->isErrorPage() && header_checked_)
+ {
+ //kdDebug(23100) << "ERROR PAGE" << endl;
+ ls->setIsErrorPage(true);
+ ls->setStatus(getHttpStatus());
+ killJob();
+ finnish();
+ return;
+ }
+ }
+ else
+ {
+ Q_ASSERT(ls->mimeType() == "text/html");
+ }
+ if(!is_charset_checked_)
+ findDocumentCharset(data);
+
+ QTextCodec* codec = 0;
+ if(has_defined_charset_)
+ codec = QTextCodec::codecForName(document_charset_);
+ if(!codec)
+ codec = QTextCodec::codecForName("iso8859-1"); // default
+
+ doc_html_ += codec->toUnicode(data);
+ }
+ }
+}
+
+void LinkChecker::findDocumentCharset(QString const& doc)
+{
+ Q_ASSERT(!is_charset_checked_);
+
+ is_charset_checked_ = true; // only check the first stream of data
+
+ if(header_checked_)
+ document_charset_ = linkstatus_->httpHeader().charset();
+
+ // try to look in the meta elements
+ if(document_charset_.isNull() || document_charset_.isEmpty())
+ document_charset_ = HtmlParser::findCharsetInMetaElement(doc);
+
+ if(!document_charset_.isNull() && !document_charset_.isEmpty())
+ has_defined_charset_ = true;
+}
+
+// only comes here if an error happened or in case of a clean html page
+// if onlyCheckHeader is false
+void LinkChecker::slotResult(KIO::Job* /*job*/)
+{
+ if(finnished_)
+ return;
+
+ kdDebug(23100) << "LinkChecker::slotResult -> " << linkstatus_->absoluteUrl().url() << endl;
+
+ Q_ASSERT(t_job_);
+ if(!t_job_)
+ return;
+
+ if(redirection_) {
+ if(!processRedirection(redirection_url_)) {
+ t_job_ = 0;
+ linkstatus_->setChecked(true);
+ finnish();
+ return;
+ }
+ }
+
+ KIO::TransferJob* job = t_job_;
+ t_job_ = 0;
+
+ emit jobFinnished(this);
+
+ if(job->error() == KIO::ERR_USER_CANCELED)
+ {
+ // FIXME This can happen! If the job is non interactive...
+ kdWarning(23100) << endl << "Job killed quietly, yet signal result was emited..." << endl;
+ kdDebug(23100) << linkstatus_->toString() << endl;
+ finnish();
+ return;
+ }
+
+ LinkStatus* ls = 0;
+ if(redirection_)
+ ls = linkStatus()->redirection();
+ else
+ ls = linkstatus_;
+ Q_ASSERT(ls);
+
+ if(!(!ls->onlyCheckHeader() ||
+ job->error() ||
+ !header_checked_))
+ kdWarning(23100) << ls->toString() << endl;
+
+ Q_ASSERT(!ls->onlyCheckHeader() || job->error() || !header_checked_);
+
+ if(ls->isErrorPage())
+ kdWarning(23100) << "\n\n" << ls->toString() << endl << endl;
+
+ Q_ASSERT(!job->isErrorPage());
+
+ if(job->error())
+ {
+ kdDebug(23100) << "Job error: " << job->errorString() << endl;
+ kdDebug(23100) << "Job error code: " << job->error() << endl;
+
+ if(job->error() == KIO::ERR_IS_DIRECTORY)
+ {
+ ls->setStatusText("OK");
+ ls->setStatus(LinkStatus::SUCCESSFULL);
+ }
+ else
+ {
+ ls->setErrorOccurred(true);
+ if(job->error() == KIO::ERR_SERVER_TIMEOUT)
+ ls->setStatus(LinkStatus::TIMEOUT);
+ else
+ ls->setStatus(LinkStatus::BROKEN);
+
+ if(job->errorString().isEmpty())
+ kdWarning(23100) << "\n\nError string is empty, error = " << job->error() << "\n\n\n";
+ if(job->error() != KIO::ERR_NO_CONTENT)
+ ls->setError(job->errorString());
+ else
+ ls->setError(i18n("No Content"));
+ }
+ }
+
+ else
+ {
+ if(!ls->absoluteUrl().protocol().startsWith("http")) {
+ ls->setStatusText("OK");
+ ls->setStatus(LinkStatus::SUCCESSFULL);
+ }
+ else
+ {
+ if(!header_checked_)
+ {
+ kdDebug(23100) << "\n\nheader not received... checking again...\n\n\n";
+ //check again
+ check();
+ return;
+ }
+ Q_ASSERT(header_checked_);
+
+ ls->setStatus(getHttpStatus());
+ }
+
+ if(!doc_html_.isNull() && !doc_html_.isEmpty())
+ {
+ ls->setDocHtml(doc_html_);
+
+ parsing_ = true;
+ HtmlParser parser(doc_html_);
+
+ if(parser.hasBaseUrl())
+ ls->setBaseURI(KURL(parser.baseUrl().url()));
+ if(parser.hasTitle())
+ ls->setHtmlDocTitle(parser.title().attributeTITLE());
+
+ ls->setChildrenNodes(parser.nodes());
+ parsing_ = false;
+ }
+ }
+ finnish();
+}
+
+
+void LinkChecker::slotRedirection (KIO::Job* /*job*/, const KURL &url)
+{
+ kdDebug(23100) << "LinkChecker::slotRedirection -> " <<
+ linkstatus_->absoluteUrl().url() << " -> " << url.url() << endl;
+// << " - " << t_job_->slave() << "/" << t_job_->slave()->slave_pid() << endl;
+
+ redirection_ = true;
+ redirection_url_ = url;
+}
+
+bool LinkChecker::processRedirection(KURL const& toUrl)
+{
+ if(finnished_)
+ return true;
+
+ kdDebug(23100) << "LinkChecker::processRedirection -> " << linkstatus_->absoluteUrl().url() << " -> " << toUrl.url() << endl;
+
+ Q_ASSERT(t_job_);
+ Q_ASSERT(linkstatus_->absoluteUrl().protocol().startsWith("http"));
+ Q_ASSERT(redirection_);
+
+ linkstatus_->setHttpHeader(getHttpHeader(t_job_, false));
+ linkstatus_->setIsRedirection(true);
+ linkstatus_->setStatusText("redirection");
+ linkstatus_->setStatus(LinkStatus::HTTP_REDIRECTION);
+ linkstatus_->setChecked(true);
+
+ LinkStatus* ls_red = new LinkStatus(*linkstatus_);
+ ls_red->setAbsoluteUrl(toUrl);
+ ls_red->setRootUrl(linkstatus_->rootUrl());
+
+ if(!linkstatus_->onlyCheckHeader())
+ ls_red->setOnlyCheckHeader(false);
+
+ linkstatus_->setRedirection(ls_red);
+ ls_red->setParent(linkstatus_);
+ ls_red->setOriginalUrl(toUrl.url());
+
+ Q_ASSERT(search_manager_);
+
+ if(search_manager_->localDomain(ls_red->absoluteUrl()))
+ ls_red->setExternalDomainDepth(-1);
+ else
+ {
+ if(search_manager_->localDomain(linkstatus_->absoluteUrl()))
+ ls_red->setExternalDomainDepth(linkstatus_->externalDomainDepth() + 1);
+ else
+ ls_red->setExternalDomainDepth(linkstatus_->externalDomainDepth());
+ }
+
+ if(!toUrl.isValid() || search_manager_->existUrl(toUrl, linkstatus_->absoluteUrl()))
+ {
+ ls_red->setChecked(false);
+ return false;
+ }
+ else
+ {
+ ls_red->setChecked(true);
+ return true;
+ }
+}
+
+void LinkChecker::finnish()
+{
+ Q_ASSERT(!t_job_);
+
+ if(!finnished_)
+ {
+ kdDebug(23100) << "LinkChecker::finnish -> " << linkstatus_->absoluteUrl().url() << endl;
+
+ finnished_ = true;
+
+ if(redirection_)
+ Q_ASSERT(linkstatus_->checked());
+ else
+ linkstatus_->setChecked(true);
+
+ emit transactionFinished(linkstatus_, this);
+ }
+}
+
+HttpResponseHeader LinkChecker::getHttpHeader(KIO::Job* /*job*/, bool remember_check)
+{
+ //kdDebug(23100) << "LinkChecker::getHttpHeader -> " << linkstatus_->absoluteUrl().url() << endl;
+
+ Q_ASSERT(!finnished_);
+ Q_ASSERT(t_job_);
+
+ QString header_string = t_job_->queryMetaData("HTTP-Headers");
+ // Q_ASSERT(!header_string.isNull() && !header_string.isEmpty());
+// kdDebug(23100) << "HTTP header: " << endl << header_string << endl;
+// kdDebug(23100) << "Keys: " << HttpResponseHeader(header_string).keys() << endl;
+// kdDebug(23100) << "Content-type: " << HttpResponseHeader(header_string).contentType() << endl;
+// kdDebug(23100) << "Content-type: " << HttpResponseHeader(header_string).value("content-type") << endl;
+
+ if(header_string.isNull() || header_string.isEmpty())
+ {
+ header_checked_ = false;
+ kdWarning(23100) << "header_string.isNull() || header_string.isEmpty(): "
+ << linkstatus_->toString() << endl;
+ }
+ else if(remember_check)
+ header_checked_ = true;
+
+ return HttpResponseHeader(header_string);
+}
+
+void LinkChecker::checkRef()
+{
+ KURL url(linkStatus()->absoluteUrl());
+ Q_ASSERT(url.hasRef());
+
+ QString ref = url.ref();
+ if(ref == "" || ref == "top") {
+ linkstatus_->setStatusText("OK");
+ linkstatus_->setStatus(LinkStatus::SUCCESSFULL);
+ finnish();
+ return;
+ }
+
+ QString url_base;
+ LinkStatus const* ls_parent = 0;
+ int i_ref = -1;
+
+ if(linkStatus()->originalUrl().startsWith("#"))
+ ls_parent = linkStatus()->parent();
+
+ else
+ {
+ i_ref = url.url().find("#");
+ url_base = url.url().left(i_ref);
+ //kdDebug(23100) << "url_base: " << url_base << endl;
+
+ Q_ASSERT(search_manager_);
+
+ ls_parent = search_manager_->linkStatus(url_base);
+ }
+
+ if(ls_parent)
+ checkRef(ls_parent);
+ else
+ {
+ url = KURL::fromPathOrURL(url.url().left(i_ref));
+ checkRef(url);
+ }
+}
+
+void LinkChecker::checkRef(KURL const& url)
+{
+ Q_ASSERT(search_manager_);
+
+ QString url_string = url.url();
+ KHTMLPart* html_part = search_manager_->htmlPart(url_string);
+ if(!html_part)
+ {
+ kdDebug() << "new KHTMLPart: " + url_string << endl;
+
+ html_part = new KHTMLPart();
+ html_part->setOnlyLocalReferences(true);
+
+ QString tmpFile;
+ if(KIO::NetAccess::download(url, tmpFile, 0))
+ {
+ QString doc_html = FileManager::read(tmpFile);
+ html_part->begin();
+ html_part->write(doc_html);
+ html_part->end();
+
+ KIO::NetAccess::removeTempFile(tmpFile);
+ }
+ else
+ {
+ kdDebug(23100) << KIO::NetAccess::lastErrorString() << endl;
+ }
+
+ search_manager_->addHtmlPart(url_string, html_part);
+ }
+
+ if(hasAnchor(html_part, linkStatus()->absoluteUrl().ref()))
+ {
+ linkstatus_->setStatusText("OK");
+ linkstatus_->setStatus(LinkStatus::SUCCESSFULL);
+ }
+ else
+ {
+ linkstatus_->setErrorOccurred(true);
+ linkstatus_->setError(i18n( "Link destination not found." ));
+ linkstatus_->setStatus(LinkStatus::BROKEN);
+ }
+
+ finnish();
+}
+
+void LinkChecker::checkRef(LinkStatus const* linkstatus_parent)
+{
+ Q_ASSERT(search_manager_);
+
+ QString url_string = linkstatus_parent->absoluteUrl().url();
+ KHTMLPart* html_part = search_manager_->htmlPart(url_string);
+ if(!html_part)
+ {
+ kdDebug() << "new KHTMLPart: " + url_string << endl;
+
+ html_part = new KHTMLPart();
+ html_part->setOnlyLocalReferences(true);
+
+ html_part->begin();
+ html_part->write(linkstatus_parent->docHtml());
+ html_part->end();
+
+ search_manager_->addHtmlPart(url_string, html_part);
+ }
+
+ if(hasAnchor(html_part, linkStatus()->absoluteUrl().ref()))
+ {
+ linkstatus_->setStatusText("OK");
+ linkstatus_->setStatus(LinkStatus::SUCCESSFULL);
+ }
+ else
+ {
+ linkstatus_->setErrorOccurred(true);
+ linkstatus_->setError(i18n( "Link destination not found." ));
+ linkstatus_->setStatus(LinkStatus::BROKEN);
+ }
+
+ finnish();
+}
+
+bool LinkChecker::hasAnchor(KHTMLPart* html_part, QString const& anchor)
+{
+ DOM::HTMLDocument htmlDocument = html_part->htmlDocument();
+ DOM::HTMLCollection anchors = htmlDocument.anchors();
+
+ DOM::DOMString name_ref(anchor);
+ Q_ASSERT(!name_ref.isNull());
+
+ DOM::Node node = anchors.namedItem(name_ref);
+ if(node.isNull())
+ {
+ node = htmlDocument.getElementById(name_ref);
+ }
+
+ if(!node.isNull())
+ return true;
+ else
+ return false;
+}
+
+void LinkChecker::killJob()
+{
+ if(!t_job_)
+ return;
+
+ KIO::TransferJob* aux = t_job_;
+ t_job_ = 0;
+ aux->disconnect(this);
+ aux->kill(true); // quietly
+}
+
+LinkStatus::Status LinkChecker::getHttpStatus() const
+{
+ QString status_code = QString::number(linkstatus_->httpHeader().statusCode());
+
+ if(status_code[0] == '2')
+ return LinkStatus::SUCCESSFULL;
+ else if(status_code[0] == '3')
+ return LinkStatus::HTTP_REDIRECTION;
+ else if(status_code[0] == '4')
+ return LinkStatus::HTTP_CLIENT_ERROR;
+ else if(status_code[0] == '5')
+ return LinkStatus::HTTP_SERVER_ERROR;
+ else
+ return LinkStatus::UNDETERMINED;
+}
+
+#include "linkchecker.moc"
diff --git a/klinkstatus/src/engine/linkchecker.h b/klinkstatus/src/engine/linkchecker.h
new file mode 100644
index 00000000..a992e5fd
--- /dev/null
+++ b/klinkstatus/src/engine/linkchecker.h
@@ -0,0 +1,128 @@
+/***************************************************************************
+ * Copyright (C) 2004 by Paulo Moura Guedes *
+ * moura@kdewebdev.org *
+ * *
+ * This program is free software; you can redistribute it and/or modify *
+ * it under the terms of the GNU General Public License as published by *
+ * the Free Software Foundation; either version 2 of the License, or *
+ * (at your option) any later version. *
+ * *
+ * This program is distributed in the hope that it will be useful, *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
+ * GNU General Public License for more details. *
+ * *
+ * You should have received a copy of the GNU General Public License *
+ * along with this program; if not, write to the *
+ * Free Software Foundation, Inc., *
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. *
+ ***************************************************************************/
+#ifndef LINKCHECKER_H
+#define LINKCHECKER_H
+
+#include <qobject.h>
+#include <qthread.h>
+#include <qstring.h>
+
+#include <kio/jobclasses.h>
+class KHTMLPart;
+
+#include "../parser/http.h"
+#include "linkstatus.h"
+class SearchManager;
+
+#include <iostream>
+using namespace std;
+
+/**
+@author Paulo Moura Guedes
+*/
+class LinkChecker : public QObject
+{
+ Q_OBJECT
+public:
+ LinkChecker(LinkStatus* linkstatus, int time_out = 50,
+ QObject *parent = 0, const char *name = 0);
+ ~LinkChecker();
+
+ //virtual void run();
+ void check();
+ void setSearchManager(SearchManager* search_manager);
+
+ LinkStatus const* linkStatus() const;
+
+ static bool hasAnchor(KHTMLPart* html_part, QString const& anchor);
+
+signals:
+
+ void transactionFinished(const LinkStatus * linkstatus,
+ LinkChecker * checker);
+ void jobFinnished(LinkChecker * checker);
+
+protected slots:
+
+ void slotData(KIO::Job *, const QByteArray &data);
+ void slotRedirection (KIO::Job *, const KURL &url);
+ void slotMimetype(KIO::Job *, const QString &type);
+ void slotResult(KIO::Job* job);
+ void slotTimeOut();
+
+protected:
+
+ void finnish();
+ HttpResponseHeader getHttpHeader(KIO::Job* job, bool remember_check = true);
+ void checkRef(); // #...
+
+private:
+
+ LinkStatus::Status getHttpStatus() const;
+ void checkRef(LinkStatus const* linkstatus_parent);
+ void checkRef(KURL const& url);
+ void killJob();
+ /**
+ * @param url
+ * @return false if the redirection was already checked by the search manager
+ */
+ bool processRedirection(KURL const& url);
+
+ void findDocumentCharset(QString const& data);
+
+private:
+
+ SearchManager* search_manager_;
+ LinkStatus* const linkstatus_;
+ KIO::TransferJob* t_job_;
+ int time_out_;
+ LinkChecker* checker_;
+ QString document_charset_;
+/* A redirection has appened, with the current URL. Several redirections
+ can happen until the final URL is reached.*/
+ bool redirection_;
+ KURL redirection_url_;
+ QString doc_html_;
+ bool header_checked_;
+ bool finnished_;
+ bool parsing_;
+
+ /**
+ * Whether the charset of the document is already checked.
+ * (e.g. <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>)
+ */
+ bool is_charset_checked_;
+ /**
+ * Wheter the page define the enconding (latin1, utf8, etc).
+ * According to the spec (http://www.w3.org/TR/html4/charset.html),
+ * it first check the server response and then the info in the html meta element.
+ */
+ bool has_defined_charset_;
+
+ static int count_; // debug attribute that counts how many links were checked
+};
+
+inline LinkStatus const* LinkChecker::linkStatus() const
+{
+ return linkstatus_;
+}
+
+
+#endif
diff --git a/klinkstatus/src/engine/linkfilter.cpp b/klinkstatus/src/engine/linkfilter.cpp
new file mode 100644
index 00000000..4d15f2e6
--- /dev/null
+++ b/klinkstatus/src/engine/linkfilter.cpp
@@ -0,0 +1,46 @@
+/***************************************************************************
+ * Copyright (C) 2004 by Paulo Moura Guedes *
+ * moura@kdewebdev.org *
+ * *
+ * This program is free software; you can redistribute it and/or modify *
+ * it under the terms of the GNU General Public License as published by *
+ * the Free Software Foundation; either version 2 of the License, or *
+ * (at your option) any later version. *
+ * *
+ * This program is distributed in the hope that it will be useful, *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
+ * GNU General Public License for more details. *
+ * *
+ * You should have received a copy of the GNU General Public License *
+ * along with this program; if not, write to the *
+ * Free Software Foundation, Inc., *
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. *
+ ***************************************************************************/
+#include "linkfilter.h"
+
+#include "linkstatus.h"
+
+
+LinkMatcher::LinkMatcher(QString const& text, ResultView::Status status)
+ : m_text(text), m_status(status)
+{
+}
+
+LinkMatcher::~LinkMatcher()
+{
+}
+
+bool LinkMatcher::matches(LinkStatus const& link ) const
+{
+/* kdDebug() << link.absoluteUrl().url() << endl;
+ kdDebug() << link.label() << endl;
+ kdDebug() << link.absoluteUrl().url().contains(m_text) << endl;
+ kdDebug() << link.label().contains(m_text) << endl;
+ */
+ return (link.absoluteUrl().url().contains(m_text, false) || link.label().contains(m_text, false)) &&
+ ResultView::displayableWithStatus(&link, m_status);
+}
+
+
+
diff --git a/klinkstatus/src/engine/linkfilter.h b/klinkstatus/src/engine/linkfilter.h
new file mode 100644
index 00000000..84da16cb
--- /dev/null
+++ b/klinkstatus/src/engine/linkfilter.h
@@ -0,0 +1,49 @@
+/***************************************************************************
+ * Copyright (C) 2004 by Paulo Moura Guedes *
+ * moura@kdewebdev.org *
+ * *
+ * This program is free software; you can redistribute it and/or modify *
+ * it under the terms of the GNU General Public License as published by *
+ * the Free Software Foundation; either version 2 of the License, or *
+ * (at your option) any later version. *
+ * *
+ * This program is distributed in the hope that it will be useful, *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
+ * GNU General Public License for more details. *
+ * *
+ * You should have received a copy of the GNU General Public License *
+ * along with this program; if not, write to the *
+ * Free Software Foundation, Inc., *
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. *
+ ***************************************************************************/
+#ifndef LINKFILTER_H
+#define LINKFILTER_H
+
+#include "../ui/resultview.h"
+
+/**
+ @author Paulo Moura Guedes <moura@kdewebdev.org>
+*/
+class LinkMatcher
+{
+public:
+ LinkMatcher(QString const& text, ResultView::Status status);
+ ~LinkMatcher();
+
+ bool matches(LinkStatus const& link) const;
+
+ void setText(const QString& text) { m_text = text; }
+ QString text() const { return m_text; }
+
+ void setStatus(ResultView::Status status) { m_status = status; }
+ ResultView::Status status() const { return m_status; }
+
+ bool nullFilter() const { return m_text.isEmpty() && m_status == ResultView::none; }
+
+private:
+ QString m_text;
+ ResultView::Status m_status;
+};
+
+#endif
diff --git a/klinkstatus/src/engine/linkstatus.cpp b/klinkstatus/src/engine/linkstatus.cpp
new file mode 100644
index 00000000..c8b359ed
--- /dev/null
+++ b/klinkstatus/src/engine/linkstatus.cpp
@@ -0,0 +1,214 @@
+/***************************************************************************
+ * Copyright (C) 2004 by Paulo Moura Guedes *
+ * moura@kdewebdev.org *
+ * *
+ * This program is free software; you can redistribute it and/or modify *
+ * it under the terms of the GNU General Public License as published by *
+ * the Free Software Foundation; either version 2 of the License, or *
+ * (at your option) any later version. *
+ * *
+ * This program is distributed in the hope that it will be useful, *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
+ * GNU General Public License for more details. *
+ * *
+ * You should have received a copy of the GNU General Public License *
+ * along with this program; if not, write to the *
+ * Free Software Foundation, Inc., *
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. *
+ ***************************************************************************/
+
+#include "linkstatus.h"
+#include "../parser/node.h"
+#include "../ui/treeview.h"
+
+#include <klocale.h>
+#include <kcharsets.h>
+
+#include <qdom.h>
+
+
+LinkStatus::~LinkStatus()
+{
+ //kdDebug(23100) << "|";
+
+ for(uint i = 0; i != children_nodes_.size(); ++i)
+ {
+ if(children_nodes_[i])
+ {
+ delete children_nodes_[i];
+ children_nodes_[i] = 0;
+ }
+ }
+
+ children_nodes_.clear();
+
+ if(isRedirection())
+ {
+ if(redirection_)
+ {
+ delete redirection_;
+ redirection_ = 0;
+ }
+ }
+}
+
+void LinkStatus::reset()
+{
+ depth_ = -1;
+ external_domain_depth_ = -1;
+ is_root_ = false;
+ error_occurred_ = false;
+ is_redirection_ = false;
+ checked_ = false;
+ only_check_header_ = true;
+ malformed_ = false;
+ Q_ASSERT(!node_);
+ has_base_URI_ = false;
+ label_ = "";
+ absolute_url_ = "";
+ doc_html_ = "";
+ http_header_ = HttpResponseHeader();
+ error_ = "";
+
+ for(uint i = 0; i != children_nodes_.size(); ++i)
+ {
+ if(children_nodes_[i])
+ {
+ delete children_nodes_[i];
+ children_nodes_[i] = 0;
+ }
+ }
+
+ children_nodes_.clear();
+
+ if(isRedirection())
+ {
+ if(redirection_)
+ {
+ delete redirection_;
+ redirection_ = 0;
+ }
+ }
+ Q_ASSERT(!parent_);
+ base_URI_ = "";
+}
+
+QString const LinkStatus::toString() const
+{
+ QString aux;
+
+ if(!is_root_)
+ {
+ Q_ASSERT(parent_);
+ aux += i18n( "Parent: %1" ).arg( parent()->absoluteUrl().prettyURL() ) + "\n";
+ }
+ Q_ASSERT(!original_url_.isNull());
+
+ aux += i18n( "URL: %1" ).arg( absoluteUrl().prettyURL() ) + "\n";
+ aux += i18n( "Original URL: %1" ).arg( originalUrl() ) + "\n";
+ if(node())
+ aux += i18n( "Node: %1" ).arg( node()->content() ) + "\n";
+
+ return aux;
+}
+
+
+LinkStatus* LinkStatus::lastRedirection(LinkStatus* ls)
+{
+ if(ls->isRedirection())
+ if(ls->redirection())
+ return lastRedirection(ls->redirection());
+ else
+ return ls;
+ else
+ return ls;
+}
+
+void LinkStatus::loadNode()
+{
+ Q_ASSERT(node_);
+
+ setOriginalUrl(node_->url());
+ setLabel(node_->linkLabel());
+
+ if(malformed())
+ {
+ setErrorOccurred(true);
+ setError(i18n( "Malformed" ));
+ setStatus(LinkStatus::MALFORMED);
+ kdDebug(23100) << "Malformed:" << endl;
+ kdDebug(23100) << "Node: " << node()->content() << endl;
+ //kdDebug(23100) << toString() << endl; // probable segfault
+ }
+}
+
+bool LinkStatus::malformed() const // don't inline please (#include "node.h")
+{
+ return (malformed_ || node_->malformed());
+}
+
+void LinkStatus::setChildrenNodes(vector<Node*> const& nodes) // don't inline please (#include "node.h")
+{
+ children_nodes_.reserve(nodes.size());
+ children_nodes_ = nodes;
+}
+
+void LinkStatus::setMalformed(bool flag)
+{
+ malformed_ = flag;
+ if(flag)
+ {
+ setErrorOccurred(true);
+ setError(i18n( "Malformed" ));
+ setStatus(LinkStatus::MALFORMED);
+ kdDebug(23100) << "Malformed!" << endl;
+ kdDebug(23100) << node()->content() << endl;
+ //kdDebug(23100) << toString() << endl; // probable segfault
+ }
+ else if(error() == i18n( "Malformed" ))
+ {
+ setErrorOccurred(false);
+ setError("");
+ setStatus(LinkStatus::UNDETERMINED);
+ }
+}
+
+void LinkStatus::save(QDomElement& element) const
+{
+ QDomElement child_element = element.ownerDocument().createElement("link");
+
+ // <url>
+ QDomElement tmp_1 = element.ownerDocument().createElement("url");
+ tmp_1.appendChild(element.ownerDocument().createTextNode(absoluteUrl().prettyURL()));
+ child_element.appendChild(tmp_1);
+
+ // <status>
+ tmp_1 = element.ownerDocument().createElement("status");
+ tmp_1.setAttribute("broken",
+ ResultView::displayableWithStatus(this, ResultView::bad) ?
+ "true" : "false");
+ tmp_1.appendChild(element.ownerDocument().createTextNode(statusText()));
+ child_element.appendChild(tmp_1);
+
+ // <label>
+ tmp_1 = element.ownerDocument().createElement("label");
+ tmp_1.appendChild(element.ownerDocument().createTextNode(KCharsets::resolveEntities(label())));
+ child_element.appendChild(tmp_1);
+
+ // <referers>
+ tmp_1 = element.ownerDocument().createElement("referrers");
+
+ for(QValueVector<KURL>::const_iterator it = referrers_.begin(); it != referrers_.end(); ++it)
+ {
+ QDomElement tmp_2 = element.ownerDocument().createElement("url");
+ tmp_2.appendChild(element.ownerDocument().createTextNode(it->prettyURL()));
+
+ tmp_1.appendChild(tmp_2);
+ }
+ Q_ASSERT(!referrers_.isEmpty());
+ child_element.appendChild(tmp_1);
+
+ element.appendChild(child_element);
+}
+
diff --git a/klinkstatus/src/engine/linkstatus.h b/klinkstatus/src/engine/linkstatus.h
new file mode 100644
index 00000000..e7567460
--- /dev/null
+++ b/klinkstatus/src/engine/linkstatus.h
@@ -0,0 +1,187 @@
+/***************************************************************************
+ * Copyright (C) 2004 by Paulo Moura Guedes *
+ * moura@kdewebdev.org *
+ * *
+ * This program is free software; you can redistribute it and/or modify *
+ * it under the terms of the GNU General Public License as published by *
+ * the Free Software Foundation; either version 2 of the License, or *
+ * (at your option) any later version. *
+ * *
+ * This program is distributed in the hope that it will be useful, *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
+ * GNU General Public License for more details. *
+ * *
+ * You should have received a copy of the GNU General Public License *
+ * along with this program; if not, write to the *
+ * Free Software Foundation, Inc., *
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. *
+ ***************************************************************************/
+
+#ifndef LINKSTATUS_H
+#define LINKSTATUS_H
+
+#include "../parser/http.h"
+#include "../utils/mvector.h"
+
+#include <kurl.h>
+#include <klocale.h>
+#include <kdebug.h>
+class TreeView;
+class TreeViewItem;
+
+#include <qstring.h>
+#include <qobject.h>
+#include <qvaluevector.h>
+class QDomElement;
+
+#include <vector>
+#include <iostream>
+
+using namespace std;
+
+
+class Node;
+
+class LinkStatus
+{
+public:
+
+ enum Status {
+ UNDETERMINED,
+ SUCCESSFULL,
+ BROKEN,
+ HTTP_REDIRECTION,
+ HTTP_CLIENT_ERROR,
+ HTTP_SERVER_ERROR,
+ TIMEOUT,
+ NOT_SUPPORTED,
+ MALFORMED
+ };
+
+ LinkStatus();
+ LinkStatus(KURL const& absolute_url);
+ LinkStatus(Node* node, LinkStatus* parent);
+ ~LinkStatus();
+
+ void save(QDomElement& element) const;
+
+ void reset();
+ void setRootUrl(KURL const& url);
+ void setStatus(Status status);
+ void setDepth(uint depth);
+ void setParent(LinkStatus* parent);
+ void setOriginalUrl(QString const& url_original);
+ void setLabel(QString const& label);
+ void setAbsoluteUrl(KURL const& url_absoluto);
+ void setDocHtml(QString const& doc_html);
+ void setHttpHeader(HttpResponseHeader const& cabecalho_http);
+ void setStatusText(QString const& statusText); // FIXME Legacy. This should be eliminated in favor of LinkStatus::Status
+ void setError(QString const& error);
+ void setIsRoot(bool flag);
+ void setErrorOccurred(bool houve_error);
+ void setIsRedirection(bool e_redirection);
+ void setRedirection(LinkStatus* redirection);
+ void setNode(Node* node);
+ void setChildrenNodes(vector<Node*> const& nodes);
+ void addChildNode(Node* node);
+ void reserveMemoryForChildrenNodes(int n);
+ void setChecked(bool flag);
+ void setExternalDomainDepth(int p);
+ void setOnlyCheckHeader(bool flag);
+ void setMalformed(bool flag = true);
+ void setHasBaseURI(bool flag = true);
+ void setHasHtmlDocTitle(bool flag = true);
+ void setBaseURI(KURL const& base_url);
+ void setHtmlDocTitle(QString const& title);
+ void setIgnored(bool flag = true);
+ void setMimeType(QString const& mimetype);
+ void setIsErrorPage(bool flag);
+ void setIsLocalRestrict(bool flag);
+ void setTreeViewItem(TreeViewItem* tree_view_item);
+ void addReferrer(KURL const& url);
+
+ KURL const& rootUrl() const;
+ Status const& status() const;
+ uint depth() const;
+ bool local() const; // linkstatus.paradigma.co.pt == paradigma.co.pt
+ bool isLocalRestrict() const; // linkstatus.paradigma.co.pt != paradigma.co.pt
+ LinkStatus const* parent() const;
+ QString const& originalUrl() const;
+ QString const& label() const;
+ KURL const& absoluteUrl() const;
+ QString const& docHtml() const;
+ HttpResponseHeader const& httpHeader() const;
+ HttpResponseHeader& httpHeader();
+ QString statusText() const; // FIXME Legacy. This should be eliminated in favor of LinkStatus::Status
+ QString const& error() const;
+ bool isRoot() const;
+ bool errorOccurred() const;
+ bool isRedirection() const;
+ LinkStatus* redirection() const;
+ Node* node() const;
+ vector<Node*> const& childrenNodes() const;
+ QString const toString() const;
+ bool checked() const;
+ int externalDomainDepth() const;
+ bool onlyCheckHeader() const;
+ bool malformed() const;
+ bool hasBaseURI() const;
+ bool hasHtmlDocTitle() const;
+ KURL const& baseURI() const;
+ QString const& htmlDocTitle() const;
+ bool ignored() const;
+ bool redirectionExists(KURL const& url) const; // to avoid cyclic links
+ QString mimeType() const;
+ bool isErrorPage() const;
+ TreeViewItem* treeViewItem() const;
+ QValueVector<KURL> const& referrers() const;
+
+ static LinkStatus* lastRedirection(LinkStatus* ls);
+
+private:
+
+ /**
+ Load some atributes in function of his parent node.
+ */
+ void loadNode();
+
+private:
+
+ KURL root_url_; // The URL which made the search start
+ Status status_;
+ int depth_;
+ int external_domain_depth_; // Para se poder escolher explorar domains diferentes ate n depth
+ QString original_url_;
+ QString label_;
+ KURL absolute_url_;
+ QString doc_html_;
+ HttpResponseHeader http_header_;
+ QString status_text_; // FIXME Legacy. This should be eliminated in favor of LinkStatus::Status
+ QString error_;
+ bool is_root_;
+ bool error_occurred_;
+ bool is_redirection_;
+ vector<Node*> children_nodes_;
+ LinkStatus* parent_;
+ LinkStatus* redirection_;
+ bool checked_;
+ bool only_check_header_;
+ bool malformed_;
+ Node* node_;
+ bool has_base_URI_;
+ bool has_html_doc_title_;
+ KURL base_URI_;
+ QString html_doc_title_;
+ bool ignored_;
+ QString mimetype_;
+ bool is_error_page_;
+ bool is_local_restrict_;
+ TreeViewItem* tree_view_item_;
+ QValueVector<KURL> referrers_;
+};
+
+#include "../parser/url.h"
+#include "linkstatus_impl.h"
+
+#endif
diff --git a/klinkstatus/src/engine/linkstatus_impl.h b/klinkstatus/src/engine/linkstatus_impl.h
new file mode 100644
index 00000000..3359664c
--- /dev/null
+++ b/klinkstatus/src/engine/linkstatus_impl.h
@@ -0,0 +1,417 @@
+/***************************************************************************
+ * Copyright (C) 2004 by Paulo Moura Guedes *
+ * moura@kdewebdev.org *
+ * *
+ * This program is free software; you can redistribute it and/or modify *
+ * it under the terms of the GNU General Public License as published by *
+ * the Free Software Foundation; either version 2 of the License, or *
+ * (at your option) any later version. *
+ * *
+ * This program is distributed in the hope that it will be useful, *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
+ * GNU General Public License for more details. *
+ * *
+ * You should have received a copy of the GNU General Public License *
+ * along with this program; if not, write to the *
+ * Free Software Foundation, Inc., *
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. *
+ ***************************************************************************/
+
+inline LinkStatus::LinkStatus()
+ : status_(LinkStatus::UNDETERMINED), depth_(-1), external_domain_depth_(-1), is_root_(false),
+ error_occurred_(false), is_redirection_(false), parent_(0), redirection_(0), checked_(false),
+ only_check_header_(true), malformed_(false),
+ node_(0), has_base_URI_(false), has_html_doc_title_(false), ignored_(false),
+ mimetype_(""), is_error_page_(false), tree_view_item_(0)
+{}
+
+inline LinkStatus::LinkStatus(KURL const& absolute_url)
+ : status_(LinkStatus::UNDETERMINED), depth_(-1), external_domain_depth_(-1), is_root_(false),
+ error_occurred_(false), is_redirection_(false), parent_(0), redirection_(0), checked_(false),
+ only_check_header_(true), malformed_(false),
+ node_(0), has_base_URI_(false), has_html_doc_title_(false), ignored_(false),
+ mimetype_(""), is_error_page_(false), tree_view_item_(0)
+{
+ setAbsoluteUrl(absolute_url);
+}
+
+inline LinkStatus::LinkStatus(Node* node, LinkStatus* parent)
+ : status_(LinkStatus::UNDETERMINED), depth_(-1), external_domain_depth_(-1), is_root_(false),
+ error_occurred_(false), is_redirection_(false), parent_(0), redirection_(0), checked_(false),
+ only_check_header_(true), malformed_(false),
+ node_(node), has_base_URI_(false), has_html_doc_title_(false), ignored_(false),
+ mimetype_(""), is_error_page_(false), tree_view_item_(0)
+{
+ loadNode();
+
+ setDepth(parent->depth() + 1);
+ setParent(parent);
+ setRootUrl(parent->rootUrl());
+}
+
+inline void LinkStatus::setRootUrl(KURL const& url)
+{
+ root_url_ = url;
+}
+
+inline void LinkStatus::setStatus(Status status)
+{
+ status_ = status;
+}
+
+inline void LinkStatus::setDepth(uint depth)
+{
+ depth_ = depth;
+}
+
+inline void LinkStatus::setParent(LinkStatus* parent)
+{
+ Q_ASSERT(parent);
+
+ parent_ = parent;
+ addReferrer(parent->absoluteUrl());
+}
+
+inline void LinkStatus::setAbsoluteUrl(KURL const& url_absoluto)
+{
+ absolute_url_ = url_absoluto;
+}
+
+inline void LinkStatus::setOriginalUrl(QString const& url_original)
+{
+ original_url_ = url_original;
+}
+
+inline void LinkStatus::setLabel(QString const& label)
+{
+ label_ = label;
+}
+
+inline void LinkStatus::setDocHtml(QString const& doc_html)
+{
+ Q_ASSERT(!doc_html.isEmpty());
+ doc_html_ = doc_html;
+}
+
+inline void LinkStatus::setHttpHeader(HttpResponseHeader const& cabecalho_http)
+{
+ http_header_ = cabecalho_http;
+}
+
+inline void LinkStatus::setStatusText(QString const& status)
+{
+ Q_ASSERT(!status.isEmpty());
+ status_text_ = status;
+}
+
+inline void LinkStatus::setError(QString const& error)
+{
+ Q_ASSERT(!error.isEmpty());
+ error_ = error;
+}
+
+inline void LinkStatus::setErrorOccurred(bool houve_error)
+{
+ error_occurred_ = houve_error;
+}
+
+inline void LinkStatus::setIsRoot(bool flag)
+{
+ is_root_ = flag;
+ label_ = i18n("ROOT");
+}
+
+inline void LinkStatus::setRedirection(LinkStatus* redirection)
+{
+ Q_ASSERT(redirection != NULL);
+ Q_ASSERT(isRedirection());
+ redirection_ = redirection;
+}
+
+inline void LinkStatus::setIsRedirection(bool e_redirection)
+{
+ is_redirection_ = e_redirection;
+}
+
+inline void LinkStatus::addChildNode(Node* node)
+{
+ children_nodes_.push_back(node);
+}
+
+inline void LinkStatus::reserveMemoryForChildrenNodes(int n)
+{
+ Q_ASSERT(n > 0);
+ children_nodes_.reserve(n);
+}
+
+inline void LinkStatus::setChecked(bool flag)
+{
+ checked_ = flag;
+}
+
+inline void LinkStatus::setExternalDomainDepth(int p)
+{
+ Q_ASSERT(p >= -1);
+ external_domain_depth_ = p;
+}
+
+inline void LinkStatus::setOnlyCheckHeader(bool flag)
+{
+ only_check_header_= flag;
+}
+
+inline void LinkStatus::setHasBaseURI(bool flag)
+{
+ has_base_URI_ = flag;
+}
+
+inline void LinkStatus::setHasHtmlDocTitle(bool flag)
+{
+ has_html_doc_title_ = flag;
+}
+
+inline void LinkStatus::setBaseURI(KURL const& base_url)
+{
+ if(!base_url.isValid())
+ {
+ kdWarning(23100) << "base url not valid: " << endl
+ << "parent: " << parent()->absoluteUrl().prettyURL() << endl
+ << "url: " << absoluteUrl().prettyURL() << endl
+ << "base url resolved: " << base_url.prettyURL() << endl;
+ }
+
+ Q_ASSERT(base_url.isValid());
+ has_base_URI_ = true;
+ base_URI_ = base_url;
+}
+
+inline void LinkStatus::setHtmlDocTitle(QString const& title)
+{
+ if(title.isNull() || title.isEmpty())
+ {
+ kdError(23100) << "HTML doc title is null or empty!" << endl
+ << toString() << endl;
+ }
+ Q_ASSERT(!title.isNull() && !title.isEmpty());
+
+ has_html_doc_title_ = true;
+ html_doc_title_ = title;
+}
+
+inline void LinkStatus::setIgnored(bool flag)
+{
+ ignored_ = flag;
+}
+
+inline void LinkStatus::setMimeType(QString const& mimetype)
+{
+ Q_ASSERT(!mimetype.isNull() && !mimetype.isEmpty());
+ mimetype_ = mimetype;
+}
+
+inline void LinkStatus::setIsErrorPage(bool flag)
+{
+ is_error_page_ = flag;
+}
+
+inline void LinkStatus::setIsLocalRestrict(bool flag)
+{
+ is_local_restrict_ = flag;
+}
+
+inline void LinkStatus::setTreeViewItem(TreeViewItem* tree_view_item)
+{
+ Q_ASSERT(tree_view_item);
+ tree_view_item_ = tree_view_item;
+}
+
+inline void LinkStatus::addReferrer(KURL const& url)
+{
+ Q_ASSERT(url.isValid());
+
+ referrers_.push_back(url);
+}
+
+
+
+
+inline KURL const& LinkStatus::rootUrl() const
+{
+ return root_url_;
+}
+
+inline LinkStatus::Status const& LinkStatus::status() const
+{
+ return status_;
+}
+
+inline uint LinkStatus::depth() const
+{
+ return depth_;
+}
+
+inline bool LinkStatus::local() const
+{
+ return external_domain_depth_ == -1;
+}
+
+inline bool LinkStatus::isLocalRestrict() const
+{
+ return is_local_restrict_;
+}
+
+inline LinkStatus const* LinkStatus::parent() const
+{
+ return parent_;
+}
+
+inline QString const& LinkStatus::originalUrl() const
+{
+ return original_url_;
+}
+
+inline QString const& LinkStatus::label() const
+{
+ return label_;
+}
+
+inline KURL const& LinkStatus::absoluteUrl() const
+{
+ return absolute_url_;
+}
+
+inline QString const& LinkStatus::docHtml() const
+{
+ return doc_html_;
+}
+
+inline HttpResponseHeader const& LinkStatus::httpHeader() const
+{
+ return http_header_;
+}
+
+inline HttpResponseHeader& LinkStatus::httpHeader()
+{
+ return http_header_;
+}
+
+inline QString LinkStatus::statusText() const
+{
+ if(errorOccurred())
+ return error();
+ else if(!absoluteUrl().protocol().startsWith("http"))
+ return status_text_;
+ else
+ {
+ QString string_code = QString::number(httpHeader().statusCode());
+ if(absoluteUrl().hasRef()) // ref URL
+ return status_text_;
+ else if(string_code == "200"/* or string_code == "304"*/)
+ return "OK";
+ else
+ return string_code;
+ }
+}
+
+inline QString const& LinkStatus::error() const
+{
+ return error_;
+}
+
+inline bool LinkStatus::isRoot() const
+{
+ return is_root_;
+}
+
+inline bool LinkStatus::errorOccurred() const
+{
+ return error_occurred_;
+}
+
+inline bool LinkStatus::isRedirection() const
+{
+ return is_redirection_;
+}
+
+inline LinkStatus* LinkStatus::redirection() const
+{
+ Q_ASSERT(isRedirection());
+
+ return redirection_;
+}
+
+inline Node* LinkStatus::node() const
+{
+ //Q_ASSERT(node_);
+ return node_;
+}
+
+inline vector<Node*> const& LinkStatus::childrenNodes() const
+{
+ return children_nodes_;
+}
+
+inline bool LinkStatus::checked() const
+{
+ return checked_;
+}
+
+inline int LinkStatus::externalDomainDepth() const
+{
+ return external_domain_depth_;
+}
+
+inline bool LinkStatus::onlyCheckHeader() const
+{
+ return only_check_header_;
+}
+
+inline bool LinkStatus::hasBaseURI() const
+{
+ return has_base_URI_;
+}
+
+inline bool LinkStatus::hasHtmlDocTitle() const
+{
+ return has_html_doc_title_;
+}
+
+inline KURL const& LinkStatus::baseURI() const
+{
+ Q_ASSERT(hasBaseURI());
+ return base_URI_;
+}
+
+inline QString const& LinkStatus::htmlDocTitle() const
+{
+ Q_ASSERT(has_html_doc_title_);
+ return html_doc_title_;
+}
+
+inline bool LinkStatus::ignored() const
+{
+ return ignored_;
+}
+
+inline QString LinkStatus::mimeType() const
+{
+ Q_ASSERT(!mimetype_.isNull());
+ return mimetype_;
+}
+
+inline bool LinkStatus::isErrorPage() const
+{
+ return is_error_page_;
+}
+
+inline TreeViewItem* LinkStatus::treeViewItem() const
+{
+ return tree_view_item_;
+}
+
+inline QValueVector<KURL> const& LinkStatus::referrers() const
+{
+ return referrers_;
+}
+
diff --git a/klinkstatus/src/engine/searchmanager.cpp b/klinkstatus/src/engine/searchmanager.cpp
new file mode 100644
index 00000000..81562a7a
--- /dev/null
+++ b/klinkstatus/src/engine/searchmanager.cpp
@@ -0,0 +1,916 @@
+/***************************************************************************
+ * Copyright (C) 2004 by Paulo Moura Guedes *
+ * moura@kdewebdev.org *
+ * *
+ * This program is free software; you can redistribute it and/or modify *
+ * it under the terms of the GNU General Public License as published by *
+ * the Free Software Foundation; either version 2 of the License, or *
+ * (at your option) any later version. *
+ * *
+ * This program is distributed in the hope that it will be useful, *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
+ * GNU General Public License for more details. *
+ * *
+ * You should have received a copy of the GNU General Public License *
+ * along with this program; if not, write to the *
+ * Free Software Foundation, Inc., *
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. *
+ ***************************************************************************/
+
+#include <kapplication.h>
+#include <kdebug.h>
+#include <klocale.h>
+#include <khtml_part.h>
+#include <kprotocolmanager.h>
+
+#include <qstring.h>
+#include <qvaluevector.h>
+#include <qdom.h>
+
+#include <iostream>
+#include <unistd.h>
+
+#include "searchmanager.h"
+#include "../parser/mstring.h"
+#include "../cfg/klsconfig.h"
+
+
+SearchManager::SearchManager(int max_simultaneous_connections, int time_out,
+ QObject *parent, const char *name)
+ : QObject(parent, name),
+ max_simultaneous_connections_(max_simultaneous_connections), has_document_root_(false),
+ depth_(-1), current_depth_(0), external_domain_depth_(0),
+ current_node_(0), current_index_(0), links_being_checked_(0),
+ finished_connections_(max_simultaneous_connections_),
+ maximum_current_connections_(-1), general_domain_(false),
+ checked_general_domain_(false), time_out_(time_out), current_connections_(0),
+ send_identification_(true), canceled_(false), searching_(false), checked_links_(0), ignored_links_(0),
+ check_parent_dirs_(true), check_external_links_(true), check_regular_expressions_(false),
+ number_of_level_links_(0), number_of_links_to_check_(0)
+{
+ root_.setIsRoot(true);
+
+ if (KLSConfig::userAgent().isEmpty()) {
+ KLSConfig::setUserAgent(KProtocolManager::defaultUserAgent());
+ }
+ user_agent_ = KLSConfig::userAgent();
+}
+
+void SearchManager::reset()
+{
+ kdDebug(23100) << "SearchManager::reset()" << endl;
+
+ //Q_ASSERT(not links_being_checked_);
+
+ root_.reset();
+ cleanItems();
+ depth_ = -1;
+ current_depth_ = 0;
+ current_node_ = 0;
+ current_index_ = 0;
+ finished_connections_ = max_simultaneous_connections_;
+ domain_ = "";
+ maximum_current_connections_ = -1;
+ general_domain_ = false;
+ checked_general_domain_ = false;
+ check_regular_expressions_ = false;
+ current_connections_ = 0;
+ canceled_ = false;
+ searching_ = false;
+ checked_links_ = 0;
+ if(KLSConfig::userAgent().isEmpty()) {
+ KLSConfig::setUserAgent(KProtocolManager::defaultUserAgent());
+ }
+ user_agent_ = KLSConfig::userAgent();
+
+ removeHtmlParts();
+}
+
+SearchManager::~SearchManager()
+{
+ reset();
+}
+
+void SearchManager::cleanItems()
+{
+ for(uint i = 0; i != search_results_.size(); ++i)
+ {
+ for(uint j = 0; j != search_results_[i].size() ; ++j)
+ {
+ for(uint l = 0; l != (search_results_[i])[j].size(); ++l)
+ {
+ if(((search_results_[i])[j])[l] != 0)
+ {
+ delete ((search_results_[i])[j])[l];
+ ((search_results_[i])[j])[l] = 0;
+ }
+ else
+ kdDebug(23100) << "LinkStatus NULL!!" << endl;
+ }
+ search_results_[i][j].clear();
+ }
+ search_results_[i].clear();
+ }
+ search_results_.clear();
+ kdDebug(23100) << endl;
+}
+
+void SearchManager::startSearch(KURL const& root, SearchMode const& modo)
+{
+ canceled_ = false;
+
+ //time_.restart();
+ time_.start();
+
+ Q_ASSERT(root.isValid());
+ //Q_ASSERT(root.protocol() == "http" || root.protocol() == "https");
+
+ if(root.hasHost() && (domain_.isNull() || domain_.isEmpty()))
+ {
+ setDomain(root.host() + root.directory());
+ kdDebug(23100) << "Domain: " << domain_ << endl;
+ }
+ root_.setIsRoot(true);
+ root_.setDepth(0);
+ root_.setOriginalUrl(root.prettyURL());
+ root_.setAbsoluteUrl(root);
+ root_.setOnlyCheckHeader(false);
+ root_.setRootUrl(root);
+
+ search_mode_ = modo;
+ if(modo == depth)
+ Q_ASSERT(depth_ != -1);
+ else if(modo == domain)
+ Q_ASSERT(depth_ == -1);
+ else
+ Q_ASSERT(depth_ != -1);
+
+ searching_ = true;
+
+ //Q_ASSERT(domain_ != QString::null);
+ checkRoot();
+}
+
+void SearchManager::resume()
+{
+ searching_ = true;
+ canceled_ = false;
+ continueSearch();
+}
+
+void SearchManager::finnish()
+{
+ searching_ = false;
+ while(links_being_checked_)
+ {
+ kdDebug(23100) << "links_being_checked_: " << links_being_checked_ << endl;
+ sleep(1);
+ }
+ emit signalSearchFinished();
+}
+
+void SearchManager::pause()
+{
+ searching_ = false;
+ while(links_being_checked_)
+ {
+ kdDebug(23100) << "links_being_checked_: " << links_being_checked_ << endl;
+ sleep(1);
+ }
+ emit signalSearchPaused();
+}
+
+void SearchManager::cancelSearch()
+{
+ canceled_ = true;
+}
+
+void SearchManager::checkRoot()
+{
+ LinkChecker* checker = new LinkChecker(&root_, time_out_, this, "link_checker");
+ checker->setSearchManager(this);
+
+ connect(checker, SIGNAL(transactionFinished(const LinkStatus *, LinkChecker *)),
+ this, SLOT(slotRootChecked(const LinkStatus *, LinkChecker *)));
+ /*
+ connect(checker, SIGNAL(jobFinnished(LinkChecker *)),
+ this, SLOT(slotLinkCheckerFinnished(LinkChecker *)));
+ */
+ checker->check();
+}
+
+void SearchManager::slotRootChecked(const LinkStatus * link, LinkChecker * checker)
+{
+ kdDebug(23100) << "SearchManager::slotRootChecked:" << endl;
+ kdDebug(23100) << link->absoluteUrl().url() << " -> " <<
+ LinkStatus::lastRedirection(&root_)->absoluteUrl().url() << endl;
+
+ Q_ASSERT(checked_links_ == 0);
+ Q_ASSERT(search_results_.size() == 0);
+
+ ++checked_links_;
+ //kdDebug(23100) << "++checked_links_: SearchManager::slotRootChecked" << endl;
+ emit signalRootChecked(link, checker);
+
+ if(search_mode_ != depth || depth_ > 0)
+ {
+ current_depth_ = 1;
+
+ vector<LinkStatus*> no = children(LinkStatus::lastRedirection(&root_));
+
+ emit signalLinksToCheckTotalSteps(no.size());
+
+ vector< vector<LinkStatus*> > nivel;
+ nivel.push_back(no);
+
+ search_results_.push_back(nivel);
+
+ if(search_results_.size() != 1)
+ {
+ kdDebug(23100) << "search_results_.size() != 1:" << endl;
+ kdDebug(23100) << "size: " << search_results_.size() << endl;
+ }
+ Q_ASSERT(search_results_.size() == 1);
+
+ if(no.size() > 0)
+ {
+ startSearch();
+ }
+ else
+ {
+ kdDebug(23100) << "SearchManager::slotRootChecked#1" << endl;
+ finnish();
+ }
+ }
+
+ else
+ {
+ Q_ASSERT(search_results_.size() == 0);
+ kdDebug(23100) << "SearchManager::slotRootChecked#2" << endl;
+ finnish();
+ }
+
+ delete checker;
+ checker = 0;
+}
+
+vector<LinkStatus*> SearchManager::children(LinkStatus* link)
+{
+ vector<LinkStatus*> children;
+
+ if(!link || link->absoluteUrl().hasRef())
+ return children;
+
+ vector<Node*> const& nodes = link->childrenNodes();
+
+ int count = 0;
+ for(uint i = 0; i != nodes.size(); ++i)
+ {
+ ++count;
+
+ Node* node = nodes[i];
+ KURL url;
+ if(node->url().isEmpty())
+ url = "";
+ else
+ url = Url::normalizeUrl(node->url(), *link, documentRoot().path());
+
+ if( (node->isLink() &&
+ checkable(url, *link) &&
+ !Url::existUrl(url, children) &&
+ !node->url().isEmpty())
+ ||
+ node->malformed() )
+ {
+ LinkStatus* ls = new LinkStatus(node, link);
+ ls->setAbsoluteUrl(url);
+
+ if(localDomain(ls->absoluteUrl()))
+ ls->setExternalDomainDepth(-1);
+ else
+ ls->setExternalDomainDepth(link->externalDomainDepth() + 1);
+
+ //ls->setIsLocalRestrict(localDomain(url));
+ ls->setIsLocalRestrict(ls->local()); // @todo clean this nonsense
+
+ if(!validUrl(url)) {
+ ls->setMalformed(true);
+ ls->setErrorOccurred(true);
+ }
+
+ ls->setOnlyCheckHeader(onlyCheckHeader(ls));
+
+ if(link->externalDomainDepth() > external_domain_depth_)
+ {
+ kdDebug(23100) << "link->externalDomainDepth() > external_domain_depth_: "
+ << link->externalDomainDepth() << endl;
+ kdDebug(23100) << "link: " << endl << link->toString() << endl;
+ kdDebug(23100) << "child: " << endl << ls->toString() << endl;
+ }
+ Q_ASSERT(link->externalDomainDepth() <= external_domain_depth_);
+
+ children.push_back(ls);
+ }
+ if(count == 50)
+ {
+ kapp->processEvents();
+ count = 0;
+ }
+ }
+
+ return children;
+}
+
+bool SearchManager::existUrl(KURL const& url, KURL const& url_parent) const
+{
+ if(url.prettyURL().isEmpty() || root_.originalUrl() == url.prettyURL())
+ return true;
+
+ for(uint i = 0; i != search_results_.size(); ++i)
+ for(uint j = 0; j != search_results_[i].size(); ++j)
+ for(uint l = 0; l != (search_results_[i])[j].size(); ++l)
+ {
+ LinkStatus* tmp = search_results_[i][j][l];
+ Q_ASSERT(tmp);
+ if(tmp->absoluteUrl() == url)
+ { // URL exists
+ QValueVector<KURL> referrers(tmp->referrers());
+
+ // Add new referrer
+ for(uint i = 0; i != referrers.size(); ++i)
+ {
+ if(referrers[i] == url_parent)
+ return true;
+ }
+ tmp->addReferrer(url_parent);
+
+ return true;
+ }
+ }
+
+ return false;
+}
+
+LinkStatus const* SearchManager::linkStatus(QString const& s_url) const
+{
+ Q_ASSERT(!s_url.isEmpty());
+
+ if(root_.absoluteUrl().url() == s_url)
+ return &root_;
+
+ int count = 0;
+ for(uint i = 0; i != search_results_.size(); ++i)
+ for(uint j = 0; j != search_results_[i].size(); ++j)
+ for(uint l = 0; l != (search_results_[i])[j].size(); ++l)
+ {
+ ++count;
+
+ LinkStatus* ls = search_results_[i][j][l];
+ Q_ASSERT(ls);
+ if(ls->absoluteUrl().url() == s_url && ls->checked())
+ return ls;
+
+ if(count == 50)
+ {
+ count = 0;
+ kapp->processEvents();
+ }
+
+ }
+
+ return 0;
+}
+
+
+void SearchManager::startSearch()
+{
+ Q_ASSERT(current_depth_ == 1);
+ Q_ASSERT(search_results_[current_depth_ - 1].size() == 1);
+ Q_ASSERT(current_node_ == 0);
+
+ if( (int)current_depth_ <= depth_ || search_mode_ != depth )
+ checkVectorLinks(nodeToAnalize());
+ else
+ {
+ kdDebug(23100) << "Search Finished! (SearchManager::comecaPesquisa)" << endl;
+ finnish();
+ }
+}
+
+void SearchManager::continueSearch()
+{
+ Q_ASSERT(!links_being_checked_);
+
+ vector<LinkStatus*> const& no = nodeToAnalize();
+
+ if((uint)current_index_ < no.size())
+ checkVectorLinks(no);
+
+ else
+ {
+ current_index_ = 0;
+ kdDebug(23100) << "Next node_____________________\n\n";
+ ++current_node_;
+ if( (uint)current_node_ < (search_results_[current_depth_ - 1]).size() )
+ checkVectorLinks(nodeToAnalize());
+ else
+ {
+ kdDebug(23100) << "Next Level_____________________________________________________________________________________\n\n\n";
+ if(search_mode_ == SearchManager::domain ||
+ current_depth_ < depth_)
+ {
+ current_node_ = 0;
+ ++current_depth_;
+
+ addLevel();
+
+ if( (uint)current_depth_ == search_results_.size() )
+ checkVectorLinks(nodeToAnalize());
+ else
+ {
+ kdDebug(23100) << "Search Finished! (SearchManager::continueSearch#1)" << endl;
+ finnish();
+ }
+ }
+ else
+ {
+ kdDebug(23100) << "Search Finished! (SearchManager::continueSearch#2)" << endl;
+ finnish();
+ }
+ }
+ }
+}
+
+vector<LinkStatus*> const& SearchManager::nodeToAnalize() const
+{
+ Q_ASSERT( (uint)current_depth_ == search_results_.size() );
+ Q_ASSERT( (uint)current_node_ < (search_results_[current_depth_ - 1]).size() );
+
+ return (search_results_[current_depth_ - 1])[current_node_];
+}
+
+void SearchManager::checkVectorLinks(vector<LinkStatus*> const& links)
+{
+ checkLinksSimultaneously(chooseLinks(links));
+}
+
+vector<LinkStatus*> SearchManager::chooseLinks(vector<LinkStatus*> const& links)
+{
+ vector<LinkStatus*> escolha;
+ for(int i = 0; i != max_simultaneous_connections_; ++i)
+ {
+ if((uint)current_index_ < links.size())
+ escolha.push_back(links[current_index_++]);
+ }
+ return escolha;
+}
+
+void SearchManager::checkLinksSimultaneously(vector<LinkStatus*> const& links)
+{
+ Q_ASSERT(finished_connections_ <= max_simultaneous_connections_);
+ finished_connections_ = 0;
+ links_being_checked_ = 0;
+ maximum_current_connections_ = -1;
+
+ if(links.size() < (uint)max_simultaneous_connections_)
+ maximum_current_connections_ = links.size();
+ else
+ maximum_current_connections_ = max_simultaneous_connections_;
+
+ for(uint i = 0; i != links.size(); ++i)
+ {
+ LinkStatus* ls(links[i]);
+ Q_ASSERT(ls);
+
+ QString protocol = ls->absoluteUrl().protocol();
+
+ ++links_being_checked_;
+ Q_ASSERT(links_being_checked_ <= max_simultaneous_connections_);
+
+ if(ls->malformed())
+ {
+ Q_ASSERT(ls->errorOccurred());
+ Q_ASSERT(ls->status() == LinkStatus::MALFORMED);
+
+ ls->setChecked(true);
+ slotLinkChecked(ls, 0);
+ }
+
+ else if(ls->absoluteUrl().prettyURL().contains("javascript:", false))
+ {
+ ++ignored_links_;
+ ls->setIgnored(true);
+ ls->setErrorOccurred(true);
+ ls->setError(i18n( "Javascript not supported" ));
+ ls->setStatus(LinkStatus::NOT_SUPPORTED);
+ ls->setChecked(true);
+ slotLinkChecked(ls, 0);
+ }
+ /*
+ else if(!(protocol == "http" || protocol == "https"))
+ {
+ ++ignored_links_;
+ ls->setIgnored(true);
+ ls->setErrorOccurred(true);
+ ls->setError(i18n("Protocol %1 not supported").arg(protocol));
+ ls->setStatus(LinkStatus::MALFORMED);
+ ls->setChecked(true);
+ slotLinkChecked(ls, 0);
+ }
+ */
+ else
+ {
+ LinkChecker* checker = new LinkChecker(ls, time_out_, this, "link_checker");
+ checker->setSearchManager(this);
+
+ connect(checker, SIGNAL(transactionFinished(const LinkStatus *, LinkChecker *)),
+ this, SLOT(slotLinkChecked(const LinkStatus *, LinkChecker *)));
+ /*
+ connect(checker, SIGNAL(jobFinnished(LinkChecker *)),
+ this, SLOT(slotLinkCheckerFinnished(LinkChecker *)));
+ */
+ checker->check();
+ }
+ }
+}
+
+void SearchManager::slotLinkChecked(const LinkStatus * link, LinkChecker * checker)
+{
+ kdDebug(23100) << "SearchManager::slotLinkChecked:" << endl;
+// kdDebug(23100) << link->absoluteUrl().url() << " -> " <<
+// LinkStatus::lastRedirection((const_cast<LinkStatus*> (link)))->absoluteUrl().url() << endl;
+
+ Q_ASSERT(link);
+ emit signalLinkChecked(link, checker);
+ ++checked_links_;
+ ++finished_connections_;
+ --links_being_checked_;
+
+ if(links_being_checked_ < 0)
+ kdDebug(23100) << link->toString() << endl;
+ Q_ASSERT(links_being_checked_ >= 0);
+
+ if(canceled_ && searching_ && !links_being_checked_)
+ {
+ pause();
+ }
+
+ else if(!canceled_ && finished_connections_ == maximumCurrentConnections() )
+ {
+ continueSearch();
+ return;
+ }
+ /*
+ delete checker;
+ checker = 0;
+ */
+}
+
+void SearchManager::addLevel()
+{
+ search_results_.push_back(vector< vector <LinkStatus*> >());
+ vector< vector <LinkStatus*> >& ultimo_nivel(search_results_[search_results_.size() - 2]);
+
+ number_of_level_links_ = 0;
+ number_of_links_to_check_ = 0;
+ uint end = ultimo_nivel.size();
+
+ for(uint i = 0; i != end; ++i) // nodes
+ {
+ uint end_sub1 = ultimo_nivel[i].size();
+ for(uint j = 0; j != end_sub1; ++j) // links
+ ++number_of_level_links_;
+ }
+
+ if(number_of_level_links_)
+ emit signalAddingLevelTotalSteps(number_of_level_links_);
+
+ for(uint i = 0; i != end; ++i) // nodes
+ {
+ uint end_sub1 = ultimo_nivel[i].size();
+ for(uint j = 0; j != end_sub1; ++j) // links
+ {
+ vector <LinkStatus*> f(children( LinkStatus::lastRedirection(((ultimo_nivel[i])[j])) ));
+ if(f.size() != 0)
+ {
+ search_results_[search_results_.size() - 1].push_back(f);
+ number_of_links_to_check_ += f.size();
+ }
+
+ emit signalAddingLevelProgress();
+// kapp->processEvents();
+ }
+ }
+ if( (search_results_[search_results_.size() - 1]).size() == 0 )
+ search_results_.pop_back();
+ else
+ emit signalLinksToCheckTotalSteps(number_of_links_to_check_);
+}
+
+bool SearchManager::checkable(KURL const& url, LinkStatus const& link_parent) const
+{
+ if(existUrl(url, link_parent.absoluteUrl()))
+ return false;
+
+ if(!checkableByDomain(url, link_parent))
+ return false;
+
+ if(!check_parent_dirs_)
+ {
+ if(Url::parentDir(root_.absoluteUrl(), url))
+ return false;
+ }
+ if(!check_external_links_)
+ {
+ if(Url::externalLink(root_.absoluteUrl(), url))
+ return false;
+ }
+ if(check_regular_expressions_)
+ {
+ Q_ASSERT(!reg_exp_.isEmpty());
+
+ if(reg_exp_.search(url.url()) != -1)
+ return false;
+ }
+
+ //kdDebug(23100) << "url " << url.url() << " is checkable!" << endl;
+ return true;
+}
+
+bool SearchManager::checkableByDomain(KURL const& url, LinkStatus const& link_parent) const
+{
+ bool result = false;
+ if(localDomain(url))
+ result = true;
+ else if( (link_parent.externalDomainDepth() + 1) < external_domain_depth_ )
+ result = true;
+ else
+ result = false;
+ /*
+ if(!result)
+ kdDebug(23100) << "\n\nURL " << url.url() << " is not checkable by domain\n\n" << endl;
+ */
+ return result;
+}
+/*
+bool SearchManager::localDomain(KURL const& url) const
+ {
+ KURL url_root = root_.absoluteUrl();
+
+ if(url_root.protocol() != url.protocol())
+ return false;
+
+ if(url_root.hasHost())
+ {
+ if(generalDomain())
+ {
+ return equalHost(domain_, url.host());
+ }
+ else
+ {
+ vector<QString> referencia = tokenizeWordsSeparatedBy(domain_, QChar('/'));
+ vector<QString> a_comparar = tokenizeWordsSeparatedBy(url.host() + url.directory(), QChar('/'));
+
+ if(a_comparar.size() < referencia.size())
+ return false;
+ else
+ {
+ for(uint i = 0; i != referencia.size(); ++i)
+ {
+ if(i == 0)
+ { // host, deal with specific function
+ if(!equalHost(referencia[i], a_comparar[i], !check_parent_dirs_))
+ return false;
+ }
+ else if(referencia[i] != a_comparar[i])
+ return false;
+ }
+ }
+ return true;
+ }
+ }
+ else if(checkParentDirs())
+ return true;
+ else
+ return url_root.isParentOf(url);
+ }
+*/
+
+/**
+ The same as SearchManager::localDomain(), but only for http or https.
+ http://linkstatus.paradigma.co.pt != http://paradigma.co.pt
+*/
+/*
+bool SearchManager::isLocalRestrict(KURL const& url) const
+ {
+ Q_ASSERT(url.protocol() == "http" || url.protocol() == "https");
+
+ KURL url_root = root_.absoluteUrl();
+
+ if(url_root.protocol() != url.protocol())
+ return false;
+
+ if(url_root.hasHost())
+ {
+ vector<QString> referencia = tokenizeWordsSeparatedBy(domain_, QChar('/'));
+ vector<QString> a_comparar = tokenizeWordsSeparatedBy(url.host() + url.directory(), QChar('/'));
+
+ if(a_comparar.size() < referencia.size())
+ return false;
+ else
+ {
+ for(uint i = 0; i != referencia.size(); ++i)
+ {
+ if(i == 0)
+ { // host, deal with specific function
+ if(!equalHost(referencia[i], a_comparar[i], true))
+ return false;
+ }
+ else if(referencia[i] != a_comparar[i])
+ return false;
+ }
+ }
+ return true;
+ }
+ else
+ return false;
+ }
+*/
+bool SearchManager::generalDomain() const
+{
+ if(checked_general_domain_)
+ return general_domain_;
+
+ else
+ {
+ Q_ASSERT(!domain_.isEmpty());
+
+ if(!check_parent_dirs_)
+ return false;
+
+ int barra = domain_.find('/');
+ if(barra != -1 && (uint)barra != domain_.length() - 1)
+ {
+ kdDebug(23100) << "Domain nao vago" << endl;
+ return false;
+ }
+ else
+ {
+ vector<QString> palavras = tokenizeWordsSeparatedByDots(domain_);
+ Q_ASSERT(palavras.size() >= 1); // host might be localhost
+
+ QString primeira_palavra = palavras[0];
+ if(primeira_palavra == "www")
+ {
+ Q_ASSERT(palavras.size() >= 3);
+ kdDebug(23100) << "Domain vago" << endl;
+ return true;
+ }
+ else if(palavras.size() == 2)
+ {
+ kdDebug(23100) << "Domain vago" << endl;
+ return true;
+ }
+ else
+ {
+ kdDebug(23100) << "Domain nao vago" << endl;
+ return false;
+ }
+ }
+ }
+}
+
+bool SearchManager::onlyCheckHeader(LinkStatus* ls) const
+{
+ if(search_mode_ == depth)
+ return current_depth_ == depth_;
+
+ else if(search_mode_ == domain)
+ return !ls->local() &&
+ ls->externalDomainDepth() == external_domain_depth_ - 1;
+
+ else
+ return
+ current_depth_ == depth_ ||
+ (!ls->local() &&
+ ls->externalDomainDepth() == external_domain_depth_ - 1);
+}
+
+void SearchManager::slotSearchFinished()
+{}
+
+void SearchManager::slotLinkCheckerFinnished(LinkChecker * checker)
+{
+ kdDebug(23100) << "deleting linkchecker" << endl;
+
+ Q_ASSERT(checker);
+ //Q_ASSERT(checker->linkStatus()->checked());
+
+ delete checker;
+ checker = 0;
+}
+
+KHTMLPart* SearchManager::htmlPart(QString const& key_url) const
+{
+ if(!html_parts_.contains(key_url))
+ return 0;
+
+ return html_parts_[key_url];
+}
+
+void SearchManager::addHtmlPart(QString const& key_url, KHTMLPart* html_part)
+{
+ Q_ASSERT(!key_url.isEmpty());
+ Q_ASSERT(html_part);
+
+ // FIXME configurable
+ if(html_parts_.count() > 150)
+ removeHtmlParts();
+
+ html_parts_.insert(key_url, html_part);
+}
+
+void SearchManager::removeHtmlParts()
+{
+ KHTMLPartMap::Iterator it;
+ for(it = html_parts_.begin(); it != html_parts_.end(); ++it)
+ {
+ delete it.data();
+ it.data() = 0;
+ }
+
+ html_parts_.clear();
+}
+
+void SearchManager::save(QDomElement& element) const
+{
+ // <url>
+ QDomElement child_element = element.ownerDocument().createElement("url");
+ child_element.appendChild(element.ownerDocument().createTextNode(root_.absoluteUrl().prettyURL()));
+ element.appendChild(child_element);
+
+ // <recursively>
+ bool recursively = searchMode() == domain || depth_ > 0;
+ child_element = element.ownerDocument().createElement("recursively");
+ child_element.appendChild(element.ownerDocument().createTextNode(recursively ? "true" : "false"));
+ element.appendChild(child_element);
+
+ // <depth>
+ child_element = element.ownerDocument().createElement("depth");
+ child_element.appendChild(element.ownerDocument().
+ createTextNode(searchMode() == domain ? QString("Unlimited") : QString::number(depth_)));
+ element.appendChild(child_element);
+
+ // <check_parent_folders>
+ child_element = element.ownerDocument().createElement("check_parent_folders");
+ child_element.appendChild(element.ownerDocument().
+ createTextNode(checkParentDirs() ? "true" : "false"));
+ element.appendChild(child_element);
+
+ // <check_external_links>
+ child_element = element.ownerDocument().createElement("check_external_links");
+ child_element.appendChild(element.ownerDocument().
+ createTextNode(checkExternalLinks() ? "true" : "false"));
+ element.appendChild(child_element);
+
+ // <check_regular_expression>
+ child_element = element.ownerDocument().createElement("check_regular_expression");
+ child_element.setAttribute("check", checkRegularExpressions() ? "true" : "false");
+ if(checkRegularExpressions())
+ child_element.appendChild(element.ownerDocument().
+ createTextNode(reg_exp_.pattern()));
+ element.appendChild(child_element);
+
+ child_element = element.ownerDocument().createElement("link_list");
+ element.appendChild(child_element);
+
+ for(uint i = 0; i != search_results_.size(); ++i)
+ {
+ for(uint j = 0; j != search_results_[i].size() ; ++j)
+ {
+ for(uint l = 0; l != (search_results_[i])[j].size(); ++l)
+ {
+ LinkStatus* ls = ((search_results_[i])[j])[l];
+ if(ls->checked())
+ ls->save(child_element);
+ }
+ }
+ }
+}
+
+QString SearchManager::toXML() const
+{
+ QDomDocument doc;
+ doc.appendChild(doc.createProcessingInstruction( "xml",
+ "version=\"1.0\" encoding=\"UTF-8\""));
+
+ QDomElement root = doc.createElement("klinkstatus");
+ doc.appendChild(root);
+
+ save(root);
+
+ return doc.toString(4);
+}
+
+#include "searchmanager.moc"
diff --git a/klinkstatus/src/engine/searchmanager.h b/klinkstatus/src/engine/searchmanager.h
new file mode 100644
index 00000000..135d267a
--- /dev/null
+++ b/klinkstatus/src/engine/searchmanager.h
@@ -0,0 +1,193 @@
+/***************************************************************************
+ * Copyright (C) 2004 by Paulo Moura Guedes *
+ * moura@kdewebdev.org *
+ * *
+ * This program is free software; you can redistribute it and/or modify *
+ * it under the terms of the GNU General Public License as published by *
+ * the Free Software Foundation; either version 2 of the License, or *
+ * (at your option) any later version. *
+ * *
+ * This program is distributed in the hope that it will be useful, *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
+ * GNU General Public License for more details. *
+ * *
+ * You should have received a copy of the GNU General Public License *
+ * along with this program; if not, write to the *
+ * Free Software Foundation, Inc., *
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. *
+ ***************************************************************************/
+
+#ifndef GESTOR_PESQUISA_H
+#define GESTOR_PESQUISA_H
+
+#include <kurl.h>
+
+#include <qobject.h>
+#include <qstring.h>
+#include <qdatetime.h>
+#include <qregexp.h>
+#include <qmap.h>
+class QDomElement;
+
+#include <vector>
+
+#include "linkstatus.h"
+#include "linkchecker.h"
+#include "../parser/node.h"
+#include "../parser/url.h"
+
+using namespace std;
+
+typedef QMap<QString, KHTMLPart*> KHTMLPartMap;
+
+class SearchManager: public QObject
+{
+ Q_OBJECT
+
+public:
+
+ enum SearchMode {
+ depth,
+ domain,
+ depth_and_domain
+ };
+
+ SearchManager(int max_simultaneous_connections = 3, int time_out = 50,
+ QObject *parent = 0, const char *name = 0);
+ ~SearchManager();
+
+ QString toXML() const;
+ void save(QDomElement& element) const;
+
+ KHTMLPartMap const& htmlParts() const { return html_parts_; }
+
+ KHTMLPart* htmlPart(QString const& key_url) const;
+ void addHtmlPart(QString const& key_url, KHTMLPart* html_part);
+ void removeHtmlParts();
+
+ void startSearch(KURL const& root);
+ void startSearch(KURL const& root, SearchMode const& modo);
+ void resume();
+ void cancelSearch();
+
+ bool hasDocumentRoot() const;
+ KURL const& documentRoot() const;
+ void setDocumentRoot(KURL const& url);
+
+ void setSearchMode(SearchMode modo);
+ void setDepth(int depth);
+ void setExternalDomainDepth(int depth);
+ void setDomain(QString const& domain);
+ void setCheckParentDirs(bool flag);
+ void setCheckExternalLinks(bool flag);
+ void setCheckRegularExpressions(bool flag);
+ void setRegularExpression(QString const& reg_exp, bool case_sensitive);
+ void setTimeOut(int time_out);
+
+ void cleanItems();
+ void reset();
+
+ bool searching() const;
+ bool localDomain(KURL const& url, bool restrict = true) const;
+ //bool isLocalRestrict(KURL const& url) const;
+ SearchMode const& searchMode() const;
+ bool checkRegularExpressions() const { return check_regular_expressions_; }
+ bool existUrl(KURL const& url, KURL const& url_parent) const;
+ LinkStatus const* linkStatus(QString const& s_url) const;
+ int checkedLinks() const;
+ QTime timeElapsed() const;
+ bool checkParentDirs() const;
+ bool checkExternalLinks() const;
+ LinkStatus const* linkStatusRoot() const;
+ int maxSimultaneousConnections() const;
+ int timeOut() const;
+
+ bool sendIdentification() const { return send_identification_; }
+ QString const& userAgent() const { return user_agent_; }
+
+private:
+
+ void checkRoot();
+ void checkVectorLinks(vector<LinkStatus*> const& links); // corresponde a um no de um nivel de depth
+ vector<LinkStatus*> children(LinkStatus* link);
+ void startSearch();
+ void continueSearch();
+ void finnish();
+ void pause();
+ vector<LinkStatus*> const& nodeToAnalize() const;
+ vector<LinkStatus*> chooseLinks(vector<LinkStatus*> const& links);
+ void checkLinksSimultaneously(vector<LinkStatus*> const& links);
+ void addLevel();
+ bool checkableByDomain(KURL const& url, LinkStatus const& link_parent) const;
+ bool checkable(KURL const& url, LinkStatus const& link_parent) const;
+ int maximumCurrentConnections() const;
+ bool onlyCheckHeader(LinkStatus* ls) const;
+
+ /*
+ Entende-se por domain vago um domain do tipo www.google.pt ou google.pt, pelo que,
+ por exemplo, imagens.google.pt, e considerado estar no mesmo domain.
+ pwp.netcabo.pt ou www.google.pt/imagens nao sao considerados domains vagos.
+ */
+ bool generalDomain() const;
+ bool generalDomainChecked() const; // Para garantir que o procedimento generalDomain() so e chamado uma vez
+
+private slots:
+
+ void slotRootChecked(const LinkStatus * link, LinkChecker * checker);
+ void slotLinkChecked(const LinkStatus * link, LinkChecker * checker);
+ void slotSearchFinished();
+ void slotLinkCheckerFinnished(LinkChecker * checker);
+
+signals:
+
+ void signalRootChecked(const LinkStatus * link, LinkChecker * checker);
+ void signalLinkChecked(const LinkStatus * link, LinkChecker * checker);
+ void signalSearchFinished();
+ void signalSearchPaused();
+ void signalAddingLevelTotalSteps(uint number_of_links);
+ void signalAddingLevelProgress();
+ void signalLinksToCheckTotalSteps(uint links_to_check);
+ //void signalLinksToCheckProgress();
+
+private:
+
+ int max_simultaneous_connections_;
+ SearchMode search_mode_;
+ LinkStatus root_;
+ bool has_document_root_;
+ KURL document_root_url_; // in case of non http protocols the document root must be explicitly given
+ int depth_;
+ int current_depth_;
+ int external_domain_depth_;
+ int current_node_;
+ int current_index_;
+ int links_being_checked_;
+ int finished_connections_;
+ int maximum_current_connections_;
+ QRegExp reg_exp_;
+ QString domain_;
+ bool general_domain_;
+ bool checked_general_domain_;
+ int time_out_;
+ int current_connections_;
+ bool send_identification_; // user-agent
+ QString user_agent_;
+
+ bool canceled_;
+ bool searching_;
+ int checked_links_;
+ QTime time_;
+ int ignored_links_;
+ bool check_parent_dirs_;
+ bool check_external_links_;
+ bool check_regular_expressions_;
+ uint number_of_level_links_;
+ uint number_of_links_to_check_;
+ vector< vector< vector <LinkStatus*> > > search_results_;
+ KHTMLPartMap html_parts_;
+};
+
+#include "searchmanager_impl.h"
+
+#endif
diff --git a/klinkstatus/src/engine/searchmanager_impl.h b/klinkstatus/src/engine/searchmanager_impl.h
new file mode 100644
index 00000000..eaa5e572
--- /dev/null
+++ b/klinkstatus/src/engine/searchmanager_impl.h
@@ -0,0 +1,158 @@
+/***************************************************************************
+ * Copyright (C) 2004 by Paulo Moura Guedes *
+ * moura@kdewebdev.org *
+ * *
+ * This program is free software; you can redistribute it and/or modify *
+ * it under the terms of the GNU General Public License as published by *
+ * the Free Software Foundation; either version 2 of the License, or *
+ * (at your option) any later version. *
+ * *
+ * This program is distributed in the hope that it will be useful, *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
+ * GNU General Public License for more details. *
+ * *
+ * You should have received a copy of the GNU General Public License *
+ * along with this program; if not, write to the *
+ * Free Software Foundation, Inc., *
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. *
+ ***************************************************************************/
+
+
+
+
+inline int SearchManager::maximumCurrentConnections() const
+{
+ Q_ASSERT(maximum_current_connections_ != -1);
+ return maximum_current_connections_;
+}
+
+inline SearchManager::SearchMode const& SearchManager::searchMode() const
+{
+ return search_mode_;
+}
+
+inline int SearchManager::checkedLinks() const
+{
+ Q_ASSERT(checked_links_ > 0);
+ return checked_links_;
+}
+
+inline QTime SearchManager::timeElapsed() const
+{
+ int ms = time_.elapsed();
+ //kdDebug(23100) << "Time elapsed (ms): " << ms << endl;
+ return QTime(0, 0).addMSecs(ms);
+}
+
+inline void SearchManager::startSearch(KURL const& root)
+{
+ startSearch(root, search_mode_);
+}
+
+inline void SearchManager::setSearchMode(SearchMode modo)
+{
+ search_mode_ = modo;
+}
+
+inline void SearchManager::setDepth(int depth)
+{
+ depth_ = depth;
+}
+
+inline void SearchManager::setExternalDomainDepth(int depth)
+{
+ external_domain_depth_ = depth;
+}
+
+inline void SearchManager::setDomain(QString const& domain)
+{
+ Q_ASSERT(domain.find("http://") == -1);
+ domain_ = domain;
+ general_domain_ = generalDomain();
+ checked_general_domain_ = true;
+}
+
+inline void SearchManager::setCheckParentDirs(bool flag)
+{
+ check_parent_dirs_ = flag;
+}
+
+inline void SearchManager::setCheckExternalLinks(bool flag)
+{
+ check_external_links_ = flag;
+}
+
+inline void SearchManager::setCheckRegularExpressions(bool flag)
+{
+ check_regular_expressions_ = flag;
+}
+
+inline void SearchManager::setRegularExpression(QString const& reg_exp, bool case_sensitive)
+{
+ reg_exp_ = QRegExp(reg_exp, case_sensitive);
+}
+
+inline void SearchManager::setTimeOut(int time_out)
+{
+ Q_ASSERT(time_out > 0);
+ time_out_ = time_out;
+}
+
+
+
+inline bool SearchManager::checkParentDirs() const
+{
+ return check_parent_dirs_;
+}
+
+inline bool SearchManager::checkExternalLinks() const
+{
+ return check_external_links_;
+}
+
+inline LinkStatus const* SearchManager::linkStatusRoot() const
+{
+ return &root_;
+}
+
+inline bool SearchManager::searching() const
+{
+ return searching_;
+}
+
+inline bool SearchManager::localDomain(KURL const& url, bool restrict) const
+{
+ return Url::localDomain(root_.absoluteUrl(), url, restrict);
+}
+
+inline int SearchManager::maxSimultaneousConnections() const
+{
+ return max_simultaneous_connections_;
+}
+
+inline int SearchManager::timeOut() const
+{
+ return time_out_;
+}
+
+inline bool SearchManager::hasDocumentRoot() const
+{
+ return has_document_root_;
+}
+
+inline KURL const& SearchManager::documentRoot() const
+{
+ return document_root_url_;
+}
+
+inline void SearchManager::setDocumentRoot(KURL const& url)
+{
+ Q_ASSERT(url.isValid()); // includes empty URLs
+ Q_ASSERT(!url.protocol().startsWith("http"));
+
+ document_root_url_ = url;
+ has_document_root_ = true;
+}
+
+