// // Retriever.cc // // Retriever: Crawl from a list of URLs and calls appropriate parsers. The // parser notifies the Retriever object that it got something // (got_* functions) and the Retriever object feed the databases // and statistics accordingly. // // Part of the ht://Dig package // Copyright (c) 1995-2004 The ht://Dig Group // For copyright details, see the file COPYING in your distribution // or the GNU Library General Public License (LGPL) version 2 or later // // // $Id: Retriever.cc,v 1.94 2004/05/28 13:15:15 lha Exp $ // #ifdef HAVE_CONFIG_H #include "htconfig.h" #endif /* HAVE_CONFIG_H */ #ifdef _MSC_VER /* _WIN32 */ # include # include #endif #include "Retriever.h" #include "htdig.h" #include "HtWordList.h" #include "WordRecord.h" #include "URLRef.h" #include "Server.h" #include "Parsable.h" #include "Document.h" #include "StringList.h" #include "WordType.h" #include "Transport.h" #include "HtHTTP.h" // For HTTP statistics #include "md5.h" #include "defaults.h" #ifndef _MSC_VER /* _WIN32 */ #include #endif #include #include static int noSignal; // no_store_phrases: // If true, only store first occurrence of each word in a document static bool no_store_phrases; //***************************************************************************** // Retriever::Retriever() // Retriever::Retriever(RetrieverLog flags): words(*(HtConfiguration::config())), words_to_add (100, 0.75) { HtConfiguration *config = HtConfiguration::config(); FILE *urls_parsed; currenthopcount = 0; max_hop_count = config->Value("max_hop_count", 999999); no_store_phrases = !config->Boolean("store_phrases"); // // Initialize the flags for the various HTML factors // // text_factor factor[0] = FLAG_TEXT; // title_factor factor[1] = FLAG_TITLE; // heading factor (now generic) factor[2] = FLAG_HEADING; factor[3] = FLAG_HEADING; factor[4] = FLAG_HEADING; factor[5] = FLAG_HEADING; factor[6] = FLAG_HEADING; factor[7] = FLAG_HEADING; // img alt text //factor[8] = FLAG_KEYWORDS; factor[8] = FLAG_TEXT; // treat alt text as plain text, until it has // its own FLAG and factor. // keywords factor factor[9] = FLAG_KEYWORDS; // META description factor factor[10] = FLAG_DESCRIPTION; factor[11] = FLAG_AUTHOR; doc = new Document(); minimumWordLength = config->Value("minimum_word_length", 3); log = flags; // if in restart mode if (Retriever_noLog != log) { String filelog = config->Find("url_log"); char buffer[1024]; int l; urls_parsed = fopen((char *) filelog, "r"); if (urls_parsed != 0) { // read all url discovered but not fetched before while (fgets(buffer, sizeof(buffer), urls_parsed)) { l = strlen(buffer); buffer[l - 1] = 0; Initial(buffer, 2); } fclose(urls_parsed); } unlink((char *) filelog); } check_unique_md5 = config->Boolean("check_unique_md5", 0); check_unique_date = config->Boolean("check_unique_date", 0); d_md5 = 0; if (check_unique_md5) { d_md5 = Database::getDatabaseInstance(DB_HASH); if (d_md5->OpenReadWrite(config->Find("md5_db"), 0666) != OK) { cerr << "DocumentDB::Open: " << config->Find("md5_db") << " " << strerror(errno) << "\n"; } } } //***************************************************************************** // Retriever::~Retriever() // Retriever::~Retriever() { if (d_md5) d_md5->Close(); delete doc; } //***************************************************************************** // void Retriever::setUsernamePassword(char *credentials) // void Retriever::setUsernamePassword(const char *credentials) { doc->setUsernamePassword(credentials); } //***************************************************************************** // void Retriever::Initial(char *list, int from) // Add a single URL to the list of URLs to visit. // Since URLs are stored on a per server basis, we first need to find the // the correct server to add the URL's path to. // // from == 0 urls in db.docs and no db.log // from == 1 urls in start_url add url only if not already in the list // from == 2 add url from db.log // from == 3 urls in db.docs and there was a db.log // void Retriever::Initial(const String & list, int from) { // // Split the list of urls up into individual urls. // StringList tokens(list, " \t"); String sig; String url; Server *server; for (int i = 0; i < tokens.Count(); i++) { URL u(tokens[i]); url = u.get(); // get before u.signature() resolves aliases server = (Server *) servers[u.signature()]; if (debug > 2) cout << "\t" << from << ":" << (int) log << ":" << url; if (!server) { String robotsURL = u.signature(); robotsURL << "robots.txt"; StringList *localRobotsFile = GetLocal(robotsURL); server = new Server(u, localRobotsFile); servers.Add(u.signature(), server); delete localRobotsFile; } if (from && visited.Exists(url)) { if (debug > 2) cout << " skipped" << endl; continue; } else if (IsValidURL(url) != 1) { if (debug > 2) cout << endl; continue; } if (Retriever_noLog == log || from != 3) { if (debug > 2) cout << " pushed"; server->push(u.get(), 0, 0, IsLocalURL(url.get())); } if (debug > 2) cout << endl; visited.Add(url, 0); } } //***************************************************************************** // void Retriever::Initial(List &list, int from) // void Retriever::Initial(List & list, int from) { list.Start_Get(); String *str; // from == 0 is an optimisation for pushing url in update mode // assuming that // 1) there's many more urls in docdb // 2) they're pushed first // 3) there's no duplicate url in docdb // then they don't need to be check against already pushed urls // But 2) can be false with -l option // // FIXME it's nasty, what have to be test is : // we have urls to push from db.docs but do we already have them in // db.log? For this it's using a side effect with 'visited' and that // urls in db.docs are only pushed via this method, and that db.log are pushed // first, db.docs second, start_urls third! // if (!from && visited.Count()) { from = 3; } while ((str = (String *) list.Get_Next())) { Initial(str->get(), from); } } //***************************************************************************** // static void sigexit(int) { noSignal = 0; //don't exit here.. just set the flag. } static void sigpipe(int) { } //***************************************************************************** // static void sig_handlers // initialise signal handlers // static void sig_handlers(void) { #ifndef _MSC_VER /* _WIN32 */ //POSIX SIGNALS struct sigaction action; /* SIGINT, SIGQUIT, SIGTERM */ action.sa_handler = sigexit; sigemptyset(&action.sa_mask); action.sa_flags = 0; if (sigaction(SIGINT, &action, NULL) != 0) reportError("Cannot install SIGINT handler\n"); if (sigaction(SIGQUIT, &action, NULL) != 0) reportError("Cannot install SIGQUIT handler\n"); if (sigaction(SIGTERM, &action, NULL) != 0) reportError("Cannot install SIGTERM handler\n"); if (sigaction(SIGHUP, &action, NULL) != 0) reportError("Cannot install SIGHUP handler\n"); #else //ANSI C signal handling - Limited to supported Windows signals. signal(SIGINT, sigexit); signal(SIGTERM, sigexit); #endif //_MSC_VER /* _WIN32 */ } static void sig_phandler(void) { #ifndef _MSC_VER /* _WIN32 */ struct sigaction action; sigemptyset(&action.sa_mask); action.sa_handler = sigpipe; action.sa_flags = SA_RESTART; if (sigaction(SIGPIPE, &action, NULL) != 0) reportError("Cannot install SIGPIPE handler\n"); #endif //_MSC_VER /* _WIN32 */ } //***************************************************************************** // static void win32_check_messages // Check WIN32 messages! // #ifdef _MSC_VER /* _WIN32 */ static void win32_check_messages(void) { // NEAL - NEEDS FINISHING/TESTING #if 0 MSG msg = {0, 0, 0, 0}; int cDown = 0; int controlDown = 0; if( GetMessage(&msg, 0, 0, 0) ) { switch(msg.message) { case WM_KEYDOWN: { if(LOWORD(msg.message)== 17) controlDown = 1; else if(LOWORD(msg.message) == 67) { cDown = 1; } } break; case WM_KEYUP: { if(LOWORD(msg.message) == 17) controlDown = 0; else if(LOWORD(msg.message) == 67) cDown = 0; } break; } } DispatchMessage(&msg); #endif } #endif //_MSC_VER /* _WIN32 */ //***************************************************************************** // void Retriever::Start() // This is the main loop of the retriever. We will go through the // list of paths stored for each server. While parsing the // retrieved documents, new paths will be added to the servers. We // return if no more paths need to be retrieved. // void Retriever::Start() { // // Main digger loop. The todo list should initialy have the start // URL and all the URLs which were seen in a previous dig. The // loop will continue as long as there are more URLs to visit. // int more = 1; Server *server; URLRef *ref; HtConfiguration *config = HtConfiguration::config(); // // Always sig . The delay bother me but a bad db is worst // if (Retriever_noLog != log) { sig_handlers(); } sig_phandler(); noSignal = 1; /////// // Main loop. We keep on retrieving until a signal is received // or all the servers' queues are empty. /////// #ifdef _MSC_VER /* _WIN32 */ win32_check_messages(); #endif while (more && noSignal) { more = 0; // // Go through all the current servers in sequence. // If they support persistent connections, we keep on popping // from the same server queue until it's empty or we reach a maximum // number of consecutive requests ("max_connection_requests"). // Or the loop may also continue for the infinite, // if we set the "max_connection_requests" to -1. // If the server doesn't support persistent connection, we take // only an URL from it, then we skip to the next server. // // Since 15.05.02: even when persistent connections are activated // we should wait for a 'server_wait_time' number of seconds // after the 'max_connection_requests' value has been reached. // // Let's position at the beginning servers.Start_Get(); int count; // Maximum number of repeated requests with the same // TCP connection (so on the same Server:Port). int max_connection_requests; #ifdef _MSC_VER /* _WIN32 */ win32_check_messages(); #endif while ((server = (Server *) servers.Get_NextElement()) && noSignal) { if (debug > 1) cout << "pick: " << server->host() << ", # servers = " << servers.Count() << endl; // We already know if a server supports HTTP pers. connections, // because we asked it for the robots.txt file (constructor of // the class). // If the Server doesn't support persistent connections // we turn it down to 1. if (server->IsPersistentConnectionAllowed()) { // Let's check for a '0' value (out of range) // If set, we change it to 1. if (config->Value("server", server->host(), "max_connection_requests") == 0) max_connection_requests = 1; else max_connection_requests = config->Value("server", server->host(), "max_connection_requests"); if (debug > 2) { cout << "> " << server->host() << " supports HTTP persistent connections"; if (max_connection_requests == -1) cout << " (" << "infinite" << ")" << endl; else cout << " (" << max_connection_requests << ")" << endl; } } else { // No HTTP persistent connections. So we request only 1 document. max_connection_requests = 1; if (debug > 2) cout << "> " << server->host() << " with a traditional HTTP connection" << endl; } count = 0; #ifdef _MSC_VER /* _WIN32 */ win32_check_messages(); #endif while (((max_connection_requests == -1) || (count < max_connection_requests)) && (ref = server->pop()) && noSignal) { count++; // // We have a URL to index, now. We need to register the // fact that we are not done yet by setting the 'more' // variable. So, we have to restart scanning the queue. // more = 1; // // Deal with the actual URL. // We'll check with the server to see if we need to sleep() // before parsing it. // parse_url(*ref); delete ref; // We reached the maximum number of connections (either with // or without persistent connections) and we must pause and // respect the 'net ethic'. if ((max_connection_requests - count) == 0) server->delay(); // This will pause if needed // and reset the time #ifdef _MSC_VER /* _WIN32 */ win32_check_messages(); #endif } #ifdef _MSC_VER /* _WIN32 */ win32_check_messages(); #endif } } #ifdef _MSC_VER /* _WIN32 */ win32_check_messages(); #endif // if we exited on signal if (Retriever_noLog != log && !noSignal) { FILE *urls_parsed; String filelog = config->Find("url_log"); // save url seen but not fetched urls_parsed = fopen((char *) filelog, "w"); if (0 == urls_parsed) { reportError(form("Unable to create URL log file '%s'", filelog.get())); } else { servers.Start_Get(); while ((server = (Server *) servers.Get_NextElement())) { while (NULL != (ref = server->pop())) { fprintf(urls_parsed, "%s\n", (const char *) ref->GetURL().get()); delete ref; } } fclose(urls_parsed); } } words.Close(); } //***************************************************************************** // void Retriever::parse_url(URLRef &urlRef) // void Retriever::parse_url(URLRef & urlRef) { HtConfiguration *config = HtConfiguration::config(); URL url; DocumentRef *ref; int old_document; time_t date; static int index = 0; static int local_urls_only = config->Boolean("local_urls_only"); static int mark_dead_servers = config->Boolean("ignore_dead_servers"); Server *server; url.parse(urlRef.GetURL().get()); currenthopcount = urlRef.GetHopCount(); ref = docs[url.get()]; // It might be nice to have just an Exists() here if (ref) { // // We already have an entry for this document in our database. // This means we can get the document ID and last modification // time from there. // current_id = ref->DocID(); date = ref->DocTime(); if (ref->DocAccessed()) old_document = 1; else // we haven't retrieved it yet, so we only have the first link old_document = 0; ref->DocBackLinks(ref->DocBackLinks() + 1); // we had a new link ref->DocAccessed(time(0)); ref->DocState(Reference_normal); currenthopcount = ref->DocHopCount(); } else { // // Never seen this document before. We need to create an // entry for it. This implies that it gets a new document ID. // date = 0; current_id = docs.NextDocID(); ref = new DocumentRef; ref->DocID(current_id); ref->DocURL(url.get()); ref->DocState(Reference_normal); ref->DocAccessed(time(0)); ref->DocHopCount(currenthopcount); ref->DocBackLinks(1); // We had to have a link to get here! old_document = 0; } word_context.DocID(ref->DocID()); if (debug > 0) { // // Display progress // cout << index++ << ':' << current_id << ':' << currenthopcount << ':' << url.get() << ": "; cout.flush(); } // Reset the document to clean out any old data doc->Reset(); doc->Url(url.get()); doc->Referer(urlRef.GetReferer().get()); base = doc->Url(); // Retrieve document, first trying local file access if possible. Transport::DocStatus status; server = (Server *) servers[url.signature()]; StringList *local_filenames = GetLocal(url.get()); if (local_filenames) { if (debug > 1) cout << "Trying local files" << endl; status = doc->RetrieveLocal(date, local_filenames); if (status == Transport::Document_not_local) { if (debug > 1) cout << "Local retrieval failed, trying HTTP" << endl; if (server && !server->IsDead() && !local_urls_only) status = doc->Retrieve(server, date); else status = Transport::Document_no_host; } delete local_filenames; } else if (server && !server->IsDead() && !local_urls_only) status = doc->Retrieve(server, date); else status = Transport::Document_no_host; current_ref = ref; // // Determine what to do by looking at the status code returned by // the Document retrieval process. // String shash; String sx; char bhash[16]; time_t ddate; switch (status) { case Transport::Document_ok: trackWords = 1; if (check_unique_md5) { if (doc->StoredLength() > 0) { if (check_unique_date) { ddate = doc->ModTime(); if (ddate < time(NULL) - 10) { // Unknown date was set to current time md5(bhash, doc->Contents(), doc->StoredLength(), &ddate, debug); } else { md5(bhash, doc->Contents(), doc->StoredLength(), 0, debug); } } else md5(bhash, doc->Contents(), doc->StoredLength(), 0, debug); shash.append(bhash, MD5_LENGTH); d_md5->Get(shash, sx); if (!sx.empty()) { if (debug > 1) { cout << " Detected duplicate by md5 hash" << endl; } words.Skip(); break; // Duplicate - don't index } else { d_md5->Put(shash, "x"); } } } if (old_document) { if (doc->ModTime() == ref->DocTime()) { words.Skip(); if (debug) cout << " retrieved but not changed" << endl; words.Skip(); break; } // // Since we already had a record of this document and // we were able to retrieve it, it must have changed // since the last time we scanned it. This means that // we need to assign a new document ID to it and mark // the old one as obsolete. // words.Skip(); int backlinks = ref->DocBackLinks(); ref->DocState(Reference_obsolete); docs.Add(*ref); delete ref; current_id = docs.NextDocID(); word_context.DocID(current_id); ref = new DocumentRef; ref->DocID(current_id); ref->DocURL(url.get()); ref->DocState(Reference_normal); ref->DocAccessed(time(0)); ref->DocHopCount(currenthopcount); ref->DocBackLinks(backlinks); if (debug) cout << " (changed) "; } RetrievedDocument(*doc, url.get(), ref); // Hey! If this document is marked noindex, don't even bother // adding new words. Mark this as gone and get rid of it! if (ref->DocState() == Reference_noindex) { if (debug > 1) cout << " ( " << ref->DocURL() << " ignored)"; words.Skip(); } else words.Flush(); if (debug) cout << " size = " << doc->Length() << endl; if (urls_seen) { fprintf(urls_seen, "%s|%d|%s|%d|%d|1\n", (const char *) url.get(), doc->Length(), doc->ContentType(), (int) doc->ModTime(), currenthopcount); } break; case Transport::Document_not_changed: if (debug) cout << " not changed" << endl; words.Skip(); break; case Transport::Document_not_found: ref->DocState(Reference_not_found); if (debug) cout << " not found" << endl; recordNotFound(url.get(), urlRef.GetReferer().get(), Transport::Document_not_found); words.Skip(); break; case Transport::Document_no_host: ref->DocState(Reference_not_found); if (debug) cout << " host not found" << endl; recordNotFound(url.get(), urlRef.GetReferer().get(), Transport::Document_no_host); words.Skip(); // Mark the server as being down if (server && mark_dead_servers) server->IsDead(1); break; case Transport::Document_no_port: ref->DocState(Reference_not_found); if (debug) cout << " host not found (port)" << endl; recordNotFound(url.get(), urlRef.GetReferer().get(), Transport::Document_no_port); words.Skip(); // Mark the server as being down if (server && mark_dead_servers) server->IsDead(1); break; case Transport::Document_not_parsable: ref->DocState(Reference_noindex); if (debug) cout << " not Parsable" << endl; words.Skip(); break; case Transport::Document_redirect: if (debug) cout << " redirect" << endl; ref->DocState(Reference_obsolete); words.Skip(); got_redirect(doc->Redirected(), ref, (urlRef.GetReferer()).get()); break; case Transport::Document_not_authorized: ref->DocState(Reference_not_found); if (debug) cout << " not authorized" << endl; words.Skip(); break; case Transport::Document_not_local: ref->DocState(Reference_not_found); if (debug) cout << " not local" << endl; words.Skip(); break; case Transport::Document_no_header: ref->DocState(Reference_not_found); if (debug) cout << " no header" << endl; words.Skip(); break; case Transport::Document_connection_down: ref->DocState(Reference_not_found); if (debug) cout << " connection down" << endl; words.Skip(); break; case Transport::Document_no_connection: ref->DocState(Reference_not_found); if (debug) cout << " no connection" << endl; words.Skip(); break; case Transport::Document_not_recognized_service: ref->DocState(Reference_not_found); if (debug) cout << " service not recognized" << endl; // Mark the server as being down if (server && mark_dead_servers) server->IsDead(1); words.Skip(); break; case Transport::Document_other_error: ref->DocState(Reference_not_found); if (debug) cout << " other error" << endl; words.Skip(); break; } docs.Add(*ref); delete ref; } //***************************************************************************** // void Retriever::RetrievedDocument(Document &doc, const String &url, DocumentRef *ref) // We found a document that needs to be parsed. Since we don't know the // document type, we'll let the Document itself return an appropriate // Parsable object which we can call upon to parse the document contents. // void Retriever::RetrievedDocument(Document & doc, const String & url, DocumentRef * ref) { n_links = 0; current_ref = ref; current_title = 0; word_context.Anchor(0); current_time = 0; current_head = 0; current_meta_dsc = 0; // // Create a parser object and let it have a go at the document. // We will pass ourselves as a callback object for all the got_*() // routines. // This will generate the Parsable object as a specific parser // Parsable *parsable = doc.getParsable(); if (parsable) parsable->parse(*this, *base); else { // If we didn't get a parser, then we should get rid of this! ref->DocState(Reference_noindex); return; } // If just storing the first occurrence of each word in a document, // we must now flush the words we saw in that document if (no_store_phrases) { DictionaryCursor cursor; char *key; HtWordReference wordRef; for (words_to_add.Start_Get (cursor); (key = words_to_add.Get_Next(cursor)); ) { word_entry *entry = (word_entry*) (words_to_add [key]); wordRef.Location(entry->location); wordRef.Flags(entry->flags); wordRef.Word(key); words.Replace(WordReference::Merge(wordRef, entry->context)); // How do I clean up properly? delete entry; } words_to_add.Release (); } // // We don't need to dispose of the parsable object since it will // automatically be reused. // // // Update the document reference // ref->DocHead((char *) current_head); ref->DocMetaDsc((char *) current_meta_dsc); if (current_time == 0) ref->DocTime(doc.ModTime()); else ref->DocTime(current_time); ref->DocTitle((char *) current_title); ref->DocSize(doc.Length()); ref->DocAccessed(time(0)); ref->DocLinks(n_links); } //***************************************************************************** // int Retriever::Need2Get(const String &u) // Return TRUE if we need to retrieve the given url. This will // check the list of urls we have already visited. // int Retriever::Need2Get(const String & u) { static String url; url = u; return !visited.Exists(url); } //***************************************************************************** // int Retriever::IsValidURL(const String &u) // Return TRUE if we need to retrieve the given url. We will check // for limits here. // int Retriever::IsValidURL(const String & u) { HtConfiguration *config = HtConfiguration::config(); Dictionary invalids; Dictionary valids; URL aUrl(u); StringList tmpList; // A list of bad extensions, separated by spaces or tabs String t = config->Find(&aUrl, "bad_extensions"); String lowerp; char *p = strtok(t, " \t"); while (p) { // Extensions are case insensitive lowerp = p; lowerp.lowercase(); invalids.Add(lowerp, 0); p = strtok(0, " \t"); } // // Valid extensions are performed similarly // // A list of valid extensions, separated by spaces or tabs t = config->Find(&aUrl, "valid_extensions"); p = strtok(t, " \t"); while (p) { // Extensions are case insensitive lowerp = p; lowerp.lowercase(); valids.Add(lowerp, 0); p = strtok(0, " \t"); } static String url; url = u; // // If the URL contains any of the patterns in the exclude list, // mark it as invalid // String exclude_urls = config->Find(&aUrl, "exclude_urls"); static String *prevexcludes = 0; static HtRegexList *excludes = 0; if (!excludes || !prevexcludes || prevexcludes->compare(exclude_urls) != 0) { if (!excludes) excludes = new HtRegexList; if (prevexcludes) delete prevexcludes; prevexcludes = new String(exclude_urls); tmpList.Create(exclude_urls, " \t"); excludes->setEscaped(tmpList, config->Boolean("case_sensitive")); tmpList.Destroy(); } if (excludes->match(url, 0, 0) != 0) { if (debug > 2) cout << endl << " Rejected: item in exclude list "; return (HTDIG_ERROR_TESTURL_EXCLUDE); } // // If the URL has a query string and it is in the bad query list // mark it as invalid // String bad_querystr = config->Find(&aUrl, "bad_querystr"); static String *prevbadquerystr = 0; static HtRegexList *badquerystr = 0; if (!badquerystr || !prevbadquerystr || prevbadquerystr->compare(bad_querystr) != 0) { if (!badquerystr) badquerystr = new HtRegexList; if (prevbadquerystr) delete prevbadquerystr; prevbadquerystr = new String(bad_querystr); tmpList.Create(bad_querystr, " \t"); badquerystr->setEscaped(tmpList, config->Boolean("case_sensitive")); tmpList.Destroy(); } char *ext = strrchr((char *) url, '?'); if (ext && badquerystr->match(ext, 0, 0) != 0) { if (debug > 2) cout << endl << " Rejected: item in bad query list "; return (HTDIG_ERROR_TESTURL_BADQUERY); } // // See if the file extension is in the list of invalid ones // String urlpath = url.get(); int parm = urlpath.indexOf('?'); // chop off URL parameter if (parm >= 0) urlpath.chop(urlpath.length() - parm); ext = strrchr((char *) urlpath.get(), '.'); String lowerext; if (ext && strchr(ext, '/')) // Ignore a dot if it's not in the ext = NULL; // final component of the path. if (ext) { lowerext.set(ext); lowerext.lowercase(); if (invalids.Exists(lowerext)) { if (debug > 2) cout << endl << " Rejected: Extension is invalid!"; return (HTDIG_ERROR_TESTURL_EXTENSION); } } // // Or NOT in the list of valid ones // if (ext && valids.Count() > 0 && !valids.Exists(lowerext)) { if (debug > 2) cout << endl << " Rejected: Extension is not valid!"; return (HTDIG_ERROR_TESTURL_EXTENSION2); } // // If none of the limits is met, we disallow the URL // if (limits.match(url, 1, 0) == 0) { if (debug > 1) cout << endl << " Rejected: URL not in the limits! "; return (HTDIG_ERROR_TESTURL_LIMITS); } // // Likewise if not in list of normalized urls // // Warning! // should be last in checks because of aUrl normalization // // signature() implicitly normalizes the URL. Be efficient... Server *server = (Server *) servers[aUrl.signature()]; // aUrl.normalize(); if (limitsn.match(aUrl.get(), 1, 0) == 0) { if (debug > 2) cout << endl << " Rejected: not in \"limit_normalized\" list!"; return (HTDIG_ERROR_TESTURL_LIMITSNORM); } // // After that gauntlet, check to see if the server allows it // (robots.txt) // if (server && server->IsDisallowed(url) != 0) { if (debug > 2) cout << endl << " Rejected: forbidden by server robots.txt!"; return (HTDIG_ERROR_TESTURL_ROBOT_FORBID); } return (1); } //***************************************************************************** // StringList* Retriever::GetLocal(const String &url) // Returns a list of strings containing the (possible) local filenames // of the given url, or 0 if it's definitely not local. // THE CALLER MUST FREE THE STRINGLIST AFTER USE! // Returned strings are not hex encoded. // StringList *Retriever::GetLocal(const String & strurl) { HtConfiguration *config = HtConfiguration::config(); static StringList *prefixes = 0; String url = strurl; static StringList *paths = 0; StringList *defaultdocs = 0; URL aUrl(url); url = aUrl.get(); // make sure we look at a parsed URL // // Initialize prefix/path list if this is the first time. // The list is given in format "prefix1=path1 prefix2=path2 ..." // if (!prefixes) { prefixes = new StringList(); paths = new StringList(); String t = config->Find("local_urls"); char *p = strtok(t, " \t"); while (p) { char *path = strchr(p, '='); if (!path) { p = strtok(0, " \t"); continue; } *path++ = '\0'; String *pre = new String(p); decodeURL(*pre); prefixes->Add(pre); String *pat = new String(path); decodeURL(*pat); paths->Add(pat); p = strtok(0, " \t"); } } if (!config->Find(&aUrl, "local_default_doc").empty()) { defaultdocs = new StringList(); String t = config->Find(&aUrl, "local_default_doc"); char *p = strtok(t, " \t"); while (p) { String *def = new String(p); decodeURL(*def); defaultdocs->Add(def); p = strtok(0, " \t"); } if (defaultdocs->Count() == 0) { delete defaultdocs; defaultdocs = 0; } } // Begin by hex-decoding URL... String hexurl = url; decodeURL(hexurl); url = hexurl.get(); // Check first for local user... if (strchr(url.get(), '~')) { StringList *local = GetLocalUser(url, defaultdocs); if (local) { if (defaultdocs) delete defaultdocs; return local; } } // This shouldn't happen, but check anyway... if (strstr(url.get(), "..")) return 0; String *prefix, *path; String *defaultdoc; StringList *local_names = new StringList(); prefixes->Start_Get(); paths->Start_Get(); while ((prefix = (String *) prefixes->Get_Next())) { path = (String *) paths->Get_Next(); if (mystrncasecmp((char *) *prefix, (char *) url, prefix->length()) == 0) { int l = strlen(url.get()) - prefix->length() + path->length() + 4; String *local = new String(*path, l); *local += &url[prefix->length()]; if (local->last() == '/' && defaultdocs) { defaultdocs->Start_Get(); while ((defaultdoc = (String *) defaultdocs->Get_Next())) { String *localdefault = new String(*local, local->length() + defaultdoc->length() + 1); localdefault->append(*defaultdoc); local_names->Add(localdefault); } delete local; } else local_names->Add(local); } } if (local_names->Count() > 0) { if (defaultdocs) delete defaultdocs; return local_names; } if (defaultdocs) delete defaultdocs; delete local_names; return 0; } //***************************************************************************** // StringList* Retriever::GetLocalUser(const String &url, StringList *defaultdocs) // If the URL has ~user part, return a list of strings containing the // (possible) local filenames of the given url, or 0 if it's // definitely not local. // THE CALLER MUST FREE THE STRINGLIST AFTER USE! // StringList *Retriever::GetLocalUser(const String & url, StringList * defaultdocs) { // NOTE: Native Windows does not have this contruct for the user Web files #ifndef _MSC_VER /* _WIN32 */ HtConfiguration *config = HtConfiguration::config(); static StringList *prefixes = 0, *paths = 0, *dirs = 0; static Dictionary home_cache; URL aUrl(url); // // Initialize prefix/path list if this is the first time. // The list is given in format "prefix1=path1,dir1 ..." // If path is zero-length, user's home directory is looked up. // if (!prefixes) { prefixes = new StringList(); paths = new StringList(); dirs = new StringList(); String t = config->Find("local_user_urls"); char *p = strtok(t, " \t"); while (p) { char *path = strchr(p, '='); if (!path) { p = strtok(0, " \t"); continue; } *path++ = '\0'; char *dir = strchr(path, ','); if (!dir) { p = strtok(0, " \t"); continue; } *dir++ = '\0'; String *pre = new String(p); decodeURL(*pre); prefixes->Add(pre); String *pat = new String(path); decodeURL(*pat); paths->Add(pat); String *ptd = new String(dir); decodeURL(*ptd); dirs->Add(ptd); p = strtok(0, " \t"); } } // Can we do anything about this? if (!strchr(url, '~') || !prefixes->Count() || strstr(url, "..")) return 0; // Split the URL to components String tmp = url; char *name = strchr((char *) tmp, '~'); *name++ = '\0'; char *rest = strchr(name, '/'); if (!rest || (rest - name <= 1) || (rest - name > 32)) return 0; *rest++ = '\0'; // Look it up in the prefix/path/dir table prefixes->Start_Get(); paths->Start_Get(); dirs->Start_Get(); String *prefix, *path, *dir; String *defaultdoc; StringList *local_names = new StringList(); while ((prefix = (String *) prefixes->Get_Next())) { path = (String *) paths->Get_Next(); dir = (String *) dirs->Get_Next(); if (mystrcasecmp((char *) *prefix, (char *) tmp) != 0) continue; String *local = new String; // No path, look up home directory if (path->length() == 0) { String *home = (String *) home_cache[name]; if (!home) { struct passwd *passwd = getpwnam(name); if (passwd) { home = new String(passwd->pw_dir); home_cache.Add(name, home); } } if (home) *local += *home; else continue; } else { *local += *path; *local += name; } *local += *dir; *local += rest; if (local->last() == '/' && defaultdocs) { defaultdocs->Start_Get(); while ((defaultdoc = (String *) defaultdocs->Get_Next())) { String *localdefault = new String(*local, local->length() + defaultdoc->length() + 1); localdefault->append(*defaultdoc); local_names->Add(localdefault); } delete local; } else local_names->Add(local); } if (local_names->Count() > 0) return local_names; delete local_names; #endif //_MSC_VER /* _WIN32 */ return 0; } //***************************************************************************** // int Retriever::IsLocalURL(const String &url) // Returns 1 if the given url has a (possible) local filename // or 0 if it's definitely not local. // int Retriever::IsLocalURL(const String & url) { int ret; StringList *local_filename = GetLocal(url); ret = (local_filename != 0); if (local_filename) delete local_filename; return ret; } //***************************************************************************** // void Retriever::got_word(char *word, int location, int heading) // The location is normalized to be in the range 0 - 1000. // void Retriever::got_word(const char *word, int location, int heading) { if (debug > 3) cout << "word: " << word << '@' << location << endl; if (heading >= (int) (sizeof(factor) / sizeof(factor[0])) || heading < 0) heading = 0; // Assume it's just normal text if (trackWords && strlen(word) >= (unsigned int) minimumWordLength) { String w = word; HtWordReference wordRef; if (no_store_phrases) { // Add new word, or mark existing word as also being at // this heading level word_entry *entry; if ((entry = (word_entry*)words_to_add.Find (w)) == NULL) { words_to_add.Add(w, new word_entry (location, factor[heading], word_context)); } else { entry->flags |= factor[heading]; } } else { wordRef.Location(location); wordRef.Flags(factor[heading]); wordRef.Word(w); words.Replace(WordReference::Merge(wordRef, word_context)); } // Check for compound words... String parts = word; int added; int nparts = 1; do { added = 0; char *start = parts.get(); char *punctp = 0, *nextp = 0, *p; char punct; int n; while (*start) { p = start; for (n = 0; n < nparts; n++) { while (HtIsStrictWordChar((unsigned char) *p)) p++; punctp = p; if (!*punctp && n + 1 < nparts) break; while (*p && !HtIsStrictWordChar((unsigned char) *p)) p++; if (n == 0) nextp = p; } if (n < nparts) break; punct = *punctp; *punctp = '\0'; if (*start && (*p || start > parts.get())) { w = start; HtStripPunctuation(w); if (w.length() >= minimumWordLength) { if (no_store_phrases) { // Add new word, or mark existing word as also being at // this heading level word_entry *entry; if ((entry = (word_entry*)words_to_add.Find (w)) == NULL) { words_to_add.Add(w, new word_entry (location, factor[heading], word_context)); } else { entry->flags |= factor[heading]; } } else { wordRef.Word(w); words.Replace(WordReference::Merge(wordRef, word_context)); } if (debug > 3) cout << "word part: " << start << '@' << location << endl; } added++; } start = nextp; *punctp = punct; } nparts++; } while (added > 2); } } //***************************************************************************** // void Retriever::got_title(const char *title) // void Retriever::got_title(const char *title) { if (debug > 1) cout << "\ntitle: " << title << endl; current_title = title; } //***************************************************************************** // void Retriever::got_author(const char *e) // void Retriever::got_author(const char *author) { if (debug > 1) cout << "\nauthor: " << author << endl; current_ref->DocAuthor(author); } //***************************************************************************** // void Retriever::got_time(const char *time) // void Retriever::got_time(const char *time) { HtDateTime new_time(current_time); if (debug > 1) cout << "\ntime: " << time << endl; // // As defined by the Dublin Core, this should be YYYY-MM-DD // In the future, we'll need to deal with the scheme portion // in case someone picks a different format. // new_time.SetFTime(time, "%Y-%m-%d"); current_time = new_time.GetTime_t(); // If we can't convert it, current_time stays the same and we get // the default--the date returned by the server... } //***************************************************************************** // void Retriever::got_anchor(const char *anchor) // void Retriever::got_anchor(const char *anchor) { if (debug > 2) cout << "anchor: " << anchor << endl; current_ref->AddAnchor(anchor); word_context.Anchor(word_context.Anchor() + 1); } //***************************************************************************** // void Retriever::got_image(const char *src) // void Retriever::got_image(const char *src) { URL url(src, *base); const char *image = (const char *) url.get(); if (debug > 2) cout << "image: " << image << endl; if (images_seen) fprintf(images_seen, "%s\n", image); } //***************************************************************************** // void Retriever::got_href(URL & url, const char *description, int hops) { DocumentRef *ref = 0; Server *server = 0; int valid_url_code = 0; // Rewrite the URL (if need be) before we do anything to it. url.rewrite(); if (debug > 2) cout << "href: " << url.get() << " (" << description << ')' << endl; n_links++; if (urls_seen) fprintf(urls_seen, "%s\n", (const char *) url.get()); // // Check if this URL falls within the valid range of URLs. // valid_url_code = IsValidURL(url.get()); if (valid_url_code > 0) { // // It is valid. Normalize it (resolve cnames for the server) // and check again... // if (debug > 2) { cout << "resolving '" << url.get() << "'\n"; cout.flush(); } url.normalize(); // If it is a backlink from the current document, // just update that field. Writing to the database // is meaningless, as it will be overwritten. // Adding it as a new document may even be harmful, as // that will be a duplicate. This can happen if the // current document is never referenced before, as in a // start_url. if (strcmp(url.get(), current_ref->DocURL()) == 0) { current_ref->DocBackLinks(current_ref->DocBackLinks() + 1); current_ref->AddDescription(description, words); } else { // // First add it to the document database // ref = docs[url.get()]; // if ref exists we have to call AddDescription even // if max_hop_count is reached if (!ref && currenthopcount + hops > max_hop_count) return; if (!ref) { // // Didn't see this one, yet. Create a new reference // for it with a unique document ID // ref = new DocumentRef; ref->DocID(docs.NextDocID()); ref->DocHopCount(currenthopcount + hops); ref->DocURL(url.get()); } ref->DocBackLinks(ref->DocBackLinks() + 1); // This one! ref->AddDescription(description, words); // // If the dig is restricting by hop count, perform the check here // too if (currenthopcount + hops > max_hop_count) { delete ref; return; } if (ref->DocHopCount() > currenthopcount + hops) ref->DocHopCount(currenthopcount + hops); docs.Add(*ref); // // Now put it in the list of URLs to still visit. // if (Need2Get(url.get())) { if (debug > 1) cout << "\n pushing " << url.get() << endl; server = (Server *) servers[url.signature()]; if (!server) { // // Hadn't seen this server, yet. Register it // String robotsURL = url.signature(); robotsURL << "robots.txt"; StringList *localRobotsFile = GetLocal(robotsURL.get()); server = new Server(url, localRobotsFile); servers.Add(url.signature(), server); delete localRobotsFile; } // // Let's just be sure we're not pushing an empty URL // if (strlen(url.get())) server->push(url.get(), ref->DocHopCount(), base->get(), IsLocalURL(url.get())); String temp = url.get(); visited.Add(temp, 0); if (debug) cout << '+'; } else if (debug) cout << '*'; delete ref; } } else { // // Not a valid URL // if (debug > 1) cout << "\nurl rejected: (level 1)" << url.get() << endl; if (debug == 1) cout << '-'; if (urls_seen) { fprintf(urls_seen, "%s|||||%d\n", (const char *) url.get(), valid_url_code); } } if (debug) cout.flush(); } //***************************************************************************** // void Retriever::got_redirect(const char *new_url, DocumentRef *old_ref) // void Retriever::got_redirect(const char *new_url, DocumentRef * old_ref, const char *referer) { // First we must piece together the new URL, which may be relative URL parent(old_ref->DocURL()); URL url(new_url, parent); // Rewrite the URL (if need be) before we do anything to it. url.rewrite(); if (debug > 2) cout << "redirect: " << url.get() << endl; n_links++; if (urls_seen) fprintf(urls_seen, "%s\n", (const char *) url.get()); // // Check if this URL falls within the valid range of URLs. // if (IsValidURL(url.get()) > 0) { // // It is valid. Normalize it (resolve cnames for the server) // and check again... // if (debug > 2) { cout << "resolving '" << url.get() << "'\n"; cout.flush(); } url.normalize(); // // First add it to the document database // DocumentRef *ref = docs[url.get()]; if (!ref) { // // Didn't see this one, yet. Create a new reference // for it with a unique document ID // ref = new DocumentRef; ref->DocID(docs.NextDocID()); ref->DocHopCount(currenthopcount); } ref->DocURL(url.get()); // // Copy the descriptions of the old DocRef to this one // List *d = old_ref->Descriptions(); if (d) { d->Start_Get(); String *str; while ((str = (String *) d->Get_Next())) { ref->AddDescription(str->get(), words); } } if (ref->DocHopCount() > old_ref->DocHopCount()) ref->DocHopCount(old_ref->DocHopCount()); // Copy the number of backlinks ref->DocBackLinks(old_ref->DocBackLinks()); docs.Add(*ref); // // Now put it in the list of URLs to still visit. // if (Need2Get(url.get())) { if (debug > 1) cout << " pushing " << url.get() << endl; Server *server = (Server *) servers[url.signature()]; if (!server) { // // Hadn't seen this server, yet. Register it // String robotsURL = url.signature(); robotsURL << "robots.txt"; StringList *localRobotsFile = GetLocal(robotsURL.get()); server = new Server(url, localRobotsFile); servers.Add(url.signature(), server); delete localRobotsFile; } if (!referer || strlen(referer) == 0) server->push(url.get(), ref->DocHopCount(), base->get(), IsLocalURL(url.get()), 0); else server->push(url.get(), ref->DocHopCount(), referer, IsLocalURL(url.get()), 0); String temp = url.get(); visited.Add(temp, 0); } delete ref; } } //***************************************************************************** // void Retriever::got_head(const char *head) // void Retriever::got_head(const char *head) { if (debug > 4) cout << "head: " << head << endl; current_head = head; } //***************************************************************************** // void Retriever::got_meta_dsc(const char *md) // void Retriever::got_meta_dsc(const char *md) { if (debug > 4) cout << "meta description: " << md << endl; current_meta_dsc = md; } //***************************************************************************** // void Retriever::got_meta_email(const char *e) // void Retriever::got_meta_email(const char *e) { if (debug > 1) cout << "\nmeta email: " << e << endl; current_ref->DocEmail(e); } //***************************************************************************** // void Retriever::got_meta_notification(const char *e) // void Retriever::got_meta_notification(const char *e) { if (debug > 1) cout << "\nmeta notification date: " << e << endl; current_ref->DocNotification(e); } //***************************************************************************** // void Retriever::got_meta_subject(const char *e) // void Retriever::got_meta_subject(const char *e) { if (debug > 1) cout << "\nmeta subect: " << e << endl; current_ref->DocSubject(e); } //***************************************************************************** // void Retriever::got_noindex() // void Retriever::got_noindex() { if (debug > 1) cout << "\nMETA ROBOT: Noindex " << current_ref->DocURL() << endl; current_ref->DocState(Reference_noindex); } //***************************************************************************** // void Retriever::recordNotFound(const String & url, const String & referer, int reason) { char *message = ""; switch (reason) { case Transport::Document_not_found: message = "Not found"; break; case Transport::Document_no_host: message = "Unknown host or unable to contact server"; break; case Transport::Document_no_port: message = "Unknown host or unable to contact server (port)"; break; default: break; } notFound << message << ": " << url << " Ref: " << referer << '\n'; } //***************************************************************************** // void Retriever::ReportStatistics(char *name) // void Retriever::ReportStatistics(const String & name) { HtConfiguration *config = HtConfiguration::config(); cout << name << ": Run complete\n"; cout << name << ": " << servers.Count() << " server"; if (servers.Count() > 1) cout << "s"; cout << " seen:\n"; Server *server; String buffer; StringList results; String newname = name; newname << ": "; servers.Start_Get(); while ((server = (Server *) servers.Get_NextElement())) { buffer = 0; server->reportStatistics(buffer, newname); results.Add(buffer); } results.Sort(); for (int i = 0; i < results.Count(); i++) { cout << results[i] << "\n"; } if (notFound.length() > 0) { cout << "\n" << name << ": Errors to take note of:\n"; cout << notFound; } cout << endl; // Report HTTP connections stats cout << "HTTP statistics" << endl; cout << "===============" << endl; if (config->Boolean("persistent_connections")) { cout << " Persistent connections : Yes" << endl; if (config->Boolean("head_before_get")) cout << " HEAD call before GET : Yes" << endl; else cout << " HEAD call before GET : No" << endl; } else { cout << " Persistent connections : No" << endl; } HtHTTP::ShowStatistics(cout) << endl; }