// // Server.cc // // Server: A class to keep track of server specific information. // // Part of the ht://Dig package // Copyright (c) 1995-2004 The ht://Dig Group // For copyright details, see the file COPYING in your distribution // or the GNU Library General Public License (LGPL) version 2 or later // // // $Id: Server.cc,v 1.29 2004/05/28 13:15:16 lha Exp $ // #ifdef HAVE_CONFIG_H #include "htconfig.h" #endif /* HAVE_CONFIG_H */ #include "htdig.h" #include "Server.h" #include "good_strtok.h" #include "htString.h" #include "URL.h" #include "Document.h" #include "URLRef.h" #include "Transport.h" #include "HtHTTP.h" // for checking persistent connections #include "StringList.h" #include #include "defaults.h" //***************************************************************************** // Server::Server(URL u, StringList *local_robots_files) // u is the base URL for this server // Server::Server(URL u, StringList *local_robots_files) : _host(u.host()), _port(u.port()), _bad_server(0), _documents(0), _accept_language(0) { HtConfiguration* config= HtConfiguration::config(); if (debug) cout << endl << "New server: " << _host << ", " << _port << endl; // We take it from the configuration _persistent_connections = config->Boolean("server", _host.get(),"persistent_connections"); _head_before_get = config->Boolean("server", _host.get(),"head_before_get"); _max_documents = config->Value("server",_host.get(),"server_max_docs"); _connection_space = config->Value("server",_host.get(),"server_wait_time"); _user_agent = config->Find("server", _host.get(), "user_agent"); _disable_cookies = config->Boolean("server", _host.get(), "disable_cookies"); // Accept-Language directive StringList _accept_language_list(config->Find("server", _host.get(), "accept_language"), " \t"); _accept_language.trunc(); // maybe not needed for (int i = 0; i < _accept_language_list.Count(); i++) { if (i>0) _accept_language << ","; // for multiple choices _accept_language << _accept_language_list[i]; } // Timeout setting _timeout = config->Value("server",_host.get(),"timeout"); // Number of consecutive attempts to establish a TCP connection _tcp_max_retries = config->Value("server",_host.get(),"tcp_max_retries"); // Seconds to wait after a timeout occurs _tcp_wait_time = config->Value("server",_host.get(),"tcp_wait_time"); if (debug > 1) { cout << " - Persistent connections: " << (_persistent_connections?"enabled":"disabled") << endl; cout << " - HEAD before GET: " << (_head_before_get?"enabled":"disabled") << endl; cout << " - Timeout: " << _timeout << endl; cout << " - Connection space: " << _connection_space << endl; cout << " - Max Documents: " << _max_documents << endl; cout << " - TCP retries: " << _tcp_max_retries << endl; cout << " - TCP wait time: " << _tcp_wait_time << endl; cout << " - Accept-Language: " << _accept_language << endl; } _last_connection.SettoNow(); // For getting robots.txt if (strcmp(u.service(),"http") == 0 || strcmp(u.service(),"https") == 0) { // // Attempt to get a robots.txt file from the specified server // String url; url.trunc(); if (debug>1) cout << "Trying to retrieve robots.txt file" << endl; url << u.signature() << "robots.txt"; static int local_urls_only = config->Boolean("local_urls_only"); time_t timeZero = 0; // Right now we want to get this every time Document doc(url, 0); Transport::DocStatus status; if (local_robots_files) { if (debug > 1) cout << "Trying local files" << endl; status = doc.RetrieveLocal(timeZero, local_robots_files); if (status == Transport::Document_not_local) { if (local_urls_only) status = Transport::Document_not_found; else { if (debug > 1) cout << "Local retrieval failed, trying HTTP" << endl; status = doc.Retrieve(this, timeZero); } } } else if (!local_urls_only) { status = doc.Retrieve(this, timeZero); // Let's check if persistent connections are both // allowed by the configuration and possible after // having requested the robots.txt file. HtHTTP * http; if (IsPersistentConnectionAllowed() && ( http = doc.GetHTTPHandler())) { if (! http->isPersistentConnectionPossible()) _persistent_connections=0; // not possible. Let's disable // them on this server. } } else status = Transport::Document_not_found; switch (status) { case Transport::Document_ok: // // Found a robots.txt file. Go parse it. // robotstxt(doc); break; case Transport::Document_not_found: case Transport::Document_not_parsable: case Transport::Document_redirect: case Transport::Document_not_authorized: // // These cases are for when there is no robots.txt file. // We will just go on happily without restrictions // break; case Transport::Document_no_host: default: // // In all other cases the server could not be reached. // We will remember this fact so that no more attempts to // contact this server will be made. // _bad_server = 1; break; } // end switch } // end if (http || https) } // Copy constructor Server::Server(const Server& rhs) :_host(_host), _port(rhs._port), _bad_server(rhs._bad_server), _connection_space(rhs._connection_space), _last_connection(rhs._last_connection), _paths(rhs._paths), _disallow(rhs._disallow), _documents(rhs._documents), _max_documents(rhs._max_documents), _persistent_connections(rhs._persistent_connections), _head_before_get(rhs._head_before_get), _disable_cookies(rhs._disable_cookies), _timeout(rhs._timeout), _tcp_wait_time(rhs._tcp_wait_time), _tcp_max_retries(rhs._tcp_max_retries), _user_agent(rhs._user_agent), _accept_language(rhs._accept_language) { } //***************************************************************************** // Server::~Server() // Server::~Server() { } //***************************************************************************** // void Server::robotstxt(Document &doc) // This will parse the robots.txt file which is contained in the document. // void Server::robotstxt(Document &doc) { HtConfiguration* config= HtConfiguration::config(); String contents = doc.Contents(); int length; int pay_attention = 0; String pattern; String myname = config->Find("server", _host.get(), "robotstxt_name"); int seen_myname = 0; char *name, *rest; if (debug > 1) cout << "Parsing robots.txt file using myname = " << myname << "\n"; // // Go through the lines in the file and determine if we need to // pay attention to them // for (char *line = strtok(contents, "\r\n"); line; line = strtok(0, "\r\n")) { if (debug > 2) cout << "Robots.txt line: " << line << endl; // // Strip comments // if (strchr(line, '#')) { *(strchr(line, '#')) = '\0'; } name = good_strtok(line, ':'); if (!name) continue; while (name && isspace(*name)) name++; rest = good_strtok(NULL, '\r'); if (!rest) rest = ""; while (rest && isspace(*rest)) rest++; length = strlen(rest); if (length > 0) { while (length > 0 && isspace(rest[length - 1])) length--; rest[length] = '\0'; } if (mystrcasecmp(name, "user-agent") == 0) { if (debug > 1) cout << "Found 'user-agent' line: " << rest << endl; if (*rest == '*' && !seen_myname) { // // This matches all search engines... // pay_attention = 1; } else if (mystrncasecmp(rest, (char*)myname, myname.length()) == 0) { // // This is for us! This will override any previous patterns // that may have been set. // if (!seen_myname) // only take first section with our name { seen_myname = 1; pay_attention = 1; pattern = 0; // ignore previous User-agent: * } else pay_attention = 0; } else { // // This doesn't concern us // pay_attention = 0; } } else if (pay_attention && mystrcasecmp(name, "disallow") == 0) { if (debug > 1) cout << "Found 'disallow' line: " << rest << endl; // // Add this path to our list to ignore // if (*rest) { if (pattern.length()) pattern << '|'; while (*rest) { if (strchr("^.[$()|*+?{\\", *rest)) pattern << '\\'; pattern << *rest++; } } } // // Ignore anything else (comments) // } // // Compile the pattern (if any...) // if (debug > 1) cout << "Pattern: " << pattern << endl; // Empty "disallow" allows all, so don't make entry which matches all. if (!pattern.empty()) { String fullpatt = "^[^:]*://[^/]*("; fullpatt << pattern << ')'; _disallow.set(fullpatt, config->Boolean("case_sensitive")); } } //***************************************************************************** // void Server::push(String &path, int hopcount, char *referer, int local, int newDoc) // void Server::push(const String &path, int hopcount, const String &referer, int local, int newDoc) { if (_bad_server && !local) return; if (IsDisallowed(path) != 0) { if (debug > 2) cout << endl << " Rejected: forbidden by server robots.txt!"; return; } // We use -1 as no limit, but we also don't want // to forbid redirects from old places if (_max_documents != -1 && newDoc && _documents >= _max_documents) { if (debug>2) // Hey! we only want to get max_docs cout << "Limit of " << _max_documents << " reached for " << _host << endl; return; } URLRef *ref = new URLRef(); ref->SetURL(path); ref->SetHopCount(hopcount); ref->SetReferer(referer); _paths.Add(ref); if (newDoc) _documents++; // cout << "***** pushing '" << path << "' with '" << referer << "'\n"; } //***************************************************************************** // URLRef *Server::pop() // URLRef *Server::pop() { URLRef *ref = (URLRef *) _paths.Remove(); if (!ref) return 0; return ref; } //***************************************************************************** // void Server::delay() // // Keeps track of how long it's been since we've seen this server // and call sleep if necessary // void Server::delay() { HtDateTime now; int time_taken = HtDateTime::GetDiff(now, _last_connection); // arg1-arg2 > 0 if (time_taken < _connection_space) sleep(_connection_space - time_taken); now.SettoNow(); _last_connection = now; // Reset the clock for the next delay! return; } //***************************************************************************** // void Server::reportStatistics(String &out, char *name) // void Server::reportStatistics(String &out, char *name) { out << name << " " << _host << ":" << _port; out << " " << _documents << " document"; if (_documents != 1) out << "s"; }