diff options
author | Slávek Banko <slavek.banko@axis.cz> | 2021-11-05 13:28:23 +0100 |
---|---|---|
committer | Slávek Banko <slavek.banko@axis.cz> | 2021-11-05 13:28:23 +0100 |
commit | 8c787c3591c1c885b91a54128835b400858c5cca (patch) | |
tree | eca1b776912a305c4d45b3964038278a2fae1ead /debian/htdig/htdig-3.2.0b6/htcommon/URL.cc | |
parent | fe188b907cdf30dfdfe0eba9412e7f8749fec158 (diff) | |
download | extra-dependencies-8c787c3591c1c885b91a54128835b400858c5cca.tar.gz extra-dependencies-8c787c3591c1c885b91a54128835b400858c5cca.zip |
DEB htdig: Added to repository.
Signed-off-by: Slávek Banko <slavek.banko@axis.cz>
Diffstat (limited to 'debian/htdig/htdig-3.2.0b6/htcommon/URL.cc')
-rw-r--r-- | debian/htdig/htdig-3.2.0b6/htcommon/URL.cc | 936 |
1 files changed, 936 insertions, 0 deletions
diff --git a/debian/htdig/htdig-3.2.0b6/htcommon/URL.cc b/debian/htdig/htdig-3.2.0b6/htcommon/URL.cc new file mode 100644 index 00000000..9ccbe5d5 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htcommon/URL.cc @@ -0,0 +1,936 @@ +// +// URL.cc +// +// URL: A URL parsing class, implementing as closely as possible the standard +// laid out in RFC2396 (e.g. http://www.faqs.org/rfcs/rfc2396.html) +// including support for multiple services. (schemes in the RFC) +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1995-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: URL.cc,v 1.16 2004/06/04 08:51:01 angusgb Exp $ +// + +#ifdef HAVE_CONFIG_H +#include "htconfig.h" +#endif /* HAVE_CONFIG_H */ + +#include "URL.h" +#include "QuotedStringList.h" +#include "Dictionary.h" +#include "HtConfiguration.h" +#include "StringMatch.h" +#include "StringList.h" +#include "HtURLRewriter.h" + +#include <string.h> +#include <stdlib.h> +#include <stdio.h> + +#ifdef HAVE_STD +#include <fstream> +#ifdef HAVE_NAMESPACES +using namespace std; +#endif +#else +#include <fstream.h> +#endif /* HAVE_STD */ + +#include <sys/types.h> +#include <ctype.h> + +#ifndef _MSC_VER /* _WIN32 */ +#include <sys/socket.h> +#include <netdb.h> +#include <arpa/inet.h> +#endif + +#define NNTP_DEFAULT_PORT 119 + +static Dictionary *slashCount = 0; + +//***************************************************************************** +// URL::URL() +// Default Constructor +// +URL::URL() +: _url(0), + _path(0), + _service(0), + _host(0), + _port(0), + _normal(0), + _hopcount(0), + _signature(0), + _user(0) +{ +} + + +//***************************************************************************** +// URL::URL(const URL& rhs) +// Copy constructor +// +URL::URL(const URL& rhs) +: _url(rhs._url), + _path(rhs._path), + _service(rhs._service), + _host(rhs._host), + _port(rhs._port), + _normal(rhs._normal), + _hopcount(rhs._hopcount), + _signature(rhs._signature), + _user(rhs._user) +{ +} + + +//***************************************************************************** +// URL::URL(const String &nurl) +// Construct a URL from a String (obviously parses the string passed in) +// +URL::URL(const String &nurl) +: _url(0), + _path(0), + _service(0), + _host(0), + _port(0), + _normal(0), + _hopcount(0), + _signature(0), + _user(0) +{ + parse(nurl); +} + + +//***************************************************************************** +// Assignment operator +const URL &URL::operator = (const URL &rhs) +{ + if (this == &rhs) + return *this; + + // Copy the attributes + _url = rhs._url; + _path = rhs._path; + _service = rhs._service; + _host = rhs._host; + _port = rhs._port; + _normal = rhs._normal; + _hopcount = rhs._hopcount; + _signature = rhs._signature; + _user = rhs._user; + + return *this; +} + +//***************************************************************************** +// URL::URL(const String &url, const URL &parent) +// Parse a reference given a parent url. This is needed to resolve relative +// references which do NOT have a full url. +// +URL::URL(const String &url, const URL &parent) +: _url(0), + _path(0), + _service(parent._service), + _host(parent._host), + _port(parent._port), + _normal(parent._normal), + _hopcount(parent._hopcount + 1), // Since this is one hop *after* the parent, we should account for this + _signature(parent._signature), + _user(parent._user) +{ + HtConfiguration* config= HtConfiguration::config(); + int allowspace = config->Boolean("allow_space_in_url", 0); + String temp; + const char *urp = url.get(); + while (*urp) + { + if (*urp == ' ' && temp.length() > 0 && allowspace) + { + // Replace space character with %20 if there's more non-space + // characters to come... + const char *s = urp+1; + while (*s && isspace(*s)) + s++; + if (*s) + temp << "%20"; + } + else if (!isspace(*urp)) + temp << *urp; + urp++; + } + char* ref = temp; + + // + // Strip any optional anchor from the reference. If, however, the + // reference contains CGI parameters after the anchor, the parameters + // will be moved left to replace the anchor. The overall effect is that + // the anchor is removed. + // Thanks goes to David Filiatrault <dwf@WebThreads.Com> for suggesting + // this removal process. + // + char *anchor = strchr(ref, '#'); + char *params = strchr(ref, '?'); + if (anchor) + { + *anchor = '\0'; + if (params) + { + if (anchor < params) + { + while (*params) + { + *anchor++ = *params++; + } + *anchor = '\0'; + } + } + } + + // + // If, after the removal of a possible '#' we have nothing left, + // we just want to use the base URL (we're on the same page but + // different anchors) + // + if (!*ref) + { + // We've already copied much of the info + _url = parent._url; + _path = parent._path; + // Since this is on the same page, we want the same hopcount + _hopcount = parent._hopcount; + return; + } + + // OK, now we need to work out what type of child URL this is + char *p = ref; + while (isalpha(*p)) // Skip through the service portion + p++; + int hasService = (*p == ':'); + // Why single out http? Shouldn't others be the same? + // Child URL of the form https:/child or ftp:child called "full" + // How about using slashes()? + if (hasService && ((strncmp(ref, "http://", 7) == 0) || + (strncmp(ref, "http:", 5) != 0))) + { + // + // No need to look at the parent url since this is a complete url... + // + parse(ref); + } + else if (strncmp(ref, "//", 2) == 0) + { + // look at the parent url's _service, to make this is a complete url... + String fullref(parent._service); + fullref << ':' << ref; + parse((char*)fullref); + } + else + { + if (hasService) + ref = p + 1; // Relative URL, skip "http:" + + if (*ref == '/') + { + // + // The reference is on the same server as the parent, but + // an absolute path was given... + // + _path = ref; + + // + // Get rid of loop-causing constructs in the path + // + normalizePath(); + } + else + { + // + // The reference is relative to the parent + // + + _path = parent._path; + int i = _path.indexOf('?'); + if (i >= 0) + { + _path.chop(_path.length() - i); + } + + // + // Remove any leading "./" sequences which could get us into + // recursive loops. + // + while (strncmp(ref, "./", 2) == 0) + ref += 2; + + if (_path.last() == '/') + { + // + // Parent was a directory. Easy enough: just append + // the current ref to it + // + _path << ref; + } + else + { + // + // Parent was a file. We need to strip the last part + // of the path before we add the reference to it. + // + String temp = _path; + p = strrchr((char*)temp, '/'); + if (p) + { + p[1] = '\0'; + _path = temp.get(); + _path << ref; + } + else + { + // + // Something must be wrong since there were no '/' + // found in the parent url. + // + // We do nothing here. The new url is the parent. + // + } + } + + // + // Get rid of loop-causing constructs in the path + // + normalizePath(); + } + + // + // Build the url. (Note, the host name has NOT been normalized!) + // No need for this if we have called URL::parse. + // + constructURL(); + } +} + + +//***************************************************************************** +// void URL::rewrite() +// +void URL::rewrite() +{ + if (HtURLRewriter::instance()->replace(_url) > 0) + parse(_url.get()); +} + + +//***************************************************************************** +// void URL::parse(const String &u) +// Given a URL string, extract the service, host, port, and path from it. +// +void URL::parse(const String &u) +{ + HtConfiguration* config= HtConfiguration::config(); + int allowspace = config->Boolean("allow_space_in_url", 0); + String temp; + const char *urp = u.get(); + while (*urp) + { + if (*urp == ' ' && temp.length() > 0 && allowspace) + { + // Replace space character with %20 if there's more non-space + // characters to come... + const char *s = urp+1; + while (*s && isspace(*s)) + s++; + if (*s) + temp << "%20"; + } + else if (!isspace(*urp)) + temp << *urp; + urp++; + } + char *nurl = temp; + + // + // Ignore any part of the URL that follows the '#' since this is just + // an index into a document. + // + char *p = strchr(nurl, '#'); + if (p) + *p = '\0'; + + // Some members need to be reset. If not, the caller would + // have used URL::URL(char *ref, URL &parent) + // (which may call us, if the URL is found to be absolute). + _normal = 0; + _signature = 0; + _user = 0; + + // + // Extract the service + // + p = strchr(nurl, ':'); + if (p) + { + _service = strtok(nurl, ":"); + p = strtok(0, "\n"); + } + else + { + _service = "http"; + p = strtok(nurl, "\n"); + } + _service.lowercase(); + + // + // Extract the host + // + if (!p || strncmp(p, "//", 2) != 0) + { + // No host specified, it's all a path. + _host = 0; + _port = 0; + _url = 0; + if (p) // if non-NULL, skip (some) leading slashes in path + { + int i; + for (i = slashes (_service); i > 0 && *p == '/'; i--) + p++; + if (i) // if fewer slashes than specified for protocol don't + // delete any. -> Backwards compatible (necessary??) + p -= slashes (_service) - i; + } + _path = p; + if (strcmp((char*)_service, "file") == 0 || slashes (_service) < 2) + _host = "localhost"; + } + else + { + p += 2; + + // + // p now points to the host + // + char *q = strchr(p, ':'); + char *slash = strchr(p, '/'); + + _path = "/"; + if (strcmp((char*)_service, "file") == 0) + { + // These should be of the form file:/// (i.e. no host) + // if there is a file://host/path then strip the host + if (strncmp(p, "/", 1) != 0) + { + p = strtok(p, "/"); + _path << strtok(0, "\n"); + } + else + _path << strtok(p+1, "\n"); // _path is "/" - don't double + _host = "localhost"; + _port = 0; + } + else if (q && ((slash && slash > q) || !slash)) + { + _host = strtok(p, ":"); + p = strtok(0, "/"); + if (p) + _port = atoi(p); + if (!p || _port <= 0) + _port = DefaultPort(); + // + // The rest of the input string is the path. + // + _path << strtok(0, "\n"); + + } + else + { + _host = strtok(p, "/"); + _host.chop(" \t"); + _port = DefaultPort(); + + // + // The rest of the input string is the path. + // + _path << strtok(0, "\n"); + + } + + // Check to see if host contains a user@ portion + int atMark = _host.indexOf('@'); + if (atMark != -1) + { + _user = _host.sub(0, atMark); + _host = _host.sub(atMark + 1); + } + } + + // + // Get rid of loop-causing constructs in the path + // + normalizePath(); + + // + // Build the url. (Note, the host name has NOT been normalized!) + // + constructURL(); +} + + +//***************************************************************************** +// void URL::normalizePath() +// Called from: URL(const String &url, const URL &parent) +// +void URL::normalizePath() +{ + // + // Rewrite the path to be the minimal. + // Remove "//", "/../" and "/./" components + // + HtConfiguration* config= HtConfiguration::config(); + + int i, limit; + int leadingdotdot = 0; + String newPath; + int pathend = _path.indexOf('?'); // Don't mess up query strings. + if (pathend < 0) + pathend = _path.length(); + + // + // get rid of "//" first, or "/foo//../" will become "/foo/" not "/" + // Some database lookups interpret empty paths (// != /), so give + // the use the option to turn this off. + // + if (!config->Boolean ("allow_double_slash")) + while ((i = _path.indexOf("//")) >= 0 && i < pathend) + { + newPath = _path.sub(0, i).get(); + newPath << _path.sub(i + 1).get(); + _path = newPath; + pathend = _path.indexOf('?'); + if (pathend < 0) + pathend = _path.length(); + } + + // + // Next get rid of redundant "/./". This could cause infinite + // loops. Moreover, "/foo/./../" should become "/", not "/foo/" + // + while ((i = _path.indexOf("/./")) >= 0 && i < pathend) + { + newPath = _path.sub(0, i).get(); + newPath << _path.sub(i + 2).get(); + _path = newPath; + pathend = _path.indexOf('?'); + if (pathend < 0) + pathend = _path.length(); + } + if ((i = _path.indexOf("/.")) >= 0 && i == pathend-2) + { + newPath = _path.sub(0, i+1).get(); // keep trailing slash + newPath << _path.sub(i + 2).get(); + _path = newPath; + pathend--; + } + + // + // Now that "empty" path components are gone, remove ("/../"). + // + while ((i = _path.indexOf("/../")) >= 0 && i < pathend) + { + if ((limit = _path.lastIndexOf('/', i - 1)) >= 0) + { + newPath = _path.sub(0, limit).get(); + newPath << _path.sub(i + 3).get(); + _path = newPath; + } + else + { + _path = _path.sub(i + 3).get(); + leadingdotdot++; + } + pathend = _path.indexOf('?'); + if (pathend < 0) + pathend = _path.length(); + } + if ((i = _path.indexOf("/..")) >= 0 && i == pathend-3) + { + if ((limit = _path.lastIndexOf('/', i - 1)) >= 0) + newPath = _path.sub(0, limit+1).get(); // keep trailing slash + else + { + newPath = '/'; + leadingdotdot++; + } + newPath << _path.sub(i + 3).get(); + _path = newPath; + pathend = _path.indexOf('?'); + if (pathend < 0) + pathend = _path.length(); + } + // The RFC gives us a choice of what to do when we have .. left and + // we're at the top level. By principle of least surprise, we'll just + // toss any "leftovers" Otherwise, we'd have a loop here to add them. + + // Finally change all "%7E" to "~" for sanity + while ((i = _path.indexOf("%7E")) >= 0 && i < pathend) + { + newPath = _path.sub(0, i).get(); + newPath << "~"; + newPath << _path.sub(i + 3).get(); + _path = newPath; + pathend = _path.indexOf('?'); + if (pathend < 0) + pathend = _path.length(); + } + + // If the server *isn't* case sensitive, we want to lowercase the path + if (!config->Boolean("case_sensitive", 1)) + _path.lowercase(); + + // And don't forget to remove index.html or similar file. +// if (strcmp((char*)_service, "file") != 0) (check is now internal) + removeIndex(_path, _service); +} + +//***************************************************************************** +// void URL::dump() +// +void URL::dump() +{ + cout << "service = " << _service.get() << endl; + cout << "user = " << _user.get() << endl; + cout << "host = " << _host.get() << endl; + cout << "port = " << _port << endl; + cout << "path = " << _path << endl; + cout << "url = " << _url << endl; +} + + +//***************************************************************************** +// void URL::path(const String &newpath) +// +void URL::path(const String &newpath) +{ + HtConfiguration* config= HtConfiguration::config(); + _path = newpath; + if (!config->Boolean("case_sensitive",1)) + _path.lowercase(); + constructURL(); +} + + +//***************************************************************************** +// void URL::removeIndex(String &path, String &service) +// Attempt to remove the remove_default_doc from the end of a URL path if +// the service allows that. (File, ftp don't. Do others?) +// This needs to be done to normalize the paths and make .../ the +// same as .../index.html +// Called from: URL::normalize() from URL::signature() [redundant?] +// URL::normalizePath() +// +void URL::removeIndex(String &path, String &service) +{ + HtConfiguration* config= HtConfiguration::config(); + static StringMatch *defaultdoc = 0; + + if (strcmp((char*)_service, "file") == 0 || + strcmp((char*)_service, "ftp") == 0) + return; + + if (path.length() == 0 || strchr((char*)path, '?')) + return; + + int filename = path.lastIndexOf('/') + 1; + if (filename == 0) + return; + + if (! defaultdoc) + { + StringList l(config->Find("remove_default_doc"), " \t"); + defaultdoc = new StringMatch(); + defaultdoc->IgnoreCase(); + defaultdoc->Pattern(l.Join('|')); + } + int which, length; + if (defaultdoc->hasPattern() && + defaultdoc->CompareWord((char*)path.sub(filename), which, length) && + filename+length == path.length()) + path.chop(path.length() - filename); +} + + +//***************************************************************************** +// void URL::normalize() +// Make sure that URLs are always in the same format. +// +void URL::normalize() +{ + HtConfiguration* config= HtConfiguration::config(); + static int hits = 0, misses = 0; + + if (_service.length() == 0 || _normal) + return; + + +// if (strcmp((char*)_service, "http") != 0) + // if service specifies "doesn't specify an IP host", don't normalize it + if (slashes (_service) != 2) + return; + +// if (strcmp ((char*)_service, "http") == 0) (check is now internal) + removeIndex(_path, _service); + + // + // Convert a hostname to an IP address + // + _host.lowercase(); + + if (!config->Boolean("allow_virtual_hosts", 1)) + { + static Dictionary hostbyname; + unsigned long addr; + struct hostent *hp; + + String *ip = (String *) hostbyname[_host]; + if (ip) + { + memcpy((char *) &addr, ip->get(), ip->length()); + hits++; + } + else + { + addr = inet_addr(_host.get()); + if (addr == 0xffffffff) + { + hp = gethostbyname(_host.get()); + if (hp == NULL) + { + return; + } + memcpy((char *)&addr, (char *)hp->h_addr, hp->h_length); + ip = new String((char *) &addr, hp->h_length); + hostbyname.Add(_host, ip); + misses++; + } + } + + static Dictionary machines; + String key; + key << int(addr); + String *realname = (String *) machines[key]; + if (realname) + _host = realname->get(); + else + machines.Add(key, new String(_host)); + } + ServerAlias(); + + // + // Reconstruct the url + // + constructURL(); + _normal = 1; + _signature = 0; +} + + +//***************************************************************************** +// const String &URL::signature() +// Return a string which uniquely identifies the server the current +// URL is refering to. +// This is the first portion of a url: service://user@host:port/ +// (in short this is the URL pointing to the root of this server) +// +const String &URL::signature() +{ + if (_signature.length()) + return _signature; + + if (!_normal) + normalize(); + _signature = _service; + _signature << "://"; + if (_user.length()) + _signature << _user << '@'; + _signature << _host; + _signature << ':' << _port << '/'; + return _signature; +} + +//***************************************************************************** +// void URL::ServerAlias() +// Takes care of the server aliases, which attempt to simplify virtual +// host problems +// +void URL::ServerAlias() +{ + HtConfiguration* config= HtConfiguration::config(); + static Dictionary *serveraliases= 0; + + if (! serveraliases) + { + String l= config->Find("server_aliases"); + String from, *to; + serveraliases = new Dictionary(); + char *p = strtok(l, " \t"); + char *salias= NULL; + while (p) + { + salias = strchr(p, '='); + if (! salias) + { + p = strtok(0, " \t"); + continue; + } + *salias++= '\0'; + from = p; + from.lowercase(); + if (from.indexOf(':') == -1) + from.append(":80"); + to= new String(salias); + to->lowercase(); + if (to->indexOf(':') == -1) + to->append(":80"); + serveraliases->Add(from.get(), to); + // fprintf (stderr, "Alias: %s->%s\n", from.get(), to->get()); + p = strtok(0, " \t"); + } + } + + String *al= 0; + int newport; + int delim; + String serversig = _host; + serversig << ':' << _port; + if ((al= (String *) serveraliases->Find(serversig))) + { + delim= al->indexOf(':'); + // fprintf(stderr, "\nOld URL: %s->%s\n", (char *) serversig, (char *) *al); + _host= al->sub(0,delim).get(); + sscanf((char*)al->sub(delim+1), "%d", &newport); + _port= newport; + // fprintf(stderr, "New URL: %s:%d\n", (char *) _host, _port); + } +} + +//***************************************************************************** +// int URL::slash(const String &protocol) +// Returns number of slashes folowing the service name for protocol +// +int +URL::slashes(const String &protocol) +{ + if (!slashCount) + { + HtConfiguration* config= HtConfiguration::config(); + slashCount = new Dictionary(); + + slashCount->Add (String("mailto"), new String("0")); + slashCount->Add (String("news"), new String("0")); + slashCount->Add (String("http"), new String("2")); + slashCount->Add (String("ftp"), new String("2")); + // file:/// has three, but the last counts as part of the path... + slashCount->Add (String("file"), new String("2")); + + QuotedStringList qsl(config->Find("external_protocols"), " \t"); + String from; + int i; + int sep,colon; + + for (i = 0; qsl[i]; i += 2) + { + from = qsl[i]; + sep = from.indexOf("->"); + if (sep != -1) + from = from.sub(0, sep).get(); // "get" aids portability... + + colon = from.indexOf(":"); + // if service specified as "help:/" or "man:", note trailing slashes + // Default is 2. + if (colon != -1) + { + int i; + char count [2]; + for (i = colon+1; from[i] == '/'; i++) + ; + count [0] = i - colon + '0' - 1; + count [1] = '\0'; + from = from.sub(0,colon).get(); + slashCount->Add (from, new String (count)); + } else + slashCount->Add (from, new String ("2")); + } + } + + // Default to two slashes for unknown protocols + String *count = (String *)slashCount->Find(protocol); + return count ? (count->get()[0] - '0') : 2; +} + +//***************************************************************************** +// void URL::constructURL() +// Constructs the _url member from everything else +// Also ensures the port number is correct for the service +// Called from URL::URL(const String &url, const URL &parent) +// URL::parse(const String &u) +// URL::path(const String &newpath) +// URL::normalize() +// +void URL::constructURL() +{ + if (strcmp((char*)_service, "file") != 0 && _host.length() == 0) { + _url = ""; + return; + } + + _url = _service; + _url << ":"; + + // Add correct number of slashes after service name + int i; + for (i = slashes (_service); i > 0; i--) + { + _url << "/"; + } + + if (slashes (_service) == 2) // services specifying a particular + { // IP host must begin "service://" + if (strcmp((char*)_service, "file") != 0) + { + if (_user.length()) + _url << _user << '@'; + _url << _host; + } + + if (_port != DefaultPort() && _port != 0) // Different than the default port + _url << ':' << _port; + } + + _url << _path; +} + + +/////// + // Get the default port for the recognised service +/////// + +int URL::DefaultPort() +{ + if (strcmp((char*)_service, "http") == 0) + return 80; + else if (strcmp((char*)_service, "https") == 0) + return 443; + else if (strcmp((char*)_service, "ftp") == 0) + return 21; + else if (strcmp((char*)_service, "gopher") == 0) + return 70; + else if (strcmp((char*)_service, "file") == 0) + return 0; + else if (strcmp((char*)_service, "news") == 0) + return NNTP_DEFAULT_PORT; + else return 80; +} |