diff options
Diffstat (limited to 'debian/htdig/htdig-3.2.0b6/libhtdig/BasicDocument.cc')
-rw-r--r-- | debian/htdig/htdig-3.2.0b6/libhtdig/BasicDocument.cc | 316 |
1 files changed, 316 insertions, 0 deletions
diff --git a/debian/htdig/htdig-3.2.0b6/libhtdig/BasicDocument.cc b/debian/htdig/htdig-3.2.0b6/libhtdig/BasicDocument.cc new file mode 100644 index 00000000..d6862550 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/libhtdig/BasicDocument.cc @@ -0,0 +1,316 @@ +// +// BasicDocument.cc +// +// 2/6/2002 created for libhtdig to simplify & mimic Document.cc +// +// Neal Richter nealr@rightnow.com +// +// +// BasicDocument: This class holds everything there is to know about a document. +// The actual contents of the document may or may not be present at +// all times for memory conservation reasons. +// +// This is a basic extensable container for plain text holding documents. +// +// Uses any Parser with parse method handling this class. +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1995-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: BasicDocument.cc,v 1.3 2004/05/28 13:15:28 lha Exp $ +// +//-------------------------------------------------------------------- + +#ifdef HAVE_CONFIG_H +#include "htconfig.h" +#endif /* HAVE_CONFIG_H */ + +#include <signal.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <ctype.h> + +#include "BasicDocument.h" +#include "TextCollector.h" +#include "StringList.h" +#include "htdig.h" +#include "Plaintext.h" +#include "HTML.h" +#include "ExternalParser.h" +#include "lib.h" + +#include "defaults.h" + +#if 1 +typedef void (*SIGNAL_HANDLER) (...); +#else +typedef SIG_PF SIGNAL_HANDLER; +#endif + +//***************************************************************************** +// BasicDocument::BasicDocument(char *loc) +// Initialize with the given loc-parameter as the location for this document. +// If the max_size is given, use that for size, otherwise use the +// config value. +// +BasicDocument::BasicDocument(char *loc, int suggested_size) +{ + int temp_size = 0; + + id = 0; + location = 0; + title = 0; + metacontent = 0; + contents = 0; + document_length = 0; + + + HtConfiguration *config = HtConfiguration::config(); + + //We probably need to move assignment of max_doc_size, according + //to a configuration value. + + if (suggested_size > 0) + temp_size = suggested_size; + else + temp_size = config->Value("max_doc_size"); + + contents.allocate(temp_size + 100); + + contentType = ""; + + if (loc) + { + Location(loc); + } +} + + +//***************************************************************************** +// BasicDocument::~BasicDocument() +// +BasicDocument::~BasicDocument() +{ + // We delete only the derived class objects + +#if MEM_DEBUG + char *p = new char; + cout << "==== BasicDocument deleted: " << this << " new at " << ((void *) p) << endl; + delete p; +#endif +} + + +//***************************************************************************** +// void BasicDocument::Reset() +// Restore the BasicDocument object to an initial state. +// +void +BasicDocument::Reset() +{ + + id = 0; + location = 0; + title = 0; + metacontent = 0; + contents = 0; + + contentType = 0; + document_length = 0; + +} + +//***************************************************************************** +// void BasicDocument::Length() +// Return/Calc length of BasicDocument... icummulative size of the Strings +// +int +BasicDocument::Length() +{ + if (document_length < 0) + { + document_length = 0; + document_length += location.length(); + document_length += title.length(); + document_length += metacontent.length(); + document_length += contents.length(); + document_length += id.length(); + } + + return (document_length); +} + + +//***************************************************************************** +// Parsable *BasicDocument::getParsable() +// Given the content-type of a document, returns a document parser. +// This will first look through the list of user supplied parsers and +// then at our (limited) builtin list of parsers. The user supplied +// parsers are external programs that will be used. + +Parsable * +BasicDocument::getParsable() +{ + static HTML *html = 0; + static Plaintext *plaintext = 0; + static ExternalParser *externalParser = 0; + + Parsable *parsable = 0; + + if (ExternalParser::canParse(contentType)) + { + if (externalParser) + { + delete externalParser; + } + externalParser = new ExternalParser(contentType); + parsable = externalParser; + } + else if (mystrncasecmp((char *) contentType, "text/html", 9) == 0) + { + if (!html) + html = new HTML(); + parsable = html; + } + else if (mystrncasecmp((char *) contentType, "text/plain", 10) == 0) + { + if (!plaintext) + plaintext = new Plaintext(); + parsable = plaintext; + } + else if (mystrncasecmp((char *) contentType, "text/css", 8) == 0) + { + return NULL; + } + else if (mystrncasecmp((char *) contentType, "text/", 5) == 0) + { + if (!plaintext) + plaintext = new Plaintext(); + parsable = plaintext; + if (debug > 1) + { + cout << '"' << contentType << "\" not a recognized type. Assuming text/plain\n"; + } + } + else + { + if (debug > 1) + { + cout << '"' << contentType << "\" not a recognized type. Ignoring\n"; + } + return NULL; + } + + parsable->setContents(contents.get(), contents.length()); + return parsable; +} + +//***************************************************************************** +// +// Test for self parseaable +// +int +BasicDocument::SelfParseable() +{ + + if (mystrncasecmp((char *) contentType, "text/vnd.customdocument", 10) == 0) + { + return (TRUE); + } + else + return (FALSE); + +} + + +//***************************************************************************** +// Parsable *BasicDocument::internalParser() +int +BasicDocument::internalParser(TextCollector & textcollector) +{ + HtConfiguration* config= HtConfiguration::config(); + char *position = NULL; + static int minimumWordLength = config->Value("minimum_word_length", 3); + int wordIndex = 1; + String word; + int letter_count = 0; + + //First Process Title + textcollector.got_title((char *) title); + + //Next Process Contents + position = contents; + + while (*position) + { + word = 0; + + if (HtIsStrictWordChar(*position)) + { + // + // Start of a word. Try to find the whole thing + // + //TODO NEAL RICHTER Imposed a 50-letter word length limit here + // + while (*position && HtIsWordChar(*position) && (letter_count < 50)) + { + word << *position; + position++; + letter_count++; + } + + letter_count = 0; + if (word.length() >= minimumWordLength) + { + textcollector.got_word((char *) word, wordIndex++, 0); + } + } + + if (*position) + position++; + + }//end while + + textcollector.got_head((char*) contents); + + //Third, Process MetaContent + position = metacontent; + textcollector.got_meta_dsc(metacontent); + + + //max_meta_description_length??? + + while (*position) + { + word = 0; + + if (HtIsStrictWordChar(*position)) + { + // + // Start of a word. Try to find the whole thing + // + while (*position && HtIsWordChar(*position) && (letter_count < 50)) + { + word << *position; + position++; + letter_count++; + } + + letter_count = 0; + + if (word.length() >= minimumWordLength) + { + textcollector.got_word((char *) word, wordIndex++, 9); + } + } + + if (*position) + position++; + + }//end while + + return(1); +} |