// // WordList.h // // NAME // // manage and use an inverted index file. // // SYNOPSIS // // #include // // Configuration* config; // WordReference wordRef; // ... // WordList* words = new WordList(config) // // delete words; // // DESCRIPTION // // WordList is the mifluz equivalent of a database handler. Each // WordList object is bound to an inverted index file and implements the // operations to create it, fill it with word occurrences and search // for an entry matching a given criterion. // // CONFIGURATION // // wordlist_extend {true|false} (default false) // If true maintain reference count of unique // words. The Noccurrence method gives access to this count. // // wordlist_verbose (default 0) // Set the verbosity level of the WordList class. //
// 1 walk logic //
// 2 walk logic details //
// 3 walk logic lots of details // // wordlist_page_size (default 8192) // Berkeley DB page size (see Berkeley DB documentation) // // wordlist_cache_size (default 500K) // Berkeley DB cache size (see Berkeley DB documentation) // Cache makes a huge difference in performance. It must be at least 2% // of the expected total data size. Note that if compression is activated // the data size is eight times larger than the actual file size. In this // case the cache must be scaled to 2% of the data size, not 2% // of the file size. See Cache tuning in the mifluz guide for // more hints. // // wordlist_compress {true|false} (default false) // Activate compression of the index. The resulting index is eight times // smaller than the uncompressed index. // // // END // // Part of the ht://Dig package // Copyright (c) 1999-2004 The ht://Dig Group // For copyright details, see the file COPYING in your distribution // or the GNU Library General Public License (LGPL) version 2 or later // // // $Id: WordList.h,v 1.10 2004/05/28 13:15:28 lha Exp $ // #ifndef _WordList_h_ #define _WordList_h_ #include #include #ifndef SWIG #include "Dictionary.h" #include "List.h" #include "htString.h" #include "WordRecord.h" #include "WordReference.h" #include "WordType.h" #include "WordDB.h" #include "WordDBCompress.h" #include "Configuration.h" #include "WordCursor.h" #endif /* SWIG */ class List; class WordList; class WordDBCursor; // // Inverted index interface // class WordList { public: //- // Constructor. Build inverted index handling object using // run time configuration parameters listed in the CONFIGURATION // section. // WordList(const Configuration& config_arg); virtual ~WordList(); //- // Insert wordRef in index. It is an error to insert // the same wordRef twice. This requires a lookup in the index // prior to the insertion. // Returns OK on success, NOTOK on error. // int Insert(const WordReference& wordRef) { return Put(wordRef, DB_NOOVERWRITE); } //- // Insert wordRef in index. If the Key() part of // the wordRef exists in the index, override it. // Returns OK on success, NOTOK on error. // int Override(const WordReference& wordRef) { return Put(wordRef, 0); } #ifndef SWIG int Put(const WordReference& wordRef, int flags); #endif /* SWIG */ //- // Returns OK if wordRef exists in the index, NOTOK otherwise. // int Exists(const WordReference& wordRef) { return db.Exists(wordRef) == 0 ? OK : NOTOK; } #ifndef SWIG //- // Returns OK if word exists in the index, NOTOK otherwise. // int Exists(const String& word) { return Exists(WordReference(word)); } #endif /* SWIG */ // // Delete permanently // //- // Delete all entries in the index whose key matches the // Key() part of wordRef, using the Walk // method. // Returns the number of entries successfully deleted. // int WalkDelete(const WordReference& wordRef); //- // Delete the entry in the index that exactly matches the // Key() part of wordRef. // Returns OK if deletion is successfull, NOTOK otherwise. // int Delete(const WordReference& wordRef) { if(db.Del(wordRef) == 0) return Unref(wordRef); else return NOTOK; } #ifdef SWIG %name(DeleteCursor) #endif /* SWIG */ //- // Delete the inverted index entry currently pointed to by the // cursor. // Returns 0 on success, Berkeley DB error code on error. This // is mainly useful when implementing a callback function for // a WordCursor. // int Delete(WordDBCursor& cursor) { return cursor.Del(); } //- // Open inverted index filename. mode // may be O_RDONLY or O_RDWR. If mode is // O_RDWR it can be or'ed with O_TRUNC to reset // the content of an existing inverted index. // If word_only is true, entries will compare equal if the "word" part // of the key is equal, even if the numeric fields aren't. (What are the // numeric fields, anyway??) // Return OK on success, NOTOK otherwise. // int Open(const String& filename, int mode, int word_only=false); //- // Close inverted index. // int Close(); // // These returns a list of all the WordReference * matching // the constraint. //- // Returns the list of word occurrences exactly matching the // Key() part of wordRef. The List returned // contains pointers to WordReference objects. It is // the responsibility of the caller to free the list. See List.h // header for usage. // List *Find(const WordReference& wordRef) { return (*this)[wordRef]; } //- // Returns the list of word occurrences exactly matching the // word. The List returned // contains pointers to WordReference objects. It is // the responsibility of the caller to free the list. See List.h // header for usage. // List *FindWord(const String& word) { return (*this)[word]; } #ifndef SWIG //- // Alias to the Find method. // List *operator [] (const WordReference& wordRef); //- // Alias to the FindWord method. // List *operator [] (const String& word) { return (*this)[WordReference(word)]; } #endif /* SWIG */ //- // Returns the list of word occurrences matching the Key() // part of wordRef. In the Key(), the string // (accessed with GetWord()) matches any string that begins // with it. The List returned contains pointers to // WordReference objects. It is the responsibility of the // caller to free the list. // List *Prefix (const WordReference& prefix); #ifndef SWIG //- // Returns the list of word occurrences matching the // word. In the Key(), the string (accessed with // GetWord()) matches any string that begins with it. The // List returned contains pointers to WordReference // objects. It is the responsibility of the caller to free the // list. // List *Prefix (const String& prefix) { return this->Prefix(WordReference(prefix)); } #endif /* SWIG */ // // Iterate over the complete database. // #ifndef SWIG //- // Returns a list of all unique words contained in the inverted // index. The List returned contains pointers to // String objects. It is the responsibility of the caller // to free the list. See List.h header for usage. // List *Words(); #endif /* SWIG */ //- // Returns a list of all entries contained in the // inverted index. The List returned contains pointers to // WordReference objects. It is the responsibility of // the caller to free the list. See List.h header for usage. // List *WordRefs(); #ifndef SWIG //- // Create a cursor that searches all the occurrences in the // inverted index and call ncallback with // ncallback_data for every match. // WordCursor *Cursor(wordlist_walk_callback_t callback, Object *callback_data) { return new WordCursor(this, callback, callback_data); } #endif /* SWIG */ //- // Create a cursor that searches all the occurrences in the // inverted index and that match nsearchKey. If // naction is set to HTDIG_WORDLIST_WALKER calls // searchKey.callback with searchKey.callback_data // for every match. If naction is set to // HTDIG_WORDLIST_COLLECT push each match in searchKey.collectRes // data member as a WordReference object. It is the responsibility // of the caller to free the searchKey.collectRes list. // WordCursor *Cursor(const WordKey &searchKey, int action = HTDIG_WORDLIST_WALKER) { return new WordCursor(this, searchKey, action); } #ifndef SWIG //- // Create a cursor that searches all the occurrences in the // inverted index and that match nsearchKey and calls // ncallback with ncallback_data for every match. // WordCursor *Cursor(const WordKey &searchKey, wordlist_walk_callback_t callback, Object * callback_data) { return new WordCursor(this, searchKey, callback, callback_data); } #endif /* SWIG */ // // Update/get global word statistics statistics // //- // Add one to the reference count for the string contained // in the Key().GetWord() part of wordRef. // Returns OK on success, NOTOK otherwise. // int Ref(const WordReference& wordRef); //- // Substract one to the reference count for the string contained // in the Key().GetWord() part of wordRef. // Returns OK on success, NOTOK otherwise. // int Unref(const WordReference& wordRef); #ifndef SWIG //- // Return in noccurrence the number of occurrences of the // string contained in the GetWord() part of key. // Returns OK on success, NOTOK otherwise. // int Noccurrence(const WordKey& key, unsigned int& noccurrence) const; // // Accessors // // // Get the Berkeley DB object // const WordType& GetWordType() const { return wtype; } #endif /* SWIG */ //- // Return the Configuration object used to initialize // the WordList object. // const Configuration& GetConfiguration() const { return config; } #ifndef SWIG // // Input/Output // //- // Write on file descriptor f an ASCII description of the // index. Each line of the file contains a WordReference // ASCII description. // Returns 0 on success, not 0 otherwise. // int Write(FILE* f); // //- // Read WordReference ASCII descriptions from f, // returns the number of inserted WordReference or < 0 if an error // occurs. Invalid descriptions are ignored as well as empty // lines. // int Read(FILE* f); #endif /* SWIG */ // // Retrieve WordReferences from the database. // Backend of WordRefs, operator[], Prefix... // List *Collect(const WordReference& word); #ifndef SWIG // // Compressor object accessors // WordDBCompress *GetCompressor() { return compressor; } void SetCompressor(WordDBCompress* compressor_arg) { compressor = compressor_arg; } const WordType wtype; const Configuration& config; int isopen; int isread; // // If true enable extended functionalities of WordList such // as per-word statistics. Read from wordlist_extended configuration // parameter. // int extended; WordDB db; WordDBCompress *compressor; int verbose; #endif /* SWIG */ }; #endif /* _WordList_h_ */