//
// WordList.cc
//
// WordList: Interface to the word database. Previously, this wrote to
// a temporary text file. Now it writes directly to the
// word database.
// NOTE: Some code previously attempted to directly read from
// the word db. This will no longer work, so it's preferred to
// use the access methods here.
// Configuration parameter used:
// wordlist_extend
// wordlist_verbose 1 walk logic
// wordlist_verbose 2 walk logic details
// wordlist_verbose 3 walk logic lots of details
//
// Part of the ht://Dig package
// Copyright (c) 1999-2004 The ht://Dig Group
// For copyright details, see the file COPYING in your distribution
// or the GNU Library General Public License (LGPL) version 2 or later
//
//
// $Id: WordList.cc,v 1.13 2004/05/28 13:15:27 lha Exp $
//
#ifdef HAVE_CONFIG_H
#include "htconfig.h"
#endif /* HAVE_CONFIG_H */
#include "WordList.h"
#include "WordReference.h"
#include "WordRecord.h"
#include "WordType.h"
#include "WordStat.h"
#include "Configuration.h"
#include "htString.h"
#include "HtPack.h"
#include "HtTime.h"
#include "WordDBCompress.h"
#include
#include
#include
#include
// *****************************************************************************
//
WordList::WordList(const Configuration& config_arg) :
wtype(config_arg),
config(config_arg)
{
// The database itself hasn't been opened yet
isopen = 0;
isread = 0;
extended = config.Boolean("wordlist_extend");
verbose = config.Value("wordlist_verbose");
compressor = 0;
}
// *****************************************************************************
//
WordList::~WordList()
{
Close();
}
// *****************************************************************************
//
int WordList::Open(const String& filename, int mode, int word_only)
{
int usecompress=0;
// If word_only, entries compare equal if the "word" part matches.
// This should only be used for querying the database, not writing it.
// It is needed by speling to test for the existence of words.
db.set_bt_compare(word_only ? word_only_db_cmp : word_db_cmp);
if(config.Value("wordlist_page_size", 0))
db.set_pagesize(config.Value("wordlist_page_size"));
if(config.Boolean("wordlist_compress") == 1) {
usecompress = DB_COMPRESS;
WordDBCompress* compressor = new WordDBCompress(
config.Boolean("wordlist_compress_zlib",0), config.Value("compression_level",0));
// compressor->debug = config.Value("wordlist_compress_debug");
SetCompressor(compressor);
db.CmprInfo(compressor->CmprInfo());
}
int flags = (mode & O_RDWR) ? DB_CREATE : DB_RDONLY;
if(mode & O_TRUNC) {
if(flags == DB_CREATE)
flags |= DB_TRUNCATE;
else
fprintf(stderr, "WordList::Open: O_TRUNC | O_RDONLY is meaningless\n");
}
flags |= usecompress;
int ret = db.Open(filename, DB_BTREE, flags, 0666) == 0 ? OK : NOTOK;
isread = mode & O_RDONLY;
isopen = 1;
return ret;
}
// *****************************************************************************
//
int WordList::Close()
{
if(isopen) {
if(db.Close() != 0) return NOTOK;
isopen = 0;
isread = 0;
}
{
WordDBCompress* compressor = GetCompressor();
if(compressor) {
delete compressor;
SetCompressor(0);
}
}
return OK;
}
// ****************************************************************************
//
int WordList::Put(const WordReference& arg, int flags)
{
if (arg.Key().GetWord().length() == 0) {
fprintf(stderr, "WordList::Put(%s) word is zero length\n", (char*)arg.Get());
return NOTOK;
}
if (!arg.Key().Filled()) {
fprintf(stderr, "WordList::Put(%s) key is not fully defined\n", (char*)arg.Get());
return NOTOK;
}
WordReference wordRef(arg);
String word = wordRef.Key().GetWord();
if(wtype.Normalize(word) & WORD_NORMALIZE_NOTOK)
return NOTOK;
wordRef.Key().SetWord(word);
//
// The two case could be grouped in a more compact way.
// However, the resources consumption difference between
// a Put(DB_NOOVERWRITE) and Put(0) is huge (the first is 75%
// slower than the second). Check the db_put sources for the
// explanation.
//
int ret = NOTOK;
if(flags) {
//
// First attempt tells us if the key exists. If it
// does not we just increment the reference count.
// Otherwise, and only if flags does not contain DB_NOOVERWRITE,
// we override the key/record pair.
//
int error;
if((error = db.Put(wordRef, DB_NOOVERWRITE)) != 0) {
if(error == DB_KEYEXIST && flags == 0)
ret = db.Put(wordRef, 0) == 0 ? OK : NOTOK;
} else {
ret = Ref(wordRef);
}
} else {
if((ret = db.Put(wordRef, 0)) == 0)
ret = Ref(wordRef);
}
return ret;
}
// *****************************************************************************
//
List *WordList::operator [] (const WordReference& wordRef)
{
return Collect(wordRef);
}
// *****************************************************************************
//
List *WordList::Prefix (const WordReference& prefix)
{
WordReference prefix2(prefix);
prefix2.Key().UndefinedWordSuffix();
return Collect(prefix2);
}
// *****************************************************************************
//
List *WordList::WordRefs()
{
return Collect(WordReference());
}
// *****************************************************************************
//
List *WordList::Collect(const WordReference& wordRef)
{
WordCursor *search = Cursor(wordRef.Key(), HTDIG_WORDLIST_COLLECTOR);
if(search->Walk() != OK) return 0;
List* result = search->GetResults();
delete search;
return result;
}
// *****************************************************************************
//
// Callback data dedicated to Dump and dump_word communication
//
class DeleteWordData : public Object
{
public:
DeleteWordData() { count = 0; }
int count;
};
// *****************************************************************************
//
//
static int delete_word(WordList *words, WordDBCursor &cursor, const WordReference *word, Object &data)
{
if(words->Delete(cursor) == 0) {
words->Unref(*word);
((DeleteWordData&)data).count++;
return OK;
} else {
fprintf(stderr, "WordList delete_word: deleting %s failed\n", (char*)word->Get());
return NOTOK;
}
}
// *****************************************************************************
//
// Delete all records matching wordRef, return the number of
// deleted records.
//
int WordList::WalkDelete(const WordReference& wordRef)
{
DeleteWordData data;
WordCursor *description = Cursor(wordRef.Key(), delete_word, &data);
description->Walk();
delete description;
return data.count;
}
// *****************************************************************************
//
//
List *WordList::Words()
{
List *list = 0;
String key;
String record;
WordReference lastWord;
WordDBCursor cursor;
if(cursor.Open(db.db) != 0) return 0;
//
// Move past the first word count record
//
const WordReference& last = WordStat::Last();
last.Pack(key, record);
if(cursor.Get(key, record, DB_SET_RANGE) != 0)
return 0;
list = new List;
do {
WordReference wordRef(key, record);
if(lastWord.Key().GetWord().empty() ||
wordRef.Key().GetWord() != lastWord.Key().GetWord())
{
list->Add(new String(wordRef.Key().GetWord()));
lastWord = wordRef;
}
} while (cursor.Get(key, record, DB_NEXT) == 0);
return list;
}
// *****************************************************************************
//
// Returns the reference count for word in arg
//
int WordList::Noccurrence(const WordKey& key, unsigned int& noccurrence) const
{
noccurrence = 0;
WordStat stat(key.GetWord());
int ret;
if((ret = db.Get(stat)) != 0) {
if(ret != DB_NOTFOUND)
return NOTOK;
} else {
noccurrence = stat.Noccurrence();
}
return OK;
}
// *****************************************************************************
//
// Increment reference count for wordRef
//
int WordList::Ref(const WordReference& wordRef)
{
if(!extended) return OK;
WordStat stat(wordRef.Key().GetWord());
int ret;
if((ret = db.Get(stat)) != 0 && ret != DB_NOTFOUND)
return NOTOK;
stat.Noccurrence()++;
return db.Put(stat, 0) == 0 ? OK : NOTOK;
}
// *****************************************************************************
//
// Decrement reference count for wordRef
//
int WordList::Unref(const WordReference& wordRef)
{
if(!extended) return OK;
WordStat stat(wordRef.Key().GetWord());
int ret;
if((ret = db.Get(stat)) != 0) {
if(ret == DB_NOTFOUND)
fprintf(stderr, "WordList::Unref(%s) Unref on non existing word occurrence\n", (char*)wordRef.Get());
return NOTOK;
}
if(stat.Noccurrence() == 0) {
fprintf(stderr, "WordList::Unref(%s) Unref on 0 occurrences word\n", (char*)wordRef.Get());
return NOTOK;
}
stat.Noccurrence()--;
if(stat.Noccurrence() > 0) {
ret = db.Put(stat, 0) == 0 ? OK : NOTOK;
} else
ret = db.Del(stat) == 0 ? OK : NOTOK;
return ret;
}
// *****************************************************************************
//
// streaming operators for ascii dumping and reading a list
class FileOutData : public Object
{
public:
FILE* f;
FileOutData(FILE* f_arg) : f(f_arg) { }
};
// *****************************************************************************
//
static int
wordlist_walk_callback_file_out(WordList *, WordDBCursor& , const WordReference *word, Object &data)
{
fprintf(((FileOutData&)data).f, "%s\n", (char*)word->Get());
return OK;
}
// *****************************************************************************
//
int
WordList::Write(FILE* f)
{
WordKey empty;
FileOutData data(f);
WordCursor *description = Cursor(empty, wordlist_walk_callback_file_out, (Object *)&data);
description->Walk();
delete description;
return 0;
}
// *****************************************************************************
//
int
WordList::Read(FILE* f)
{
WordReference word;
#define WORD_BUFFER_SIZE 1024
char buffer[WORD_BUFFER_SIZE + 1];
String line;
int line_number = 0;
int inserted = 0;
while(fgets(buffer, WORD_BUFFER_SIZE, f)) {
line_number++;
int buffer_length = strlen(buffer);
int eol = buffer[buffer_length - 1] == '\n';
if(eol) buffer[--buffer_length] = '\0';
line.append(buffer, buffer_length);
//
// Join big lines
//
if(!eol) continue;
//
// If line ends with a \ continue
//
if(line.last() == '\\') {
line.chop(1);
continue;
}
if(!line.empty()) {
if(word.Set(line) != OK) {
fprintf(stderr, "WordList::Read: line %d : %s\n", line_number, (char*)line);
fprintf(stderr, " cannot build WordReference (ignored)\n");
} else {
if(Insert(word) != OK) {
fprintf(stderr, "WordList::Read: line %d : %s\n", line_number, (char*)line);
fprintf(stderr, " insert failed (ignored)\n");
} else {
inserted++;
}
if(verbose) fprintf(stderr, "WordList::Read: inserting %s\n", (char*)word.Get());
}
line.trunc();
}
}
return inserted;
}