summaryrefslogtreecommitdiffstats
path: root/debian/htdig/htdig-3.2.0b6/htword/WordDict.h
blob: 86b457173b36639ccdeccd7b7406ae2d50b99a0e (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
//
// WordDict.h
//
// NAME
// 
// manage and use an inverted index dictionary.
//
// SYNOPSIS
// 
// #include <mifluz.h>
// 
// WordList* words = ...;
// WordDict* dict = words->Dict();
// 
// DESCRIPTION
// 
// WordDict maps strings to unique identifiers and frequency in the 
// inverted index. Whenever a new word is found, the WordDict class 
// can be asked to assign it a serial number. When doing so, an entry
// is created in the dictionary with a frequency of zero. The application
// may then increment or decrement the frequency to reflect the inverted
// index content.
//
// The serial numbers range from 1 to 2^32 inclusive.
//
// A WordDict object is automatically created by the WordList object and
// should not be created directly by the application.
//
// END
//
// Part of the ht://Dig package   <http://www.htdig.org/>
// Copyright (c) 1999-2004 The ht://Dig Group
// For copyright details, see the file COPYING in your distribution
// or the GNU Library General Public License (LGPL) version 2 or later
// <http://www.gnu.org/copyleft/lgpl.html>
//
// $Id: WordDict.h,v 1.4 2004/05/28 13:15:26 lha Exp $
//

#ifndef _WordDict_h_
#define _WordDict_h_

#include <stdio.h>

#ifndef SWIG
#include "htString.h"
#include "WordDB.h"

class WordList;
class WordDictCursor;

#define WORD_DICT_SERIAL_INVALID	0

class WordDictRecord {
 public:
  inline WordDictRecord() { count = 0; id = WORD_DICT_SERIAL_INVALID; }

  inline int Unpack(const String& coded) {
    int offset = 0;
    coded.ber_shift(offset, count);
    coded.ber_shift(offset, id);
    return OK;
  }

  inline int Pack(String& coded) const {
    int offset = 0;
    coded.ber_push(offset, count);
    coded.ber_push(offset, id);
    return OK;
  }

  inline int Get(WordDB* db, const String& word) {
    String tmp_word = word;
    String coded(BER_MAX_BYTES * 2);
    int ret;
    if((ret = db->Get(0, tmp_word, coded, 0)) != 0) return ret;

    Unpack(coded);

    return ret;
  }
  
  inline int Put(WordDB* db, const String& word) {
    String coded(BER_MAX_BYTES * 2);
    Pack(coded);
    return db->Put(0, word, coded, 0);
  }

  inline int Del(WordDB* db, const String& word) {
    return db->Del(0, word);
  }

  inline unsigned int Count() { return count; }
  inline unsigned int Id() { return id; }

  unsigned int count;
  unsigned int id;
};
#endif /* SWIG */

class WordDict 
{
 public:
#ifndef SWIG
  //-
  // Private constructor. 
  //
  WordDict() { words = 0; db = 0; }
  ~WordDict();

  //-
  // Bind the object a WordList inverted index. Return OK on success,
  // NOTOK otherwise.
  //
  int Initialize(WordList* words);

  //-
  // Open the underlying Berkeley DB sub-database. The enclosing 
  // file is given by the <i>words</i> data member. Return OK on success,
  // NOTOK otherwise.
  //
  int Open();
  //-
  // Destroy the underlying Berkeley DB sub-database. Return OK on success,
  // NOTOK otherwise.
  //
  int Remove();
  //-
  // Close the underlying Berkeley DB sub-database. Return OK on success,
  // NOTOK otherwise.
  //
  int Close();
    
  //-
  // If the <b>word</b> argument exists in the dictionnary, return its
  // serial number in the <b>serial</b> argument. If it does not already
  // exists, assign it a serial number, create an entry with a frequency
  // of zero and return the new serial in the <b>serial</b> argument.
  // Return OK on success, NOTOK otherwise.
  //
  int Serial(const String& word, unsigned int& serial);
  //-
  // If the <b>word</b> argument exists in the dictionnary, return its
  // serial number in the <b>serial</b> argument. If it does not exists
  // set the <b>serial</b> argument to WORD_DICT_SERIAL_INVALID.
  // Return OK on success, NOTOK otherwise.
  //
  int SerialExists(const String& word, unsigned int& serial);
  //-
  // Short hand for Serial() followed by Ref().
  // Return OK on success, NOTOK otherwise.
  //
  int SerialRef(const String& word, unsigned int& serial);
  //-
  // Return the frequency of the <b>word</b> argument
  // in the <b>noccurrence</b> argument. 
  // Return OK on success, NOTOK otherwise.
  //
  int Noccurrence(const String& word, unsigned int& noccurrence) const;
#endif /* SWIG */

  //-
  // Short hand for words->GetContext()->GetType()->Normalize(word).
  // Return OK on success, NOTOK otherwise.
  // 
  int Normalize(String& word) const;

  //-
  // Short hand for Incr(word, 1)
  //
  int Ref(const String& word) { return Incr(word, 1); }
  //-
  // Add <b>incr</b> to the frequency of the <b>word</b>. 
  // Return OK on success, NOTOK otherwise.
  //
  int Incr(const String& word, unsigned int incr);
  //-
  // Short hand for Decr(word, 1)
  //
  int Unref(const String& word) { return Decr(word, 1); }
  //-
  // Subtract <b>decr</b> to the frequency of the <b>word</b>. If
  // the frequency becomes lower or equal to zero, remove the entry
  // from the dictionnary and lose the association between the word and its
  // serial number.
  // Return OK on success, NOTOK otherwise.
  //
  int Decr(const String& word, unsigned int decr);
  //-
  // Set the frequency of <b>word</b> with the value of the <b>noccurrence</b>
  // argument.
  //
  int Put(const String& word, unsigned int noccurrence);

  //-
  // Return true if <b>word</b> exists in the dictionnary, false otherwise.
  //
  int Exists(const String& word) const;

#ifndef SWIG
  //-
  // Return a pointer to the associated WordList object.
  //
  List* Words() const;

  //-
  // Return a cursor to sequentially walk the dictionnary using the 
  // <b>Next</b> method. 
  //
  WordDictCursor* Cursor() const;
  //-
  // Return the next entry in the dictionnary. The <b>cursor</b> argument
  // must have been created using the <i>Cursor</i> method. The word is
  // returned in the <b>word</b> argument and the record is returned in
  // the <b>record</b> argument. 
  // On success the function returns 0, at the end of the dictionnary it
  // returns DB_NOTFOUND. The <b>cursor</b> argument is deallocated when
  // the function hits the end of the dictionnary or an error occurs.
  // 
  int Next(WordDictCursor* cursor, String& word, WordDictRecord& record);

  //-
  // Return a cursor to sequentially walk the entries of the dictionnary
  // that start with the <b>prefix</b> argument, using the 
  // <b>NextPrefix</b> method. 
  //
  WordDictCursor* CursorPrefix(const String& prefix) const;
  //-
  // Return the next prefix from the dictionnary. The <b>cursor</b> argument
  // must have been created using the <i>CursorPrefix</i> method. The word is
  // returned in the <b>word</b> argument and the record is returned in
  // the <b>record</b> argument. The <b>word</b> is guaranteed to start with
  // the prefix specified to the <b>CursorPrefix</b> method.
  // On success the function returns 0, at the end of the dictionnary it
  // returns DB_NOTFOUND. The <b>cursor</b> argument is deallocated when
  // the function hits the end of the dictionnary or an error occurs.
  // 
  int NextPrefix(WordDictCursor* cursor, String& word, WordDictRecord& record);

  //-
  // Dump the complete dictionary in the file descriptor <b>f.</b> The
  // format of the dictionary is <i>word serial frequency</i>, one by
  // line. 
  //
  int Write(FILE* f);

 private:
  WordList*			words;
  WordDB*	            	db;
#endif /* SWIG */
};
#endif /* _WordDict_h_ */