1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
|
/***************************************************************************
* Copyright (C) 2004-2007 by Georgy Yunaev, gyunaev@ulduzsoft.com *
* Portions Copyright (C) 2003 Razvan Cojocaru <razvanco@gmx.net> *
* Please do not use email address above for bug reports; see *
* the README file *
* *
* This program is free software; you can redistribute it and/or modify *
* it under the terms of the GNU General Public License as published by *
* the Free Software Foundation; either version 2 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU General Public License for more details. *
* *
* You should have received a copy of the GNU General Public License *
* along with this program; if not, write to the *
* Free Software Foundation, Inc., *
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. *
***************************************************************************/
#include "chm_lib.h"
#include "libchmfile.h"
#include "libchmtocimage.h"
#include <sys/types.h> /* for u_int{32,64}_t */
//! Keeps the intermediate search result
class LCHMSearchProgressResult
{
public:
inline LCHMSearchProgressResult() {}
inline LCHMSearchProgressResult( u_int32_t t, u_int32_t u ) : titleoff(t),urloff(u) {}
QValueVector<u_int64_t> offsets;
u_int32_t titleoff;
u_int32_t urloff;
};
//! An array to keeps the intermediate search results
typedef QT34VECTOR<LCHMSearchProgressResult> LCHMSearchProgressResults;
//! CHM files processor; the implementation
class LCHMFileImpl
{
public:
LCHMFileImpl();
~LCHMFileImpl();
// Implementations for LCHMFile members
bool loadFile( const QString& archiveName );
void closeAll();
QString title() const { return encodeWithCurrentCodec( m_title ); }
QString homeUrl() const { return encodeWithCurrentCodec( m_home ); }
bool getFileContentAsString( QString * str, const QString& url, bool internal_encoding = false );
bool getFileContentAsBinary( QByteArray * data, const QString& url ) const;
bool getFileSize( unsigned int * size, const QString& url );
bool enumerateFiles( QStringList * files );
QString getTopicByUrl ( const QString& url ) const;
const QPixmap * getBookIconPixmap( unsigned int imagenum );
bool setCurrentEncoding( const LCHMTextEncoding * encoding );
//! Parse the HHC or HHS file, and fill the context (asIndex is false) or index (asIndex is true) array.
bool parseFileAndFillArray (const QString& file, QT34VECTOR< LCHMParsedEntry > * data, bool asIndex );
/*!
* \brief Fast search using the $FIftiMain file in the .chm.
* \param text The text we're looking for.
* \param wholeWords Are we looking for whole words only?
* \param titlesOnly Are we looking for titles only?
* \param results A string-string hashmap that will hold
* the results in case of successful search. The keys are
* the URLs and the values are the page titles.
* \param phrase_search Indicates that word offset information should be kept.
* \return true if the search found something, false otherwise.
*/
bool searchWord( const QString& word,
bool wholeWords,
bool titlesOnly,
LCHMSearchProgressResults& results,
bool phrase_search );
/*!
* \brief Finalize the search, resolve the matches, the and generate the results array.
* \param tempres Temporary search results from SearchWord.
* \param results A string-string hashmap that will hold the results in case of successful search.
* The keys are the URLs and the values are the page titles.
*/
void getSearchResults( const LCHMSearchProgressResults& tempres,
QStringList * results,
unsigned int limit_results = 500 );
//! Looks up fileName in the archive.
bool ResolveObject( const QString& fileName, chmUnitInfo *ui ) const;
//! Retrieves an uncompressed chunk of a file in the .chm.
size_t RetrieveObject(const chmUnitInfo *ui, unsigned char *buffer, LONGUINT64 fileOffset, LONGINT64 bufferSize) const;
//! Encode the string with the currently selected text codec, if possible. Or return as-is, if not.
inline QString encodeWithCurrentCodec (const QString& str) const
{
return (m_textCodec ? m_textCodec->toUnicode (str) : str);
}
//! Encode the string with the currently selected text codec, if possible. Or return as-is, if not.
inline QString encodeWithCurrentCodec (const char * str) const
{
return (m_textCodec ? m_textCodec->toUnicode (str) : (QString) str);
}
//! Encode the string from internal files with the currently selected text codec, if possible.
//! Or return as-is, if not.
inline QString encodeInternalWithCurrentCodec (const QString& str) const
{
return (m_textCodecForSpecialFiles ? m_textCodecForSpecialFiles->toUnicode (str) : str);
}
//! Encode the string from internal files with the currently selected text codec, if possible.
//! Or return as-is, if not.
inline QString encodeInternalWithCurrentCodec (const char * str) const
{
return (m_textCodecForSpecialFiles ? m_textCodecForSpecialFiles->toUnicode (str) : (QString) str);
}
//! Helper. Translates from Win32 encodings to generic wxWidgets ones.
const char * GetFontEncFromCharSet (const QString& font) const;
//! Helper. Returns the $FIftiMain offset of leaf node or 0.
u_int32_t GetLeafNodeOffset(const QString& text,
u_int32_t initalOffset,
u_int32_t buffSize,
u_int16_t treeDepth );
//! Helper. Processes the word location code entries while searching.
bool ProcessWLC(u_int64_t wlc_count,
u_int64_t wlc_size,
u_int32_t wlc_offset,
unsigned char ds,
unsigned char dr,
unsigned char cs,
unsigned char cr,
unsigned char ls,
unsigned char lr,
LCHMSearchProgressResults& results,
bool phrase_search );
//! Looks up as much information as possible from #WINDOWS/#STRINGS.
bool getInfoFromWindows();
//! Looks up as much information as possible from #SYSTEM.
bool getInfoFromSystem();
//! Fill the topic-url map
void fillTopicsUrlMap();
//! Sets up textCodec
void setupTextCodec (const char * name);
//! Guess used text encoding, using m_detectedLCID and m_font. Set up m_textCodec
bool guessTextEncoding ();
//! Change the current CHM encoding for internal files and texts.
//! Encoding could be either simple Qt codepage, or set like CP1251/KOI8, which allows to
//! set up encodings separately for text (first) and internal files (second)
bool changeFileEncoding( const char *qtencoding );
//! Convert the word, so it has an appropriate encoding
QCString convertSearchWord ( const QString &src );
/*!
* Helper procedure in TOC parsing, decodes the string between the quotes (first or last) with decoding HTML
* entities like í
*/
int findStringInQuotes (const QString& tag, int offset, QString& value, bool firstquote, bool decodeentities );
/*!
* Decodes Unicode HTML entities according to current encoding.
*/
QString decodeEntity (const QString& entity );
/*!
* \brief Returns the list of all available text encodings.
* \return A pointer to the beginning of the text encoding table. The table could be
* enumerated until language == 0, which means end of table.
*
* \ingroup encoding
*/
static const LCHMTextEncoding * getTextEncodingTable();
/*!
* \brief Looks up for encoding by LCID
* \param lcid LCID to look up
* \return A pointer to encoding structure.
*
* \ingroup encoding
*/
static const LCHMTextEncoding * lookupByLCID( short lcid );
/*!
* \brief Get the encoding index
* \param enc Encoding
* \return An index in encoding table. getTextEncodingTable() + i gets the encoding.
*
* \ingroup encoding
*/
static int getEncodingIndex( const LCHMTextEncoding * enc);
/*!
* Normalizes path to search in internal arrays
*/
QString normalizeUrl (const QString& path ) const;
// Members
//! Pointer to the chmlib structure
chmFile * m_chmFile;
//! Opened file name
QString m_filename;
//! Home url, got from CHM file
QString m_home;
//! Context tree filename. Got from CHM file
QString m_topicsFile;
//! Index filename. Got from CHM file
QString m_indexFile;
//! Chm Title. Got from CHM file
QString m_title;
// Localization stuff
//! LCID from CHM file, used in encoding detection
short m_detectedLCID;
//! font charset from CHM file, used in encoding detection
QString m_font;
//! Chosen text codec
QTextCodec * m_textCodec;
QTextCodec * m_textCodecForSpecialFiles;
//! Current encoding
const LCHMTextEncoding * m_currentEncoding;
//! Map to decode HTML entitles like ´ based on current encoding
QMap<QString, QString> m_entityDecodeMap;
//! TRUE if /#TOPICS, /#STRINGS, /#URLTBL and /#URLSTR are resolved, and the members below are valid
bool m_lookupTablesValid;
//! pointer to /#TOPICS
chmUnitInfo m_chmTOPICS;
//! pointer to /#STRINGS
chmUnitInfo m_chmSTRINGS;
//! pointer to /#URLTBL
chmUnitInfo m_chmURLTBL;
//! pointer to /#URLSTR
chmUnitInfo m_chmURLSTR;
//! Indicates whether the built-in search is available. This is true only when m_lookupTablesValid
//! is TRUE, and m_chmFIftiMain is resolved.
bool m_searchAvailable;
//! pointer to /$FIftiMain
chmUnitInfo m_chmFIftiMain;
//! Book TOC icon images storage
LCHMTocImageKeeper m_imagesKeeper;
//! Map url->topic
QMap< QString, QString > m_url2topics;
};
|