lib/libchmfile/libchmfileimpl.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286

/***************************************************************************
 *   Copyright (C) 2004-2007 by Georgy Yunaev, gyunaev@ulduzsoft.com       *
 *   Portions Copyright (C) 2003  Razvan Cojocaru <razvanco@gmx.net>       *  
 *   Please do not use email address above for bug reports; see            *
 *   the README file                                                       *
 *                                                                         *
 *   This program is free software; you can redistribute it and/or modify  *
 *   it under the terms of the GNU General Public License as published by  *
 *   the Free Software Foundation; either version 2 of the License, or     *
 *   (at your option) any later version.                                   *
 *                                                                         *
 *   This program is distributed in the hope that it will be useful,       *
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
 *   GNU General Public License for more details.                          *
 *                                                                         *
 *   You should have received a copy of the GNU General Public License     *
 *   along with this program; if not, write to the                         *
 *   Free Software Foundation, Inc.,                                       *
 *   51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.             *
 ***************************************************************************/

#include "chm_lib.h"

#include "libchmfile.h"
#include "libchmtocimage.h"

#include <sys/types.h> /* for u_int{32,64}_t */

//! Keeps the intermediate search result
class LCHMSearchProgressResult
{
	public:
		inline LCHMSearchProgressResult() {}
		inline LCHMSearchProgressResult( u_int32_t t, u_int32_t u ) : titleoff(t),urloff(u) {}
		
		QValueVector<u_int64_t>		offsets;
		u_int32_t					titleoff;
		u_int32_t					urloff;
};

//! An array to keeps the intermediate search results
typedef QT34VECTOR<LCHMSearchProgressResult>	LCHMSearchProgressResults;


//! CHM files processor; the implementation
class LCHMFileImpl
{
	public:
		LCHMFileImpl();
		~LCHMFileImpl();
		
		// Implementations for LCHMFile members
		bool 		loadFile( const QString& archiveName );
		void		closeAll();
		
		QString 	title() const	{ return encodeWithCurrentCodec( m_title ); }
		QString 	homeUrl() const	{ return encodeWithCurrentCodec( m_home ); }
		
		bool 		getFileContentAsString( QString * str, const QString& url, bool internal_encoding = false );
		bool 		getFileContentAsBinary( QByteArray * data, const QString& url ) const;
		bool 		getFileSize( unsigned int * size, const QString& url );
				
		bool		enumerateFiles( QStringList * files );
		QString		getTopicByUrl ( const QString& url )  const;
		
		const QPixmap * getBookIconPixmap( unsigned int imagenum );
		
		bool		setCurrentEncoding( const LCHMTextEncoding * encoding );
						
		//! Parse the HHC or HHS file, and fill the context (asIndex is false) or index (asIndex is true) array.
		bool  		parseFileAndFillArray (const QString& file, QT34VECTOR< LCHMParsedEntry > * data, bool asIndex );
	
		/*!
		 * \brief Fast search using the $FIftiMain file in the .chm.
		 * \param text The text we're looking for.
		 * \param wholeWords Are we looking for whole words only?
		 * \param titlesOnly Are we looking for titles only?
		 * \param results A string-string hashmap that will hold
		 *               the results in case of successful search. The keys are
		 *               the URLs and the values are the page titles.
		 * \param phrase_search Indicates that word offset information should be kept.
		 * \return true if the search found something, false otherwise.
		*/
		bool searchWord( const QString& word, 
						 bool wholeWords, 
	   					 bool titlesOnly, 
		  				 LCHMSearchProgressResults& results, 
		                 bool phrase_search );

		/*!
		 *  \brief Finalize the search, resolve the matches, the and generate the results array.
		 * 	\param tempres Temporary search results from SearchWord.
		 * 	\param results A string-string hashmap that will hold the results in case of successful search.
		 *  The keys are the URLs and the values are the page titles.
		 */
		void getSearchResults( const LCHMSearchProgressResults& tempres, 
							   QStringList * results, 
		  					   unsigned int limit_results = 500 );

		//! Looks up fileName in the archive.
		bool ResolveObject( const QString& fileName, chmUnitInfo *ui ) const;

		//!  Retrieves an uncompressed chunk of a file in the .chm.
		size_t RetrieveObject(const chmUnitInfo *ui, unsigned char *buffer, LONGUINT64 fileOffset, LONGINT64 bufferSize) const;
		
		//! Encode the string with the currently selected text codec, if possible. Or return as-is, if not.
		inline QString encodeWithCurrentCodec (const QString& str) const
		{
			return (m_textCodec ? m_textCodec->toUnicode (str) : str);
		}

		//! Encode the string with the currently selected text codec, if possible. Or return as-is, if not.
		inline QString encodeWithCurrentCodec (const char * str) const
		{
			return (m_textCodec ? m_textCodec->toUnicode (str) : (QString) str);
		}
	
		//! Encode the string from internal files with the currently selected text codec, if possible. 
		//! Or return as-is, if not.	
		inline QString encodeInternalWithCurrentCodec (const QString& str) const
		{
			return (m_textCodecForSpecialFiles ? m_textCodecForSpecialFiles->toUnicode (str) : str);
		}
	
		//! Encode the string from internal files with the currently selected text codec, if possible. 
		//! Or return as-is, if not.	
		inline QString encodeInternalWithCurrentCodec (const char * str) const
		{
			return (m_textCodecForSpecialFiles ? m_textCodecForSpecialFiles->toUnicode (str) : (QString) str);
		}
	
		//! Helper. Translates from Win32 encodings to generic wxWidgets ones.
		const char * GetFontEncFromCharSet (const QString& font) const;

		//! Helper. Returns the $FIftiMain offset of leaf node or 0.
		u_int32_t GetLeafNodeOffset(const QString& text,
									u_int32_t initalOffset,
		 							u_int32_t buffSize,
   									u_int16_t treeDepth );

		//! Helper. Processes the word location code entries while searching.
		bool ProcessWLC(u_int64_t wlc_count, 
						u_int64_t wlc_size,
						u_int32_t wlc_offset,
						unsigned char ds,
						unsigned char dr, 
						unsigned char cs,
						unsigned char cr, 
						unsigned char ls,
						unsigned char lr, 
						LCHMSearchProgressResults& results,
						bool phrase_search );

		//! Looks up as much information as possible from #WINDOWS/#STRINGS.
		bool getInfoFromWindows();

		//! Looks up as much information as possible from #SYSTEM.
		bool getInfoFromSystem();
	
		//! Fill the topic-url map
		void	fillTopicsUrlMap();
		
		//! Sets up textCodec
		void setupTextCodec (const char * name);

		//! Guess used text encoding, using m_detectedLCID and m_font. Set up m_textCodec
		bool guessTextEncoding ();

		//! Change the current CHM encoding for internal files and texts.
		//! Encoding could be either simple Qt codepage, or set like CP1251/KOI8, which allows to
		//! set up encodings separately for text (first) and internal files (second)
		bool  changeFileEncoding( const char *qtencoding );

		//! Convert the word, so it has an appropriate encoding
		QCString convertSearchWord ( const QString &src );

		/*!
		 * Helper procedure in TOC parsing, decodes the string between the quotes (first or last) with decoding HTML
		 * entities like &iacute;
		 */
		int findStringInQuotes (const QString& tag, int offset, QString& value, bool firstquote, bool decodeentities );

		/*!
		 * Decodes Unicode HTML entities according to current encoding.
		 */
		QString decodeEntity (const QString& entity );
		
		/*!
		 * \brief Returns the list of all available text encodings.
		 * \return A pointer to the beginning of the text encoding table. The table could be
		 *         enumerated until language == 0, which means end of table.
		 *
		 * \ingroup encoding
		 */
		static const LCHMTextEncoding	* 	getTextEncodingTable();

		/*!
		 * \brief Looks up for encoding by LCID
		 * \param lcid LCID to look up
		 * \return A pointer to encoding structure.
		 *
		 * \ingroup encoding
		 */
		static const LCHMTextEncoding * lookupByLCID( short lcid );
		
		/*!
		 * \brief Get the encoding index
		 * \param enc Encoding
		 * \return An index in encoding table. getTextEncodingTable() + i gets the encoding.
		 *
		 * \ingroup encoding
		 */
		static int getEncodingIndex( const LCHMTextEncoding * enc);
		
		/*!
		 * Normalizes path to search in internal arrays
		 */
		QString normalizeUrl (const QString& path ) const;

		
		// Members		
		
		//! Pointer to the chmlib structure
		chmFile	*	m_chmFile;
	
		//! Opened file name
		QString  	m_filename;
	
		//! Home url, got from CHM file
		QString  	m_home;

		//! Context tree filename. Got from CHM file
		QString  	m_topicsFile;

		//! Index filename. Got from CHM file
		QString 	m_indexFile;

		//! Chm Title. Got from CHM file
		QString		m_title;

		// Localization stuff
		//! LCID from CHM file, used in encoding detection
		short			m_detectedLCID;

		//! font charset from CHM file, used in encoding detection
		QString 		m_font;

		//! Chosen text codec
		QTextCodec	*	m_textCodec;
		QTextCodec	*	m_textCodecForSpecialFiles;

		//! Current encoding
		const LCHMTextEncoding * m_currentEncoding;

		//! Map to decode HTML entitles like &acute; based on current encoding
		QMap<QString, QString>					m_entityDecodeMap;

		//! TRUE if /#TOPICS, /#STRINGS, /#URLTBL and  /#URLSTR are resolved, and the members below are valid
		bool		m_lookupTablesValid;

		//! pointer to /#TOPICS
		chmUnitInfo	m_chmTOPICS;

		//! pointer to /#STRINGS
		chmUnitInfo	m_chmSTRINGS;

		//! pointer to /#URLTBL
		chmUnitInfo	m_chmURLTBL;

		//! pointer to /#URLSTR
		chmUnitInfo	m_chmURLSTR;

		//! Indicates whether the built-in search is available. This is true only when m_lookupTablesValid
		//! is TRUE, and m_chmFIftiMain is resolved.
		bool			m_searchAvailable;

		//! pointer to /$FIftiMain
		chmUnitInfo	m_chmFIftiMain;
		
		//! Book TOC icon images storage
		LCHMTocImageKeeper	m_imagesKeeper;
		
		//! Map url->topic
		QMap< QString, QString >	m_url2topics;
};