Added abandoned KDE3 version of kchmviewer

git-svn-id: svn://anonsvn.kde.org/home/kde/branches/trinity/applications/kchmviewer@1090662 283d02a7-25f6-0310-bc7c-ecb5cbfe19da
author: tpearson <tpearson@283d02a7-25f6-0310-bc7c-ecb5cbfe19da> 2010-02-15 18:23:18 +0000
committer: tpearson <tpearson@283d02a7-25f6-0310-bc7c-ecb5cbfe19da> 2010-02-15 18:23:18 +0000
commit: 3133bbc63a2d32dac638db58fa13e966488e88b5 (patch)
tree: 326595b5fefc87fd7cf5ab3905bc00883fe3c919 /src/kchmsearchengine_impl.cpp
download: kchmviewer-3133bbc63a2d32dac638db58fa13e966488e88b5.tar.gz
kchmviewer-3133bbc63a2d32dac638db58fa13e966488e88b5.zip
1 files changed, 572 insertions, 0 deletions
diff --git a/src/kchmsearchengine_impl.cpp b/src/kchmsearchengine_impl.cpp
new file mode 100644
index 0000000..286832a
--- /dev/null
+++ b/src/kchmsearchengine_impl.cpp
@@ -0,0 +1,572 @@
+/***************************************************************************
+ *   Copyright (C) 2004-2007 by Georgy Yunaev, gyunaev@ulduzsoft.com       *
+ *   Portions Copyright (C) 2000-2005 Trolltech AS.                        *   
+ *   Please do not use email address above for bug reports; see            *
+ *   the README file                                                       *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.             *
+ ***************************************************************************/
+
+
+#include <qfile.h>
+#include <qdir.h>
+#include <qstringlist.h>
+#include <qdict.h>
+#include <qapplication.h>
+
+#include <ctype.h>
+
+#include "kchmsearchengine_impl.h"
+#include "kchmmainwindow.h"
+#include "libchmfileimpl.h"
+
+
+namespace QtAs {
+
+// Those characters are splitters (i.e. split the word), but added themselves into dictionary too.
+// This makes the dictionary MUCH larger, but ensure that for the piece of "window->print" both 
+// search for "print" and "->print" will find it.
+static const char SPLIT_CHARACTERS[] = "!()*&^%#@[]{}':;,.?/|/?<>\\-+=~`";
+	
+// Those characters are parts of word - for example, '_' is here, and search for _debug will find only _debug.
+static const char WORD_CHARACTERS[] = "$_";
+
+	
+int TermList::compareItems( QPtrCollection::Item i1, QPtrCollection::Item i2 )
+{
+	if( ( (Term*)i1 )->frequency == ( (Term*)i2 )->frequency )
+		return 0;
+	if( ( (Term*)i1 )->frequency < ( (Term*)i2 )->frequency )
+		return -1;
+	return 1;
+}
+
+QDataStream &operator>>( QDataStream &s, Document &l )
+{
+	s >> l.docNumber;
+	s >> l.frequency;
+	return s;
+}
+
+QDataStream &operator<<( QDataStream &s, const Document &l )
+{
+	s << (Q_INT16)l.docNumber;
+	s << (Q_INT16)l.frequency;
+	return s;
+}
+
+Index::Index( const QString &dp, const QString & )
+	: QObject( 0, 0 ), dict( 8999 ), docPath( dp )
+{
+	lastWindowClosed = false;
+	connect( qApp, SIGNAL( lastWindowClosed() ),
+			 this, SLOT( setLastWinClosed() ) );
+}
+
+Index::Index( const QStringList &dl, const QString & )
+	: QObject( 0, 0 ), dict( 20011 )
+{
+	docList = dl;
+	lastWindowClosed = false;
+	connect( qApp, SIGNAL( lastWindowClosed() ),
+			 this, SLOT( setLastWinClosed() ) );
+}
+
+void Index::setLastWinClosed()
+{
+	lastWindowClosed = true;
+}
+
+void Index::setDictionaryFile( const QString &f )
+{
+	dictFile = f;
+}
+
+void Index::setDocListFile( const QString &f )
+{
+	docListFile = f;
+}
+
+void Index::setDocList( const QStringList &lst )
+{
+	docList = lst;
+}
+
+bool Index::makeIndex()
+{
+	if ( docList.isEmpty() )
+		return false;
+	
+	QStringList::Iterator it = docList.begin();
+	int steps = docList.count() / 100;
+	
+	if ( !steps )
+		steps++;
+	
+	int prog = 0;
+	
+	for ( int i = 0; it != docList.end(); ++it, ++i )
+	{
+		if ( lastWindowClosed )
+			return false;
+
+		parseDocument( *it, i );
+		
+		if ( i%steps == 0 )
+		{
+			prog++;
+			emit indexingProgress( prog );
+		}
+	}
+	
+	return true;
+}
+
+
+void Index::insertInDict( const QString &str, int docNum )
+{
+	Entry *e = 0;
+	if ( dict.count() )
+		e = dict[ str ];
+	
+	if ( e )
+	{
+		if ( e->documents.first().docNumber != docNum )
+			e->documents.prepend( Document( docNum, 1 ) );
+		else
+			e->documents.first().frequency++;
+	}
+	else
+	{
+		dict.insert( str, new Entry( docNum ) );
+	}
+}
+
+
+bool Index::parseDocumentToStringlist( const QString & filename, QStringList & tokenlist )
+{
+	QString parsedbuf, parseentity;
+	QString text;
+	
+	if ( !::mainWindow->chmFile()->getFileContentAsString( &text, filename ) )
+	{
+		qWarning( "Index::parseDocument: Could not retrieve the content at %s", filename.ascii() );
+		return false;
+	}
+	
+	if ( text.isNull() )
+	{
+		qWarning( "Index::parseDocument: Retrieved content for %s is empty", filename.ascii() );
+		return false;
+	}
+
+	m_charssplit = SPLIT_CHARACTERS;
+	m_charsword = WORD_CHARACTERS;
+	
+	tokenlist.clear();
+	
+	// State machine states
+	enum state_t
+	{
+		STATE_OUTSIDE_TAGS,		// outside HTML tags; parse text
+		STATE_IN_HTML_TAG,		// inside HTML tags; wait for end tag
+		STATE_IN_QUOTES,		// inside HTML tags and inside quotes; wait for end quote (in var QuoteChar)
+		STATE_IN_HTML_ENTITY,	// inside HTML entity; parse the entity
+	};
+	
+	state_t state = STATE_OUTSIDE_TAGS;
+	QChar QuoteChar; // used in STATE_IN_QUOTES
+	
+	for ( unsigned int j = 0; j < text.length(); j++ )
+	{
+		QChar ch = text[j];
+		
+		if ( (j % 20000) == 0 )
+			qApp->processEvents( QEventLoop::ExcludeUserInput );
+		
+		if ( state == STATE_IN_HTML_TAG )
+		{
+			// We are inside HTML tag.
+			// Ignore everything until we see '>' (end of HTML tag) or quote char (quote start)
+			if ( ch == '"' || ch == '\'' )
+			{
+				state = STATE_IN_QUOTES;
+				QuoteChar = ch;
+			}
+			else if ( ch == '>' )
+				state = STATE_OUTSIDE_TAGS;
+				
+			continue;
+		}
+		else if ( state == STATE_IN_QUOTES )
+		{
+			// We are inside quoted text inside HTML tag. 
+			// Ignore everything until we see the quote character again
+			if ( ch == QuoteChar )
+				state = STATE_IN_HTML_TAG;
+				
+			continue;
+		}
+		else if ( state == STATE_IN_HTML_ENTITY )
+		{
+			// We are inside encoded HTML entity (like &nbsp;).
+			// Collect to parsedbuf everything until we see ;
+			if ( ch.isLetterOrNumber() )
+			{
+				// get next character of this entity
+				parseentity.append( ch );
+				continue;
+			}
+				
+			// The entity ended
+			state = STATE_OUTSIDE_TAGS;
+			
+			// Some shitty HTML does not terminate entities correctly. Screw it.			
+			if ( ch != ';' && ch != '<' )
+			{
+				if ( parseentity.isEmpty() )
+				{
+					// straight '&' symbol. Add and continue.
+					parsedbuf += "&";
+				}
+				else
+					qWarning( "Index::parseDocument: incorrectly terminated HTML entity '&%s%c', ignoring", parseentity.ascii(), ch.latin1() );
+				
+				j--; // parse this character again, but in different state
+				continue;
+			}
+			
+			// Don't we have a space?
+			if ( parseentity.lower() != "nbsp" )
+			{
+				QString entity = ::mainWindow->chmFile()->impl()->decodeEntity( parseentity );
+			
+				if ( entity.isNull() )
+				{
+					// decodeEntity() already printed error message
+					//qWarning( "Index::parseDocument: failed to decode entity &%s;", parsedbuf.ascii() );
+					continue;
+				}
+			
+				parsedbuf += entity;
+				continue;
+			}
+			else
+				ch = ' '; // We got a space, so treat it like it, and not add it to parsebuf
+		}
+		
+		// 
+		// Now process STATE_OUTSIDE_TAGS
+		//
+		
+		// Check for start of HTML tag, and switch to STATE_IN_HTML_TAG if it is
+		if ( ch == '<' )
+		{
+			state = STATE_IN_HTML_TAG;
+			goto tokenize_buf;
+		}
+		
+		// Check for start of HTML entity
+		if ( ch == '&' )
+		{
+			state = STATE_IN_HTML_ENTITY;
+			parseentity = QString::null;
+			continue;
+		}
+		
+		// Replace quote by ' - quotes are used in search window to set the phrase
+		if ( ch == '"' )
+			ch = '\'';
+		
+		// Ok, we have a valid character outside HTML tags, and probably some in buffer already.
+		// If it is char or letter, add it and continue
+		if ( ch.isLetterOrNumber() || m_charsword.find( ch ) != -1 )
+		{
+			parsedbuf.append( ch );
+			continue;
+		}
+		
+		// If it is a split char, add the word to the dictionary, and then add the char itself.
+		if ( m_charssplit.find( ch ) != -1 )
+		{
+			if ( !parsedbuf.isEmpty() )
+				tokenlist.push_back( parsedbuf.lower() );
+			
+			tokenlist.push_back( ch.lower() );
+			parsedbuf = QString::null;
+			continue;
+		}
+		
+tokenize_buf:		
+		// Just add the word; it is most likely a space or terminated by tokenizer.
+		if ( !parsedbuf.isEmpty() )
+		{
+			tokenlist.push_back( parsedbuf.lower() );
+			parsedbuf = QString::null;
+		}
+	}
+	
+	// Add the last word if still here - for broken htmls.
+	if ( !parsedbuf.isEmpty() )
+		tokenlist.push_back( parsedbuf.lower() );
+	
+	return true;
+}
+
+
+void Index::parseDocument( const QString &filename, int docNum )
+{
+	QStringList terms;
+	
+	if ( !parseDocumentToStringlist( filename, terms ) )
+		return;
+	
+	for ( QStringList::Iterator it = terms.begin(); it != terms.end(); ++it )
+		insertInDict( *it, docNum );
+}
+
+
+void Index::writeDict()
+{
+	QDictIterator<Entry> it( dict );
+	QFile f( dictFile );
+	
+	if ( !f.open( IO_WriteOnly ) )
+	{
+		qWarning( "Index::writeDict: could not write dictionary file %s", dictFile.ascii() );
+		return;
+	}
+	
+	QDataStream s( &f );
+	s << (int) 1; // version
+	s << m_charssplit;
+	s << m_charsword;
+	
+	for( ; it.current(); ++it )
+	{
+		Entry *e = it.current();
+		s << it.currentKey();
+		s << e->documents;
+	}
+	
+	f.close();
+	writeDocumentList();
+}
+
+void Index::writeDocumentList()
+{
+	QFile f( docListFile );
+	if ( !f.open( IO_WriteOnly ) )
+	{
+		qWarning( "Index::writeDocumentList: could not write dictionary file %s", docListFile.ascii() );
+		return;
+	}
+	QDataStream s( &f );
+	s << docList;
+}
+
+bool Index::readDict()
+{
+	QFile f( dictFile );
+	if ( !f.open( IO_ReadOnly ) )
+		return false;
+
+	dict.clear();
+	QDataStream s( &f );
+	QString key;
+	int version;
+	QValueList<Document> docs;
+	
+	s >> version;
+	s >> m_charssplit;
+	s >> m_charsword;
+	
+	while ( !s.atEnd() )
+	{
+		s >> key;
+		s >> docs;
+		dict.insert( key, new Entry( docs ) );
+	}
+	
+	f.close();
+	return dict.size() > 0 && readDocumentList();
+}
+
+bool Index::readDocumentList()
+{
+	QFile f( docListFile );
+	if ( !f.open( IO_ReadOnly ) )
+		return false;
+	QDataStream s( &f );
+	s >> docList;
+	return true;
+}
+
+QStringList Index::query( const QStringList &terms, const QStringList &termSeq, const QStringList &seqWords )
+{
+	TermList termList;
+
+	QStringList::ConstIterator it = terms.begin();
+	for ( it = terms.begin(); it != terms.end(); ++it )
+	{
+		Entry *e = 0;
+		
+		if ( dict[ *it ] )
+		{
+			e = dict[ *it ];
+			termList.append( new Term( *it, e->documents.count(), e->documents ) );
+		}
+		else
+		{
+			return QStringList();
+		}
+	}
+	
+	termList.sort();
+
+	Term *minTerm = termList.first();
+	
+	if ( !termList.count() )
+		return QStringList();
+	
+	termList.removeFirst();
+
+	QValueList<Document> minDocs = minTerm->documents;
+	QValueList<Document>::iterator C;
+	QValueList<Document>::ConstIterator It;
+	Term *t = termList.first();
+	
+	for ( ; t; t = termList.next() )
+	{
+		QValueList<Document> docs = t->documents;
+		C = minDocs.begin();
+		
+		while ( C != minDocs.end() )
+		{
+			bool found = false;
+			
+			for ( It = docs.begin(); It != docs.end(); ++It )
+			{
+				if ( (*C).docNumber == (*It).docNumber )
+				{
+					(*C).frequency += (*It).frequency;
+					found = true;
+					break;
+				}
+			}
+			
+			if ( !found )
+				C = minDocs.remove( C );
+			else
+				++C;
+		}
+	}
+
+	QStringList results;
+	qHeapSort( minDocs );
+	
+	if ( termSeq.isEmpty() )
+	{
+		for ( C = minDocs.begin(); C != minDocs.end(); ++C )
+			results << docList[ (int)(*C).docNumber ];
+		
+		return results;
+	}
+
+	QString fileName;
+	
+	for ( C = minDocs.begin(); C != minDocs.end(); ++C )
+	{
+		fileName =  docList[ (int)(*C).docNumber ];
+		
+		if ( searchForPhrases( termSeq, seqWords, fileName ) )
+			results << fileName;
+	}
+	
+	return results;
+}
+
+
+bool Index::searchForPhrases( const QStringList &phrases, const QStringList &words, const QString &filename )
+{
+	QStringList parsed_document;
+	
+	if ( !parseDocumentToStringlist( filename, parsed_document ) )
+		return false;
+
+	miniDict.clear();
+	
+	// Initialize the dictionary with the words in phrase(s)
+	for ( QStringList::ConstIterator cIt = words.begin(); cIt != words.end(); ++cIt )
+		miniDict.insert( *cIt, new PosEntry( 0 ) );
+
+	// Fill the dictionary with the words from the document
+	unsigned int word_offset = 3;
+	for ( QStringList::ConstIterator it = parsed_document.begin(); it != parsed_document.end(); it++, word_offset++ )
+	{
+		PosEntry * entry = miniDict[ *it ];
+		
+		if ( entry )
+			entry->positions.append( word_offset );
+	}
+	
+	// Dump it
+/*	
+	QDictIterator<PosEntry> it( miniDict );
+	for( ; it.current(); ++it )
+	{
+		QString text( it.currentKey() );
+		QValueList<uint> pos = miniDict[text]->positions;
+		for ( unsigned int i = 1; i < pos.size(); i++ )
+			text += " " + QString::number( pos[i] );
+		
+		qDebug( "%s", text.ascii());
+	}
+*/				
+	
+	QValueList<uint> first_word_positions;
+	
+	for ( QStringList::ConstIterator phrase_it = phrases.begin(); phrase_it != phrases.end(); phrase_it++ )
+	{
+		QStringList phrasewords = QStringList::split( ' ', *phrase_it );
+		first_word_positions = miniDict[ phrasewords[0] ]->positions;
+		
+		for ( unsigned int j = 1; j < phrasewords.count(); ++j )
+		{
+			QValueList<uint> next_word_it = miniDict[ phrasewords[j] ]->positions;
+			QValueList<uint>::iterator dict_it = first_word_positions.begin();
+			
+			while ( dict_it != first_word_positions.end() )
+			{
+				if ( next_word_it.find( *dict_it + 1 ) != next_word_it.end() )
+				{
+					(*dict_it)++;
+					++dict_it;
+				}
+				else
+					dict_it = first_word_positions.remove( dict_it );
+			}
+		}
+	}
+	
+	if ( first_word_positions.count() )
+		return true;
+	
+	return false;
+}
+
+
+};
author	tpearson <tpearson@283d02a7-25f6-0310-bc7c-ecb5cbfe19da>	2010-02-15 18:23:18 +0000
committer	tpearson <tpearson@283d02a7-25f6-0310-bc7c-ecb5cbfe19da>	2010-02-15 18:23:18 +0000
commit	3133bbc63a2d32dac638db58fa13e966488e88b5 (patch)
tree	326595b5fefc87fd7cf5ab3905bc00883fe3c919 /src/kchmsearchengine_impl.cpp
download	kchmviewer-3133bbc63a2d32dac638db58fa13e966488e88b5.tar.gz kchmviewer-3133bbc63a2d32dac638db58fa13e966488e88b5.zip