/*************************************************************************** * Copyright (C) 2004-2009 by Thomas Fischer * * fischer@unix-ag.uni-kl.de * * * * This program is free software; you can redistribute it and/or modify * * it under the terms of the GNU General Public License as published by * * the Free Software Foundation; either version 2 of the License, or * * (at your option) any later version. * * * * This program is distributed in the hope that it will be useful, * * but WITHOUT ANY WARRANTY; without even the implied warranty of * * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * * GNU General Public License for more details. * * * * You should have received a copy of the GNU General Public License * * along with this program; if not, write to the * * Free Software Foundation, Inc., * * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * ***************************************************************************/ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "findduplicates.h" #define max(a,b) ((a)>(b)?(a):(b)) #define min(a,b) ((a)<(b)?(a):(b)) namespace KBibTeX { const unsigned int FindDuplicates::maxDistance = 0xffffff; FindDuplicates::FindDuplicates( DuplicateCliqueList &result, unsigned int sensitivity, BibTeX::File *file, TQWidget *parent ) : TQObject( NULL, NULL ), m_doCancel( false ) { if ( file->count() < 2 ) return; int len = file->count() * ( file->count() - 1 ) / 2; unsigned int *distVector = new unsigned int[len]; memset( distVector, 0xff, sizeof( unsigned int )*len ); TQMap mapElementToIndex; TQApplication::setOverrideCursor( TQt::waitCursor ); KProgressDialog *progDlg = new KProgressDialog( parent, NULL, i18n( "Find Duplicates" ), i18n( "Searching for duplicates..." ), true ); connect( progDlg, SIGNAL( cancelClicked() ), this, SLOT( slotCancel() ) ); progDlg->progressBar()->setTotalSteps( len ); determineDistances( file, distVector, mapElementToIndex, progDlg ); progDlg->progressBar()->setValue( len ); if ( !m_doCancel ) buildClique( result, file, distVector, mapElementToIndex, sensitivity ); delete progDlg; delete[] distVector; TQApplication::restoreOverrideCursor(); } /** * Determine the distance between elements either from two different BibTeX * files (merging operation) or within the same file (find duplicates). * Inter-element distances will be written into the distance vector. * @param file BibTeX file * @param distVector inter-element distance vector * @param mapElementToIndex map from elements to indices (will be written) * @param progDlg progress dialog to write status information to */ void FindDuplicates::determineDistances( BibTeX::File *file, unsigned int *distVector, TQMap &mapElementToIndex, KProgressDialog *progDlg ) { int progress = 0, i = 0; for ( BibTeX::File::ElementList::ConstIterator it1 = file->constBegin(); !m_doCancel && it1 != file->constEnd(); ++it1, ++i ) { BibTeX::Entry *entryA = dynamic_cast( *it1 ); if ( entryA != NULL ) { mapElementToIndex.insert( entryA, i ); int j = i + 1; for ( BibTeX::File::ElementList::ConstIterator it2 = ++BibTeX::File::ElementList::ConstIterator( it1 ); !m_doCancel && it2 != file->constEnd(); ++it2, ++j ) { BibTeX::Entry *entryB = dynamic_cast( *it2 ); if ( entryB == NULL ) continue; unsigned int d = entryDistance( entryA, entryB ); distVector[arrayOffset( i, j )] = d; progDlg->progressBar()->setValue( ++progress ); tqApp->processEvents(); } } else { BibTeX::Macro *macroA = dynamic_cast( *it1 ); if ( macroA != NULL ) { mapElementToIndex.insert( macroA, i ); int j = i + 1; for ( BibTeX::File::ElementList::ConstIterator it2 = ++BibTeX::File::ElementList::ConstIterator( it1 ); !m_doCancel && it2 != file->constEnd(); ++it2, ++j ) { BibTeX::Macro *macroB = dynamic_cast( *it2 ); if ( macroB == NULL ) continue; distVector[arrayOffset( i, j )] = macroDistance( macroA, macroB ); progDlg->progressBar()->setValue( ++progress ); tqApp->processEvents(); } } else { BibTeX::Preamble *preambleA = dynamic_cast( *it1 ); if ( preambleA != NULL ) { mapElementToIndex.insert( preambleA, i ); int j = i + 1; for ( BibTeX::File::ElementList::ConstIterator it2 = ++BibTeX::File::ElementList::ConstIterator( it1 ); !m_doCancel && it2 != file->constEnd(); ++it2, ++j ) { BibTeX::Preamble *preambleB = dynamic_cast( *it2 ); if ( preambleB == NULL ) continue; distVector[arrayOffset( i, j )] = preambleDistance( preambleA, preambleB ); progDlg->progressBar()->setValue( ++progress ); tqApp->processEvents(); } } } } } } /** * Build a list of clique of BibTeX elements with a distance below the * sensitivity threshold. The list of cliques is added to the cliqueList * parameter. * @param cliqueList List of cliques found in this function * @param file BibTeX file * @param distVector inter-element distance vector * @param mapElementToIndex map from elements to indices * @param sensitivity sensitivity threshold value */ void FindDuplicates::buildClique( DuplicateCliqueList &cliqueList, BibTeX::File *file, unsigned int *distVector, TQMap &mapElementToIndex, unsigned int sensitivity ) { int usedLen = file->count(); bool* used = new bool[usedLen]; memset( used, false, sizeof( bool ) * usedLen ); TQValueList queue; for ( BibTeX::File::ElementList::ConstIterator it1 = file->constBegin(); it1 != file->constEnd(); ++it1 ) { /** current element must be either entry, preamble, or macro */ BibTeX::Element *elem1 = dynamic_cast( *it1 ); if ( elem1 == NULL ) elem1 = dynamic_cast( *it1 ); if ( elem1 == NULL ) elem1 = dynamic_cast( *it1 ); /** skip element otherwise or if already used */ if ( elem1 == NULL || used[mapElementToIndex[elem1]] ) continue; DuplicateClique clique; queue.clear(); queue.append( elem1 ); used[mapElementToIndex[elem1]] = true; while ( !queue.isEmpty() ) { elem1 = *( queue.begin() ); queue.remove( queue.begin() ); int curIndex = mapElementToIndex[elem1]; clique.append( elem1 ); for ( BibTeX::File::ElementList::ConstIterator it2 = file->constBegin(); it2 != file->constEnd(); ++it2 ) { /** current element must be either entry, preamble, or macro */ BibTeX::Element *elem2 = dynamic_cast( *it2 ); int otherIndex=mapElementToIndex[elem2]; if ( elem2 == NULL ) elem2 = dynamic_cast( *it2 ); if ( elem2 == NULL ) elem2 = dynamic_cast( *it2 ); /** skip element otherwise or if already used */ if ( elem2 == NULL || used[( otherIndex = mapElementToIndex[elem2] )] ) continue; unsigned int distance = distVector[arrayOffset( curIndex, otherIndex )]; if ( distance <= sensitivity ) { queue.append( elem2 ); used[otherIndex ] = true; } } } if ( clique.size() > 1 ) cliqueList.append( clique ); } delete[] used; } /** * Distance between two BibTeX entries, scaled by maxDistance. */ unsigned int FindDuplicates::entryDistance( BibTeX::Entry *entryA, BibTeX::Entry *entryB ) { double titleValue = levenshteinDistance( extractTitle( entryA ), extractTitle( entryB ) ); double authorValue = levenshteinDistance( authorsLastName( entryA ), authorsLastName( entryB ) ); double yearValue = extractYear( entryA ) - extractYear( entryB ); yearValue = min( 1.0, yearValue * yearValue / 100.0 ); unsigned int distance = ( unsigned int )( maxDistance * ( titleValue * 0.6 + authorValue * 0.3 + yearValue * 0.1 ) ); return distance; } /** * Distance between two BibTeX macros, scaled by maxDistance. */ unsigned int FindDuplicates::macroDistance( BibTeX::Macro *macroA, BibTeX::Macro *macroB ) { double keyValue = levenshteinDistance( extractMacroKey( macroA ), extractMacroKey( macroB ) ); double valueValue = levenshteinDistance( extractMacroValue( macroA ), extractMacroValue( macroB ) ); unsigned int distance = ( unsigned int )( maxDistance * ( keyValue * 0.7 + valueValue * 0.3 ) ); return distance; } unsigned int FindDuplicates::preambleDistance( BibTeX::Preamble *preambleA, BibTeX::Preamble *preambleB ) { return ( unsigned int )( maxDistance * levenshteinDistance( preambleA->value()->text(), preambleB->value()->text() ) ); } FindDuplicates::~FindDuplicates() { // nothing } /** * Determine the Levenshtein distance between two sentences, * where each sentence is in a string (not split into single words). * See also http://en.wikipedia.org/wiki/Levenshtein_distance * @param s first sentence * @param t second sentence * @return distance between both sentences */ double FindDuplicates::levenshteinDistance( const TQString &s, const TQString &t ) { const TQRegExp nonWordRegExp( "[^a-zA-Z']+" ); if ( s == TQString::null || t == TQString::null ) return 1.0; return levenshteinDistance( TQStringList::split( nonWordRegExp, s ), TQStringList::split( nonWordRegExp, t ) ); } /** * Determine the Levenshtein distance between two words. * See also http://en.wikipedia.org/wiki/Levenshtein_distance * @param s first word * @param t second word * @return distance between both words */ double FindDuplicates::levenshteinDistanceWord( const TQString &s, const TQString &t ) { TQString mys = s.lower(), myt = t.lower(); int m = s.length(), n = t.length(); if ( m < 1 && n < 1 ) return 0.0; if ( m < 1 || n < 1 ) return 1.0; int **d = new int*[m+1]; for ( int i = 0; i <= m; ++i ) {d[i] = new int[n+1]; d[i][0] = i;} for ( int i = 0; i <= n; ++i ) d[0][i] = i; for ( int i = 1; i <= m;++i ) for ( int j = 1; j <= n;++j ) { d[i][j] = d[i-1][j] + 1; int c = d[i][j-1] + 1; if ( c < d[i][j] ) d[i][j] = c; c = d[i-1][j-1] + ( mys[i-1] == myt[j-1] ? 0 : 1 ); if ( c < d[i][j] ) d[i][j] = c; } double result = d[m][n]; for ( int i = 0; i <= m; ++i ) delete[] d[i]; delete [] d; result = result / ( double )max( m, n ); result *= result; return result; } /** * Determine the Levenshtein distance between two sentences (list of words). * See also http://en.wikipedia.org/wiki/Levenshtein_distance * @param s first sentence * @param t second sentence * @return distance between both sentences */ double FindDuplicates::levenshteinDistance( const TQStringList &s, const TQStringList &t ) { int m = s.size(), n = t.size(); if ( m < 1 && n < 1 ) return 0.0; if ( m < 1 || n < 1 ) return 1.0; double **d = new double*[m+1]; for ( int i = 0; i <= m; ++i ) {d[i] = new double[n+1]; d[i][0] = i;} for ( int i = 0; i <= n; ++i ) d[0][i] = i; for ( int i = 1; i <= m;++i ) for ( int j = 1; j <= n;++j ) { d[i][j] = d[i-1][j] + 1; double c = d[i][j-1] + 1; if ( c < d[i][j] ) d[i][j] = c; c = d[i-1][j-1] + levenshteinDistanceWord( s[i-1], t[j-1] ); if ( c < d[i][j] ) d[i][j] = c; } double result = d[m][n]; for ( int i = 0; i <= m; ++i ) delete[] d[i]; delete [] d; result = result / ( double )max( m, n ); return result; } /** * Linearize a two-dimensional triangle matrix */ int FindDuplicates::arrayOffset( int a, int b ) { if ( a == b ) return -1; else if ( b < a ) { int swap = a; a = b; b = swap; } return ( b * ( b - 1 ) / 2 + a ); } /** * Determine title for a given entry */ TQString FindDuplicates::extractTitle( BibTeX::Entry *entry ) { /** retrieve field holding title information for entry */ BibTeX::EntryField *field = entry->getField( BibTeX::EntryField::ftTitle ); if ( field == NULL ) return TQString::null; /** no title field available */ /** *fetch value item holding title */ BibTeX::ValueItem *valueItem = field->value()->items.isEmpty() ? NULL : field->value()->items.first(); if ( valueItem == NULL ) return TQString::null; /** no value item found or is empty */ return valueItem->text(); // TODO: Perform some postprocessing? } /** * Determine list of authors for a given entry */ TQStringList FindDuplicates::authorsLastName( BibTeX::Entry *entry ) { TQStringList result; /** retrieve field holding authors information for entry */ BibTeX::EntryField *field = entry->getField( BibTeX::EntryField::ftAuthor ); if ( field == NULL ) return result; /** no author field available */ /** fetch container holding list of author names */ BibTeX::PersonContainer *personContainer = field != NULL ? dynamic_cast( field->value()->items.isEmpty() ? NULL : field->value()->items.first() ) : NULL; if ( personContainer == NULL || personContainer->persons.isEmpty() ) return result; /** container not found or is empty */ /** iterate through container and fetch each author's last name */ for ( TQValueList::ConstIterator it = personContainer->persons.begin(); it != personContainer->persons.end(); ++it ) result.append(( *it )->lastName() ); return result; } /** * Determine year for a given entry */ int FindDuplicates::extractYear( BibTeX::Entry *entry ) { /** retrieve field holding year information for entry */ BibTeX::EntryField *field = entry->getField( BibTeX::EntryField::ftYear ); if ( field == NULL ) return -1; /** no year field available */ /** *fetch value item holding year */ BibTeX::ValueItem *valueItem = field != NULL ? ( field->value()->items.isEmpty() ? NULL : field->value()->items.first() ) : NULL; if ( valueItem == NULL ) return -1; /** no value item found or is empty */ /** parse value item's text */ bool ok = FALSE; int year = TQString( valueItem->text() ).toInt( &ok ); if ( !ok ) year = -1; return year; } /** * Determine key from a given macro */ TQString FindDuplicates::extractMacroKey( BibTeX::Macro *macro ) { return macro->key(); } /** * Determine key from a given macro */ TQString FindDuplicates::extractMacroValue( BibTeX::Macro *macro ) { return macro->value()->text(); } void FindDuplicates::slotCancel() { m_doCancel = true; } } #include "findduplicates.moc"