summaryrefslogtreecommitdiffstats
path: root/src/findduplicates.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'src/findduplicates.cpp')
-rw-r--r--src/findduplicates.cpp444
1 files changed, 444 insertions, 0 deletions
diff --git a/src/findduplicates.cpp b/src/findduplicates.cpp
new file mode 100644
index 0000000..ddcf5a8
--- /dev/null
+++ b/src/findduplicates.cpp
@@ -0,0 +1,444 @@
+/***************************************************************************
+ * Copyright (C) 2004-2009 by Thomas Fischer *
+ * fischer@unix-ag.uni-kl.de *
+ * *
+ * This program is free software; you can redistribute it and/or modify *
+ * it under the terms of the GNU General Public License as published by *
+ * the Free Software Foundation; either version 2 of the License, or *
+ * (at your option) any later version. *
+ * *
+ * This program is distributed in the hope that it will be useful, *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
+ * GNU General Public License for more details. *
+ * *
+ * You should have received a copy of the GNU General Public License *
+ * along with this program; if not, write to the *
+ * Free Software Foundation, Inc., *
+ * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. *
+ ***************************************************************************/
+#include <math.h>
+
+#include <qstring.h>
+#include <qstringlist.h>
+#include <qregexp.h>
+#include <qapplication.h>
+
+#include <kdebug.h>
+#include <klocale.h>
+#include <kmessagebox.h>
+#include <kprogress.h>
+
+#include <element.h>
+#include <entry.h>
+#include <file.h>
+#include <preamble.h>
+#include <macro.h>
+#include "findduplicates.h"
+
+#define max(a,b) ((a)>(b)?(a):(b))
+#define min(a,b) ((a)<(b)?(a):(b))
+
+namespace KBibTeX
+{
+ const unsigned int FindDuplicates::maxDistance = 0xffffff;
+
+ FindDuplicates::FindDuplicates( DuplicateCliqueList &result, unsigned int sensitivity, BibTeX::File *file, QWidget *parent )
+ : QObject( NULL, NULL ), m_doCancel( false )
+ {
+ if ( file->count() < 2 )
+ return;
+
+ int len = file->count() * ( file->count() - 1 ) / 2;
+ unsigned int *distVector = new unsigned int[len];
+ memset( distVector, 0xff, sizeof( unsigned int )*len );
+ QMap<BibTeX::Element*, int> mapElementToIndex;
+
+ QApplication::setOverrideCursor( Qt::waitCursor );
+ KProgressDialog *progDlg = new KProgressDialog( parent, NULL, i18n( "Find Duplicates" ), i18n( "Searching for duplicates..." ), true );
+ connect( progDlg, SIGNAL( cancelClicked() ), this, SLOT( slotCancel() ) );
+ progDlg->progressBar()->setTotalSteps( len );
+
+ determineDistances( file, distVector, mapElementToIndex, progDlg );
+ progDlg->progressBar()->setValue( len );
+
+ if ( !m_doCancel )
+ buildClique( result, file, distVector, mapElementToIndex, sensitivity );
+
+ delete progDlg;
+ delete[] distVector;
+ QApplication::restoreOverrideCursor();
+ }
+
+ /**
+ * Determine the distance between elements either from two different BibTeX
+ * files (merging operation) or within the same file (find duplicates).
+ * Inter-element distances will be written into the distance vector.
+ * @param file BibTeX file
+ * @param distVector inter-element distance vector
+ * @param mapElementToIndex map from elements to indices (will be written)
+ * @param progDlg progress dialog to write status information to
+ */
+ void FindDuplicates::determineDistances( BibTeX::File *file, unsigned int *distVector, QMap<BibTeX::Element*, int> &mapElementToIndex, KProgressDialog *progDlg )
+ {
+ int progress = 0, i = 0;
+ for ( BibTeX::File::ElementList::ConstIterator it1 = file->constBegin(); !m_doCancel && it1 != file->constEnd(); ++it1, ++i )
+ {
+ BibTeX::Entry *entryA = dynamic_cast<BibTeX::Entry*>( *it1 );
+ if ( entryA != NULL )
+ {
+ mapElementToIndex.insert( entryA, i );
+
+ int j = i + 1;
+ for ( BibTeX::File::ElementList::ConstIterator it2 = ++BibTeX::File::ElementList::ConstIterator( it1 ); !m_doCancel && it2 != file->constEnd(); ++it2, ++j )
+ {
+ BibTeX::Entry *entryB = dynamic_cast<BibTeX::Entry*>( *it2 );
+ if ( entryB == NULL ) continue;
+
+ unsigned int d = entryDistance( entryA, entryB );
+ distVector[arrayOffset( i, j )] = d;
+
+ progDlg->progressBar()->setValue( ++progress );
+ qApp->processEvents();
+ }
+ }
+ else
+ {
+ BibTeX::Macro *macroA = dynamic_cast<BibTeX::Macro*>( *it1 );
+ if ( macroA != NULL )
+ {
+ mapElementToIndex.insert( macroA, i );
+
+ int j = i + 1;
+ for ( BibTeX::File::ElementList::ConstIterator it2 = ++BibTeX::File::ElementList::ConstIterator( it1 ); !m_doCancel && it2 != file->constEnd(); ++it2, ++j )
+ {
+ BibTeX::Macro *macroB = dynamic_cast<BibTeX::Macro*>( *it2 );
+ if ( macroB == NULL ) continue;
+
+ distVector[arrayOffset( i, j )] = macroDistance( macroA, macroB );
+
+ progDlg->progressBar()->setValue( ++progress );
+ qApp->processEvents();
+ }
+ }
+ else
+ {
+ BibTeX::Preamble *preambleA = dynamic_cast<BibTeX::Preamble*>( *it1 );
+ if ( preambleA != NULL )
+ {
+ mapElementToIndex.insert( preambleA, i );
+
+ int j = i + 1;
+ for ( BibTeX::File::ElementList::ConstIterator it2 = ++BibTeX::File::ElementList::ConstIterator( it1 ); !m_doCancel && it2 != file->constEnd(); ++it2, ++j )
+ {
+ BibTeX::Preamble *preambleB = dynamic_cast<BibTeX::Preamble*>( *it2 );
+ if ( preambleB == NULL ) continue;
+
+ distVector[arrayOffset( i, j )] = preambleDistance( preambleA, preambleB );
+
+ progDlg->progressBar()->setValue( ++progress );
+ qApp->processEvents();
+ }
+ }
+ }
+ }
+ }
+ }
+
+ /**
+ * Build a list of clique of BibTeX elements with a distance below the
+ * sensitivity threshold. The list of cliques is added to the cliqueList
+ * parameter.
+ * @param cliqueList List of cliques found in this function
+ * @param file BibTeX file
+ * @param distVector inter-element distance vector
+ * @param mapElementToIndex map from elements to indices
+ * @param sensitivity sensitivity threshold value
+ */
+ void FindDuplicates::buildClique( DuplicateCliqueList &cliqueList, BibTeX::File *file, unsigned int *distVector, QMap<BibTeX::Element*, int> &mapElementToIndex, unsigned int sensitivity )
+ {
+ int usedLen = file->count();
+ bool* used = new bool[usedLen];
+ memset( used, false, sizeof( bool ) * usedLen );
+ QValueList<BibTeX::Element*> queue;
+ for ( BibTeX::File::ElementList::ConstIterator it1 = file->constBegin(); it1 != file->constEnd(); ++it1 )
+ {
+ /** current element must be either entry, preamble, or macro */
+ BibTeX::Element *elem1 = dynamic_cast<BibTeX::Entry*>( *it1 );
+ if ( elem1 == NULL )
+ elem1 = dynamic_cast<BibTeX::Macro*>( *it1 );
+ if ( elem1 == NULL )
+ elem1 = dynamic_cast<BibTeX::Preamble*>( *it1 );
+ /** skip element otherwise or if already used */
+ if ( elem1 == NULL || used[mapElementToIndex[elem1]] ) continue;
+
+ DuplicateClique clique;
+
+ queue.clear();
+ queue.append( elem1 );
+ used[mapElementToIndex[elem1]] = true;
+
+ while ( !queue.isEmpty() )
+ {
+ elem1 = *( queue.begin() );
+ queue.remove( queue.begin() );
+ int curIndex = mapElementToIndex[elem1];
+ clique.append( elem1 );
+
+ for ( BibTeX::File::ElementList::ConstIterator it2 = file->constBegin(); it2 != file->constEnd(); ++it2 )
+ {
+ /** current element must be either entry, preamble, or macro */
+ BibTeX::Element *elem2 = dynamic_cast<BibTeX::Entry*>( *it2 );
+ int otherIndex=mapElementToIndex[elem2];
+ if ( elem2 == NULL )
+ elem2 = dynamic_cast<BibTeX::Macro*>( *it2 );
+ if ( elem2 == NULL )
+ elem2 = dynamic_cast<BibTeX::Preamble*>( *it2 );
+ /** skip element otherwise or if already used */
+ if ( elem2 == NULL || used[( otherIndex = mapElementToIndex[elem2] )] )
+ continue;
+
+ unsigned int distance = distVector[arrayOffset( curIndex, otherIndex )];
+
+ if ( distance <= sensitivity )
+ {
+ queue.append( elem2 );
+ used[otherIndex ] = true;
+ }
+ }
+ }
+
+ if ( clique.size() > 1 )
+ cliqueList.append( clique );
+ }
+ delete[] used;
+ }
+
+ /**
+ * Distance between two BibTeX entries, scaled by maxDistance.
+ */
+ unsigned int FindDuplicates::entryDistance( BibTeX::Entry *entryA, BibTeX::Entry *entryB )
+ {
+ double titleValue = levenshteinDistance( extractTitle( entryA ), extractTitle( entryB ) );
+ double authorValue = levenshteinDistance( authorsLastName( entryA ), authorsLastName( entryB ) );
+ double yearValue = extractYear( entryA ) - extractYear( entryB );
+ yearValue = min( 1.0, yearValue * yearValue / 100.0 );
+ unsigned int distance = ( unsigned int )( maxDistance * ( titleValue * 0.6 + authorValue * 0.3 + yearValue * 0.1 ) );
+
+ return distance;
+ }
+
+ /**
+ * Distance between two BibTeX macros, scaled by maxDistance.
+ */
+ unsigned int FindDuplicates::macroDistance( BibTeX::Macro *macroA, BibTeX::Macro *macroB )
+ {
+ double keyValue = levenshteinDistance( extractMacroKey( macroA ), extractMacroKey( macroB ) );
+ double valueValue = levenshteinDistance( extractMacroValue( macroA ), extractMacroValue( macroB ) );
+ unsigned int distance = ( unsigned int )( maxDistance * ( keyValue * 0.7 + valueValue * 0.3 ) );
+
+ return distance;
+ }
+
+ unsigned int FindDuplicates::preambleDistance( BibTeX::Preamble *preambleA, BibTeX::Preamble *preambleB )
+ {
+ return ( unsigned int )( maxDistance * levenshteinDistance( preambleA->value()->text(), preambleB->value()->text() ) );
+ }
+
+ FindDuplicates::~FindDuplicates()
+ {
+// nothing
+ }
+
+ /**
+ * Determine the Levenshtein distance between two sentences,
+ * where each sentence is in a string (not split into single words).
+ * See also http://en.wikipedia.org/wiki/Levenshtein_distance
+ * @param s first sentence
+ * @param t second sentence
+ * @return distance between both sentences
+ */
+ double FindDuplicates::levenshteinDistance( const QString &s, const QString &t )
+ {
+ const QRegExp nonWordRegExp( "[^a-zA-Z']+" );
+ if ( s == QString::null || t == QString::null ) return 1.0;
+ return levenshteinDistance( QStringList::split( nonWordRegExp, s ), QStringList::split( nonWordRegExp, t ) );
+ }
+
+ /**
+ * Determine the Levenshtein distance between two words.
+ * See also http://en.wikipedia.org/wiki/Levenshtein_distance
+ * @param s first word
+ * @param t second word
+ * @return distance between both words
+ */
+ double FindDuplicates::levenshteinDistanceWord( const QString &s, const QString &t )
+ {
+ QString mys = s.lower(), myt = t.lower();
+ int m = s.length(), n = t.length();
+ if ( m < 1 && n < 1 ) return 0.0;
+ if ( m < 1 || n < 1 ) return 1.0;
+
+ int **d = new int*[m+1];
+ for ( int i = 0; i <= m; ++i ) {d[i] = new int[n+1]; d[i][0] = i;}
+ for ( int i = 0; i <= n; ++i ) d[0][i] = i;
+
+ for ( int i = 1; i <= m;++i )
+ for ( int j = 1; j <= n;++j )
+ {
+ d[i][j] = d[i-1][j] + 1;
+ int c = d[i][j-1] + 1;
+ if ( c < d[i][j] ) d[i][j] = c;
+ c = d[i-1][j-1] + ( mys[i-1] == myt[j-1] ? 0 : 1 );
+ if ( c < d[i][j] ) d[i][j] = c;
+ }
+
+ double result = d[m][n];
+ for ( int i = 0; i <= m; ++i ) delete[] d[i];
+ delete [] d;
+
+ result = result / ( double )max( m, n );
+ result *= result;
+ return result;
+ }
+
+ /**
+ * Determine the Levenshtein distance between two sentences (list of words).
+ * See also http://en.wikipedia.org/wiki/Levenshtein_distance
+ * @param s first sentence
+ * @param t second sentence
+ * @return distance between both sentences
+ */
+ double FindDuplicates::levenshteinDistance( const QStringList &s, const QStringList &t )
+ {
+ int m = s.size(), n = t.size();
+ if ( m < 1 && n < 1 ) return 0.0;
+ if ( m < 1 || n < 1 ) return 1.0;
+
+ double **d = new double*[m+1];
+ for ( int i = 0; i <= m; ++i ) {d[i] = new double[n+1]; d[i][0] = i;}
+ for ( int i = 0; i <= n; ++i ) d[0][i] = i;
+
+ for ( int i = 1; i <= m;++i )
+ for ( int j = 1; j <= n;++j )
+ {
+ d[i][j] = d[i-1][j] + 1;
+ double c = d[i][j-1] + 1;
+ if ( c < d[i][j] ) d[i][j] = c;
+ c = d[i-1][j-1] + levenshteinDistanceWord( s[i-1], t[j-1] );
+ if ( c < d[i][j] ) d[i][j] = c;
+ }
+
+ double result = d[m][n];
+ for ( int i = 0; i <= m; ++i ) delete[] d[i];
+ delete [] d;
+
+ result = result / ( double )max( m, n );
+
+ return result;
+ }
+
+ /**
+ * Linearize a two-dimensional triangle matrix
+ */
+ int FindDuplicates::arrayOffset( int a, int b )
+ {
+ if ( a == b )
+ return -1;
+ else if ( b < a )
+ {
+ int swap = a;
+ a = b;
+ b = swap;
+ }
+
+ return ( b * ( b - 1 ) / 2 + a );
+ }
+
+ /**
+ * Determine title for a given entry
+ */
+ QString FindDuplicates::extractTitle( BibTeX::Entry *entry )
+ {
+ /** retrieve field holding title information for entry */
+ BibTeX::EntryField *field = entry->getField( BibTeX::EntryField::ftTitle );
+ if ( field == NULL )
+ return QString::null; /** no title field available */
+
+ /** *fetch value item holding title */
+ BibTeX::ValueItem *valueItem = field->value()->items.isEmpty() ? NULL : field->value()->items.first();
+ if ( valueItem == NULL )
+ return QString::null; /** no value item found or is empty */
+
+ return valueItem->text(); // TODO: Perform some postprocessing?
+ }
+
+ /**
+ * Determine list of authors for a given entry
+ */
+ QStringList FindDuplicates::authorsLastName( BibTeX::Entry *entry )
+ {
+ QStringList result;
+
+ /** retrieve field holding authors information for entry */
+ BibTeX::EntryField *field = entry->getField( BibTeX::EntryField::ftAuthor );
+ if ( field == NULL )
+ return result; /** no author field available */
+
+ /** fetch container holding list of author names */
+ BibTeX::PersonContainer *personContainer = field != NULL ? dynamic_cast<BibTeX::PersonContainer*>( field->value()->items.isEmpty() ? NULL : field->value()->items.first() ) : NULL;
+ if ( personContainer == NULL || personContainer->persons.isEmpty() )
+ return result; /** container not found or is empty */
+
+ /** iterate through container and fetch each author's last name */
+ for ( QValueList<BibTeX::Person*>::ConstIterator it = personContainer->persons.begin(); it != personContainer->persons.end(); ++it )
+ result.append(( *it )->lastName() );
+
+ return result;
+ }
+
+ /**
+ * Determine year for a given entry
+ */
+ int FindDuplicates::extractYear( BibTeX::Entry *entry )
+ {
+ /** retrieve field holding year information for entry */
+ BibTeX::EntryField *field = entry->getField( BibTeX::EntryField::ftYear );
+ if ( field == NULL )
+ return -1; /** no year field available */
+
+ /** *fetch value item holding year */
+ BibTeX::ValueItem *valueItem = field != NULL ? ( field->value()->items.isEmpty() ? NULL : field->value()->items.first() ) : NULL;
+ if ( valueItem == NULL )
+ return -1; /** no value item found or is empty */
+
+ /** parse value item's text */
+ bool ok = FALSE;
+ int year = QString( valueItem->text() ).toInt( &ok );
+ if ( !ok ) year = -1;
+
+ return year;
+ }
+
+ /**
+ * Determine key from a given macro
+ */
+ QString FindDuplicates::extractMacroKey( BibTeX::Macro *macro )
+ {
+ return macro->key();
+ }
+
+ /**
+ * Determine key from a given macro
+ */
+ QString FindDuplicates::extractMacroValue( BibTeX::Macro *macro )
+ {
+ return macro->value()->text();
+ }
+
+ void FindDuplicates::slotCancel()
+ {
+ m_doCancel = true;
+ }
+}
+#include "findduplicates.moc"