summaryrefslogtreecommitdiffstats
path: root/src/findduplicates.h
blob: e1a2fced7eb3ceac1581d15f86cb2643266245ba (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
/***************************************************************************
 *   Copyright (C) 2004-2009 by Thomas Fischer                             *
 *   fischer@unix-ag.uni-kl.de                                             *
 *                                                                         *
 *   This program is free software; you can redistribute it and/or modify  *
 *   it under the terms of the GNU General Public License as published by  *
 *   the Free Software Foundation; either version 2 of the License, or     *
 *   (at your option) any later version.                                   *
 *                                                                         *
 *   This program is distributed in the hope that it will be useful,       *
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
 *   GNU General Public License for more details.                          *
 *                                                                         *
 *   You should have received a copy of the GNU General Public License     *
 *   along with this program; if not, write to the                         *
 *   Free Software Foundation, Inc.,                                       *
 *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
 ***************************************************************************/
#ifndef KBIBTEXFINDDUPLICATES_H
#define KBIBTEXFINDDUPLICATES_H

#include <qobject.h>

class KProgressDialog;
class QWidget;

namespace KBibTeX
{

    /**
     @author Thomas Fischer <fischer@unix-ag.uni-kl.de>
    */
    class FindDuplicates : public QObject
    {
        Q_OBJECT
    public:
        typedef QValueList<BibTeX::Element*> DuplicateClique;
        typedef QValueList<DuplicateClique> DuplicateCliqueList;

        /**
         * Find duplicates in a given BibTeX file. The sensitivity parameter controls the distance between two elements where both elements are considered to be duplicates. The parent object is used as a progress dialog's parent.
         * @param file
         * @param sensitivity
         * @param parent
         * @return
         */
        FindDuplicates( DuplicateCliqueList &result, unsigned int sensitivity, BibTeX::File *file, QWidget *parent );

        ~FindDuplicates();

        /**
         * Maximum sensitivity
               */
        static const unsigned int maxDistance;

    protected:
        void determineDistances( BibTeX::File *file, unsigned int *distVector, QMap<BibTeX::Element*, int> &mapElementToIndex, KProgressDialog *progDlg );
        void buildClique( DuplicateCliqueList &result, BibTeX::File *file, unsigned int *distVector, QMap<BibTeX::Element*, int> &mapElementToIndex, unsigned int sensitivity );
        unsigned int entryDistance( BibTeX::Entry *entryA, BibTeX::Entry *entryB );
        unsigned int macroDistance( BibTeX::Macro *macroA, BibTeX::Macro *macroB );
        unsigned int preambleDistance( BibTeX::Preamble *preambleA, BibTeX::Preamble *preambleB );

        static QString extractTitle( BibTeX::Entry *entry );
        static QStringList authorsLastName( BibTeX::Entry *entry );
        static int extractYear( BibTeX::Entry *entry );
        static QString extractMacroKey( BibTeX::Macro *macro );
        static QString extractMacroValue( BibTeX::Macro *macro );

    private:
        bool m_doCancel;

        double levenshteinDistance( const QStringList &s, const QStringList &t );
        double levenshteinDistance( const QString &s, const QString &t );
        double levenshteinDistanceWord( const QString &s, const QString &t );
        int arrayOffset( int a, int b );
        void sort( unsigned int *array, int len );

    private slots:
        void slotCancel();
    };

}

#endif