summaryrefslogtreecommitdiffstats
path: root/kooka/ocrword.cpp
blob: 9ef393e9cac273f487668746271785bdd4afeed0 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
/***************************************************************************
                   ocrword.cpp  - ocr-result word and wordlist
                             -------------------
    begin                : Fri Jan 10 2003
    copyright            : (C) 2003 by Klaas Freitag
    email                : freitag@suse.de
 ***************************************************************************/

/***************************************************************************
 *                                                                         *
 *  This file may be distributed and/or modified under the terms of the    *
 *  GNU General Public License version 2 as published by the Free Software *
 *  Foundation and appearing in the file COPYING included in the           *
 *  packaging of this file.                                                *
 *
 *  As a special exception, permission is given to link this program       *
 *  with any version of the KADMOS ocr/icr engine of reRecognition GmbH,   *
 *  Kreuzlingen and distribute the resulting executable without            *
 *  including the source code for KADMOS in the source distribution.       *
 *
 *  As a special exception, permission is given to link this program       *
 *  with any edition of TQt, and distribute the resulting executable,       *
 *  without including the source code for TQt in the source distribution.   *
 *                                                                         *
 ***************************************************************************/

#include <tqstring.h>
#include "ocrword.h"
#include <tqrect.h>
#include <tqptrlist.h>
#include <kdebug.h>
#include <tqregexp.h>

/* -------------------- ocrWord -------------------- */
ocrWord::ocrWord( const TQString& s )
    : TQString(s)
{

}

ocrWord::ocrWord() : TQString()
{

}

#if 0
TQRect ocrWord::boundingRect()
{
    TQRect r;

    return r;
}
#endif

/* -------------------- CocrWordList ------------------ */
ocrWordList::ocrWordList()
    :TQValueList<ocrWord>(),
     m_block(0)
{
    // setAutoDelete( true );
}

TQStringList ocrWordList::stringList()
{
    TQStringList res;
    TQRegExp rx("[,\\.-]");
    ocrWordList::iterator it;

    for ( it = begin(); it != end(); ++it )
    {
#if 0
        /* Uncommented this to prevent an error that occurs if the length of the
         * spellchecked stringlist and the ocr_page wordlist are not the same length.
         * For the ocrpage words connected with a dash are one word while the code
         * below parts them into two. That confuses the replacement code if the user
         * decided. Solution:  KSpell should treat dash-linked words correctly.
         * We live with the problem here that dashes bring confusion ;-)
         */
        if( (*it).contains( rx ) )
            res += TQStringList::split( rx, (*it) );
        else
#endif
            res << *it;
    }
    return res;

}

bool ocrWordList::updateOCRWord( const TQString& from, const TQString& to )
{
    ocrWordList::iterator it;
    bool res = false;

    for( it = begin(); it != end(); ++it )
    {
        TQString word = (*it);
        kdDebug(28000) <<  "updateOCRWord in list: Comparing word " << word << endl;
        if( word.contains( from, true ) ) // case sensitive search
        {
            word.replace( from, to );
            *it = ocrWord( word );
            res = true;
            break;
        }
    }
    return res;
}

TQRect ocrWordList::wordListRect()
{
    TQRect rect;

    ocrWordList::iterator it;

    for( it = begin(); it != end(); ++it )
    {
        rect = rect.unite( (*it).rect() );
    }
    return rect;
}


/*
 * since tdespell removes , - | / etc. from words while they remain in the words
 * in the ocr wordlist.
 * This search goes through the wordlist and tries to find the words without caring
 * for special chars. It simply removes all chars from the words that are not alphanumeric.
 */
bool ocrWordList::findFuzzyIndex( const TQString& word, ocrWord& resWord )
{
    ocrWordList::iterator it;
    bool res = false;

    for( it = begin(); it != end() && !res; ++it )
    {
        TQString fuzzyword = (*it);
        fuzzyword.remove( TQRegExp( "\\W" ));  // Remove all non-word characters.
        fuzzyword.remove( '_' );

        // kdDebug(28000) <<  "findFuzzy: Comparing word " << fuzzyword << " which was "
        //                << (*it) << " with " <<  word << endl;
        if( fuzzyword == word )
        {
            resWord = *it;
            res = true;
        }
    }
    return res;

}

void ocrWordList::setBlock( int b )
{
    m_block = b;
}

/*   */