diff options
Diffstat (limited to 'debian/htdig/htdig-3.2.0b6/htfuzzy/Metaphone.cc')
-rw-r--r-- | debian/htdig/htdig-3.2.0b6/htfuzzy/Metaphone.cc | 330 |
1 files changed, 330 insertions, 0 deletions
diff --git a/debian/htdig/htdig-3.2.0b6/htfuzzy/Metaphone.cc b/debian/htdig/htdig-3.2.0b6/htfuzzy/Metaphone.cc new file mode 100644 index 00000000..1f066a9d --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htfuzzy/Metaphone.cc @@ -0,0 +1,330 @@ +// +// Metaphone.cc +// +// Metaphone: A fuzzy matching algorithm used to match words that +// sound alike in the English language. Probably not so +// good for foreign languages. +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1995-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: Metaphone.cc,v 1.12 2004/05/28 13:15:20 lha Exp $ +// + +#ifdef HAVE_CONFIG_H +#include "htconfig.h" +#endif /* HAVE_CONFIG_H */ + +#include <fcntl.h> + +#include "Metaphone.h" +#include "Dictionary.h" + +#include <ctype.h> + + +//***************************************************************************** +// Metaphone::Metaphone(const HtConfiguration& config_arg) +// +Metaphone::Metaphone(const HtConfiguration& config_arg) : + Fuzzy(config_arg) +{ + name = "metaphone"; +} + + +//***************************************************************************** +// Metaphone::~Metaphone() +// +Metaphone::~Metaphone() +{ +} + + +//***************************************************************************** +// void Metaphone::generateKey(char *word, String &key) +// +/* + * This code was copied from the slapd package developed at umich. + * it was debugged and cleaned up in February 1999 by Geoffrey Hutchison + * for the ht://Dig Project. + */ +/* + * Metaphone copied from C Gazette, June/July 1991, pp 56-57, + * author Gary A. Parker, with changes by Bernard Tiffany of the + * University of Michigan, and more changes by Tim Howes of the + * University of Michigan. + */ + +/* Character coding array */ +static char vsvfn[26] = { + 1, 16, 4, 16, 9, 2, 4, 16, 9, 2, 0, 2, 2, + /* A B C D E F G H I J K L M */ + 2, 1, 4, 0, 2, 4, 4, 1, 0, 0, 0, 8, 0}; + /* N O P Q R S T U V W X Y Z */ + +/* Macros to access character coding array */ +#define vscode(x) ((x) >= 'A' && (x) <= 'Z' ? vsvfn[(x) - 'A'] : 0) +#define vowel(x) ((x) != '\0' && vscode(x) & 1) /* AEIOU */ +#define same(x) ((x) != '\0' && vscode(x) & 2) /* FJLMNR */ +#define varson(x) ((x) != '\0' && vscode(x) & 4) /* CGPST */ +#define frontv(x) ((x) != '\0' && vscode(x) & 8) /* EIY */ +#define noghf(x) ((x) != '\0' && vscode(x) & 16) /* BDH */ + +#define MAXPHONEMELEN 6 + +void +Metaphone::generateKey(char *word, String &key) +{ + if (!word || !*word) + return; + + char *n; + String ntrans; + + /* + * Copy Word to internal buffer, dropping non-alphabetic characters + * and converting to upper case + */ + + ntrans << "0000"; + + for (; *word; word++) + { + if (isalpha(*word)) + ntrans << *word; + } + ntrans.uppercase(); + + /* ntrans[0] will always be == 0 */ + n = ntrans.get(); + *n++ = 0; + *n++ = 0; + *n++ = 0; + *n = 0; /* Pad with nulls */ + n = ntrans.get() + 4; /* Assign pointer to start */ + + /* Check for PN, KN, GN, AE, WR, WH, and X at start */ + switch (*n) + { + case 'P': + case 'K': + case 'G': + /* 'PN', 'KN', 'GN' becomes 'N' */ + if (*(n + 1) == 'N') + *n++ = 0; + break; + case 'A': + /* 'AE' becomes 'E' */ + if (*(n + 1) == 'E') + *n++ = 0; + break; + case 'W': + /* 'WR' becomes 'R', and 'WH' to 'W' */ + if (*(n + 1) == 'R') + *n++ = 0; + else if (*(n + 1) == 'H') { + *(n + 1) = *n; + *n++ = 0; + } + break; + case 'X': + /* 'X' becomes 'S' */ + *n = 'S'; + break; + } + + /* + * Now, loop step through string, stopping at end of string or when + * the computed 'metaph' is MAXPHONEMELEN characters long + */ + + for (; *n && key.length() < MAXPHONEMELEN; n++) + { + /* Drop duplicates except for CC */ + if (*(n - 1) == *n && *n != 'C') + continue; + /* Check for F J L M N R or first letter vowel */ + if (same(*n) || *(n - 1) == '\0' && vowel(*n)) + key << *n; + else + { + switch (*n) + { + case 'B': + /* + * B unless in -MB + */ + if (*(n + 1) || *(n - 1) != 'M') + key << *n; + break; + case 'C': + /* + * X if in -CIA-, -CH- else S if in + * -CI-, -CE-, -CY- else dropped if + * in -SCI-, -SCE-, -SCY- else K + */ + if (*(n - 1) != 'S' || !frontv(*(n + 1))) + { + if (*(n + 1) == 'I' && *(n + 2) == 'A') + key << 'X'; + else if (frontv(*(n + 1))) + key << 'S'; + else if (*(n + 1) == 'H') + key << (((*(n - 1) == '\0' && !vowel(*(n + 2))) + || *(n - 1) == 'S') + ? 'K' : 'X'); + else + key << 'K'; + } + break; + case 'D': + /* + * J if in DGE or DGI or DGY else T + */ + key << ((*(n + 1) == 'G' && frontv(*(n + 2))) + ? (char) 'J' : (char) 'T'); + break; + case 'G': + /* + * F if in -GH and not B--GH, D--GH, + * -H--GH, -H---GH else dropped if + * -GNED, -GN, -DGE-, -DGI-, -DGY- + * else J if in -GE-, -GI-, -GY- and + * not GG else K + * + */ + if ((*(n + 1) != 'G' || vowel(*(n + 2))) && + (*(n + 1) != 'N' || (*(n + 1) && + (*(n + 2) != 'E' || + *(n + 3) != 'D'))) && + (*(n - 1) != 'D' || !frontv(*(n + 1)))) + if (frontv(*(n + 1)) && *(n + 2) != 'G') + key << 'J'; + else + key << 'K'; + else if (*(n + 1) == 'H' && !noghf(*(n - 3)) && + *(n - 4) != 'H') + key << 'F'; + break; + case 'H': + /* + * H if before a vowel and not after + * C, G, P, S, T else dropped + */ + if (!varson(*(n - 1)) && (!vowel(*(n - 1 + )) || + vowel(*(n + 1)))) + key << 'H'; + break; + case 'K': + /* + * dropped if after C else K + */ + if (*(n - 1) != 'C') + key << 'K'; + break; + case 'P': + /* + * F if before H, else P + */ + key << (*(n + 1) == 'H' ? + (char) 'F' : (char) 'P'); + break; + case 'Q': + /* + * K + */ + key << 'K'; + break; + case 'S': + /* + * X in -SH-, -SIO- or -SIA- else S + */ + key << ((*(n + 1) == 'H' || + (*(n + 1) == 'I' && (*(n + 2) == 'O' || + *(n + 2) == 'A'))) + ? (char) 'X' : (char) 'S'); + break; + case 'T': + /* + * X in -TIA- or -TIO- else 0 (zero) + * before H else dropped if in -TCH- + * else T + */ + if (*(n + 1) == 'I' && (*(n + 2) == 'O' || + *(n + 2) == 'A')) + key << 'X'; + else if (*(n + 1) == 'H') + key << '0'; + else if (*(n + 1) != 'C' || *(n + 2) != 'H') + key << 'T'; + break; + case 'V': + /* + * F + */ + key << 'F'; + break; + case 'W': + /* + * W after a vowel, else dropped + */ + case 'Y': + /* + * Y unless followed by a vowel + */ + if (vowel(*(n + 1))) + key << *n; + break; + case 'X': + /* + * KS + */ + if (*(n - 1) == '\0') + key << 'S'; + else + key << "KS"; /* Insert K, then S */ + break; + case 'Z': + /* + * S + */ + key << 'S'; + break; + } + } + } +} + + +//***************************************************************************** +// void Metaphone::addWord(char *word) +// +void +Metaphone::addWord(char *word) +{ + if (!dict) + { + dict = new Dictionary; + } + + String key; + generateKey(word, key); + + if (key.length() == 0) + return; + String *s = (String *) dict->Find(key); + if (s) + { + // if (mystrcasestr(s->get(), word) != 0) + (*s) << ' ' << word; + } + else + { + dict->Add(key, new String(word)); + } +} |