summaryrefslogtreecommitdiffstats
path: root/debian/htdig/htdig-3.2.0b6/htfuzzy/Metaphone.cc
diff options
context:
space:
mode:
Diffstat (limited to 'debian/htdig/htdig-3.2.0b6/htfuzzy/Metaphone.cc')
-rw-r--r--debian/htdig/htdig-3.2.0b6/htfuzzy/Metaphone.cc330
1 files changed, 330 insertions, 0 deletions
diff --git a/debian/htdig/htdig-3.2.0b6/htfuzzy/Metaphone.cc b/debian/htdig/htdig-3.2.0b6/htfuzzy/Metaphone.cc
new file mode 100644
index 00000000..1f066a9d
--- /dev/null
+++ b/debian/htdig/htdig-3.2.0b6/htfuzzy/Metaphone.cc
@@ -0,0 +1,330 @@
+//
+// Metaphone.cc
+//
+// Metaphone: A fuzzy matching algorithm used to match words that
+// sound alike in the English language. Probably not so
+// good for foreign languages.
+//
+// Part of the ht://Dig package <http://www.htdig.org/>
+// Copyright (c) 1995-2004 The ht://Dig Group
+// For copyright details, see the file COPYING in your distribution
+// or the GNU Library General Public License (LGPL) version 2 or later
+// <http://www.gnu.org/copyleft/lgpl.html>
+//
+// $Id: Metaphone.cc,v 1.12 2004/05/28 13:15:20 lha Exp $
+//
+
+#ifdef HAVE_CONFIG_H
+#include "htconfig.h"
+#endif /* HAVE_CONFIG_H */
+
+#include <fcntl.h>
+
+#include "Metaphone.h"
+#include "Dictionary.h"
+
+#include <ctype.h>
+
+
+//*****************************************************************************
+// Metaphone::Metaphone(const HtConfiguration& config_arg)
+//
+Metaphone::Metaphone(const HtConfiguration& config_arg) :
+ Fuzzy(config_arg)
+{
+ name = "metaphone";
+}
+
+
+//*****************************************************************************
+// Metaphone::~Metaphone()
+//
+Metaphone::~Metaphone()
+{
+}
+
+
+//*****************************************************************************
+// void Metaphone::generateKey(char *word, String &key)
+//
+/*
+ * This code was copied from the slapd package developed at umich.
+ * it was debugged and cleaned up in February 1999 by Geoffrey Hutchison
+ * for the ht://Dig Project.
+ */
+/*
+ * Metaphone copied from C Gazette, June/July 1991, pp 56-57,
+ * author Gary A. Parker, with changes by Bernard Tiffany of the
+ * University of Michigan, and more changes by Tim Howes of the
+ * University of Michigan.
+ */
+
+/* Character coding array */
+static char vsvfn[26] = {
+ 1, 16, 4, 16, 9, 2, 4, 16, 9, 2, 0, 2, 2,
+ /* A B C D E F G H I J K L M */
+ 2, 1, 4, 0, 2, 4, 4, 1, 0, 0, 0, 8, 0};
+ /* N O P Q R S T U V W X Y Z */
+
+/* Macros to access character coding array */
+#define vscode(x) ((x) >= 'A' && (x) <= 'Z' ? vsvfn[(x) - 'A'] : 0)
+#define vowel(x) ((x) != '\0' && vscode(x) & 1) /* AEIOU */
+#define same(x) ((x) != '\0' && vscode(x) & 2) /* FJLMNR */
+#define varson(x) ((x) != '\0' && vscode(x) & 4) /* CGPST */
+#define frontv(x) ((x) != '\0' && vscode(x) & 8) /* EIY */
+#define noghf(x) ((x) != '\0' && vscode(x) & 16) /* BDH */
+
+#define MAXPHONEMELEN 6
+
+void
+Metaphone::generateKey(char *word, String &key)
+{
+ if (!word || !*word)
+ return;
+
+ char *n;
+ String ntrans;
+
+ /*
+ * Copy Word to internal buffer, dropping non-alphabetic characters
+ * and converting to upper case
+ */
+
+ ntrans << "0000";
+
+ for (; *word; word++)
+ {
+ if (isalpha(*word))
+ ntrans << *word;
+ }
+ ntrans.uppercase();
+
+ /* ntrans[0] will always be == 0 */
+ n = ntrans.get();
+ *n++ = 0;
+ *n++ = 0;
+ *n++ = 0;
+ *n = 0; /* Pad with nulls */
+ n = ntrans.get() + 4; /* Assign pointer to start */
+
+ /* Check for PN, KN, GN, AE, WR, WH, and X at start */
+ switch (*n)
+ {
+ case 'P':
+ case 'K':
+ case 'G':
+ /* 'PN', 'KN', 'GN' becomes 'N' */
+ if (*(n + 1) == 'N')
+ *n++ = 0;
+ break;
+ case 'A':
+ /* 'AE' becomes 'E' */
+ if (*(n + 1) == 'E')
+ *n++ = 0;
+ break;
+ case 'W':
+ /* 'WR' becomes 'R', and 'WH' to 'W' */
+ if (*(n + 1) == 'R')
+ *n++ = 0;
+ else if (*(n + 1) == 'H') {
+ *(n + 1) = *n;
+ *n++ = 0;
+ }
+ break;
+ case 'X':
+ /* 'X' becomes 'S' */
+ *n = 'S';
+ break;
+ }
+
+ /*
+ * Now, loop step through string, stopping at end of string or when
+ * the computed 'metaph' is MAXPHONEMELEN characters long
+ */
+
+ for (; *n && key.length() < MAXPHONEMELEN; n++)
+ {
+ /* Drop duplicates except for CC */
+ if (*(n - 1) == *n && *n != 'C')
+ continue;
+ /* Check for F J L M N R or first letter vowel */
+ if (same(*n) || *(n - 1) == '\0' && vowel(*n))
+ key << *n;
+ else
+ {
+ switch (*n)
+ {
+ case 'B':
+ /*
+ * B unless in -MB
+ */
+ if (*(n + 1) || *(n - 1) != 'M')
+ key << *n;
+ break;
+ case 'C':
+ /*
+ * X if in -CIA-, -CH- else S if in
+ * -CI-, -CE-, -CY- else dropped if
+ * in -SCI-, -SCE-, -SCY- else K
+ */
+ if (*(n - 1) != 'S' || !frontv(*(n + 1)))
+ {
+ if (*(n + 1) == 'I' && *(n + 2) == 'A')
+ key << 'X';
+ else if (frontv(*(n + 1)))
+ key << 'S';
+ else if (*(n + 1) == 'H')
+ key << (((*(n - 1) == '\0' && !vowel(*(n + 2)))
+ || *(n - 1) == 'S')
+ ? 'K' : 'X');
+ else
+ key << 'K';
+ }
+ break;
+ case 'D':
+ /*
+ * J if in DGE or DGI or DGY else T
+ */
+ key << ((*(n + 1) == 'G' && frontv(*(n + 2)))
+ ? (char) 'J' : (char) 'T');
+ break;
+ case 'G':
+ /*
+ * F if in -GH and not B--GH, D--GH,
+ * -H--GH, -H---GH else dropped if
+ * -GNED, -GN, -DGE-, -DGI-, -DGY-
+ * else J if in -GE-, -GI-, -GY- and
+ * not GG else K
+ *
+ */
+ if ((*(n + 1) != 'G' || vowel(*(n + 2))) &&
+ (*(n + 1) != 'N' || (*(n + 1) &&
+ (*(n + 2) != 'E' ||
+ *(n + 3) != 'D'))) &&
+ (*(n - 1) != 'D' || !frontv(*(n + 1))))
+ if (frontv(*(n + 1)) && *(n + 2) != 'G')
+ key << 'J';
+ else
+ key << 'K';
+ else if (*(n + 1) == 'H' && !noghf(*(n - 3)) &&
+ *(n - 4) != 'H')
+ key << 'F';
+ break;
+ case 'H':
+ /*
+ * H if before a vowel and not after
+ * C, G, P, S, T else dropped
+ */
+ if (!varson(*(n - 1)) && (!vowel(*(n - 1
+ )) ||
+ vowel(*(n + 1))))
+ key << 'H';
+ break;
+ case 'K':
+ /*
+ * dropped if after C else K
+ */
+ if (*(n - 1) != 'C')
+ key << 'K';
+ break;
+ case 'P':
+ /*
+ * F if before H, else P
+ */
+ key << (*(n + 1) == 'H' ?
+ (char) 'F' : (char) 'P');
+ break;
+ case 'Q':
+ /*
+ * K
+ */
+ key << 'K';
+ break;
+ case 'S':
+ /*
+ * X in -SH-, -SIO- or -SIA- else S
+ */
+ key << ((*(n + 1) == 'H' ||
+ (*(n + 1) == 'I' && (*(n + 2) == 'O' ||
+ *(n + 2) == 'A')))
+ ? (char) 'X' : (char) 'S');
+ break;
+ case 'T':
+ /*
+ * X in -TIA- or -TIO- else 0 (zero)
+ * before H else dropped if in -TCH-
+ * else T
+ */
+ if (*(n + 1) == 'I' && (*(n + 2) == 'O' ||
+ *(n + 2) == 'A'))
+ key << 'X';
+ else if (*(n + 1) == 'H')
+ key << '0';
+ else if (*(n + 1) != 'C' || *(n + 2) != 'H')
+ key << 'T';
+ break;
+ case 'V':
+ /*
+ * F
+ */
+ key << 'F';
+ break;
+ case 'W':
+ /*
+ * W after a vowel, else dropped
+ */
+ case 'Y':
+ /*
+ * Y unless followed by a vowel
+ */
+ if (vowel(*(n + 1)))
+ key << *n;
+ break;
+ case 'X':
+ /*
+ * KS
+ */
+ if (*(n - 1) == '\0')
+ key << 'S';
+ else
+ key << "KS"; /* Insert K, then S */
+ break;
+ case 'Z':
+ /*
+ * S
+ */
+ key << 'S';
+ break;
+ }
+ }
+ }
+}
+
+
+//*****************************************************************************
+// void Metaphone::addWord(char *word)
+//
+void
+Metaphone::addWord(char *word)
+{
+ if (!dict)
+ {
+ dict = new Dictionary;
+ }
+
+ String key;
+ generateKey(word, key);
+
+ if (key.length() == 0)
+ return;
+ String *s = (String *) dict->Find(key);
+ if (s)
+ {
+ // if (mystrcasestr(s->get(), word) != 0)
+ (*s) << ' ' << word;
+ }
+ else
+ {
+ dict->Add(key, new String(word));
+ }
+}