/* vim: set sw=8: -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ /* tdespell2 - adopted from enchant * Copyright (C) 2003 Dom Lachowicz * Copyright (C) 2004 Zack Rusin * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the * Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, * Boston, MA 02110-1301, USA. * * In addition, as a special exception, Dom Lachowicz * gives permission to link the code of this program with * non-LGPL Spelling Provider libraries (eg: a MSFT Office * spell checker backend) and distribute linked combinations including * the two. You must obey the GNU General Public License in all * respects for all of the code used other than said providers. If you modify * this file, you may extend this exception to your version of the * file, but you are not obligated to do so. If you do not wish to * do so, delete this exception statement from your version. */ /* * lookup.c - see if a word appears in the dictionary * * Pace Willisson, 1983 * * Copyright 1987, 1988, 1989, 1992, 1993, Geoff Kuenning, Granada Hills, CA * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All modifications to the source code must be clearly marked as * such. Binary redistributions based on modified source code * must be clearly marked as modified versions in the documentation * and/or other materials provided with the distribution. * 4. All advertising materials mentioning features or use of this software * must display the following acknowledgment: * This product includes software developed by Geoff Kuenning and * other unpaid contributors. * 5. The name of Geoff Kuenning may not be used to endorse or promote * products derived from this software without specific prior * written permission. * * THIS SOFTWARE IS PROVIDED BY GEOFF KUENNING AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL GEOFF KUENNING OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * $Log$ * Revision 1.1 2004/01/31 16:44:12 zrusin * ISpell plugin. * * Revision 1.7 2003/09/25 02:44:48 dom * bug 5813 * * Revision 1.6 2003/08/26 13:20:40 dom * ispell crasher fix, implement enchant_dictionary_release * * Revision 1.5 2003/08/26 13:08:03 uwog * Fix segfault when the requested dictionary couldn't be found. * * Revision 1.4 2003/08/14 16:27:36 dom * update some documentation * * Revision 1.3 2003/07/28 20:40:27 dom * fix up the license clause, further win32-registry proof some directory getting functions * * Revision 1.2 2003/07/16 22:52:47 dom * LGPL + exception license * * Revision 1.1 2003/07/15 01:15:07 dom * ispell enchant backend * * Revision 1.3 2003/01/29 05:50:12 hippietrail * * Fixed my mess in EncodingManager. * Changed many C casts to C++ casts. * * Revision 1.2 2003/01/25 03:16:05 hippietrail * * An UT_ICONV_INVALID fix which escaped the last commit. * * Revision 1.1 2003/01/24 05:52:34 hippietrail * * Refactored ispell code. Old ispell global variables had been put into * an allocated structure, a pointer to which was passed to many functions. * I have now made all such functions and variables private members of the * ISpellChecker class. It was C OO, now it's C++ OO. * * I've fixed the makefiles and tested compilation but am unable to test * operation. Please back out my changes if they cause problems which * are not obvious or easy to fix. * * Revision 1.12 2003/01/06 18:48:39 dom * ispell cleanup, start of using new 'add' save features * * Revision 1.11 2002/09/19 05:31:17 hippietrail * * More Ispell cleanup. Conditional globals and DEREF macros are removed. * K&R function declarations removed, converted to Doxygen style comments * where possible. No code has been changed (I hope). Compiles for me but * unable to test. * * Revision 1.10 2002/09/17 03:03:30 hippietrail * * After seeking permission on the developer list I've reformatted all the * spelling source which seemed to have parts which used 2, 3, 4, and 8 * spaces for tabs. It should all look good with our standard 4-space * tabs now. * I've concentrated just on indentation in the actual code. More prettying * could be done. * * NO code changes were made * * * Revision 1.9 2002/09/13 17:20:13 mpritchett * Fix more warnings for Linux build * * Revision 1.8 2002/05/03 09:49:43 fjfranklin * o hash downloader update (Gabriel Gerhardsson) * - Comment out the "Can't open " printf. * - Make the progressbar more clean at the begining of the download. * - Add support for tarballs that doesn't have the full path included * - Fix copyright headers on the newly added files (*HashDownloader.*) * * Revision 1.7 2001/08/27 19:06:30 dom * Lots of compilation fixes * * Revision 1.6 2001/08/10 18:32:40 dom * Spelling and iconv updates. god, i hate iconv * * Revision 1.5 2001/08/10 09:57:49 hub * Patch by sobomax@FreeBSD.org * #include "iconv.h" directive is missed from src/other/spell/xp/lookup.c and * src/wp/impexp/xp/ie_imp_RTF.cpp. * See bug 1823 * * Revision 1.4 2001/07/18 17:46:01 dom * Module changes, and fix compiler warnings * * Revision 1.3 2001/06/12 21:32:49 dom * More ispell work... * * Revision 1.2 2001/05/12 16:05:42 thomasf * Big pseudo changes to ispell to make it pass around a structure rather * than rely on all sorts of gloabals willy nilly here and there. Also * fixed our spelling class to work with accepting suggestions once more. * This code is dirty, gross and ugly (not to mention still not supporting * multiple hash sized just yet) but it works on my machine and will no * doubt break other machines. * * Revision 1.1 2001/04/15 16:01:24 tomas_f * moving to spell/xp * * Revision 1.7 1999/09/29 23:33:32 justin * Updates to the underlying ispell-based code to support suggested corrections. * * Revision 1.6 1999/04/13 17:12:51 jeff * Applied "Darren O. Benham" spell check changes. * Fixed crash on Win32 with the new code. * * Revision 1.5 1999/01/07 01:07:48 paul * Fixed spell leaks. * * Revision 1.5 1999/01/07 01:07:48 paul * Fixed spell leaks. * * Revision 1.4 1998/12/29 14:55:33 eric * * I've doctored the ispell code pretty extensively here. It is now * warning-free on Win32. It also *works* on Win32 now, since I * replaced all the I/O calls with ANSI standard ones. * * Revision 1.3 1998/12/28 23:11:30 eric * * modified spell code and integration to build on Windows. * This is still a hack. * * Actually, it doesn't yet WORK on Windows. It just builds. * SpellCheckInit is failing for some reason. * * Revision 1.2 1998/12/28 22:16:22 eric * * These changes begin to incorporate the spell checker into AbiWord. Most * of this is a hack. * * 1. added other/spell to the -I list in config/abi_defs * 2. replaced other/spell/Makefile with one which is more like * our build system. * 3. added other/spell to other/Makefile so that the build will now * dive down and build the spell check library. * 4. added the AbiSpell library to the Makefiles in wp/main * 5. added a call to SpellCheckInit in wp/main/unix/UnixMain.cpp. * This call is a HACK and should be replaced with something * proper later. * 6. added code to fv_View.cpp as follows: * whenever you double-click on a word, the spell checker * verifies that word and prints its status to stdout. * * Caveats: * 1. This will break the Windows build. I'm going to work on fixing it * now. * 2. This only works if your dictionary is in /usr/lib/ispell/american.hash. * The dictionary location is currently hard-coded. This will be * fixed as well. * * Anyway, such as it is, it works. * * Revision 1.1 1998/12/28 18:04:43 davet * Spell checker code stripped from ispell. At this point, there are * two external routines... the Init routine, and a check-a-word routine * which returns a boolean value, and takes a 16 bit char string. * The code resembles the ispell code as much as possible still. * * Revision 1.42 1995/01/08 23:23:42 geoff * Support MSDOS_BINARY_OPEN when opening the hash file to read it in. * * Revision 1.41 1994/01/25 07:11:51 geoff * Get rid of all old RCS log lines in preparation for the 3.1 release. * */ #include #include #include #include "ispell_checker.h" #include "msgs.h" #ifdef INDEXDUMP static void dumpindex P ((struct flagptr * indexp, int depth)); #endif /* INDEXDUMP */ int gnMaskBits = 64; /*! * \param hashname name of the hash file (dictionary) * * \return */ int ISpellChecker::linit (char *hashname) { FILE* fpHash; int i; struct dent * dp; struct flagent * entry; struct flagptr * ind; int nextchar, x; int viazero; ichar_t * cp; if ((fpHash = fopen (hashname, "rb")) == NULL) { return (-1); } m_hashsize = fread (reinterpret_cast(&m_hashheader), 1, sizeof m_hashheader, fpHash); if (m_hashsize < static_cast(sizeof(m_hashheader))) { if (m_hashsize < 0) fprintf (stderr, LOOKUP_C_CANT_READ, hashname); else if (m_hashsize == 0) fprintf (stderr, LOOKUP_C_NULL_HASH, hashname); else fprintf (stderr, LOOKUP_C_SHORT_HASH (m_hashname, m_hashsize, static_cast(sizeof m_hashheader))); return (-1); } else if (m_hashheader.magic != MAGIC) { fprintf (stderr, LOOKUP_C_BAD_MAGIC (hashname, static_cast(MAGIC), static_cast(m_hashheader.magic))); return (-1); } else if (m_hashheader.magic2 != MAGIC) { fprintf (stderr, LOOKUP_C_BAD_MAGIC2 (hashname, static_cast(MAGIC), static_cast(m_hashheader.magic2))); return (-1); } /* else if (hashheader.compileoptions != COMPILEOPTIONS*/ else if ( 1 != 1 || m_hashheader.maxstringchars != MAXSTRINGCHARS || m_hashheader.maxstringcharlen != MAXSTRINGCHARLEN) { fprintf (stderr, LOOKUP_C_BAD_OPTIONS (static_cast(m_hashheader.compileoptions), m_hashheader.maxstringchars, m_hashheader.maxstringcharlen, static_cast(COMPILEOPTIONS), MAXSTRINGCHARS, MAXSTRINGCHARLEN)); return (-1); } { m_hashtbl = (struct dent *) calloc (static_cast(m_hashheader.tblsize), sizeof (struct dent)); m_hashsize = m_hashheader.tblsize; m_hashstrings = static_cast(malloc(static_cast(m_hashheader.stringsize))); } m_numsflags = m_hashheader.stblsize; m_numpflags = m_hashheader.ptblsize; m_sflaglist = (struct flagent *) malloc ((m_numsflags + m_numpflags) * sizeof (struct flagent)); if (m_hashtbl == NULL || m_hashstrings == NULL || m_sflaglist == NULL) { fprintf (stderr, LOOKUP_C_NO_HASH_SPACE); return (-1); } m_pflaglist = m_sflaglist + m_numsflags; { if( fread ( m_hashstrings, 1, static_cast(m_hashheader.stringsize), fpHash) != static_cast(m_hashheader.stringsize) ) { fprintf (stderr, LOOKUP_C_BAD_FORMAT); fprintf (stderr, "stringsize err\n" ); return (-1); } if ( m_hashheader.compileoptions & 0x04 ) { if( fread (reinterpret_cast(m_hashtbl), 1, static_cast(m_hashheader.tblsize) * sizeof(struct dent), fpHash) != (static_cast(m_hashheader.tblsize * sizeof (struct dent)))) { fprintf (stderr, LOOKUP_C_BAD_FORMAT); return (-1); } } else { for( x=0; x(m_hashtbl+x), sizeof( struct dent)-sizeof( MASKTYPE ), 1, fpHash) != 1) { fprintf (stderr, LOOKUP_C_BAD_FORMAT); return (-1); } } /*for*/ } /*else*/ } if (fread (reinterpret_cast(m_sflaglist), 1, static_cast(m_numsflags+ m_numpflags) * sizeof (struct flagent), fpHash) != (m_numsflags + m_numpflags) * sizeof (struct flagent)) { fprintf (stderr, LOOKUP_C_BAD_FORMAT); return (-1); } fclose (fpHash); { for (i = m_hashsize, dp = m_hashtbl; --i >= 0; dp++) { if (dp->word == (char *) -1) dp->word = NULL; else dp->word = &m_hashstrings [ reinterpret_cast(dp->word) ]; if (dp->next == (struct dent *) -1) dp->next = NULL; else dp->next = &m_hashtbl [ reinterpret_cast(dp->next) ]; } } for (i = m_numsflags + m_numpflags, entry = m_sflaglist; --i >= 0; entry++) { if (entry->stripl) entry->strip = reinterpret_cast(&m_hashstrings[reinterpret_cast(entry->strip)]); else entry->strip = NULL; if (entry->affl) entry->affix = reinterpret_cast(&m_hashstrings[reinterpret_cast(entry->affix)]); else entry->affix = NULL; } /* ** Warning - 'entry' and 'i' are reset in the body of the loop ** below. Don't try to optimize it by (e.g.) moving the decrement ** of i into the loop condition. */ for (i = m_numsflags, entry = m_sflaglist; i > 0; i--, entry++) { if (entry->affl == 0) { cp = NULL; ind = &m_sflagindex[0]; viazero = 1; } else { cp = entry->affix + entry->affl - 1; ind = &m_sflagindex[*cp]; viazero = 0; while (ind->numents == 0 && ind->pu.fp != NULL) { if (cp == entry->affix) { ind = &ind->pu.fp[0]; viazero = 1; } else { ind = &ind->pu.fp[*--cp]; viazero = 0; } } } if (ind->numents == 0) ind->pu.ent = entry; ind->numents++; /* ** If this index entry has more than MAXSEARCH flags in ** it, we will split it into subentries to reduce the ** searching. However, the split doesn't make sense in ** two cases: (a) if we are already at the end of the ** current affix, or (b) if all the entries in the list ** have identical affixes. Since the list is sorted, (b) ** is true if the first and last affixes in the list ** are identical. */ if (!viazero && ind->numents >= MAXSEARCH && icharcmp (entry->affix, ind->pu.ent->affix) != 0) { /* Sneaky trick: back up and reprocess */ entry = ind->pu.ent - 1; /* -1 is for entry++ in loop */ i = m_numsflags - (entry - m_sflaglist); ind->pu.fp = (struct flagptr *) calloc (static_cast(SET_SIZE + m_hashheader.nstrchars), sizeof (struct flagptr)); if (ind->pu.fp == NULL) { fprintf (stderr, LOOKUP_C_NO_LANG_SPACE); return (-1); } ind->numents = 0; } } /* ** Warning - 'entry' and 'i' are reset in the body of the loop ** below. Don't try to optimize it by (e.g.) moving the decrement ** of i into the loop condition. */ for (i = m_numpflags, entry = m_pflaglist; i > 0; i--, entry++) { if (entry->affl == 0) { cp = NULL; ind = &m_pflagindex[0]; viazero = 1; } else { cp = entry->affix; ind = &m_pflagindex[*cp++]; viazero = 0; while (ind->numents == 0 && ind->pu.fp != NULL) { if (*cp == 0) { ind = &ind->pu.fp[0]; viazero = 1; } else { ind = &ind->pu.fp[*cp++]; viazero = 0; } } } if (ind->numents == 0) ind->pu.ent = entry; ind->numents++; /* ** If this index entry has more than MAXSEARCH flags in ** it, we will split it into subentries to reduce the ** searching. However, the split doesn't make sense in ** two cases: (a) if we are already at the end of the ** current affix, or (b) if all the entries in the list ** have identical affixes. Since the list is sorted, (b) ** is true if the first and last affixes in the list ** are identical. */ if (!viazero && ind->numents >= MAXSEARCH && icharcmp (entry->affix, ind->pu.ent->affix) != 0) { /* Sneaky trick: back up and reprocess */ entry = ind->pu.ent - 1; /* -1 is for entry++ in loop */ i = m_numpflags - (entry - m_pflaglist); ind->pu.fp = static_cast(calloc(SET_SIZE + m_hashheader.nstrchars, sizeof (struct flagptr))); if (ind->pu.fp == NULL) { fprintf (stderr, LOOKUP_C_NO_LANG_SPACE); return (-1); } ind->numents = 0; } } #ifdef INDEXDUMP fprintf (stderr, "Prefix index table:\n"); dumpindex (m_pflagindex, 0); fprintf (stderr, "Suffix index table:\n"); dumpindex (m_sflagindex, 0); #endif if (m_hashheader.nstrchartype == 0) m_chartypes = NULL; else { m_chartypes = (struct strchartype *) malloc (m_hashheader.nstrchartype * sizeof (struct strchartype)); if (m_chartypes == NULL) { fprintf (stderr, LOOKUP_C_NO_LANG_SPACE); return (-1); } for (i = 0, nextchar = m_hashheader.strtypestart; i < m_hashheader.nstrchartype; i++) { m_chartypes[i].name = &m_hashstrings[nextchar]; nextchar += strlen (m_chartypes[i].name) + 1; m_chartypes[i].deformatter = &m_hashstrings[nextchar]; nextchar += strlen (m_chartypes[i].deformatter) + 1; m_chartypes[i].suffixes = &m_hashstrings[nextchar]; while (m_hashstrings[nextchar] != '\0') nextchar += strlen (&m_hashstrings[nextchar]) + 1; nextchar++; } } initckch(NULL); return (0); } #ifndef FREEP #define FREEP(p) do { if (p) free(p); } while (0) #endif /*! * \param wchars Characters in -w option, if any */ void ISpellChecker::initckch (char *wchars) { ichar_t c; char num[4]; for (c = 0; c < static_cast(SET_SIZE+ m_hashheader.nstrchars); ++c) { if (iswordch (c)) { if (!mylower (c)) { m_Try[m_Trynum] = c; ++m_Trynum; } } else if (isboundarych (c)) { m_Try[m_Trynum] = c; ++m_Trynum; } } if (wchars != NULL) { while (m_Trynum < SET_SIZE && *wchars != '\0') { if (*wchars != 'n' && *wchars != '\\') { c = *wchars; ++wchars; } else { ++wchars; num[0] = '\0'; num[1] = '\0'; num[2] = '\0'; num[3] = '\0'; if (isdigit (wchars[0])) { num[0] = wchars[0]; if (isdigit (wchars[1])) { num[1] = wchars[1]; if (isdigit (wchars[2])) num[2] = wchars[2]; } } if (wchars[-1] == 'n') { wchars += strlen (num); c = atoi (num); } else { wchars += strlen (num); c = 0; if (num[0]) c = num[0] - '0'; if (num[1]) { c <<= 3; c += num[1] - '0'; } if (num[2]) { c <<= 3; c += num[2] - '0'; } } } /* c &= NOPARITY;*/ if (!m_hashheader.wordchars[c]) { m_hashheader.wordchars[c] = 1; m_hashheader.sortorder[c] = m_hashheader.sortval++; m_Try[m_Trynum] = c; ++m_Trynum; } } } } /* * \param indexp */ void ISpellChecker::clearindex (struct flagptr *indexp) { int i; for (i = 0; i < SET_SIZE + m_hashheader.nstrchars; i++, indexp++) { if (indexp->numents == 0 && indexp->pu.fp != NULL) { clearindex(indexp->pu.fp); free(indexp->pu.fp); } } } #ifdef INDEXDUMP static void dumpindex (indexp, depth) struct flagptr * indexp; int depth; { int i; int j; int k; char stripbuf[INPUTWORDLEN + 4 * MAXAFFIXLEN + 4]; for (i = 0; i < SET_SIZE + hashheader.nstrchars; i++, indexp++) { if (indexp->numents == 0 && indexp->pu.fp != NULL) { for (j = depth; --j >= 0; ) putc (' ', stderr); if (i >= ' ' && i <= '~') putc (i, stderr); else fprintf (stderr, "0x%x", i); putc ('\n', stderr); dumpindex (indexp->pu.fp, depth + 1); } else if (indexp->numents) { for (j = depth; --j >= 0; ) putc (' ', stderr); if (i >= ' ' && i <= '~') putc (i, stderr); else fprintf (stderr, "0x%x", i); fprintf (stderr, " -> %d entries\n", indexp->numents); for (k = 0; k < indexp->numents; k++) { for (j = depth; --j >= 0; ) putc (' ', stderr); if (indexp->pu.ent[k].stripl) { ichartostr (stripbuf, indexp->pu.ent[k].strip, sizeof stripbuf, 1); fprintf (stderr, " entry %d (-%s,%s)\n", &indexp->pu.ent[k] - sflaglist, stripbuf, indexp->pu.ent[k].affl ? ichartosstr (indexp->pu.ent[k].affix, 1) : "-"); } else fprintf (stderr, " entry %d (%s)\n", &indexp->pu.ent[k] - sflaglist, ichartosstr (indexp->pu.ent[k].affix, 1)); } } } } #endif /* n is length of s */ /* * \param s * \param dotree * * \return */ struct dent * ISpellChecker::ispell_lookup (ichar_t *s, int dotree) { struct dent * dp; char * s1; char schar[INPUTWORDLEN + MAXAFFIXLEN]; dp = &m_hashtbl[hash (s, m_hashsize)]; if (ichartostr (schar, s, sizeof schar, 1)) fprintf (stderr, WORD_TOO_LONG (schar)); for ( ; dp != NULL; dp = dp->next) { /* quick strcmp, but only for equality */ s1 = dp->word; if (s1 && s1[0] == schar[0] && strcmp (s1 + 1, schar + 1) == 0) return dp; #ifndef NO_CAPITALIZATION_SUPPORT while (dp->flagfield & MOREVARIANTS) /* Skip variations */ dp = dp->next; #endif } return NULL; } void ISpellChecker::alloc_ispell_struct() { m_translate_in = 0; } void ISpellChecker::free_ispell_struct() { }