1 files changed, 695 insertions, 0 deletions
diff --git a/src/translators/btparse/string_util.c b/src/translators/btparse/string_util.c
new file mode 100644
index 0000000..3713608
--- /dev/null
+++ b/src/translators/btparse/string_util.c
@@ -0,0 +1,695 @@
+/* ------------------------------------------------------------------------
+@NAME       : string_util.c
+@DESCRIPTION: Various string-processing utility functions:
+                bt_purify_string()
+                bt_change_case()
+
+              and their helpers:
+                foreign_letter()
+                purify_special_char()
+@GLOBALS    : 
+@CALLS      : 
+@CALLERS    : 
+@CREATED    : 1997/10/19, Greg Ward
+@MODIFIED   : 1997/11/25, GPW: renamed to from purify.c to string_util.c
+                               added bt_change_case() and friends
+@VERSION    : $Id: string_util.c,v 1.10 1999/10/28 22:50:28 greg Rel $
+-------------------------------------------------------------------------- */
+
+#include <stdlib.h>
+#include <ctype.h>
+#include <string.h>
+#include <assert.h>
+#include "error.h"
+#include "btparse.h"
+#include "bt_debug.h"
+
+
+/* 
+ * These definitions should be fixed to be consistent with HTML 
+ * entities, just for fun.  And perhaps I should add entries for
+ * accented letters (at least those supported by TeX and HTML).
+ */
+typedef enum
+{
+   L_OTHER,                             /* not a "foreign" letter */
+   L_OSLASH_L,                          /* Eastern European {\o} */
+   L_OSLASH_U,
+   L_LSLASH_L,                          /* {\l} */
+   L_LSLASH_U,
+   L_OELIG_L,                           /* Latin {\oe} ligature */
+   L_OELIG_U,
+   L_AELIG_L,                           /* {\ae} ligature */
+   L_AELIG_U,
+   L_SSHARP_L,                          /* German "sharp s" {\ss} */
+   L_SSHARP_U,
+   L_ACIRCLE_L,                         /* Nordic {\aa} */
+   L_ACIRCLE_U,
+   L_INODOT_L,                          /* undotted i: {\i} */
+   L_JNODOT_L                           /* {\j} */
+} bt_letter;
+
+
+static const char * uc_version[] = 
+{
+   NULL,                                /* L_OTHER */
+   "\\O",                               /* L_OSLASH_L */
+   "\\O",                               /* L_OSLASH_U */
+   "\\L",                               /* L_LSLASH_L */
+   "\\L",                               /* L_LSLASH_U */
+   "\\OE",                              /* L_OELIG_L */
+   "\\OE",                              /* L_OELIG_U */
+   "\\AE",                              /* L_AELIG_L */
+   "\\AE",                              /* L_AELIG_U */
+   "SS",                                /* L_SSHARP_L -- for LaTeX 2.09 */
+   "\\SS",                              /* L_SSHARP_U */
+   "\\AA",                              /* L_ACIRCLE_L */
+   "\\AA",                              /* L_ACIRCLE_U */
+   "I",                                 /* L_INODOT_L */
+   "J"                                  /* L_JNODOT_L */
+};
+
+static const char * lc_version[] = 
+{
+   NULL,                                /* L_OTHER */
+   "\\o",                               /* L_OSLASH_L */
+   "\\o",                               /* L_OSLASH_U */
+   "\\l",                               /* L_LSLASH_L */
+   "\\l",                               /* L_LSLASH_U */
+   "\\oe",                              /* L_OELIG_L */
+   "\\oe",                              /* L_OELIG_U */
+   "\\ae",                              /* L_AELIG_L */
+   "\\ae",                              /* L_AELIG_U */
+   "\\ss",                              /* L_SSHARP_L */
+   "\\ss",                              /* L_SSHARP_U */
+   "\\aa",                              /* L_ACIRCLE_L */
+   "\\aa",                              /* L_ACIRCLE_U */
+   "\\i",                               /* L_INODOT_L */
+   "\\j"                                /* L_JNODOT_L */
+};      
+
+
+
+/* ------------------------------------------------------------------------
+@NAME       : foreign_letter()
+@INPUT      : str
+              start
+              stop
+@OUTPUT     : letter
+@RETURNS    : TRUE if the string delimited by start and stop is a foreign
+              letter control sequence
+@DESCRIPTION: Determines if a character sequence is one of (La)TeX's
+              "foreign letter" control sequences (l, o, ae, oe, aa, ss, plus
+              uppercase versions).  If `letter' is non-NULL, returns which
+              letter was found in it (as a bt_letter value).
+@CALLS      : 
+@CALLERS    : purify_special_char()
+@CREATED    : 1997/10/19, GPW
+@MODIFIED   : 
+-------------------------------------------------------------------------- */
+static boolean
+foreign_letter (char *str, int start, int stop, bt_letter * letter)
+{
+   char      c1, c2;
+   bt_letter dummy;
+
+
+   /* 
+    * This is written for speed, not flexibility -- adding new foreign
+    * letters would be trying and vexatious.
+    * 
+    * N.B. my gold standard list of foreign letters is Kopka and Daly's
+    * *A Guide to LaTeX 2e*, section 2.5.6.
+    */
+
+   if (letter == NULL)                  /* so we can assign to *letter */
+      letter = &dummy;                  /* without compunctions */
+   *letter = L_OTHER;                   /* assume not a "foreign" letter */
+
+   c1 = str[start+0];                   /* only two characters that we're */
+   c2 = str[start+1];                   /* interested in */
+
+   switch (stop - start)
+   {
+      case 1:                           /* one-character control sequences */
+         switch (c1)                    /* (\o and \l) */
+         {
+            case 'o':
+               *letter = L_OSLASH_L; return TRUE;
+            case 'O':
+               *letter = L_OSLASH_U; return TRUE;
+            case 'l':
+               *letter = L_LSLASH_L; return TRUE;
+            case 'L': 
+               *letter = L_LSLASH_L; return TRUE;
+            case 'i':
+               *letter = L_INODOT_L; return TRUE;
+            case 'j':
+               *letter = L_JNODOT_L; return TRUE;
+            default:
+               return FALSE;
+         }
+         break;
+      case 2:                           /* two character control sequences */
+         switch (c1)                    /* (\oe, \ae, \aa, and \ss) */
+         {
+            case 'o':
+               if (c2 == 'e') { *letter = L_OELIG_L; return TRUE; }
+            case 'O':
+               if (c2 == 'E') { *letter = L_OELIG_U; return TRUE; }
+
+            /* BibTeX 0.99 does not handle \aa and \AA -- but I do!*/
+            case 'a':
+               if (c2 == 'e')
+                  { *letter = L_AELIG_L; return TRUE; }
+               else if (c2 == 'a')
+                  { *letter = L_ACIRCLE_L; return TRUE; }
+               else
+                  return FALSE;
+            case 'A':
+               if (c2 == 'E')
+                  { *letter = L_AELIG_U; return TRUE; }
+               else if (c2 == 'A')
+                  { *letter = L_ACIRCLE_U; return TRUE; }
+               else
+                  return FALSE;
+
+            /* uppercase sharp-s -- new with LaTeX 2e (so far all I do
+             * is recognize it as a "foreign" letter)
+             */
+            case 's':
+               if (c2 == 's')
+                  { *letter = L_SSHARP_L; return TRUE; }
+               else 
+                  return FALSE;
+            case 'S':
+               if (c2 == 'S')
+                  { *letter = L_SSHARP_U; return TRUE; }
+               else 
+                  return FALSE;
+         }
+         break;
+      default:
+         return FALSE;
+   } /* switch on length of control sequence */
+
+   internal_error ("foreign_letter(): should never reach end of function");
+   return FALSE;                        /* to keep gcc -Wall happy */
+
+} /* foreign_letter */
+
+
+/* ------------------------------------------------------------------------
+@NAME       : purify_special_char()
+@INPUT      : *src, *dst - pointers into the input and output strings
+@OUTPUT     : *src       - updated to point to the closing brace of the 
+                           special char
+              *dst       - updated to point to the next available spot
+                           for copying text to
+@RETURNS    : 
+@DESCRIPTION: "Purifies" a BibTeX special character.  On input, *src should
+              point to the opening brace of a special character (ie. the
+              brace must be at depth 0 of the whole string, and the
+              character immediately following it must be a backslash).
+              *dst should point to the next spot to copy into the output
+              (purified) string.  purify_special_char() will skip over the
+              opening brace and backslash; if the control sequence is one
+              of LaTeX's foreign letter sequences (as determined by
+              foreign_letter()), then it is simply copied to *dst.
+              Otherwise the control sequence is skipped.  In either case,
+              text after the control sequence is either copied (alphabetic
+              characters) or skipped (anything else, including hyphens,
+              ties, and digits).
+@CALLS      : foreign_letter()
+@CALLERS    : bt_purify_string()
+@CREATED    : 1997/10/19, GPW
+@MODIFIED   : 
+-------------------------------------------------------------------------- */
+static void
+purify_special_char (char *str, int * src, int * dst)
+{
+   int    depth;
+   int    peek;
+
+   assert (str[*src] == '{' && str[*src + 1] == '\\');
+   depth = 1;
+
+   *src += 2;                           /* jump to start of control sequence */
+   peek = *src;                         /* scan to end of control sequence */
+   while (isalpha (str[peek]))
+      peek++;
+   if (peek == *src)                    /* in case of single-char, non-alpha */
+      peek++;                           /* control sequence (eg. {\'e}) */
+
+   if (foreign_letter (str, *src, peek, NULL))
+   {
+      assert (peek - *src == 1 || peek - *src == 2);
+      str[(*dst)++] = str[(*src)++];    /* copy first char */
+      if (*src < peek)                  /* copy second char, downcasing */
+         str[(*dst)++] = tolower (str[(*src)++]);
+   }
+   else                                 /* not a foreign letter -- skip */
+   {                                    /* the control sequence entirely */
+      *src = peek;
+   }
+
+   while (str[*src])
+   {
+      switch (str[*src])
+      {
+         case '{':
+            depth++;
+            (*src)++;
+            break;
+         case '}':
+            depth--;
+            if (depth == 0) return;     /* done with special char */
+            (*src)++;
+            break;
+         default:
+            if (isalpha (str[*src]))    /* copy alphabetic chars */
+               str[(*dst)++] = str[(*src)++];
+            else                        /* skip everything else */
+               (*src)++;
+      }
+   }
+
+   /* 
+    * If we get here, we have unbalanced braces -- the '}' case should
+    * always hit a depth == 0 point if braces are balanced.  No warning,
+    * though, because a) BibTeX doesn't warn about purifying unbalanced
+    * strings, and b) we (should have) already warned about it in the
+    * lexer.
+    */
+
+} /* purify_special_char() */
+
+
+/* ------------------------------------------------------------------------
+@NAME       : bt_purify_string()
+@INOUT      : instr
+@INPUT      : options
+@OUTPUT     : 
+@RETURNS    : instr   - same as input string, but modified in place
+@DESCRIPTION: "Purifies" a BibTeX string.  This consists of copying
+              alphanumeric characters, converting hyphens and ties to
+              space, copying spaces, and skipping everything else.  (Well,
+              almost -- special characters are handled specially, of
+              course.  Basically, accented letters have the control
+              sequence skipped, while foreign letters have the control
+              sequence preserved in a reasonable manner.  See
+              purify_special_char() for details.)
+@CALLS      : purify_special_char()
+@CALLERS    : 
+@CREATED    : 1997/10/19, GPW
+@MODIFIED   : 
+-------------------------------------------------------------------------- */
+void
+bt_purify_string (char * string, ushort options)
+{
+   int    src,                          /* both indeces into string */
+          dst;
+   int    depth;                        /* brace depth in string */
+   unsigned orig_len;
+
+   /* 
+    * Since purification always copies or deletes chars, outstr will
+    * be no longer than string -- so nothing fancy is required to put
+    * an upper bound on its eventual size.
+    */
+
+   depth = 0;
+   src = 0;
+   dst = 0;
+   orig_len = strlen (string);
+
+   DBG_ACTION (1, printf ("bt_purify_string(): input = %p (%s)\n", 
+                          string, string));
+
+   while (string[src] != (char) 0)
+   {
+      DBG_ACTION (2, printf ("  next: >%c<: ", string[src]));
+      switch (string[src])
+      {
+         case '~':                      /* "separator" characters -- */
+         case '-':                      /* replaced with space */
+         case ' ':                      /* and copy an actual space */
+            string[dst++] = ' ';
+            src++;
+            DBG_ACTION (2, printf ("replacing with space"));
+            break;
+         case '{':
+            if (depth == 0 && string[src+1] == '\\')
+            {
+               DBG_ACTION (2, printf ("special char found"));
+               purify_special_char (string, &src, &dst);
+            }
+            else
+            {
+               DBG_ACTION (2, printf ("ordinary open brace"));
+               src++;
+            }
+            depth++;
+            break;
+         case '}':
+            DBG_ACTION (2, printf ("close brace"));
+            depth--;
+            src++;
+            break;
+         default:
+            if (isalnum (string[src]))         /* any alphanumeric char -- */
+            {
+               DBG_ACTION (2, printf ("alphanumeric -- copying"));
+               string[dst++] = string[src++]; /* copy it */
+            }
+            else                        /* anything else -- skip it */
+            {
+               DBG_ACTION (2, printf ("non-separator, non-brace, non-alpha"));
+               src++;
+            }
+      } /* switch string[src] */
+
+      DBG_ACTION (2, printf ("\n"));
+
+   } /* while string[src] */
+
+   DBG_ACTION (1, printf ("bt_purify_string(): depth on exit: %d\n", depth));
+
+   string[dst] = (char) 0;
+   assert (strlen (string) <= orig_len);
+} /* bt_purify_string() */
+
+
+/* ======================================================================
+ * Case-transformation stuff
+ */
+
+
+/* ------------------------------------------------------------------------
+@NAME       : convert_special_char()
+@INPUT      : transform
+@INOUT      : string
+              src
+              dst
+              start_sentence
+              after_colon
+@RETURNS    : 
+@DESCRIPTION: Does case conversion on a special character.
+@GLOBALS    : 
+@CALLS      : 
+@CALLERS    : 
+@CREATED    : 1997/11/25, GPW
+@MODIFIED   : 
+-------------------------------------------------------------------------- */
+static void
+convert_special_char (char transform, 
+                      char * string,
+                      int * src,
+                      int * dst, 
+                      boolean * start_sentence,
+                      boolean * after_colon)
+{
+   int       depth;
+   boolean   done_special;
+   int       cs_end;
+   int       cs_len;                    /* counting the backslash */
+   bt_letter letter;
+   const char *    repl;
+   int       repl_len;
+
+#ifndef ALLOW_WARNINGS
+   repl = NULL;                         /* silence "might be used" */
+                                        /* uninitialized" warning */
+#endif
+
+   /* First, copy just the opening brace */
+   string[(*dst)++] = string[(*src)++];
+
+   /* 
+    * Now loop over characters inside the braces -- stop when we reach
+    * the matching close brace, or when the string ends.
+    */
+   depth = 1;                           /* because we're in a special char */
+   done_special = FALSE;
+
+   while (string[*src] != 0 && !done_special)
+   {
+      switch (string[*src])
+      {
+         case '\\':                     /* a control sequence */
+         {
+            cs_end = *src+1;            /* scan over chars of c.s. */
+            while (isalpha (string[cs_end])) 
+               cs_end++;
+
+            /* 
+             * OK, now *src points to the backslash (so src+*1 points to
+             * first char. of control sequence), and cs_end points to
+             * character immediately following end of control sequence.
+             * Thus we analyze [*src+1..cs_end] to determine if the control
+             * sequence is a foreign letter, and use (cs_end - (*src+1) + 1)
+             * = (cs_end - *src) as the length of the control sequence.
+             */
+
+            cs_len = cs_end - *src;     /* length of cs, counting backslash */
+
+            if (foreign_letter (string, *src+1, cs_end, &letter))
+            {
+               if (letter == L_OTHER)
+                  internal_error ("impossible foreign letter");
+
+               switch (transform)
+               {
+                  case 'u':
+                     repl = uc_version[(int) letter];
+                     break;
+                  case 'l':
+                     repl = lc_version[(int) letter];
+                     break;
+                  case 't':
+                     if (*start_sentence || *after_colon)
+                     {
+                        repl = uc_version[(int) letter];
+                        *start_sentence = *after_colon = FALSE;
+                     }
+                     else
+                     {
+                        repl = lc_version[(int) letter];
+                     }
+                     break;
+                  default:
+                     internal_error ("impossible case transform \"%c\"",
+                                     transform);
+               }
+
+               repl_len = strlen (repl);
+               if (repl_len > cs_len)
+                  internal_error
+                     ("replacement text longer than original cs");
+
+               strncpy (string + *dst, repl, repl_len);
+               *src = cs_end;
+               *dst += repl_len;
+            } /* control sequence is a foreign letter */
+            else
+            {
+               /* not a foreign letter -- just copy the control seq. as is */
+
+
+               strncpy (string + *dst, string + *src, cs_end - *src);
+               *src += cs_len;
+               assert (*src == cs_end);
+               *dst += cs_len;
+            } /* control sequence not a foreign letter */
+
+            break;
+         } /* case: '\\' */
+
+         case '{':
+         {
+            string[(*dst)++] = string[(*src)++];
+            depth++;
+            break;
+         }
+
+         case '}':
+         {
+            string[(*dst)++] = string[(*src)++];
+            depth--;
+            if (depth == 0)
+               done_special = TRUE;
+            break;
+         }
+
+         default:                       /* any other character */
+         {
+            switch (transform)
+            {
+               /* 
+                * Inside special chars, lowercase and title caps are same.
+                * (At least, that's bibtex's convention.  I might change this
+                * at some point to be a bit smarter.)
+                */
+               case 'l':
+               case 't':
+                  string[(*dst)++] = tolower (string[(*src)++]);
+                  break;
+               case 'u':
+                  string[(*dst)++] = toupper (string[(*src)++]);
+                  break;
+               default:
+                  internal_error ("impossible case transform \"%c\"",
+                                  transform);
+            }
+         } /* default char */
+
+      } /* switch: current char */
+
+   } /* while: string or special char not done */
+
+} /* convert_special_char() */
+
+
+/* ------------------------------------------------------------------------
+@NAME       : bt_change_case()
+@INPUT      : 
+@OUTPUT     : 
+@RETURNS    : 
+@DESCRIPTION: Converts a string (in-place) to either uppercase, lowercase,
+              or "title capitalization">
+@GLOBALS    : 
+@CALLS      : 
+@CALLERS    : 
+@CREATED    : 1997/11/25, GPW
+@MODIFIED   : 
+-------------------------------------------------------------------------- */
+void
+bt_change_case (char   transform,
+                char * string,
+                ushort options)
+{
+   int    len;
+   int    depth;
+   int    src, dst;                     /* indeces into string */
+   boolean start_sentence;
+   boolean after_colon;
+
+   src = dst = 0;
+   len = strlen (string);
+   depth = 0;
+
+   start_sentence = TRUE;
+   after_colon = FALSE;
+
+   while (string[src] != 0)
+   {
+      switch (string[src])
+      {
+         case '{': 
+
+            /* 
+             * At start of special character?  The entire special char.
+             * will be handled here, as follows:
+             *   - text at any brace-depth within the s.c. is case-mangled;
+             *     punctuation (sentence endings, colons) are ignored
+             *   - control sequences are left alone, unless they are
+             *     one of the "foreign letter" control sequences, in
+             *     which case they're converted to the appropriate string
+             *     according to the uc_version or lc_version tables.
+             */
+            if (depth == 0 && string[src+1] == '\\')
+            {
+               convert_special_char (transform, string, &src, &dst, 
+                                     &start_sentence, &after_colon);
+            }
+
+            /*
+             * Otherwise, it's just something in braces.  This is probably
+             * a proper noun or something encased in braces to protect it
+             * from case-mangling, so we do not case-mangle it.  However,
+             * we *do* switch out of start_sentence or after_colon mode if
+             * we happen to be there (otherwise we'll do the wrong thing
+             * once we're out of the braces).
+             */
+            else
+            {
+               string[dst++] = string[src++];
+               start_sentence = after_colon = FALSE;
+               depth++;
+            }
+            break;
+
+         case '}':
+            string[dst++] = string[src++];
+            depth--;
+            break;
+
+         /*
+          * Sentence-ending punctuation and colons are handled separately
+          * to allow for exact mimicing of BibTeX's behaviour.  I happen
+          * to think that this behaviour (capitalize first word of sentences
+          * in a title) is better than BibTeX's, but I want to keep my
+          * options open for a future goal of perfect compatability.
+          */
+         case '.':
+         case '?':
+         case '!':
+            start_sentence = TRUE;
+            string[dst++] = string[src++];
+            break;
+
+         case ':':
+            after_colon = TRUE;
+            string[dst++] = string[src++];
+            break;
+
+         default:
+            if (isspace (string[src]))
+            {
+               string[dst++] = string[src++];
+            }
+            else
+            {
+               if (depth == 0)
+               {
+                  switch (transform)
+                  {
+                     case 'u':
+                        string[dst++] = toupper (string[src++]);
+                        break;
+                     case 'l':
+                        string[dst++] = tolower (string[src++]);
+                        break;
+                     case 't':
+                        if (start_sentence || after_colon)
+                        {
+                           /* 
+                            * XXX BibTeX only preserves case of character
+                            * immediately after a colon; I do two things
+                            * differently: first, I pay attention to sentence
+                            * punctuation, and second I force uppercase
+                            * at start of sentence or after a colon.
+                            */
+                           string[dst++] = toupper (string[src++]);
+                           start_sentence = after_colon = FALSE;
+                        }
+                        else
+                        {
+                           string[dst++] = tolower (string[src++]);
+                        }
+                        break;
+                     default:
+                        internal_error ("impossible case transform \"%c\"",
+                                        transform);
+                  }
+               } /* depth == 0 */
+               else
+               {
+                  string[dst++] = string[src++];
+               }
+            } /* not blank */
+      } /* switch on current character */
+                                  
+   } /* while not at end of string */
+
+} /* bt_change_case */