summaryrefslogtreecommitdiffstats
path: root/src/translators/btparse/string_util.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/translators/btparse/string_util.c')
-rw-r--r--src/translators/btparse/string_util.c695
1 files changed, 695 insertions, 0 deletions
diff --git a/src/translators/btparse/string_util.c b/src/translators/btparse/string_util.c
new file mode 100644
index 0000000..3713608
--- /dev/null
+++ b/src/translators/btparse/string_util.c
@@ -0,0 +1,695 @@
+/* ------------------------------------------------------------------------
+@NAME : string_util.c
+@DESCRIPTION: Various string-processing utility functions:
+ bt_purify_string()
+ bt_change_case()
+
+ and their helpers:
+ foreign_letter()
+ purify_special_char()
+@GLOBALS :
+@CALLS :
+@CALLERS :
+@CREATED : 1997/10/19, Greg Ward
+@MODIFIED : 1997/11/25, GPW: renamed to from purify.c to string_util.c
+ added bt_change_case() and friends
+@VERSION : $Id: string_util.c,v 1.10 1999/10/28 22:50:28 greg Rel $
+-------------------------------------------------------------------------- */
+
+#include <stdlib.h>
+#include <ctype.h>
+#include <string.h>
+#include <assert.h>
+#include "error.h"
+#include "btparse.h"
+#include "bt_debug.h"
+
+
+/*
+ * These definitions should be fixed to be consistent with HTML
+ * entities, just for fun. And perhaps I should add entries for
+ * accented letters (at least those supported by TeX and HTML).
+ */
+typedef enum
+{
+ L_OTHER, /* not a "foreign" letter */
+ L_OSLASH_L, /* Eastern European {\o} */
+ L_OSLASH_U,
+ L_LSLASH_L, /* {\l} */
+ L_LSLASH_U,
+ L_OELIG_L, /* Latin {\oe} ligature */
+ L_OELIG_U,
+ L_AELIG_L, /* {\ae} ligature */
+ L_AELIG_U,
+ L_SSHARP_L, /* German "sharp s" {\ss} */
+ L_SSHARP_U,
+ L_ACIRCLE_L, /* Nordic {\aa} */
+ L_ACIRCLE_U,
+ L_INODOT_L, /* undotted i: {\i} */
+ L_JNODOT_L /* {\j} */
+} bt_letter;
+
+
+static const char * uc_version[] =
+{
+ NULL, /* L_OTHER */
+ "\\O", /* L_OSLASH_L */
+ "\\O", /* L_OSLASH_U */
+ "\\L", /* L_LSLASH_L */
+ "\\L", /* L_LSLASH_U */
+ "\\OE", /* L_OELIG_L */
+ "\\OE", /* L_OELIG_U */
+ "\\AE", /* L_AELIG_L */
+ "\\AE", /* L_AELIG_U */
+ "SS", /* L_SSHARP_L -- for LaTeX 2.09 */
+ "\\SS", /* L_SSHARP_U */
+ "\\AA", /* L_ACIRCLE_L */
+ "\\AA", /* L_ACIRCLE_U */
+ "I", /* L_INODOT_L */
+ "J" /* L_JNODOT_L */
+};
+
+static const char * lc_version[] =
+{
+ NULL, /* L_OTHER */
+ "\\o", /* L_OSLASH_L */
+ "\\o", /* L_OSLASH_U */
+ "\\l", /* L_LSLASH_L */
+ "\\l", /* L_LSLASH_U */
+ "\\oe", /* L_OELIG_L */
+ "\\oe", /* L_OELIG_U */
+ "\\ae", /* L_AELIG_L */
+ "\\ae", /* L_AELIG_U */
+ "\\ss", /* L_SSHARP_L */
+ "\\ss", /* L_SSHARP_U */
+ "\\aa", /* L_ACIRCLE_L */
+ "\\aa", /* L_ACIRCLE_U */
+ "\\i", /* L_INODOT_L */
+ "\\j" /* L_JNODOT_L */
+};
+
+
+
+/* ------------------------------------------------------------------------
+@NAME : foreign_letter()
+@INPUT : str
+ start
+ stop
+@OUTPUT : letter
+@RETURNS : TRUE if the string delimited by start and stop is a foreign
+ letter control sequence
+@DESCRIPTION: Determines if a character sequence is one of (La)TeX's
+ "foreign letter" control sequences (l, o, ae, oe, aa, ss, plus
+ uppercase versions). If `letter' is non-NULL, returns which
+ letter was found in it (as a bt_letter value).
+@CALLS :
+@CALLERS : purify_special_char()
+@CREATED : 1997/10/19, GPW
+@MODIFIED :
+-------------------------------------------------------------------------- */
+static boolean
+foreign_letter (char *str, int start, int stop, bt_letter * letter)
+{
+ char c1, c2;
+ bt_letter dummy;
+
+
+ /*
+ * This is written for speed, not flexibility -- adding new foreign
+ * letters would be trying and vexatious.
+ *
+ * N.B. my gold standard list of foreign letters is Kopka and Daly's
+ * *A Guide to LaTeX 2e*, section 2.5.6.
+ */
+
+ if (letter == NULL) /* so we can assign to *letter */
+ letter = &dummy; /* without compunctions */
+ *letter = L_OTHER; /* assume not a "foreign" letter */
+
+ c1 = str[start+0]; /* only two characters that we're */
+ c2 = str[start+1]; /* interested in */
+
+ switch (stop - start)
+ {
+ case 1: /* one-character control sequences */
+ switch (c1) /* (\o and \l) */
+ {
+ case 'o':
+ *letter = L_OSLASH_L; return TRUE;
+ case 'O':
+ *letter = L_OSLASH_U; return TRUE;
+ case 'l':
+ *letter = L_LSLASH_L; return TRUE;
+ case 'L':
+ *letter = L_LSLASH_L; return TRUE;
+ case 'i':
+ *letter = L_INODOT_L; return TRUE;
+ case 'j':
+ *letter = L_JNODOT_L; return TRUE;
+ default:
+ return FALSE;
+ }
+ break;
+ case 2: /* two character control sequences */
+ switch (c1) /* (\oe, \ae, \aa, and \ss) */
+ {
+ case 'o':
+ if (c2 == 'e') { *letter = L_OELIG_L; return TRUE; }
+ case 'O':
+ if (c2 == 'E') { *letter = L_OELIG_U; return TRUE; }
+
+ /* BibTeX 0.99 does not handle \aa and \AA -- but I do!*/
+ case 'a':
+ if (c2 == 'e')
+ { *letter = L_AELIG_L; return TRUE; }
+ else if (c2 == 'a')
+ { *letter = L_ACIRCLE_L; return TRUE; }
+ else
+ return FALSE;
+ case 'A':
+ if (c2 == 'E')
+ { *letter = L_AELIG_U; return TRUE; }
+ else if (c2 == 'A')
+ { *letter = L_ACIRCLE_U; return TRUE; }
+ else
+ return FALSE;
+
+ /* uppercase sharp-s -- new with LaTeX 2e (so far all I do
+ * is recognize it as a "foreign" letter)
+ */
+ case 's':
+ if (c2 == 's')
+ { *letter = L_SSHARP_L; return TRUE; }
+ else
+ return FALSE;
+ case 'S':
+ if (c2 == 'S')
+ { *letter = L_SSHARP_U; return TRUE; }
+ else
+ return FALSE;
+ }
+ break;
+ default:
+ return FALSE;
+ } /* switch on length of control sequence */
+
+ internal_error ("foreign_letter(): should never reach end of function");
+ return FALSE; /* to keep gcc -Wall happy */
+
+} /* foreign_letter */
+
+
+/* ------------------------------------------------------------------------
+@NAME : purify_special_char()
+@INPUT : *src, *dst - pointers into the input and output strings
+@OUTPUT : *src - updated to point to the closing brace of the
+ special char
+ *dst - updated to point to the next available spot
+ for copying text to
+@RETURNS :
+@DESCRIPTION: "Purifies" a BibTeX special character. On input, *src should
+ point to the opening brace of a special character (ie. the
+ brace must be at depth 0 of the whole string, and the
+ character immediately following it must be a backslash).
+ *dst should point to the next spot to copy into the output
+ (purified) string. purify_special_char() will skip over the
+ opening brace and backslash; if the control sequence is one
+ of LaTeX's foreign letter sequences (as determined by
+ foreign_letter()), then it is simply copied to *dst.
+ Otherwise the control sequence is skipped. In either case,
+ text after the control sequence is either copied (alphabetic
+ characters) or skipped (anything else, including hyphens,
+ ties, and digits).
+@CALLS : foreign_letter()
+@CALLERS : bt_purify_string()
+@CREATED : 1997/10/19, GPW
+@MODIFIED :
+-------------------------------------------------------------------------- */
+static void
+purify_special_char (char *str, int * src, int * dst)
+{
+ int depth;
+ int peek;
+
+ assert (str[*src] == '{' && str[*src + 1] == '\\');
+ depth = 1;
+
+ *src += 2; /* jump to start of control sequence */
+ peek = *src; /* scan to end of control sequence */
+ while (isalpha (str[peek]))
+ peek++;
+ if (peek == *src) /* in case of single-char, non-alpha */
+ peek++; /* control sequence (eg. {\'e}) */
+
+ if (foreign_letter (str, *src, peek, NULL))
+ {
+ assert (peek - *src == 1 || peek - *src == 2);
+ str[(*dst)++] = str[(*src)++]; /* copy first char */
+ if (*src < peek) /* copy second char, downcasing */
+ str[(*dst)++] = tolower (str[(*src)++]);
+ }
+ else /* not a foreign letter -- skip */
+ { /* the control sequence entirely */
+ *src = peek;
+ }
+
+ while (str[*src])
+ {
+ switch (str[*src])
+ {
+ case '{':
+ depth++;
+ (*src)++;
+ break;
+ case '}':
+ depth--;
+ if (depth == 0) return; /* done with special char */
+ (*src)++;
+ break;
+ default:
+ if (isalpha (str[*src])) /* copy alphabetic chars */
+ str[(*dst)++] = str[(*src)++];
+ else /* skip everything else */
+ (*src)++;
+ }
+ }
+
+ /*
+ * If we get here, we have unbalanced braces -- the '}' case should
+ * always hit a depth == 0 point if braces are balanced. No warning,
+ * though, because a) BibTeX doesn't warn about purifying unbalanced
+ * strings, and b) we (should have) already warned about it in the
+ * lexer.
+ */
+
+} /* purify_special_char() */
+
+
+/* ------------------------------------------------------------------------
+@NAME : bt_purify_string()
+@INOUT : instr
+@INPUT : options
+@OUTPUT :
+@RETURNS : instr - same as input string, but modified in place
+@DESCRIPTION: "Purifies" a BibTeX string. This consists of copying
+ alphanumeric characters, converting hyphens and ties to
+ space, copying spaces, and skipping everything else. (Well,
+ almost -- special characters are handled specially, of
+ course. Basically, accented letters have the control
+ sequence skipped, while foreign letters have the control
+ sequence preserved in a reasonable manner. See
+ purify_special_char() for details.)
+@CALLS : purify_special_char()
+@CALLERS :
+@CREATED : 1997/10/19, GPW
+@MODIFIED :
+-------------------------------------------------------------------------- */
+void
+bt_purify_string (char * string, ushort options)
+{
+ int src, /* both indeces into string */
+ dst;
+ int depth; /* brace depth in string */
+ unsigned orig_len;
+
+ /*
+ * Since purification always copies or deletes chars, outstr will
+ * be no longer than string -- so nothing fancy is required to put
+ * an upper bound on its eventual size.
+ */
+
+ depth = 0;
+ src = 0;
+ dst = 0;
+ orig_len = strlen (string);
+
+ DBG_ACTION (1, printf ("bt_purify_string(): input = %p (%s)\n",
+ string, string));
+
+ while (string[src] != (char) 0)
+ {
+ DBG_ACTION (2, printf (" next: >%c<: ", string[src]));
+ switch (string[src])
+ {
+ case '~': /* "separator" characters -- */
+ case '-': /* replaced with space */
+ case ' ': /* and copy an actual space */
+ string[dst++] = ' ';
+ src++;
+ DBG_ACTION (2, printf ("replacing with space"));
+ break;
+ case '{':
+ if (depth == 0 && string[src+1] == '\\')
+ {
+ DBG_ACTION (2, printf ("special char found"));
+ purify_special_char (string, &src, &dst);
+ }
+ else
+ {
+ DBG_ACTION (2, printf ("ordinary open brace"));
+ src++;
+ }
+ depth++;
+ break;
+ case '}':
+ DBG_ACTION (2, printf ("close brace"));
+ depth--;
+ src++;
+ break;
+ default:
+ if (isalnum (string[src])) /* any alphanumeric char -- */
+ {
+ DBG_ACTION (2, printf ("alphanumeric -- copying"));
+ string[dst++] = string[src++]; /* copy it */
+ }
+ else /* anything else -- skip it */
+ {
+ DBG_ACTION (2, printf ("non-separator, non-brace, non-alpha"));
+ src++;
+ }
+ } /* switch string[src] */
+
+ DBG_ACTION (2, printf ("\n"));
+
+ } /* while string[src] */
+
+ DBG_ACTION (1, printf ("bt_purify_string(): depth on exit: %d\n", depth));
+
+ string[dst] = (char) 0;
+ assert (strlen (string) <= orig_len);
+} /* bt_purify_string() */
+
+
+/* ======================================================================
+ * Case-transformation stuff
+ */
+
+
+/* ------------------------------------------------------------------------
+@NAME : convert_special_char()
+@INPUT : transform
+@INOUT : string
+ src
+ dst
+ start_sentence
+ after_colon
+@RETURNS :
+@DESCRIPTION: Does case conversion on a special character.
+@GLOBALS :
+@CALLS :
+@CALLERS :
+@CREATED : 1997/11/25, GPW
+@MODIFIED :
+-------------------------------------------------------------------------- */
+static void
+convert_special_char (char transform,
+ char * string,
+ int * src,
+ int * dst,
+ boolean * start_sentence,
+ boolean * after_colon)
+{
+ int depth;
+ boolean done_special;
+ int cs_end;
+ int cs_len; /* counting the backslash */
+ bt_letter letter;
+ const char * repl;
+ int repl_len;
+
+#ifndef ALLOW_WARNINGS
+ repl = NULL; /* silence "might be used" */
+ /* uninitialized" warning */
+#endif
+
+ /* First, copy just the opening brace */
+ string[(*dst)++] = string[(*src)++];
+
+ /*
+ * Now loop over characters inside the braces -- stop when we reach
+ * the matching close brace, or when the string ends.
+ */
+ depth = 1; /* because we're in a special char */
+ done_special = FALSE;
+
+ while (string[*src] != 0 && !done_special)
+ {
+ switch (string[*src])
+ {
+ case '\\': /* a control sequence */
+ {
+ cs_end = *src+1; /* scan over chars of c.s. */
+ while (isalpha (string[cs_end]))
+ cs_end++;
+
+ /*
+ * OK, now *src points to the backslash (so src+*1 points to
+ * first char. of control sequence), and cs_end points to
+ * character immediately following end of control sequence.
+ * Thus we analyze [*src+1..cs_end] to determine if the control
+ * sequence is a foreign letter, and use (cs_end - (*src+1) + 1)
+ * = (cs_end - *src) as the length of the control sequence.
+ */
+
+ cs_len = cs_end - *src; /* length of cs, counting backslash */
+
+ if (foreign_letter (string, *src+1, cs_end, &letter))
+ {
+ if (letter == L_OTHER)
+ internal_error ("impossible foreign letter");
+
+ switch (transform)
+ {
+ case 'u':
+ repl = uc_version[(int) letter];
+ break;
+ case 'l':
+ repl = lc_version[(int) letter];
+ break;
+ case 't':
+ if (*start_sentence || *after_colon)
+ {
+ repl = uc_version[(int) letter];
+ *start_sentence = *after_colon = FALSE;
+ }
+ else
+ {
+ repl = lc_version[(int) letter];
+ }
+ break;
+ default:
+ internal_error ("impossible case transform \"%c\"",
+ transform);
+ }
+
+ repl_len = strlen (repl);
+ if (repl_len > cs_len)
+ internal_error
+ ("replacement text longer than original cs");
+
+ strncpy (string + *dst, repl, repl_len);
+ *src = cs_end;
+ *dst += repl_len;
+ } /* control sequence is a foreign letter */
+ else
+ {
+ /* not a foreign letter -- just copy the control seq. as is */
+
+
+ strncpy (string + *dst, string + *src, cs_end - *src);
+ *src += cs_len;
+ assert (*src == cs_end);
+ *dst += cs_len;
+ } /* control sequence not a foreign letter */
+
+ break;
+ } /* case: '\\' */
+
+ case '{':
+ {
+ string[(*dst)++] = string[(*src)++];
+ depth++;
+ break;
+ }
+
+ case '}':
+ {
+ string[(*dst)++] = string[(*src)++];
+ depth--;
+ if (depth == 0)
+ done_special = TRUE;
+ break;
+ }
+
+ default: /* any other character */
+ {
+ switch (transform)
+ {
+ /*
+ * Inside special chars, lowercase and title caps are same.
+ * (At least, that's bibtex's convention. I might change this
+ * at some point to be a bit smarter.)
+ */
+ case 'l':
+ case 't':
+ string[(*dst)++] = tolower (string[(*src)++]);
+ break;
+ case 'u':
+ string[(*dst)++] = toupper (string[(*src)++]);
+ break;
+ default:
+ internal_error ("impossible case transform \"%c\"",
+ transform);
+ }
+ } /* default char */
+
+ } /* switch: current char */
+
+ } /* while: string or special char not done */
+
+} /* convert_special_char() */
+
+
+/* ------------------------------------------------------------------------
+@NAME : bt_change_case()
+@INPUT :
+@OUTPUT :
+@RETURNS :
+@DESCRIPTION: Converts a string (in-place) to either uppercase, lowercase,
+ or "title capitalization">
+@GLOBALS :
+@CALLS :
+@CALLERS :
+@CREATED : 1997/11/25, GPW
+@MODIFIED :
+-------------------------------------------------------------------------- */
+void
+bt_change_case (char transform,
+ char * string,
+ ushort options)
+{
+ int len;
+ int depth;
+ int src, dst; /* indeces into string */
+ boolean start_sentence;
+ boolean after_colon;
+
+ src = dst = 0;
+ len = strlen (string);
+ depth = 0;
+
+ start_sentence = TRUE;
+ after_colon = FALSE;
+
+ while (string[src] != 0)
+ {
+ switch (string[src])
+ {
+ case '{':
+
+ /*
+ * At start of special character? The entire special char.
+ * will be handled here, as follows:
+ * - text at any brace-depth within the s.c. is case-mangled;
+ * punctuation (sentence endings, colons) are ignored
+ * - control sequences are left alone, unless they are
+ * one of the "foreign letter" control sequences, in
+ * which case they're converted to the appropriate string
+ * according to the uc_version or lc_version tables.
+ */
+ if (depth == 0 && string[src+1] == '\\')
+ {
+ convert_special_char (transform, string, &src, &dst,
+ &start_sentence, &after_colon);
+ }
+
+ /*
+ * Otherwise, it's just something in braces. This is probably
+ * a proper noun or something encased in braces to protect it
+ * from case-mangling, so we do not case-mangle it. However,
+ * we *do* switch out of start_sentence or after_colon mode if
+ * we happen to be there (otherwise we'll do the wrong thing
+ * once we're out of the braces).
+ */
+ else
+ {
+ string[dst++] = string[src++];
+ start_sentence = after_colon = FALSE;
+ depth++;
+ }
+ break;
+
+ case '}':
+ string[dst++] = string[src++];
+ depth--;
+ break;
+
+ /*
+ * Sentence-ending punctuation and colons are handled separately
+ * to allow for exact mimicing of BibTeX's behaviour. I happen
+ * to think that this behaviour (capitalize first word of sentences
+ * in a title) is better than BibTeX's, but I want to keep my
+ * options open for a future goal of perfect compatability.
+ */
+ case '.':
+ case '?':
+ case '!':
+ start_sentence = TRUE;
+ string[dst++] = string[src++];
+ break;
+
+ case ':':
+ after_colon = TRUE;
+ string[dst++] = string[src++];
+ break;
+
+ default:
+ if (isspace (string[src]))
+ {
+ string[dst++] = string[src++];
+ }
+ else
+ {
+ if (depth == 0)
+ {
+ switch (transform)
+ {
+ case 'u':
+ string[dst++] = toupper (string[src++]);
+ break;
+ case 'l':
+ string[dst++] = tolower (string[src++]);
+ break;
+ case 't':
+ if (start_sentence || after_colon)
+ {
+ /*
+ * XXX BibTeX only preserves case of character
+ * immediately after a colon; I do two things
+ * differently: first, I pay attention to sentence
+ * punctuation, and second I force uppercase
+ * at start of sentence or after a colon.
+ */
+ string[dst++] = toupper (string[src++]);
+ start_sentence = after_colon = FALSE;
+ }
+ else
+ {
+ string[dst++] = tolower (string[src++]);
+ }
+ break;
+ default:
+ internal_error ("impossible case transform \"%c\"",
+ transform);
+ }
+ } /* depth == 0 */
+ else
+ {
+ string[dst++] = string[src++];
+ }
+ } /* not blank */
+ } /* switch on current character */
+
+ } /* while not at end of string */
+
+} /* bt_change_case */