summaryrefslogtreecommitdiffstats
path: root/kpdf/xpdf/xpdf/Lexer.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'kpdf/xpdf/xpdf/Lexer.cpp')
-rw-r--r--kpdf/xpdf/xpdf/Lexer.cpp521
1 files changed, 521 insertions, 0 deletions
diff --git a/kpdf/xpdf/xpdf/Lexer.cpp b/kpdf/xpdf/xpdf/Lexer.cpp
new file mode 100644
index 00000000..04eadef2
--- /dev/null
+++ b/kpdf/xpdf/xpdf/Lexer.cpp
@@ -0,0 +1,521 @@
+//========================================================================
+//
+// Lexer.cpp
+//
+// Copyright 1996-2003 Glyph & Cog, LLC
+//
+//========================================================================
+
+#include <aconf.h>
+
+#ifdef USE_GCC_PRAGMAS
+#pragma implementation
+#endif
+
+#include <stdlib.h>
+#include <stddef.h>
+#include <string.h>
+#include <ctype.h>
+#include "Lexer.h"
+#include "Error.h"
+#include "XRef.h"
+
+//------------------------------------------------------------------------
+
+// A '1' in this array means the character is white space. A '1' or
+// '2' means the character ends a name or command.
+static char specialChars[256] = {
+ 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, // 0x
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 1x
+ 1, 0, 0, 0, 0, 2, 0, 0, 2, 2, 0, 0, 0, 0, 0, 2, // 2x
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0, // 3x
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 4x
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0, 0, // 5x
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 6x
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0, 0, // 7x
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 8x
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 9x
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // ax
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // bx
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // cx
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // dx
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // ex
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // fx
+};
+
+//------------------------------------------------------------------------
+// Lexer
+//------------------------------------------------------------------------
+
+Lexer::Lexer(XRef *xrefA, Stream *str) {
+ Object obj;
+
+ xref = xrefA;
+
+ curStr.initStream(str);
+ streams = new Array(xref);
+ streams->add(curStr.copy(&obj));
+ strPtr = 0;
+ freeArray = gTrue;
+ curStr.streamReset();
+}
+
+Lexer::Lexer(XRef *xrefA, Object *obj) {
+ Object obj2;
+
+ xref = xrefA;
+
+ if (obj->isStream()) {
+ streams = new Array(xref);
+ freeArray = gTrue;
+ streams->add(obj->copy(&obj2));
+ } else {
+ streams = obj->getArray();
+ freeArray = gFalse;
+ }
+ strPtr = 0;
+ if (streams->getLength() > 0) {
+ streams->get(strPtr, &curStr);
+ curStr.streamReset();
+ }
+}
+
+Lexer::~Lexer() {
+ if (!curStr.isNone()) {
+ curStr.streamClose();
+ curStr.free();
+ }
+ if (freeArray) {
+ delete streams;
+ }
+}
+
+int Lexer::getChar() {
+ int c;
+
+ c = EOF;
+ while (!curStr.isNone() && (c = curStr.streamGetChar()) == EOF) {
+ curStr.streamClose();
+ curStr.free();
+ ++strPtr;
+ if (strPtr < streams->getLength()) {
+ streams->get(strPtr, &curStr);
+ curStr.streamReset();
+ }
+ }
+ return c;
+}
+
+int Lexer::lookChar() {
+ if (curStr.isNone()) {
+ return EOF;
+ }
+ return curStr.streamLookChar();
+}
+
+Object *Lexer::getObj(Object *obj, int objNum) {
+ char *p;
+ int c, c2;
+ GBool comment, neg, done;
+ int numParen;
+ int xi;
+ double xf, scale;
+ GString *s;
+ int n, m;
+
+ // skip whitespace and comments
+ comment = gFalse;
+ while (1) {
+ if ((c = getChar()) == EOF) {
+ return obj->initEOF();
+ }
+ if (comment) {
+ if (c == '\r' || c == '\n')
+ comment = gFalse;
+ } else if (c == '%') {
+ comment = gTrue;
+ } else if (specialChars[c] != 1) {
+ break;
+ }
+ }
+
+ // start reading token
+ switch (c) {
+
+ // number
+ case '0': case '1': case '2': case '3': case '4':
+ case '5': case '6': case '7': case '8': case '9':
+ case '-': case '.':
+ neg = gFalse;
+ xi = 0;
+ if (c == '-') {
+ neg = gTrue;
+ } else if (c == '.') {
+ goto doReal;
+ } else {
+ xi = c - '0';
+ }
+ while (1) {
+ c = lookChar();
+ if (isdigit(c)) {
+ getChar();
+ xi = xi * 10 + (c - '0');
+ } else if (c == '.') {
+ getChar();
+ goto doReal;
+ } else {
+ break;
+ }
+ }
+ if (neg)
+ xi = -xi;
+ obj->initInt(xi);
+ break;
+ doReal:
+ xf = xi;
+ scale = 0.1;
+ while (1) {
+ c = lookChar();
+ if (c == '-') {
+ // ignore minus signs in the middle of numbers to match
+ // Adobe's behavior
+ error(getPos(), "Badly formatted number");
+ getChar();
+ continue;
+ }
+ if (!isdigit(c)) {
+ break;
+ }
+ getChar();
+ xf = xf + scale * (c - '0');
+ scale *= 0.1;
+ }
+ if (neg)
+ xf = -xf;
+ obj->initReal(xf);
+ break;
+
+ // string
+ case '(':
+ p = tokBuf;
+ n = 0;
+ numParen = 1;
+ done = gFalse;
+ s = NULL;
+ do {
+ c2 = EOF;
+ switch (c = getChar()) {
+
+ case EOF:
+#if 0
+ // This breaks some PDF files, e.g., ones from Photoshop.
+ case '\r':
+ case '\n':
+#endif
+ error(getPos(), "Unterminated string");
+ done = gTrue;
+ break;
+
+ case '(':
+ ++numParen;
+ c2 = c;
+ break;
+
+ case ')':
+ if (--numParen == 0) {
+ done = gTrue;
+ } else {
+ c2 = c;
+ }
+ break;
+
+ case '\\':
+ switch (c = getChar()) {
+ case 'n':
+ c2 = '\n';
+ break;
+ case 'r':
+ c2 = '\r';
+ break;
+ case 't':
+ c2 = '\t';
+ break;
+ case 'b':
+ c2 = '\b';
+ break;
+ case 'f':
+ c2 = '\f';
+ break;
+ case '\\':
+ case '(':
+ case ')':
+ c2 = c;
+ break;
+ case '0': case '1': case '2': case '3':
+ case '4': case '5': case '6': case '7':
+ c2 = c - '0';
+ c = lookChar();
+ if (c >= '0' && c <= '7') {
+ getChar();
+ c2 = (c2 << 3) + (c - '0');
+ c = lookChar();
+ if (c >= '0' && c <= '7') {
+ getChar();
+ c2 = (c2 << 3) + (c - '0');
+ }
+ }
+ break;
+ case '\r':
+ c = lookChar();
+ if (c == '\n') {
+ getChar();
+ }
+ break;
+ case '\n':
+ break;
+ case EOF:
+ error(getPos(), "Unterminated string");
+ done = gTrue;
+ break;
+ default:
+ c2 = c;
+ break;
+ }
+ break;
+
+ default:
+ c2 = c;
+ break;
+ }
+
+ if (c2 != EOF) {
+ if (n == tokBufSize) {
+ if (!s)
+ s = new GString(tokBuf, tokBufSize);
+ else
+ s->append(tokBuf, tokBufSize);
+ p = tokBuf;
+ n = 0;
+
+ // we are growing see if the document is not malformed and we are growing too much
+ if (objNum > 0 && xref != NULL)
+ {
+ int newObjNum = xref->getNumEntry(curStr.streamGetPos());
+ if (newObjNum != objNum)
+ {
+ error(getPos(), "Unterminated string");
+ done = gTrue;
+ delete s;
+ n = -2;
+ }
+ }
+ }
+ *p++ = (char)c2;
+ ++n;
+ }
+ } while (!done);
+ if (n >= 0) {
+ if (!s)
+ s = new GString(tokBuf, n);
+ else
+ s->append(tokBuf, n);
+ obj->initString(s);
+ } else {
+ obj->initEOF();
+ }
+ break;
+
+ // name
+ case '/':
+ p = tokBuf;
+ n = 0;
+ s = NULL;
+ while ((c = lookChar()) != EOF && !specialChars[c]) {
+ getChar();
+ if (c == '#') {
+ c2 = lookChar();
+ if (c2 >= '0' && c2 <= '9') {
+ c = c2 - '0';
+ } else if (c2 >= 'A' && c2 <= 'F') {
+ c = c2 - 'A' + 10;
+ } else if (c2 >= 'a' && c2 <= 'f') {
+ c = c2 - 'a' + 10;
+ } else {
+ goto notEscChar;
+ }
+ getChar();
+ c <<= 4;
+ c2 = getChar();
+ if (c2 >= '0' && c2 <= '9') {
+ c += c2 - '0';
+ } else if (c2 >= 'A' && c2 <= 'F') {
+ c += c2 - 'A' + 10;
+ } else if (c2 >= 'a' && c2 <= 'f') {
+ c += c2 - 'a' + 10;
+ } else {
+ error(getPos(), "Illegal digit in hex char in name");
+ }
+ }
+ notEscChar:
+ if (n == tokBufSize) {
+ if (!s)
+ s = new GString(tokBuf, tokBufSize);
+ else
+ {
+ // the spec says 127 is the maximum, we are already at 256 so bail out
+ error(getPos(), "Name token too long");
+ break;
+ }
+ p = tokBuf;
+ n = 0;
+ }
+ *p++ = c;
+ ++n;
+ }
+ *p = '\0';
+ if (s) {
+ s->append(tokBuf, n);
+ obj->initName(s->getCString());
+ delete s;
+ } else obj->initName(tokBuf);
+ break;
+
+ // array punctuation
+ case '[':
+ case ']':
+ tokBuf[0] = c;
+ tokBuf[1] = '\0';
+ obj->initCmd(tokBuf);
+ break;
+
+ // hex string or dict punctuation
+ case '<':
+ c = lookChar();
+
+ // dict punctuation
+ if (c == '<') {
+ getChar();
+ tokBuf[0] = tokBuf[1] = '<';
+ tokBuf[2] = '\0';
+ obj->initCmd(tokBuf);
+
+ // hex string
+ } else {
+ p = tokBuf;
+ m = n = 0;
+ c2 = 0;
+ s = NULL;
+ while (1) {
+ c = getChar();
+ if (c == '>') {
+ break;
+ } else if (c == EOF) {
+ error(getPos(), "Unterminated hex string");
+ break;
+ } else if (specialChars[c] != 1) {
+ c2 = c2 << 4;
+ if (c >= '0' && c <= '9')
+ c2 += c - '0';
+ else if (c >= 'A' && c <= 'F')
+ c2 += c - 'A' + 10;
+ else if (c >= 'a' && c <= 'f')
+ c2 += c - 'a' + 10;
+ else
+ error(getPos(), "Illegal character <%02x> in hex string", c);
+ if (++m == 2) {
+ if (n == tokBufSize) {
+ if (!s)
+ s = new GString(tokBuf, tokBufSize);
+ else
+ s->append(tokBuf, tokBufSize);
+ p = tokBuf;
+ n = 0;
+ }
+ *p++ = (char)c2;
+ ++n;
+ c2 = 0;
+ m = 0;
+ }
+ }
+ }
+ if (!s)
+ s = new GString(tokBuf, n);
+ else
+ s->append(tokBuf, n);
+ if (m == 1)
+ s->append((char)(c2 << 4));
+ obj->initString(s);
+ }
+ break;
+
+ // dict punctuation
+ case '>':
+ c = lookChar();
+ if (c == '>') {
+ getChar();
+ tokBuf[0] = tokBuf[1] = '>';
+ tokBuf[2] = '\0';
+ obj->initCmd(tokBuf);
+ } else {
+ error(getPos(), "Illegal character '>'");
+ obj->initError();
+ }
+ break;
+
+ // error
+ case ')':
+ case '{':
+ case '}':
+ error(getPos(), "Illegal character '%c'", c);
+ obj->initError();
+ break;
+
+ // command
+ default:
+ p = tokBuf;
+ *p++ = c;
+ n = 1;
+ while ((c = lookChar()) != EOF && !specialChars[c]) {
+ getChar();
+ if (++n == tokBufSize) {
+ error(getPos(), "Command token too long");
+ break;
+ }
+ *p++ = c;
+ }
+ *p = '\0';
+ if (tokBuf[0] == 't' && !strcmp(tokBuf, "true")) {
+ obj->initBool(gTrue);
+ } else if (tokBuf[0] == 'f' && !strcmp(tokBuf, "false")) {
+ obj->initBool(gFalse);
+ } else if (tokBuf[0] == 'n' && !strcmp(tokBuf, "null")) {
+ obj->initNull();
+ } else {
+ obj->initCmd(tokBuf);
+ }
+ break;
+ }
+
+ return obj;
+}
+
+void Lexer::skipToNextLine() {
+ int c;
+
+ while (1) {
+ c = getChar();
+ if (c == EOF || c == '\n') {
+ return;
+ }
+ if (c == '\r') {
+ if ((c = lookChar()) == '\n') {
+ getChar();
+ }
+ return;
+ }
+ }
+}
+
+GBool Lexer::isSpace(int c) {
+ return c >= 0 && c <= 0xff && specialChars[c] == 1;
+}