summaryrefslogtreecommitdiffstats
path: root/filters/kword/pdf/xpdf/xpdf/pdftotext.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'filters/kword/pdf/xpdf/xpdf/pdftotext.cpp')
-rw-r--r--filters/kword/pdf/xpdf/xpdf/pdftotext.cpp322
1 files changed, 322 insertions, 0 deletions
diff --git a/filters/kword/pdf/xpdf/xpdf/pdftotext.cpp b/filters/kword/pdf/xpdf/xpdf/pdftotext.cpp
new file mode 100644
index 000000000..b2e68b422
--- /dev/null
+++ b/filters/kword/pdf/xpdf/xpdf/pdftotext.cpp
@@ -0,0 +1,322 @@
+//========================================================================
+//
+// pdftotext.cpp
+//
+// Copyright 1997-2002 Glyph & Cog, LLC
+//
+//========================================================================
+
+#include <aconf.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stddef.h>
+#include <string.h>
+#include "parseargs.h"
+#include "GString.h"
+#include "gmem.h"
+#include "GlobalParams.h"
+#include "Object.h"
+#include "Stream.h"
+#include "Array.h"
+#include "Dict.h"
+#include "XRef.h"
+#include "Catalog.h"
+#include "Page.h"
+#include "PDFDoc.h"
+#include "TextOutputDev.h"
+#include "CharTypes.h"
+#include "UnicodeMap.h"
+#include "Error.h"
+#include "config.h"
+
+static void printInfoString(FILE *f, Dict *infoDict, char *key,
+ char *text1, char *text2, UnicodeMap *uMap);
+static void printInfoDate(FILE *f, Dict *infoDict, char *key, char *fmt);
+
+static int firstPage = 1;
+static int lastPage = 0;
+static GBool rawOrder = gFalse;
+static GBool htmlMeta = gFalse;
+static char textEncName[128] = "";
+static char textEOL[16] = "";
+static char ownerPassword[33] = "";
+static char userPassword[33] = "";
+static GBool quiet = gFalse;
+static char cfgFileName[256] = "";
+static GBool printVersion = gFalse;
+static GBool printHelp = gFalse;
+
+static ArgDesc argDesc[] = {
+ {"-f", argInt, &firstPage, 0,
+ "first page to convert"},
+ {"-l", argInt, &lastPage, 0,
+ "last page to convert"},
+ {"-raw", argFlag, &rawOrder, 0,
+ "keep strings in content stream order"},
+ {"-htmlmeta", argFlag, &htmlMeta, 0,
+ "generate a simple HTML file, including the meta information"},
+ {"-enc", argString, textEncName, sizeof(textEncName),
+ "output text encoding name"},
+ {"-eol", argString, textEOL, sizeof(textEOL),
+ "output end-of-line convention (unix, dos, or mac)"},
+ {"-opw", argString, ownerPassword, sizeof(ownerPassword),
+ "owner password (for encrypted files)"},
+ {"-upw", argString, userPassword, sizeof(userPassword),
+ "user password (for encrypted files)"},
+ {"-q", argFlag, &quiet, 0,
+ "don't print any messages or errors"},
+ {"-cfg", argString, cfgFileName, sizeof(cfgFileName),
+ "configuration file to use in place of .xpdfrc"},
+ {"-v", argFlag, &printVersion, 0,
+ "print copyright and version info"},
+ {"-h", argFlag, &printHelp, 0,
+ "print usage information"},
+ {"-help", argFlag, &printHelp, 0,
+ "print usage information"},
+ {"--help", argFlag, &printHelp, 0,
+ "print usage information"},
+ {"-?", argFlag, &printHelp, 0,
+ "print usage information"},
+ {NULL}
+};
+
+int main(int argc, char *argv[]) {
+ PDFDoc *doc;
+ GString *fileName;
+ GString *textFileName;
+ GString *ownerPW, *userPW;
+ TextOutputDev *textOut;
+ FILE *f;
+ UnicodeMap *uMap;
+ Object info;
+ GBool ok;
+ char *p;
+ int exitCode;
+
+ exitCode = 99;
+
+ // parse args
+ ok = parseArgs(argDesc, &argc, argv);
+ if (!ok || argc < 2 || argc > 3 || printVersion || printHelp) {
+ fprintf(stderr, "pdftotext version %s\n", xpdfVersion);
+ fprintf(stderr, "%s\n", xpdfCopyright);
+ if (!printVersion) {
+ printUsage("pdftotext", "<PDF-file> [<text-file>]", argDesc);
+ }
+ goto err0;
+ }
+ fileName = new GString(argv[1]);
+
+ // read config file
+ globalParams = new GlobalParams(cfgFileName);
+ if (textEncName[0]) {
+ globalParams->setTextEncoding(textEncName);
+ }
+ if (textEOL[0]) {
+ if (!globalParams->setTextEOL(textEOL)) {
+ fprintf(stderr, "Bad '-eol' value on command line\n");
+ }
+ }
+ if (quiet) {
+ globalParams->setErrQuiet(quiet);
+ }
+
+ // get mapping to output encoding
+ if (!(uMap = globalParams->getTextEncoding())) {
+ error(-1, "Couldn't get text encoding");
+ delete fileName;
+ goto err1;
+ }
+
+ // open PDF file
+ if (ownerPassword[0]) {
+ ownerPW = new GString(ownerPassword);
+ } else {
+ ownerPW = NULL;
+ }
+ if (userPassword[0]) {
+ userPW = new GString(userPassword);
+ } else {
+ userPW = NULL;
+ }
+ doc = new PDFDoc(fileName, ownerPW, userPW);
+ if (userPW) {
+ delete userPW;
+ }
+ if (ownerPW) {
+ delete ownerPW;
+ }
+ if (!doc->isOk()) {
+ exitCode = 1;
+ goto err2;
+ }
+
+ // check for copy permission
+ if (!doc->okToCopy()) {
+ error(-1, "Copying of text from this document is not allowed.");
+ exitCode = 3;
+ goto err2;
+ }
+
+ // construct text file name
+ if (argc == 3) {
+ textFileName = new GString(argv[2]);
+ } else {
+ p = fileName->getCString() + fileName->getLength() - 4;
+ if (!strcmp(p, ".pdf") || !strcmp(p, ".PDF")) {
+ textFileName = new GString(fileName->getCString(),
+ fileName->getLength() - 4);
+ } else {
+ textFileName = fileName->copy();
+ }
+ textFileName->append(htmlMeta ? ".html" : ".txt");
+ }
+
+ // get page range
+ if (firstPage < 1) {
+ firstPage = 1;
+ }
+ if (lastPage < 1 || lastPage > doc->getNumPages()) {
+ lastPage = doc->getNumPages();
+ }
+
+ // write HTML header
+ if (htmlMeta) {
+ if (!textFileName->cmp("-")) {
+ f = stdout;
+ } else {
+ if (!(f = fopen(textFileName->getCString(), "wb"))) {
+ error(-1, "Couldn't open text file '%s'", textFileName->getCString());
+ exitCode = 2;
+ goto err3;
+ }
+ }
+ fputs("<html>\n", f);
+ fputs("<head>\n", f);
+ doc->getDocInfo(&info);
+ if (info.isDict()) {
+ printInfoString(f, info.getDict(), "Title", "<title>", "</title>\n",
+ uMap);
+ printInfoString(f, info.getDict(), "Subject",
+ "<meta name=\"Subject\" content=\"", "\">\n", uMap);
+ printInfoString(f, info.getDict(), "Keywords",
+ "<meta name=\"Keywords\" content=\"", "\">\n", uMap);
+ printInfoString(f, info.getDict(), "Author",
+ "<meta name=\"Author\" content=\"", "\">\n", uMap);
+ printInfoString(f, info.getDict(), "Creator",
+ "<meta name=\"Creator\" content=\"", "\">\n", uMap);
+ printInfoString(f, info.getDict(), "Producer",
+ "<meta name=\"Producer\" content=\"", "\">\n", uMap);
+ printInfoDate(f, info.getDict(), "CreationDate",
+ "<meta name=\"CreationDate\" content=\"\">\n");
+ printInfoDate(f, info.getDict(), "LastModifiedDate",
+ "<meta name=\"ModDate\" content=\"\">\n");
+ }
+ info.free();
+ fputs("</head>\n", f);
+ fputs("<body>\n", f);
+ fputs("<pre>\n", f);
+ if (f != stdout) {
+ fclose(f);
+ }
+ }
+
+ // write text file
+ textOut = new TextOutputDev(textFileName->getCString(), rawOrder, htmlMeta);
+ if (textOut->isOk()) {
+ doc->displayPages(textOut, firstPage, lastPage, 72, 0, gFalse);
+ } else {
+ delete textOut;
+ exitCode = 2;
+ goto err3;
+ }
+ delete textOut;
+
+ // write end of HTML file
+ if (htmlMeta) {
+ if (!textFileName->cmp("-")) {
+ f = stdout;
+ } else {
+ if (!(f = fopen(textFileName->getCString(), "ab"))) {
+ error(-1, "Couldn't open text file '%s'", textFileName->getCString());
+ exitCode = 2;
+ goto err3;
+ }
+ }
+ fputs("</pre>\n", f);
+ fputs("</body>\n", f);
+ fputs("</html>\n", f);
+ if (f != stdout) {
+ fclose(f);
+ }
+ }
+
+ exitCode = 0;
+
+ // clean up
+ err3:
+ delete textFileName;
+ err2:
+ delete doc;
+ uMap->decRefCnt();
+ err1:
+ delete globalParams;
+ err0:
+
+ // check for memory leaks
+ Object::memCheck(stderr);
+ gMemReport(stderr);
+
+ return exitCode;
+}
+
+static void printInfoString(FILE *f, Dict *infoDict, char *key,
+ char *text1, char *text2, UnicodeMap *uMap) {
+ Object obj;
+ GString *s1;
+ GBool isUnicode;
+ Unicode u;
+ char buf[8];
+ int i, n;
+
+ if (infoDict->lookup(key, &obj)->isString()) {
+ fputs(text1, f);
+ s1 = obj.getString();
+ if ((s1->getChar(0) & 0xff) == 0xfe &&
+ (s1->getChar(1) & 0xff) == 0xff) {
+ isUnicode = gTrue;
+ i = 2;
+ } else {
+ isUnicode = gFalse;
+ i = 0;
+ }
+ while (i < obj.getString()->getLength()) {
+ if (isUnicode) {
+ u = ((s1->getChar(i) & 0xff) << 8) |
+ (s1->getChar(i+1) & 0xff);
+ i += 2;
+ } else {
+ u = s1->getChar(i) & 0xff;
+ ++i;
+ }
+ n = uMap->mapUnicode(u, buf, sizeof(buf));
+ fwrite(buf, 1, n, f);
+ }
+ fputs(text2, f);
+ }
+ obj.free();
+}
+
+static void printInfoDate(FILE *f, Dict *infoDict, char *key, char *fmt) {
+ Object obj;
+ char *s;
+
+ if (infoDict->lookup(key, &obj)->isString()) {
+ s = obj.getString()->getCString();
+ if (s[0] == 'D' && s[1] == ':') {
+ s += 2;
+ }
+ fprintf(f, fmt, s);
+ }
+ obj.free();
+}