summaryrefslogtreecommitdiffstats
path: root/kpdf/xpdf/xpdf/TextOutputDev.h
diff options
context:
space:
mode:
Diffstat (limited to 'kpdf/xpdf/xpdf/TextOutputDev.h')
-rw-r--r--kpdf/xpdf/xpdf/TextOutputDev.h661
1 files changed, 661 insertions, 0 deletions
diff --git a/kpdf/xpdf/xpdf/TextOutputDev.h b/kpdf/xpdf/xpdf/TextOutputDev.h
new file mode 100644
index 00000000..3a424bb1
--- /dev/null
+++ b/kpdf/xpdf/xpdf/TextOutputDev.h
@@ -0,0 +1,661 @@
+//========================================================================
+//
+// TextOutputDev.h
+//
+// Copyright 1997-2003 Glyph & Cog, LLC
+//
+//========================================================================
+
+#ifndef TEXTOUTPUTDEV_H
+#define TEXTOUTPUTDEV_H
+
+#include <aconf.h>
+
+#ifdef USE_GCC_PRAGMAS
+#pragma interface
+#endif
+
+#include <stdio.h>
+#include "gtypes.h"
+#include "GfxFont.h"
+#include "OutputDev.h"
+
+class GString;
+class GList;
+class GfxFont;
+class GfxState;
+class UnicodeMap;
+class Link;
+
+class TextWord;
+class TextPool;
+class TextLine;
+class TextLineFrag;
+class TextBlock;
+class TextFlow;
+class TextWordList;
+class TextPage;
+
+//------------------------------------------------------------------------
+
+typedef void (*TextOutputFunc)(void *stream, char *text, int len);
+
+//------------------------------------------------------------------------
+// TextFontInfo
+//------------------------------------------------------------------------
+
+class TextFontInfo {
+public:
+
+ TextFontInfo(GfxState *state);
+ ~TextFontInfo();
+
+ GBool matches(GfxState *state);
+
+#if TEXTOUT_WORD_LIST
+ // Get the font name (which may be NULL).
+ GString *getFontName() { return fontName; }
+
+ // Get font descriptor flags.
+ GBool isFixedWidth() { return flags & fontFixedWidth; }
+ GBool isSerif() { return flags & fontSerif; }
+ GBool isSymbolic() { return flags & fontSymbolic; }
+ GBool isItalic() { return flags & fontItalic; }
+ GBool isBold() { return flags & fontBold; }
+#endif
+
+private:
+
+ GfxFont *gfxFont;
+#if TEXTOUT_WORD_LIST
+ GString *fontName;
+ int flags;
+#endif
+
+ friend class TextWord;
+ friend class TextPage;
+};
+
+//------------------------------------------------------------------------
+// TextWord
+//------------------------------------------------------------------------
+
+class TextWord {
+public:
+
+ // Constructor.
+ TextWord(GfxState *state, int rotA, double x0, double y0,
+ int charPosA, TextFontInfo *fontA, double fontSize);
+
+ // Destructor.
+ ~TextWord();
+
+ // Add a character to the word.
+ void addChar(GfxState *state, double x, double y,
+ double dx, double dy, Unicode u);
+
+ // Merge <word> onto the end of <this>.
+ void merge(TextWord *word);
+
+ // Compares <this> to <word>, returning -1 (<), 0 (=), or +1 (>),
+ // based on a primary-axis comparison, e.g., x ordering if rot=0.
+ int primaryCmp(TextWord *word);
+
+ // Return the distance along the primary axis between <this> and
+ // <word>.
+ double primaryDelta(TextWord *word);
+
+ static int cmpYX(const void *p1, const void *p2);
+
+ // Get the TextFontInfo object associated with this word.
+ TextFontInfo *getFontInfo() { return font; }
+
+ // Get the next TextWord on the linked list.
+ TextWord *getNext() { return next; }
+
+#if TEXTOUT_WORD_LIST
+ int getLength() { return len; }
+ Unicode getChar(int idx) { return text[idx]; }
+ GString *getText();
+ GString *getFontName() { return font->fontName; }
+ void getColor(double *r, double *g, double *b)
+ { *r = colorR; *g = colorG; *b = colorB; }
+ void getBBox(double *xMinA, double *yMinA, double *xMaxA, double *yMaxA)
+ { *xMinA = xMin; *yMinA = yMin; *xMaxA = xMax; *yMaxA = yMax; }
+ void getCharBBox(int charIdx, double *xMinA, double *yMinA,
+ double *xMaxA, double *yMaxA);
+ double getFontSize() { return fontSize; }
+ int getRotation() { return rot; }
+ int getCharPos() { return charPos; }
+ int getCharLen() { return charLen; }
+ GBool getSpaceAfter() { return spaceAfter; }
+#endif
+
+ GBool isUnderlined() { return underlined; }
+ Link *getLink() { return link; }
+
+private:
+
+ int rot; // rotation, multiple of 90 degrees
+ // (0, 1, 2, or 3)
+ double xMin, xMax; // bounding box x coordinates
+ double yMin, yMax; // bounding box y coordinates
+ double base; // baseline x or y coordinate
+ Unicode *text; // the text
+ double *edge; // "near" edge x or y coord of each char
+ // (plus one extra entry for the last char)
+ int len; // length of text and edge arrays
+ int size; // size of text and edge arrays
+ int charPos; // character position (within content stream)
+ int charLen; // number of content stream characters in
+ // this word
+ TextFontInfo *font; // font information
+ double fontSize; // font size
+ GBool spaceAfter; // set if there is a space between this
+ // word and the next word on the line
+ TextWord *next; // next word in line
+
+#if TEXTOUT_WORD_LIST
+ double colorR, // word color
+ colorG,
+ colorB;
+#endif
+
+ GBool underlined;
+ Link *link;
+
+ friend class TextPool;
+ friend class TextLine;
+ friend class TextBlock;
+ friend class TextFlow;
+ friend class TextWordList;
+ friend class TextPage;
+};
+
+//------------------------------------------------------------------------
+// TextPool
+//------------------------------------------------------------------------
+
+class TextPool {
+public:
+
+ TextPool();
+ ~TextPool();
+
+ TextWord *getPool(int baseIdx) { return pool[baseIdx - minBaseIdx]; }
+ void setPool(int baseIdx, TextWord *p) { pool[baseIdx - minBaseIdx] = p; }
+
+ int getBaseIdx(double base);
+
+ void addWord(TextWord *word);
+
+private:
+
+ int minBaseIdx; // min baseline bucket index
+ int maxBaseIdx; // max baseline bucket index
+ TextWord **pool; // array of linked lists, one for each
+ // baseline value (multiple of 4 pts)
+ TextWord *cursor; // pointer to last-accessed word
+ int cursorBaseIdx; // baseline bucket index of last-accessed word
+
+ friend class TextBlock;
+ friend class TextPage;
+};
+
+//------------------------------------------------------------------------
+// TextLine
+//------------------------------------------------------------------------
+
+class TextLine {
+public:
+
+ TextLine(TextBlock *blkA, int rotA, double baseA);
+ ~TextLine();
+
+ void addWord(TextWord *word);
+
+ // Return the distance along the primary axis between <this> and
+ // <line>.
+ double primaryDelta(TextLine *line);
+
+ // Compares <this> to <line>, returning -1 (<), 0 (=), or +1 (>),
+ // based on a primary-axis comparison, e.g., x ordering if rot=0.
+ int primaryCmp(TextLine *line);
+
+ // Compares <this> to <line>, returning -1 (<), 0 (=), or +1 (>),
+ // based on a secondary-axis comparison of the baselines, e.g., y
+ // ordering if rot=0.
+ int secondaryCmp(TextLine *line);
+
+ int cmpYX(TextLine *line);
+
+ static int cmpXY(const void *p1, const void *p2);
+
+ void coalesce(UnicodeMap *uMap);
+
+ // Get the head of the linked list of TextWords.
+ TextWord *getWords() { return words; }
+
+ // Get the next TextLine on the linked list.
+ TextLine *getNext() { return next; }
+
+ // Returns true if the last char of the line is a hyphen.
+ GBool isHyphenated() { return hyphenated; }
+
+private:
+
+ TextBlock *blk; // parent block
+ int rot; // text rotation
+ double xMin, xMax; // bounding box x coordinates
+ double yMin, yMax; // bounding box y coordinates
+ double base; // baseline x or y coordinate
+ TextWord *words; // words in this line
+ TextWord *lastWord; // last word in this line
+ Unicode *text; // Unicode text of the line, including
+ // spaces between words
+ double *edge; // "near" edge x or y coord of each char
+ // (plus one extra entry for the last char)
+ int *col; // starting column number of each Unicode char
+ int len; // number of Unicode chars
+ int convertedLen; // total number of converted characters
+ GBool hyphenated; // set if last char is a hyphen
+ TextLine *next; // next line in block
+
+ friend class TextLineFrag;
+ friend class TextBlock;
+ friend class TextFlow;
+ friend class TextWordList;
+ friend class TextPage;
+};
+
+//------------------------------------------------------------------------
+// TextBlock
+//------------------------------------------------------------------------
+
+class TextBlock {
+public:
+
+ TextBlock(TextPage *pageA, int rotA);
+ ~TextBlock();
+
+ void addWord(TextWord *word);
+
+ void coalesce(UnicodeMap *uMap);
+
+ // Update this block's priMin and priMax values, looking at <blk>.
+ void updatePriMinMax(TextBlock *blk);
+
+ static int cmpXYPrimaryRot(const void *p1, const void *p2);
+
+ static int cmpYXPrimaryRot(const void *p1, const void *p2);
+
+ int primaryCmp(TextBlock *blk);
+
+ double secondaryDelta(TextBlock *blk);
+
+ // Returns true if <this> is below <blk>, relative to the page's
+ // primary rotation.
+ GBool isBelow(TextBlock *blk);
+
+ // Get the head of the linked list of TextLines.
+ TextLine *getLines() { return lines; }
+
+ // Get the next TextBlock on the linked list.
+ TextBlock *getNext() { return next; }
+
+private:
+
+ TextPage *page; // the parent page
+ int rot; // text rotation
+ double xMin, xMax; // bounding box x coordinates
+ double yMin, yMax; // bounding box y coordinates
+ double priMin, priMax; // whitespace bounding box along primary axis
+
+ TextPool *pool; // pool of words (used only until lines
+ // are built)
+ TextLine *lines; // linked list of lines
+ TextLine *curLine; // most recently added line
+ int nLines; // number of lines
+ int charCount; // number of characters in the block
+ int col; // starting column
+ int nColumns; // number of columns in the block
+
+ TextBlock *next;
+ TextBlock *stackNext;
+
+ friend class TextLine;
+ friend class TextLineFrag;
+ friend class TextFlow;
+ friend class TextWordList;
+ friend class TextPage;
+};
+
+//------------------------------------------------------------------------
+// TextFlow
+//------------------------------------------------------------------------
+
+class TextFlow {
+public:
+
+ TextFlow(TextPage *pageA, TextBlock *blk);
+ ~TextFlow();
+
+ // Add a block to the end of this flow.
+ void addBlock(TextBlock *blk);
+
+ // Returns true if <blk> fits below <prevBlk> in the flow, i.e., (1)
+ // it uses a font no larger than the last block added to the flow,
+ // and (2) it fits within the flow's [priMin, priMax] along the
+ // primary axis.
+ GBool blockFits(TextBlock *blk, TextBlock *prevBlk);
+
+ // Get the head of the linked list of TextBlocks.
+ TextBlock *getBlocks() { return blocks; }
+
+ // Get the next TextFlow on the linked list.
+ TextFlow *getNext() { return next; }
+
+private:
+
+ TextPage *page; // the parent page
+ double xMin, xMax; // bounding box x coordinates
+ double yMin, yMax; // bounding box y coordinates
+ double priMin, priMax; // whitespace bounding box along primary axis
+ TextBlock *blocks; // blocks in flow
+ TextBlock *lastBlk; // last block in this flow
+ TextFlow *next;
+
+ friend class TextWordList;
+ friend class TextPage;
+};
+
+#if TEXTOUT_WORD_LIST
+
+//------------------------------------------------------------------------
+// TextWordList
+//------------------------------------------------------------------------
+
+class TextWordList {
+public:
+
+ // Build a flat word list, in content stream order (if
+ // text->rawOrder is true), physical layout order (if <physLayout>
+ // is true and text->rawOrder is false), or reading order (if both
+ // flags are false).
+ TextWordList(TextPage *text, GBool physLayout);
+
+ ~TextWordList();
+
+ // Return the number of words on the list.
+ int getLength();
+
+ // Return the <idx>th word from the list.
+ TextWord *get(int idx);
+
+private:
+
+ GList *words; // [TextWord]
+};
+
+#endif // TEXTOUT_WORD_LIST
+
+//------------------------------------------------------------------------
+// TextPage
+//------------------------------------------------------------------------
+
+class TextPage {
+public:
+
+ // Constructor.
+ TextPage(GBool rawOrderA);
+
+ // Destructor.
+ ~TextPage();
+
+ // Start a new page.
+ void startPage(GfxState *state);
+
+ // End the current page.
+ void endPage();
+
+ // Update the current font.
+ void updateFont(GfxState *state);
+
+ // Begin a new word.
+ void beginWord(GfxState *state, double x0, double y0);
+
+ // Add a character to the current word.
+ void addChar(GfxState *state, double x, double y,
+ double dx, double dy,
+ CharCode c, int nBytes, Unicode *u, int uLen);
+
+ // End the current word, sorting it into the list of words.
+ void endWord();
+
+ // Add a word, sorting it into the list of words.
+ void addWord(TextWord *word);
+
+ // Add a (potential) underline.
+ void addUnderline(double x0, double y0, double x1, double y1);
+
+ // Add a hyperlink.
+ void addLink(int xMin, int yMin, int xMax, int yMax, Link *link);
+
+ // Coalesce strings that look like parts of the same line.
+ void coalesce(GBool physLayout, GBool doHTML);
+
+ // Find a string. If <startAtTop> is true, starts looking at the
+ // top of the page; else if <startAtLast> is true, starts looking
+ // immediately after the last find result; else starts looking at
+ // <xMin>,<yMin>. If <stopAtBottom> is true, stops looking at the
+ // bottom of the page; else if <stopAtLast> is true, stops looking
+ // just before the last find result; else stops looking at
+ // <xMax>,<yMax>.
+ GBool findText(Unicode *s, int len,
+ GBool startAtTop, GBool stopAtBottom,
+ GBool startAtLast, GBool stopAtLast,
+ GBool caseSensitive, GBool backward,
+ double *xMin, double *yMin,
+ double *xMax, double *yMax);
+
+ // Get the text which is inside the specified rectangle.
+ GString *getText(double xMin, double yMin,
+ double xMax, double yMax);
+
+ // Find a string by character position and length. If found, sets
+ // the text bounding rectangle and returns true; otherwise returns
+ // false.
+ GBool findCharRange(int pos, int length,
+ double *xMin, double *yMin,
+ double *xMax, double *yMax);
+
+ // Dump contents of page to a file.
+ void dump(void *outputStream, TextOutputFunc outputFunc,
+ GBool physLayout);
+
+ // Get the head of the linked list of TextFlows.
+ TextFlow *getFlows() { return flows; }
+
+#if TEXTOUT_WORD_LIST
+ // Build a flat word list, in content stream order (if
+ // this->rawOrder is true), physical layout order (if <physLayout>
+ // is true and this->rawOrder is false), or reading order (if both
+ // flags are false).
+ TextWordList *makeWordList(GBool physLayout);
+#endif
+
+private:
+
+ void clear();
+ void assignColumns(TextLineFrag *frags, int nFrags, int rot);
+ int dumpFragment(Unicode *text, int len, UnicodeMap *uMap, GString *s);
+
+ GBool rawOrder; // keep text in content stream order
+
+ double pageWidth, pageHeight; // width and height of current page
+ TextWord *curWord; // currently active string
+ int charPos; // next character position (within content
+ // stream)
+ TextFontInfo *curFont; // current font
+ double curFontSize; // current font size
+ int nest; // current nesting level (for Type 3 fonts)
+ int nTinyChars; // number of "tiny" chars seen so far
+ GBool lastCharOverlap; // set if the last added char overlapped the
+ // previous char
+
+ TextPool *pools[4]; // a "pool" of TextWords for each rotation
+ TextFlow *flows; // linked list of flows
+ TextBlock **blocks; // array of blocks, in yx order
+ int nBlocks; // number of blocks
+ int primaryRot; // primary rotation
+ GBool primaryLR; // primary direction (true means L-to-R,
+ // false means R-to-L)
+ TextWord *rawWords; // list of words, in raw order (only if
+ // rawOrder is set)
+ TextWord *rawLastWord; // last word on rawWords list
+
+ GList *fonts; // all font info objects used on this
+ // page [TextFontInfo]
+
+ double lastFindXMin, // coordinates of the last "find" result
+ lastFindYMin;
+ GBool haveLastFind;
+
+ GList *underlines; // [TextUnderline]
+ GList *links; // [TextLink]
+
+ friend class TextLine;
+ friend class TextLineFrag;
+ friend class TextBlock;
+ friend class TextFlow;
+ friend class TextWordList;
+};
+
+//------------------------------------------------------------------------
+// TextOutputDev
+//------------------------------------------------------------------------
+
+class TextOutputDev: public OutputDev {
+public:
+
+ // Open a text output file. If <fileName> is NULL, no file is
+ // written (this is useful, e.g., for searching text). If
+ // <physLayoutA> is true, the original physical layout of the text
+ // is maintained. If <rawOrder> is true, the text is kept in
+ // content stream order.
+ TextOutputDev(char *fileName, GBool physLayoutA,
+ GBool rawOrderA, GBool append);
+
+ // Create a TextOutputDev which will write to a generic stream. If
+ // <physLayoutA> is true, the original physical layout of the text
+ // is maintained. If <rawOrder> is true, the text is kept in
+ // content stream order.
+ TextOutputDev(TextOutputFunc func, void *stream,
+ GBool physLayoutA, GBool rawOrderA);
+
+ // Destructor.
+ virtual ~TextOutputDev();
+
+ // Check if file was successfully created.
+ virtual GBool isOk() { return ok; }
+
+ //---- get info about output device
+
+ // Does this device use upside-down coordinates?
+ // (Upside-down means (0,0) is the top left corner of the page.)
+ virtual GBool upsideDown() { return gTrue; }
+
+ // Does this device use drawChar() or drawString()?
+ virtual GBool useDrawChar() { return gTrue; }
+
+ // Does this device use beginType3Char/endType3Char? Otherwise,
+ // text in Type 3 fonts will be drawn with drawChar/drawString.
+ virtual GBool interpretType3Chars() { return gFalse; }
+
+ // Does this device need non-text content?
+ virtual GBool needNonText() { return gFalse; }
+
+ //----- initialization and control
+
+ // Start a page.
+ virtual void startPage(int pageNum, GfxState *state);
+
+ // End a page.
+ virtual void endPage();
+
+ //----- update text state
+ virtual void updateFont(GfxState *state);
+
+ //----- text drawing
+ virtual void beginString(GfxState *state, GString *s);
+ virtual void endString(GfxState *state);
+ virtual void drawChar(GfxState *state, double x, double y,
+ double dx, double dy,
+ double originX, double originY,
+ CharCode c, int nBytes, Unicode *u, int uLen);
+
+ //----- path painting
+ virtual void stroke(GfxState *state);
+ virtual void fill(GfxState *state);
+ virtual void eoFill(GfxState *state);
+
+ //----- link borders
+ virtual void processLink(Link *link, Catalog *catalog);
+
+ //----- special access
+
+ // Find a string. If <startAtTop> is true, starts looking at the
+ // top of the page; else if <startAtLast> is true, starts looking
+ // immediately after the last find result; else starts looking at
+ // <xMin>,<yMin>. If <stopAtBottom> is true, stops looking at the
+ // bottom of the page; else if <stopAtLast> is true, stops looking
+ // just before the last find result; else stops looking at
+ // <xMax>,<yMax>.
+ GBool findText(Unicode *s, int len,
+ GBool startAtTop, GBool stopAtBottom,
+ GBool startAtLast, GBool stopAtLast,
+ GBool caseSensitive, GBool backward,
+ double *xMin, double *yMin,
+ double *xMax, double *yMax);
+
+ // Get the text which is inside the specified rectangle.
+ GString *getText(double xMin, double yMin,
+ double xMax, double yMax);
+
+ // Find a string by character position and length. If found, sets
+ // the text bounding rectangle and returns true; otherwise returns
+ // false.
+ GBool findCharRange(int pos, int length,
+ double *xMin, double *yMin,
+ double *xMax, double *yMax);
+
+#if TEXTOUT_WORD_LIST
+ // Build a flat word list, in content stream order (if
+ // this->rawOrder is true), physical layout order (if
+ // this->physLayout is true and this->rawOrder is false), or reading
+ // order (if both flags are false).
+ TextWordList *makeWordList();
+#endif
+
+ // Returns the TextPage object for the last rasterized page,
+ // transferring ownership to the caller.
+ TextPage *takeText();
+
+ // Turn extra processing for HTML conversion on or off.
+ void enableHTMLExtras(GBool doHTMLA) { doHTML = doHTMLA; }
+
+private:
+
+ TextOutputFunc outputFunc; // output function
+ void *outputStream; // output stream
+ GBool needClose; // need to close the output file?
+ // (only if outputStream is a FILE*)
+ TextPage *text; // text for the current page
+ GBool physLayout; // maintain original physical layout when
+ // dumping text
+ GBool rawOrder; // keep text in content stream order
+ GBool doHTML; // extra processing for HTML conversion
+ GBool ok; // set up ok?
+};
+
+#endif