1//========================================================================
2//
3// TextOutputDev.h
4//
5// Copyright 1997-2003 Glyph & Cog, LLC
6//
7//========================================================================
8
9//========================================================================
10//
11// Modified under the Poppler project - http://poppler.freedesktop.org
12//
13// All changes made under the Poppler project to this file are licensed
14// under GPL version 2 or later
15//
16// Copyright (C) 2005-2007 Kristian Høgsberg <krh@redhat.com>
17// Copyright (C) 2006 Ed Catmur <ed@catmur.co.uk>
18// Copyright (C) 2007, 2008, 2011, 2013 Carlos Garcia Campos <carlosgc@gnome.org>
19// Copyright (C) 2007, 2017 Adrian Johnson <ajohnson@redneon.com>
20// Copyright (C) 2008, 2010, 2015, 2016, 2018, 2019, 2021 Albert Astals Cid <aacid@kde.org>
21// Copyright (C) 2010 Brian Ewins <brian.ewins@gmail.com>
22// Copyright (C) 2012, 2013, 2015, 2016 Jason Crain <jason@aquaticape.us>
23// Copyright (C) 2013 Thomas Freitag <Thomas.Freitag@alfa.de>
24// Copyright (C) 2018 Klarälvdalens Datakonsult AB, a KDAB Group company, <info@kdab.com>. Work sponsored by the LiMux project of the city of Munich
25// Copyright (C) 2018 Sanchit Anand <sanxchit@gmail.com>
26// Copyright (C) 2018, 2020, 2021 Nelson Benítez León <nbenitezl@gmail.com>
27// Copyright (C) 2019, 2022 Oliver Sander <oliver.sander@tu-dresden.de>
28// Copyright (C) 2019 Dan Shea <dan.shea@logical-innovations.com>
29// Copyright (C) 2020 Suzuki Toshiya <mpsuzuki@hiroshima-u.ac.jp>
30// Copyright (C) 2024 Stefan Brüns <stefan.bruens@rwth-aachen.de>
31//
32// To see a description of the changes please see the Changelog file that
33// came with your tarball or type make ChangeLog if you are building from git
34//
35//========================================================================
36
37#ifndef TEXTOUTPUTDEV_H
38#define TEXTOUTPUTDEV_H
39
40#include "poppler-config.h"
41#include "poppler_private_export.h"
42#include <cstdio>
43#include "GfxFont.h"
44#include "GfxState.h"
45#include "OutputDev.h"
46
47class GooString;
48class Gfx;
49class GfxFont;
50class GfxState;
51class UnicodeMap;
52class AnnotLink;
53
54class TextWord;
55class TextPool;
56class TextLine;
57class TextLineFrag;
58class TextBlock;
59class TextFlow;
60class TextLink;
61class TextUnderline;
62class TextWordList;
63class TextPage;
64class TextSelectionVisitor;
65
66//------------------------------------------------------------------------
67
68typedef void (*TextOutputFunc)(void *stream, const char *text, int len);
69
70enum SelectionStyle
71{
72 selectionStyleGlyph,
73 selectionStyleWord,
74 selectionStyleLine
75};
76
77enum EndOfLineKind
78{
79 eolUnix, // LF
80 eolDOS, // CR+LF
81 eolMac // CR
82};
83
84//------------------------------------------------------------------------
85// TextFontInfo
86//------------------------------------------------------------------------
87
88class POPPLER_PRIVATE_EXPORT TextFontInfo
89{
90public:
91 explicit TextFontInfo(const GfxState *state);
92 ~TextFontInfo();
93
94 TextFontInfo(const TextFontInfo &) = delete;
95 TextFontInfo &operator=(const TextFontInfo &) = delete;
96
97 bool matches(const GfxState *state) const;
98 bool matches(const TextFontInfo *fontInfo) const;
99 bool matches(const Ref *ref) const;
100
101 // Get the font ascent, or a default value if the font is not set
102 double getAscent() const;
103
104 // Get the font descent, or a default value if the font is not set
105 double getDescent() const;
106
107 // Get the writing mode (0 or 1), or 0 if the font is not set
108 int getWMode() const;
109
110#ifdef TEXTOUT_WORD_LIST
111 // Get the font name (which may be NULL).
112 const GooString *getFontName() const { return fontName; }
113
114 // Get font descriptor flags.
115 bool isFixedWidth() const { return flags & fontFixedWidth; }
116 bool isSerif() const { return flags & fontSerif; }
117 bool isSymbolic() const { return flags & fontSymbolic; }
118 bool isItalic() const { return flags & fontItalic; }
119 bool isBold() const { return flags & fontBold; }
120#endif
121
122private:
123 std::shared_ptr<GfxFont> gfxFont;
124#ifdef TEXTOUT_WORD_LIST
125 GooString *fontName;
126 int flags;
127#endif
128
129 friend class TextWord;
130 friend class TextPage;
131 friend class TextSelectionPainter;
132};
133
134//------------------------------------------------------------------------
135// TextWord
136//------------------------------------------------------------------------
137
138class POPPLER_PRIVATE_EXPORT TextWord
139{
140public:
141 // Constructor.
142 TextWord(const GfxState *state, int rotA, double fontSize);
143
144 // Destructor.
145 ~TextWord();
146
147 TextWord(const TextWord &) = delete;
148 TextWord &operator=(const TextWord &) = delete;
149
150 // Add a character to the word.
151 void addChar(const GfxState *state, TextFontInfo *fontA, double x, double y, double dx, double dy, int charPosA, int charLen, CharCode c, Unicode u, const Matrix &textMatA);
152
153 // Attempt to add a character to the word as a combining character.
154 // Either character u or the last character in the word must be an
155 // acute, dieresis, or other combining character. Returns true if
156 // the character was added.
157 bool addCombining(const GfxState *state, TextFontInfo *fontA, double fontSizeA, double x, double y, double dx, double dy, int charPosA, int charLen, CharCode c, Unicode u, const Matrix &textMatA);
158
159 // Merge <word> onto the end of <this>.
160 void merge(TextWord *word);
161
162 // Compares <this> to <word>, returning -1 (<), 0 (=), or +1 (>),
163 // based on a primary-axis comparison, e.g., x ordering if rot=0.
164 int primaryCmp(const TextWord *word) const;
165
166 // Return the distance along the primary axis between <this> and
167 // <word>.
168 double primaryDelta(const TextWord *word) const;
169
170 static int cmpYX(const void *p1, const void *p2);
171
172 void visitSelection(TextSelectionVisitor *visitor, const PDFRectangle *selection, SelectionStyle style);
173
174 // Get the TextFontInfo object associated with a character.
175 const TextFontInfo *getFontInfo(int idx) const { return chars[idx].font; }
176
177 // Get the next TextWord on the linked list.
178 const TextWord *getNext() const { return next; }
179
180#ifdef TEXTOUT_WORD_LIST
181 int getLength() const { return chars.size(); }
182 const Unicode *getChar(int idx) const { return &chars[idx].text; }
183 GooString *getText() const;
184 const GooString *getFontName(int idx) const { return chars[idx].font->fontName; }
185 void getColor(double *r, double *g, double *b) const
186 {
187 *r = colorR;
188 *g = colorG;
189 *b = colorB;
190 }
191 void getBBox(double *xMinA, double *yMinA, double *xMaxA, double *yMaxA) const
192 {
193 *xMinA = xMin;
194 *yMinA = yMin;
195 *xMaxA = xMax;
196 *yMaxA = yMax;
197 }
198 void getCharBBox(int charIdx, double *xMinA, double *yMinA, double *xMaxA, double *yMaxA) const;
199 double getFontSize() const { return fontSize; }
200 int getRotation() const { return rot; }
201 int getCharPos() const { return chars.empty() ? 0 : chars.front().charPos; }
202 int getCharLen() const { return chars.empty() ? 0 : chars.back().charPos - chars.front().charPos; }
203 bool getSpaceAfter() const { return spaceAfter; }
204#endif
205 bool isUnderlined() const { return underlined; }
206 const AnnotLink *getLink() const { return link; }
207 double getEdge(int i) const { return chars[i].edge; }
208 double getBaseline() const { return base; }
209 bool hasSpaceAfter() const { return spaceAfter; }
210 const TextWord *nextWord() const { return next; };
211 auto len() const { return chars.size(); }
212
213private:
214 void setInitialBounds(TextFontInfo *fontA, double x, double y);
215
216 int rot; // rotation, multiple of 90 degrees
217 // (0, 1, 2, or 3)
218 int wMode; // horizontal (0) or vertical (1) writing mode
219 double xMin, xMax; // bounding box x coordinates
220 double yMin, yMax; // bounding box y coordinates
221 double base; // baseline x or y coordinate
222
223 double fontSize; // font size
224
225 struct CharInfo
226 {
227 Unicode text;
228 CharCode charcode;
229 int charPos;
230 double edge;
231 TextFontInfo *font;
232 Matrix textMat;
233 };
234 std::vector<CharInfo> chars;
235 int charPosEnd = 0;
236 double edgeEnd = 0;
237
238 bool spaceAfter; // set if there is a space between this
239 // word and the next word on the line
240 bool underlined;
241 bool invisible; // whether we are invisible (glyphless)
242 TextWord *next; // next word in line
243
244#ifdef TEXTOUT_WORD_LIST
245 double colorR, // word color
246 colorG, colorB;
247#endif
248
249 AnnotLink *link;
250
251 friend class TextPool;
252 friend class TextLine;
253 friend class TextBlock;
254 friend class TextFlow;
255 friend class TextWordList;
256 friend class TextPage;
257
258 friend class TextSelectionPainter;
259 friend class TextSelectionDumper;
260};
261
262//------------------------------------------------------------------------
263// TextPool
264//------------------------------------------------------------------------
265
266class TextPool
267{
268public:
269 TextPool();
270 ~TextPool();
271
272 TextPool(const TextPool &) = delete;
273 TextPool &operator=(const TextPool &) = delete;
274
275 TextWord *getPool(int baseIdx) { return pool[baseIdx - minBaseIdx]; }
276 void setPool(int baseIdx, TextWord *p) { pool[baseIdx - minBaseIdx] = p; }
277
278 int getBaseIdx(double base) const;
279
280 void addWord(TextWord *word);
281
282private:
283 int minBaseIdx; // min baseline bucket index
284 int maxBaseIdx; // max baseline bucket index
285 TextWord **pool; // array of linked lists, one for each
286 // baseline value (multiple of 4 pts)
287 TextWord *cursor; // pointer to last-accessed word
288 int cursorBaseIdx; // baseline bucket index of last-accessed word
289
290 friend class TextBlock;
291 friend class TextPage;
292};
293
294struct TextFlowData;
295
296//------------------------------------------------------------------------
297// TextLine
298//------------------------------------------------------------------------
299
300class TextLine
301{
302public:
303 TextLine(TextBlock *blkA, int rotA, double baseA);
304 ~TextLine();
305
306 TextLine(const TextLine &) = delete;
307 TextLine &operator=(const TextLine &) = delete;
308
309 void addWord(TextWord *word);
310
311 // Return the distance along the primary axis between <this> and
312 // <line>.
313 double primaryDelta(const TextLine *line) const;
314
315 // Compares <this> to <line>, returning -1 (<), 0 (=), or +1 (>),
316 // based on a primary-axis comparison, e.g., x ordering if rot=0.
317 int primaryCmp(const TextLine *line) const;
318
319 // Compares <this> to <line>, returning -1 (<), 0 (=), or +1 (>),
320 // based on a secondary-axis comparison of the baselines, e.g., y
321 // ordering if rot=0.
322 int secondaryCmp(const TextLine *line) const;
323
324 int cmpYX(const TextLine *line) const;
325
326 static int cmpXY(const void *p1, const void *p2);
327
328 void coalesce(const UnicodeMap *uMap);
329
330 void visitSelection(TextSelectionVisitor *visitor, const PDFRectangle *selection, SelectionStyle style);
331
332 // Get the head of the linked list of TextWords.
333 const TextWord *getWords() const { return words; }
334
335 // Get the next TextLine on the linked list.
336 const TextLine *getNext() const { return next; }
337
338 // Returns true if the last char of the line is a hyphen.
339 bool isHyphenated() const { return hyphenated; }
340
341private:
342 TextBlock *blk; // parent block
343 int rot; // text rotation
344 double xMin, xMax; // bounding box x coordinates
345 double yMin, yMax; // bounding box y coordinates
346 double base; // baseline x or y coordinate
347 TextWord *words; // words in this line
348 TextWord *lastWord; // last word in this line
349 Unicode *text; // Unicode text of the line, including
350 // spaces between words
351 double *edge; // "near" edge x or y coord of each char
352 // (plus one extra entry for the last char)
353 int *col; // starting column number of each Unicode char
354 int len; // number of Unicode chars
355 int convertedLen; // total number of converted characters
356 bool hyphenated; // set if last char is a hyphen
357 TextLine *next; // next line in block
358 Unicode *normalized; // normalized form of Unicode text
359 int normalized_len; // number of normalized Unicode chars
360 int *normalized_idx; // indices of normalized chars into Unicode text
361 Unicode *ascii_translation; // ascii translation from the normalized text
362 int ascii_len; // length of ascii translation text
363 int *ascii_idx; // indices of ascii chars into Unicode text of line
364
365 friend class TextLineFrag;
366 friend class TextBlock;
367 friend class TextFlow;
368 friend class TextWordList;
369 friend class TextPage;
370
371 friend class TextSelectionPainter;
372 friend class TextSelectionSizer;
373 friend class TextSelectionDumper;
374};
375
376//------------------------------------------------------------------------
377// TextBlock
378//------------------------------------------------------------------------
379
380class TextBlock
381{
382public:
383 TextBlock(TextPage *pageA, int rotA);
384 ~TextBlock();
385
386 TextBlock(const TextBlock &) = delete;
387 TextBlock &operator=(const TextBlock &) = delete;
388
389 void addWord(TextWord *word);
390
391 void coalesce(const UnicodeMap *uMap, double fixedPitch);
392
393 // Update this block's priMin and priMax values, looking at <blk>.
394 void updatePriMinMax(const TextBlock *blk);
395
396 static int cmpXYPrimaryRot(const void *p1, const void *p2);
397
398 static int cmpYXPrimaryRot(const void *p1, const void *p2);
399
400 int primaryCmp(const TextBlock *blk) const;
401
402 double secondaryDelta(const TextBlock *blk) const;
403
404 // Returns true if <this> is below <blk>, relative to the page's
405 // primary rotation.
406 bool isBelow(const TextBlock *blk) const;
407
408 void visitSelection(TextSelectionVisitor *visitor, const PDFRectangle *selection, SelectionStyle style);
409
410 // Get the head of the linked list of TextLines.
411 const TextLine *getLines() const { return lines; }
412
413 // Get the next TextBlock on the linked list.
414 const TextBlock *getNext() const { return next; }
415
416 void getBBox(double *xMinA, double *yMinA, double *xMaxA, double *yMaxA) const
417 {
418 *xMinA = xMin;
419 *yMinA = yMin;
420 *xMaxA = xMax;
421 *yMaxA = yMax;
422 }
423
424 int getLineCount() const { return nLines; }
425
426private:
427 bool isBeforeByRule1(const TextBlock *blk1);
428 bool isBeforeByRepeatedRule1(const TextBlock *blkList, const TextBlock *blk1);
429 bool isBeforeByRule2(const TextBlock *blk1);
430
431 int visitDepthFirst(TextBlock *blkList, int pos1, TextBlock **sorted, int sortPos, bool *visited);
432 int visitDepthFirst(TextBlock *blkList, int pos1, TextBlock **sorted, int sortPos, bool *visited, TextBlock **cache, int cacheSize);
433
434 TextPage *page; // the parent page
435 int rot; // text rotation
436 double xMin, xMax; // bounding box x coordinates
437 double yMin, yMax; // bounding box y coordinates
438 double priMin, priMax; // whitespace bounding box along primary axis
439 double ExMin, ExMax; // extended bounding box x coordinates
440 double EyMin, EyMax; // extended bounding box y coordinates
441 int tableId; // id of table to which this block belongs
442 bool tableEnd; // is this block at end of line of actual table
443
444 TextPool *pool; // pool of words (used only until lines
445 // are built)
446 TextLine *lines; // linked list of lines
447 TextLine *curLine; // most recently added line
448 int nLines; // number of lines
449 int charCount; // number of characters in the block
450 int col; // starting column
451 int nColumns; // number of columns in the block
452
453 TextBlock *next;
454 TextBlock *stackNext;
455
456 friend class TextLine;
457 friend class TextLineFrag;
458 friend class TextFlow;
459 friend class TextWordList;
460 friend class TextPage;
461 friend class TextSelectionPainter;
462 friend class TextSelectionDumper;
463};
464
465//------------------------------------------------------------------------
466// TextFlow
467//------------------------------------------------------------------------
468
469class TextFlow
470{
471public:
472 TextFlow(TextPage *pageA, TextBlock *blk);
473 ~TextFlow();
474
475 TextFlow(const TextFlow &) = delete;
476 TextFlow &operator=(const TextFlow &) = delete;
477
478 // Add a block to the end of this flow.
479 void addBlock(TextBlock *blk);
480
481 // Returns true if <blk> fits below <prevBlk> in the flow, i.e., (1)
482 // it uses a font no larger than the last block added to the flow,
483 // and (2) it fits within the flow's [priMin, priMax] along the
484 // primary axis.
485 bool blockFits(const TextBlock *blk, const TextBlock *prevBlk) const;
486
487 // Get the head of the linked list of TextBlocks.
488 const TextBlock *getBlocks() const { return blocks; }
489
490 // Get the next TextFlow on the linked list.
491 const TextFlow *getNext() const { return next; }
492
493private:
494 TextPage *page; // the parent page
495 double xMin, xMax; // bounding box x coordinates
496 double yMin, yMax; // bounding box y coordinates
497 double priMin, priMax; // whitespace bounding box along primary axis
498 TextBlock *blocks; // blocks in flow
499 TextBlock *lastBlk; // last block in this flow
500 TextFlow *next;
501
502 friend class TextWordList;
503 friend class TextPage;
504};
505
506#ifdef TEXTOUT_WORD_LIST
507
508//------------------------------------------------------------------------
509// TextWordList
510//------------------------------------------------------------------------
511
512class POPPLER_PRIVATE_EXPORT TextWordList
513{
514public:
515 // Build a flat word list, in content stream order (if
516 // text->rawOrder is true), physical layout order (if <physLayout>
517 // is true and text->rawOrder is false), or reading order (if both
518 // flags are false).
519 TextWordList(const TextPage *text, bool physLayout);
520
521 ~TextWordList();
522
523 TextWordList(const TextWordList &) = delete;
524 TextWordList &operator=(const TextWordList &) = delete;
525
526 // Return the number of words on the list.
527 int getLength() const;
528
529 // Return the <idx>th word from the list.
530 TextWord *get(int idx);
531
532private:
533 std::vector<TextWord *> words;
534};
535
536#endif // TEXTOUT_WORD_LIST
537
538class TextWordSelection
539{
540public:
541 TextWordSelection(const TextWord *wordA, int beginA, int endA) : word(wordA), begin(beginA), end(endA) { }
542
543 const TextWord *getWord() const { return word; }
544 int getBegin() const { return begin; }
545 int getEnd() const { return end; }
546
547private:
548 const TextWord *word;
549 int begin;
550 int end;
551
552 friend class TextSelectionPainter;
553 friend class TextSelectionDumper;
554};
555
556//------------------------------------------------------------------------
557// TextPage
558//------------------------------------------------------------------------
559
560class POPPLER_PRIVATE_EXPORT TextPage
561{
562public:
563 // Constructor.
564 explicit TextPage(bool rawOrderA, bool discardDiagA = false);
565
566 TextPage(const TextPage &) = delete;
567 TextPage &operator=(const TextPage &) = delete;
568
569 void incRefCnt();
570 void decRefCnt();
571
572 // Start a new page.
573 void startPage(const GfxState *state);
574
575 // End the current page.
576 void endPage();
577
578 // Update the current font.
579 void updateFont(const GfxState *state);
580
581 // Begin a new word.
582 void beginWord(const GfxState *state);
583
584 // Add a character to the current word.
585 void addChar(const GfxState *state, double x, double y, double dx, double dy, CharCode c, int nBytes, const Unicode *u, int uLen);
586
587 // Add <nChars> invisible characters.
588 void incCharCount(int nChars);
589
590 // End the current word, sorting it into the list of words.
591 void endWord();
592
593 // Add a word, sorting it into the list of words.
594 void addWord(TextWord *word);
595
596 // Add a (potential) underline.
597 void addUnderline(double x0, double y0, double x1, double y1);
598
599 // Add a hyperlink.
600 void addLink(int xMin, int yMin, int xMax, int yMax, AnnotLink *link);
601
602 // Coalesce strings that look like parts of the same line.
603 void coalesce(bool physLayout, double fixedPitch, bool doHTML);
604 void coalesce(bool physLayout, double fixedPitch, bool doHTML, double minColSpacing1);
605
606 // Find a string. If <startAtTop> is true, starts looking at the
607 // top of the page; else if <startAtLast> is true, starts looking
608 // immediately after the last find result; else starts looking at
609 // <xMin>,<yMin>. If <stopAtBottom> is true, stops looking at the
610 // bottom of the page; else if <stopAtLast> is true, stops looking
611 // just before the last find result; else stops looking at
612 // <xMax>,<yMax>.
613 bool findText(const Unicode *s, int len, bool startAtTop, bool stopAtBottom, bool startAtLast, bool stopAtLast, bool caseSensitive, bool backward, bool wholeWord, double *xMin, double *yMin, double *xMax, double *yMax);
614
615 // Adds new parameter ignoreDiacritics, which will do diacritics
616 // insensitive search, i.e. ignore accents, umlauts, diaeresis,etc.
617 // while matching. This option will be ignored if <s> contains characters
618 // which are not pure ascii.
619 bool findText(const Unicode *s, int len, bool startAtTop, bool stopAtBottom, bool startAtLast, bool stopAtLast, bool caseSensitive, bool ignoreDiacritics, bool backward, bool wholeWord, double *xMin, double *yMin, double *xMax,
620 double *yMax);
621
622 // Adds new parameter <matchAcrossLines>, which allows <s> to match on text
623 // spanning from end of a line to the next line. In that case, the rect for
624 // the part of match that falls on the next line will be stored in
625 // <continueMatch>, and if hyphenation (i.e. ignoring hyphen at end of line)
626 // was used while matching at the end of the line prior to <continueMatch>,
627 // then <ignoredHyphen> will be true, otherwise will be false.
628 // Only finding across two lines is supported, i.e. it won't match where <s>
629 // spans more than two lines.
630 //
631 // <matchAcrossLines> will be ignored if <backward> is true (as that
632 // combination has not been implemented yet).
633 bool findText(const Unicode *s, int len, bool startAtTop, bool stopAtBottom, bool startAtLast, bool stopAtLast, bool caseSensitive, bool ignoreDiacritics, bool matchAcrossLines, bool backward, bool wholeWord, double *xMin, double *yMin,
634 double *xMax, double *yMax, PDFRectangle *continueMatch, bool *ignoredHyphen);
635
636 // Get the text which is inside the specified rectangle.
637 GooString *getText(double xMin, double yMin, double xMax, double yMax, EndOfLineKind textEOL) const;
638
639 void visitSelection(TextSelectionVisitor *visitor, const PDFRectangle *selection, SelectionStyle style);
640
641 void drawSelection(OutputDev *out, double scale, int rotation, const PDFRectangle *selection, SelectionStyle style, const GfxColor *glyph_color, const GfxColor *box_color);
642
643 std::vector<PDFRectangle *> *getSelectionRegion(const PDFRectangle *selection, SelectionStyle style, double scale);
644
645 GooString *getSelectionText(const PDFRectangle *selection, SelectionStyle style);
646
647 std::vector<TextWordSelection *> **getSelectionWords(const PDFRectangle *selection, SelectionStyle style, int *nLines);
648
649 // Find a string by character position and length. If found, sets
650 // the text bounding rectangle and returns true; otherwise returns
651 // false.
652 bool findCharRange(int pos, int length, double *xMin, double *yMin, double *xMax, double *yMax) const;
653
654 // Dump contents of page to a file.
655 void dump(void *outputStream, TextOutputFunc outputFunc, bool physLayout, EndOfLineKind textEOL, bool pageBreaks);
656
657 // Get the head of the linked list of TextFlows.
658 const TextFlow *getFlows() const { return flows; }
659
660 // If true, will combine characters when a base and combining
661 // character are drawn on eachother.
662 void setMergeCombining(bool merge);
663
664#ifdef TEXTOUT_WORD_LIST
665 // Build a flat word list, in content stream order (if
666 // this->rawOrder is true), physical layout order (if <physLayout>
667 // is true and this->rawOrder is false), or reading order (if both
668 // flags are false).
669 std::unique_ptr<TextWordList> makeWordList(bool physLayout);
670#endif
671
672private:
673 // Destructor.
674 ~TextPage();
675
676 void clear();
677 void assignColumns(TextLineFrag *frags, int nFrags, bool rot) const;
678 int dumpFragment(const Unicode *text, int len, const UnicodeMap *uMap, GooString *s) const;
679 void adjustRotation(TextLine *line, int start, int end, double *xMin, double *xMax, double *yMin, double *yMax);
680
681 bool rawOrder; // keep text in content stream order
682 bool discardDiag; // discard diagonal text
683 bool mergeCombining; // merge when combining and base characters
684 // are drawn on top of each other
685
686 double pageWidth, pageHeight; // width and height of current page
687 TextWord *curWord; // currently active string
688 int charPos; // next character position (within content
689 // stream)
690 TextFontInfo *curFont; // current font
691 double curFontSize; // current font size
692 int nest; // current nesting level (for Type 3 fonts)
693 int nTinyChars; // number of "tiny" chars seen so far
694 bool lastCharOverlap; // set if the last added char overlapped the
695 // previous char
696 bool diagonal; // whether the current text is diagonal
697
698 std::unique_ptr<TextPool> pools[4]; // a "pool" of TextWords for each rotation
699 TextFlow *flows; // linked list of flows
700 TextBlock **blocks; // array of blocks, in yx order
701 int nBlocks; // number of blocks
702 int primaryRot; // primary rotation
703 bool primaryLR; // primary direction (true means L-to-R,
704 // false means R-to-L)
705 TextWord *rawWords; // list of words, in raw order (only if
706 // rawOrder is set)
707 TextWord *rawLastWord; // last word on rawWords list
708
709 std::vector<std::unique_ptr<TextFontInfo>> fonts; // all font info objects used on this page
710
711 double lastFindXMin, // coordinates of the last "find" result
712 lastFindYMin;
713 bool haveLastFind;
714
715 std::vector<std::unique_ptr<TextUnderline>> underlines;
716 std::vector<std::unique_ptr<TextLink>> links;
717
718 int refCnt;
719
720 friend class TextLine;
721 friend class TextLineFrag;
722 friend class TextBlock;
723 friend class TextFlow;
724 friend class TextWordList;
725 friend class TextSelectionPainter;
726 friend class TextSelectionDumper;
727};
728
729//------------------------------------------------------------------------
730// ActualText
731//------------------------------------------------------------------------
732
733class POPPLER_PRIVATE_EXPORT ActualText
734{
735public:
736 // Create an ActualText
737 explicit ActualText(TextPage *out);
738 ~ActualText();
739
740 ActualText(const ActualText &) = delete;
741 ActualText &operator=(const ActualText &) = delete;
742
743 void addChar(const GfxState *state, double x, double y, double dx, double dy, CharCode c, int nBytes, const Unicode *u, int uLen);
744 void begin(const GfxState *state, const GooString *text);
745 void end(const GfxState *state);
746
747private:
748 TextPage *text;
749
750 GooString *actualText; // replacement text for the span
751 double actualTextX0;
752 double actualTextY0;
753 double actualTextX1;
754 double actualTextY1;
755 int actualTextNBytes;
756};
757
758//------------------------------------------------------------------------
759// TextOutputDev
760//------------------------------------------------------------------------
761
762class POPPLER_PRIVATE_EXPORT TextOutputDev : public OutputDev
763{
764public:
765 static double minColSpacing1_default;
766
767 // Open a text output file. If <fileName> is NULL, no file is
768 // written (this is useful, e.g., for searching text). If
769 // <physLayoutA> is true, the original physical layout of the text
770 // is maintained. If <rawOrder> is true, the text is kept in
771 // content stream order. If <discardDiag> is true, diagonal text
772 // is removed from output.
773 TextOutputDev(const char *fileName, bool physLayoutA, double fixedPitchA, bool rawOrderA, bool append, bool discardDiagA = false);
774
775 // Create a TextOutputDev which will write to a generic stream. If
776 // <physLayoutA> is true, the original physical layout of the text
777 // is maintained. If <rawOrder> is true, the text is kept in
778 // content stream order. If <discardDiag> is true, diagonal text
779 // is removed from output.
780 TextOutputDev(TextOutputFunc func, void *stream, bool physLayoutA, double fixedPitchA, bool rawOrderA, bool discardDiagA = false);
781
782 // Destructor.
783 ~TextOutputDev() override;
784
785 // Check if file was successfully created.
786 virtual bool isOk() { return ok; }
787
788 //---- get info about output device
789
790 // Does this device use upside-down coordinates?
791 // (Upside-down means (0,0) is the top left corner of the page.)
792 bool upsideDown() override { return true; }
793
794 // Does this device use drawChar() or drawString()?
795 bool useDrawChar() override { return true; }
796
797 // Does this device use beginType3Char/endType3Char? Otherwise,
798 // text in Type 3 fonts will be drawn with drawChar/drawString.
799 bool interpretType3Chars() override { return false; }
800
801 // Does this device need non-text content?
802 bool needNonText() override { return false; }
803
804 // Does this device require incCharCount to be called for text on
805 // non-shown layers?
806 bool needCharCount() override { return true; }
807
808 //----- initialization and control
809
810 // Start a page.
811 void startPage(int pageNum, GfxState *state, XRef *xref) override;
812
813 // End a page.
814 void endPage() override;
815
816 //----- save/restore graphics state
817 void restoreState(GfxState *state) override;
818
819 //----- update text state
820 void updateFont(GfxState *state) override;
821
822 //----- text drawing
823 void beginString(GfxState *state, const GooString *s) override;
824 void endString(GfxState *state) override;
825 void drawChar(GfxState *state, double x, double y, double dx, double dy, double originX, double originY, CharCode c, int nBytes, const Unicode *u, int uLen) override;
826 void incCharCount(int nChars) override;
827 void beginActualText(GfxState *state, const GooString *text) override;
828 void endActualText(GfxState *state) override;
829
830 //----- path painting
831 void stroke(GfxState *state) override;
832 void fill(GfxState *state) override;
833 void eoFill(GfxState *state) override;
834
835 //----- link borders
836 void processLink(AnnotLink *link) override;
837
838 //----- special access
839
840 // Find a string. If <startAtTop> is true, starts looking at the
841 // top of the page; else if <startAtLast> is true, starts looking
842 // immediately after the last find result; else starts looking at
843 // <xMin>,<yMin>. If <stopAtBottom> is true, stops looking at the
844 // bottom of the page; else if <stopAtLast> is true, stops looking
845 // just before the last find result; else stops looking at
846 // <xMax>,<yMax>.
847 bool findText(const Unicode *s, int len, bool startAtTop, bool stopAtBottom, bool startAtLast, bool stopAtLast, bool caseSensitive, bool backward, bool wholeWord, double *xMin, double *yMin, double *xMax, double *yMax) const;
848
849 // Get the text which is inside the specified rectangle.
850 GooString *getText(double xMin, double yMin, double xMax, double yMax) const;
851
852 // Find a string by character position and length. If found, sets
853 // the text bounding rectangle and returns true; otherwise returns
854 // false.
855 bool findCharRange(int pos, int length, double *xMin, double *yMin, double *xMax, double *yMax) const;
856
857 void drawSelection(OutputDev *out, double scale, int rotation, const PDFRectangle *selection, SelectionStyle style, const GfxColor *glyph_color, const GfxColor *box_color);
858
859 std::vector<PDFRectangle *> *getSelectionRegion(const PDFRectangle *selection, SelectionStyle style, double scale);
860
861 GooString *getSelectionText(const PDFRectangle *selection, SelectionStyle style);
862
863 // If true, will combine characters when a base and combining
864 // character are drawn on eachother.
865 void setMergeCombining(bool merge);
866
867#ifdef TEXTOUT_WORD_LIST
868 // Build a flat word list, in content stream order (if
869 // this->rawOrder is true), physical layout order (if
870 // this->physLayout is true and this->rawOrder is false), or reading
871 // order (if both flags are false).
872 std::unique_ptr<TextWordList> makeWordList();
873#endif
874
875 // Returns the TextPage object for the last rasterized page,
876 // transferring ownership to the caller.
877 TextPage *takeText();
878
879 // Turn extra processing for HTML conversion on or off.
880 void enableHTMLExtras(bool doHTMLA) { doHTML = doHTMLA; }
881
882 // Get the head of the linked list of TextFlows for the
883 // last rasterized page.
884 const TextFlow *getFlows() const;
885
886 static constexpr EndOfLineKind defaultEndOfLine()
887 {
888#if defined(_WIN32)
889 return eolDOS;
890#else
891 return eolUnix;
892#endif
893 }
894 void setTextEOL(EndOfLineKind textEOLA) { textEOL = textEOLA; }
895 void setTextPageBreaks(bool textPageBreaksA) { textPageBreaks = textPageBreaksA; }
896 double getMinColSpacing1() const { return minColSpacing1; }
897 void setMinColSpacing1(double val) { minColSpacing1 = val; }
898
899private:
900 TextOutputFunc outputFunc; // output function
901 void *outputStream; // output stream
902 bool needClose; // need to close the output file?
903 // (only if outputStream is a FILE*)
904 TextPage *text; // text for the current page
905 bool physLayout; // maintain original physical layout when
906 // dumping text
907 double fixedPitch; // if physLayout is true and this is non-zero,
908 // assume fixed-pitch characters with this
909 // width
910 double minColSpacing1; // see default value defined with same name at TextOutputDev.cc
911 bool rawOrder; // keep text in content stream order
912 bool discardDiag; // Diagonal text, i.e., text that is not close to one of the
913 // 0, 90, 180, or 270 degree axes, is discarded. This is useful
914 // to skip watermarks drawn on top of body text, etc.
915 bool doHTML; // extra processing for HTML conversion
916 bool ok; // set up ok?
917 bool textPageBreaks; // insert end-of-page markers?
918 EndOfLineKind textEOL; // type of EOL marker to use
919
920 ActualText *actualText;
921};
922
923#endif
924

source code of poppler/poppler/TextOutputDev.h