1 | //======================================================================== |
2 | // |
3 | // TextOutputDev.h |
4 | // |
5 | // Copyright 1997-2003 Glyph & Cog, LLC |
6 | // |
7 | //======================================================================== |
8 | |
9 | //======================================================================== |
10 | // |
11 | // Modified under the Poppler project - http://poppler.freedesktop.org |
12 | // |
13 | // All changes made under the Poppler project to this file are licensed |
14 | // under GPL version 2 or later |
15 | // |
16 | // Copyright (C) 2005-2007 Kristian Høgsberg <krh@redhat.com> |
17 | // Copyright (C) 2006 Ed Catmur <ed@catmur.co.uk> |
18 | // Copyright (C) 2007, 2008, 2011, 2013 Carlos Garcia Campos <carlosgc@gnome.org> |
19 | // Copyright (C) 2007, 2017 Adrian Johnson <ajohnson@redneon.com> |
20 | // Copyright (C) 2008, 2010, 2015, 2016, 2018, 2019, 2021 Albert Astals Cid <aacid@kde.org> |
21 | // Copyright (C) 2010 Brian Ewins <brian.ewins@gmail.com> |
22 | // Copyright (C) 2012, 2013, 2015, 2016 Jason Crain <jason@aquaticape.us> |
23 | // Copyright (C) 2013 Thomas Freitag <Thomas.Freitag@alfa.de> |
24 | // Copyright (C) 2018 Klarälvdalens Datakonsult AB, a KDAB Group company, <info@kdab.com>. Work sponsored by the LiMux project of the city of Munich |
25 | // Copyright (C) 2018 Sanchit Anand <sanxchit@gmail.com> |
26 | // Copyright (C) 2018, 2020, 2021 Nelson Benítez León <nbenitezl@gmail.com> |
27 | // Copyright (C) 2019, 2022 Oliver Sander <oliver.sander@tu-dresden.de> |
28 | // Copyright (C) 2019 Dan Shea <dan.shea@logical-innovations.com> |
29 | // Copyright (C) 2020 Suzuki Toshiya <mpsuzuki@hiroshima-u.ac.jp> |
30 | // Copyright (C) 2024 Stefan Brüns <stefan.bruens@rwth-aachen.de> |
31 | // |
32 | // To see a description of the changes please see the Changelog file that |
33 | // came with your tarball or type make ChangeLog if you are building from git |
34 | // |
35 | //======================================================================== |
36 | |
37 | #ifndef TEXTOUTPUTDEV_H |
38 | #define TEXTOUTPUTDEV_H |
39 | |
40 | #include "poppler-config.h" |
41 | #include "poppler_private_export.h" |
42 | #include <cstdio> |
43 | #include "GfxFont.h" |
44 | #include "GfxState.h" |
45 | #include "OutputDev.h" |
46 | |
47 | class GooString; |
48 | class Gfx; |
49 | class GfxFont; |
50 | class GfxState; |
51 | class UnicodeMap; |
52 | class AnnotLink; |
53 | |
54 | class TextWord; |
55 | class TextPool; |
56 | class TextLine; |
57 | class TextLineFrag; |
58 | class TextBlock; |
59 | class TextFlow; |
60 | class TextLink; |
61 | class TextUnderline; |
62 | class TextWordList; |
63 | class TextPage; |
64 | class TextSelectionVisitor; |
65 | |
66 | //------------------------------------------------------------------------ |
67 | |
68 | typedef void (*TextOutputFunc)(void *stream, const char *text, int len); |
69 | |
70 | enum SelectionStyle |
71 | { |
72 | selectionStyleGlyph, |
73 | selectionStyleWord, |
74 | selectionStyleLine |
75 | }; |
76 | |
77 | enum EndOfLineKind |
78 | { |
79 | eolUnix, // LF |
80 | eolDOS, // CR+LF |
81 | eolMac // CR |
82 | }; |
83 | |
84 | //------------------------------------------------------------------------ |
85 | // TextFontInfo |
86 | //------------------------------------------------------------------------ |
87 | |
88 | class POPPLER_PRIVATE_EXPORT TextFontInfo |
89 | { |
90 | public: |
91 | explicit TextFontInfo(const GfxState *state); |
92 | ~TextFontInfo(); |
93 | |
94 | TextFontInfo(const TextFontInfo &) = delete; |
95 | TextFontInfo &operator=(const TextFontInfo &) = delete; |
96 | |
97 | bool matches(const GfxState *state) const; |
98 | bool matches(const TextFontInfo *fontInfo) const; |
99 | bool matches(const Ref *ref) const; |
100 | |
101 | // Get the font ascent, or a default value if the font is not set |
102 | double getAscent() const; |
103 | |
104 | // Get the font descent, or a default value if the font is not set |
105 | double getDescent() const; |
106 | |
107 | // Get the writing mode (0 or 1), or 0 if the font is not set |
108 | int getWMode() const; |
109 | |
110 | #ifdef TEXTOUT_WORD_LIST |
111 | // Get the font name (which may be NULL). |
112 | const GooString *getFontName() const { return fontName; } |
113 | |
114 | // Get font descriptor flags. |
115 | bool isFixedWidth() const { return flags & fontFixedWidth; } |
116 | bool isSerif() const { return flags & fontSerif; } |
117 | bool isSymbolic() const { return flags & fontSymbolic; } |
118 | bool isItalic() const { return flags & fontItalic; } |
119 | bool isBold() const { return flags & fontBold; } |
120 | #endif |
121 | |
122 | private: |
123 | std::shared_ptr<GfxFont> gfxFont; |
124 | #ifdef TEXTOUT_WORD_LIST |
125 | GooString *fontName; |
126 | int flags; |
127 | #endif |
128 | |
129 | friend class TextWord; |
130 | friend class TextPage; |
131 | friend class TextSelectionPainter; |
132 | }; |
133 | |
134 | //------------------------------------------------------------------------ |
135 | // TextWord |
136 | //------------------------------------------------------------------------ |
137 | |
138 | class POPPLER_PRIVATE_EXPORT TextWord |
139 | { |
140 | public: |
141 | // Constructor. |
142 | TextWord(const GfxState *state, int rotA, double fontSize); |
143 | |
144 | // Destructor. |
145 | ~TextWord(); |
146 | |
147 | TextWord(const TextWord &) = delete; |
148 | TextWord &operator=(const TextWord &) = delete; |
149 | |
150 | // Add a character to the word. |
151 | void addChar(const GfxState *state, TextFontInfo *fontA, double x, double y, double dx, double dy, int charPosA, int charLen, CharCode c, Unicode u, const Matrix &textMatA); |
152 | |
153 | // Attempt to add a character to the word as a combining character. |
154 | // Either character u or the last character in the word must be an |
155 | // acute, dieresis, or other combining character. Returns true if |
156 | // the character was added. |
157 | bool addCombining(const GfxState *state, TextFontInfo *fontA, double fontSizeA, double x, double y, double dx, double dy, int charPosA, int charLen, CharCode c, Unicode u, const Matrix &textMatA); |
158 | |
159 | // Merge <word> onto the end of <this>. |
160 | void merge(TextWord *word); |
161 | |
162 | // Compares <this> to <word>, returning -1 (<), 0 (=), or +1 (>), |
163 | // based on a primary-axis comparison, e.g., x ordering if rot=0. |
164 | int primaryCmp(const TextWord *word) const; |
165 | |
166 | // Return the distance along the primary axis between <this> and |
167 | // <word>. |
168 | double primaryDelta(const TextWord *word) const; |
169 | |
170 | static int cmpYX(const void *p1, const void *p2); |
171 | |
172 | void visitSelection(TextSelectionVisitor *visitor, const PDFRectangle *selection, SelectionStyle style); |
173 | |
174 | // Get the TextFontInfo object associated with a character. |
175 | const TextFontInfo *getFontInfo(int idx) const { return chars[idx].font; } |
176 | |
177 | // Get the next TextWord on the linked list. |
178 | const TextWord *getNext() const { return next; } |
179 | |
180 | #ifdef TEXTOUT_WORD_LIST |
181 | int getLength() const { return chars.size(); } |
182 | const Unicode *getChar(int idx) const { return &chars[idx].text; } |
183 | GooString *getText() const; |
184 | const GooString *getFontName(int idx) const { return chars[idx].font->fontName; } |
185 | void getColor(double *r, double *g, double *b) const |
186 | { |
187 | *r = colorR; |
188 | *g = colorG; |
189 | *b = colorB; |
190 | } |
191 | void getBBox(double *xMinA, double *yMinA, double *xMaxA, double *yMaxA) const |
192 | { |
193 | *xMinA = xMin; |
194 | *yMinA = yMin; |
195 | *xMaxA = xMax; |
196 | *yMaxA = yMax; |
197 | } |
198 | void getCharBBox(int charIdx, double *xMinA, double *yMinA, double *xMaxA, double *yMaxA) const; |
199 | double getFontSize() const { return fontSize; } |
200 | int getRotation() const { return rot; } |
201 | int getCharPos() const { return chars.empty() ? 0 : chars.front().charPos; } |
202 | int getCharLen() const { return chars.empty() ? 0 : chars.back().charPos - chars.front().charPos; } |
203 | bool getSpaceAfter() const { return spaceAfter; } |
204 | #endif |
205 | bool isUnderlined() const { return underlined; } |
206 | const AnnotLink *getLink() const { return link; } |
207 | double getEdge(int i) const { return chars[i].edge; } |
208 | double getBaseline() const { return base; } |
209 | bool hasSpaceAfter() const { return spaceAfter; } |
210 | const TextWord *nextWord() const { return next; }; |
211 | auto len() const { return chars.size(); } |
212 | |
213 | private: |
214 | void setInitialBounds(TextFontInfo *fontA, double x, double y); |
215 | |
216 | int rot; // rotation, multiple of 90 degrees |
217 | // (0, 1, 2, or 3) |
218 | int wMode; // horizontal (0) or vertical (1) writing mode |
219 | double xMin, xMax; // bounding box x coordinates |
220 | double yMin, yMax; // bounding box y coordinates |
221 | double base; // baseline x or y coordinate |
222 | |
223 | double fontSize; // font size |
224 | |
225 | struct CharInfo |
226 | { |
227 | Unicode text; |
228 | CharCode charcode; |
229 | int charPos; |
230 | double edge; |
231 | TextFontInfo *font; |
232 | Matrix textMat; |
233 | }; |
234 | std::vector<CharInfo> chars; |
235 | int charPosEnd = 0; |
236 | double edgeEnd = 0; |
237 | |
238 | bool spaceAfter; // set if there is a space between this |
239 | // word and the next word on the line |
240 | bool underlined; |
241 | bool invisible; // whether we are invisible (glyphless) |
242 | TextWord *next; // next word in line |
243 | |
244 | #ifdef TEXTOUT_WORD_LIST |
245 | double colorR, // word color |
246 | colorG, colorB; |
247 | #endif |
248 | |
249 | AnnotLink *link; |
250 | |
251 | friend class TextPool; |
252 | friend class TextLine; |
253 | friend class TextBlock; |
254 | friend class TextFlow; |
255 | friend class TextWordList; |
256 | friend class TextPage; |
257 | |
258 | friend class TextSelectionPainter; |
259 | friend class TextSelectionDumper; |
260 | }; |
261 | |
262 | //------------------------------------------------------------------------ |
263 | // TextPool |
264 | //------------------------------------------------------------------------ |
265 | |
266 | class TextPool |
267 | { |
268 | public: |
269 | TextPool(); |
270 | ~TextPool(); |
271 | |
272 | TextPool(const TextPool &) = delete; |
273 | TextPool &operator=(const TextPool &) = delete; |
274 | |
275 | TextWord *getPool(int baseIdx) { return pool[baseIdx - minBaseIdx]; } |
276 | void setPool(int baseIdx, TextWord *p) { pool[baseIdx - minBaseIdx] = p; } |
277 | |
278 | int getBaseIdx(double base) const; |
279 | |
280 | void addWord(TextWord *word); |
281 | |
282 | private: |
283 | int minBaseIdx; // min baseline bucket index |
284 | int maxBaseIdx; // max baseline bucket index |
285 | TextWord **pool; // array of linked lists, one for each |
286 | // baseline value (multiple of 4 pts) |
287 | TextWord *cursor; // pointer to last-accessed word |
288 | int cursorBaseIdx; // baseline bucket index of last-accessed word |
289 | |
290 | friend class TextBlock; |
291 | friend class TextPage; |
292 | }; |
293 | |
294 | struct TextFlowData; |
295 | |
296 | //------------------------------------------------------------------------ |
297 | // TextLine |
298 | //------------------------------------------------------------------------ |
299 | |
300 | class TextLine |
301 | { |
302 | public: |
303 | TextLine(TextBlock *blkA, int rotA, double baseA); |
304 | ~TextLine(); |
305 | |
306 | TextLine(const TextLine &) = delete; |
307 | TextLine &operator=(const TextLine &) = delete; |
308 | |
309 | void addWord(TextWord *word); |
310 | |
311 | // Return the distance along the primary axis between <this> and |
312 | // <line>. |
313 | double primaryDelta(const TextLine *line) const; |
314 | |
315 | // Compares <this> to <line>, returning -1 (<), 0 (=), or +1 (>), |
316 | // based on a primary-axis comparison, e.g., x ordering if rot=0. |
317 | int primaryCmp(const TextLine *line) const; |
318 | |
319 | // Compares <this> to <line>, returning -1 (<), 0 (=), or +1 (>), |
320 | // based on a secondary-axis comparison of the baselines, e.g., y |
321 | // ordering if rot=0. |
322 | int secondaryCmp(const TextLine *line) const; |
323 | |
324 | int cmpYX(const TextLine *line) const; |
325 | |
326 | static int cmpXY(const void *p1, const void *p2); |
327 | |
328 | void coalesce(const UnicodeMap *uMap); |
329 | |
330 | void visitSelection(TextSelectionVisitor *visitor, const PDFRectangle *selection, SelectionStyle style); |
331 | |
332 | // Get the head of the linked list of TextWords. |
333 | const TextWord *getWords() const { return words; } |
334 | |
335 | // Get the next TextLine on the linked list. |
336 | const TextLine *getNext() const { return next; } |
337 | |
338 | // Returns true if the last char of the line is a hyphen. |
339 | bool isHyphenated() const { return hyphenated; } |
340 | |
341 | private: |
342 | TextBlock *blk; // parent block |
343 | int rot; // text rotation |
344 | double xMin, xMax; // bounding box x coordinates |
345 | double yMin, yMax; // bounding box y coordinates |
346 | double base; // baseline x or y coordinate |
347 | TextWord *words; // words in this line |
348 | TextWord *lastWord; // last word in this line |
349 | Unicode *text; // Unicode text of the line, including |
350 | // spaces between words |
351 | double *edge; // "near" edge x or y coord of each char |
352 | // (plus one extra entry for the last char) |
353 | int *col; // starting column number of each Unicode char |
354 | int len; // number of Unicode chars |
355 | int convertedLen; // total number of converted characters |
356 | bool hyphenated; // set if last char is a hyphen |
357 | TextLine *next; // next line in block |
358 | Unicode *normalized; // normalized form of Unicode text |
359 | int normalized_len; // number of normalized Unicode chars |
360 | int *normalized_idx; // indices of normalized chars into Unicode text |
361 | Unicode *ascii_translation; // ascii translation from the normalized text |
362 | int ascii_len; // length of ascii translation text |
363 | int *ascii_idx; // indices of ascii chars into Unicode text of line |
364 | |
365 | friend class TextLineFrag; |
366 | friend class TextBlock; |
367 | friend class TextFlow; |
368 | friend class TextWordList; |
369 | friend class TextPage; |
370 | |
371 | friend class TextSelectionPainter; |
372 | friend class TextSelectionSizer; |
373 | friend class TextSelectionDumper; |
374 | }; |
375 | |
376 | //------------------------------------------------------------------------ |
377 | // TextBlock |
378 | //------------------------------------------------------------------------ |
379 | |
380 | class TextBlock |
381 | { |
382 | public: |
383 | TextBlock(TextPage *pageA, int rotA); |
384 | ~TextBlock(); |
385 | |
386 | TextBlock(const TextBlock &) = delete; |
387 | TextBlock &operator=(const TextBlock &) = delete; |
388 | |
389 | void addWord(TextWord *word); |
390 | |
391 | void coalesce(const UnicodeMap *uMap, double fixedPitch); |
392 | |
393 | // Update this block's priMin and priMax values, looking at <blk>. |
394 | void updatePriMinMax(const TextBlock *blk); |
395 | |
396 | static int cmpXYPrimaryRot(const void *p1, const void *p2); |
397 | |
398 | static int cmpYXPrimaryRot(const void *p1, const void *p2); |
399 | |
400 | int primaryCmp(const TextBlock *blk) const; |
401 | |
402 | double secondaryDelta(const TextBlock *blk) const; |
403 | |
404 | // Returns true if <this> is below <blk>, relative to the page's |
405 | // primary rotation. |
406 | bool isBelow(const TextBlock *blk) const; |
407 | |
408 | void visitSelection(TextSelectionVisitor *visitor, const PDFRectangle *selection, SelectionStyle style); |
409 | |
410 | // Get the head of the linked list of TextLines. |
411 | const TextLine *getLines() const { return lines; } |
412 | |
413 | // Get the next TextBlock on the linked list. |
414 | const TextBlock *getNext() const { return next; } |
415 | |
416 | void getBBox(double *xMinA, double *yMinA, double *xMaxA, double *yMaxA) const |
417 | { |
418 | *xMinA = xMin; |
419 | *yMinA = yMin; |
420 | *xMaxA = xMax; |
421 | *yMaxA = yMax; |
422 | } |
423 | |
424 | int getLineCount() const { return nLines; } |
425 | |
426 | private: |
427 | bool isBeforeByRule1(const TextBlock *blk1); |
428 | bool isBeforeByRepeatedRule1(const TextBlock *blkList, const TextBlock *blk1); |
429 | bool isBeforeByRule2(const TextBlock *blk1); |
430 | |
431 | int visitDepthFirst(TextBlock *blkList, int pos1, TextBlock **sorted, int sortPos, bool *visited); |
432 | int visitDepthFirst(TextBlock *blkList, int pos1, TextBlock **sorted, int sortPos, bool *visited, TextBlock **cache, int cacheSize); |
433 | |
434 | TextPage *page; // the parent page |
435 | int rot; // text rotation |
436 | double xMin, xMax; // bounding box x coordinates |
437 | double yMin, yMax; // bounding box y coordinates |
438 | double priMin, priMax; // whitespace bounding box along primary axis |
439 | double ExMin, ExMax; // extended bounding box x coordinates |
440 | double EyMin, EyMax; // extended bounding box y coordinates |
441 | int tableId; // id of table to which this block belongs |
442 | bool tableEnd; // is this block at end of line of actual table |
443 | |
444 | TextPool *pool; // pool of words (used only until lines |
445 | // are built) |
446 | TextLine *lines; // linked list of lines |
447 | TextLine *curLine; // most recently added line |
448 | int nLines; // number of lines |
449 | int charCount; // number of characters in the block |
450 | int col; // starting column |
451 | int nColumns; // number of columns in the block |
452 | |
453 | TextBlock *next; |
454 | TextBlock *stackNext; |
455 | |
456 | friend class TextLine; |
457 | friend class TextLineFrag; |
458 | friend class TextFlow; |
459 | friend class TextWordList; |
460 | friend class TextPage; |
461 | friend class TextSelectionPainter; |
462 | friend class TextSelectionDumper; |
463 | }; |
464 | |
465 | //------------------------------------------------------------------------ |
466 | // TextFlow |
467 | //------------------------------------------------------------------------ |
468 | |
469 | class TextFlow |
470 | { |
471 | public: |
472 | TextFlow(TextPage *pageA, TextBlock *blk); |
473 | ~TextFlow(); |
474 | |
475 | TextFlow(const TextFlow &) = delete; |
476 | TextFlow &operator=(const TextFlow &) = delete; |
477 | |
478 | // Add a block to the end of this flow. |
479 | void addBlock(TextBlock *blk); |
480 | |
481 | // Returns true if <blk> fits below <prevBlk> in the flow, i.e., (1) |
482 | // it uses a font no larger than the last block added to the flow, |
483 | // and (2) it fits within the flow's [priMin, priMax] along the |
484 | // primary axis. |
485 | bool blockFits(const TextBlock *blk, const TextBlock *prevBlk) const; |
486 | |
487 | // Get the head of the linked list of TextBlocks. |
488 | const TextBlock *getBlocks() const { return blocks; } |
489 | |
490 | // Get the next TextFlow on the linked list. |
491 | const TextFlow *getNext() const { return next; } |
492 | |
493 | private: |
494 | TextPage *page; // the parent page |
495 | double xMin, xMax; // bounding box x coordinates |
496 | double yMin, yMax; // bounding box y coordinates |
497 | double priMin, priMax; // whitespace bounding box along primary axis |
498 | TextBlock *blocks; // blocks in flow |
499 | TextBlock *lastBlk; // last block in this flow |
500 | TextFlow *next; |
501 | |
502 | friend class TextWordList; |
503 | friend class TextPage; |
504 | }; |
505 | |
506 | #ifdef TEXTOUT_WORD_LIST |
507 | |
508 | //------------------------------------------------------------------------ |
509 | // TextWordList |
510 | //------------------------------------------------------------------------ |
511 | |
512 | class POPPLER_PRIVATE_EXPORT TextWordList |
513 | { |
514 | public: |
515 | // Build a flat word list, in content stream order (if |
516 | // text->rawOrder is true), physical layout order (if <physLayout> |
517 | // is true and text->rawOrder is false), or reading order (if both |
518 | // flags are false). |
519 | TextWordList(const TextPage *text, bool physLayout); |
520 | |
521 | ~TextWordList(); |
522 | |
523 | TextWordList(const TextWordList &) = delete; |
524 | TextWordList &operator=(const TextWordList &) = delete; |
525 | |
526 | // Return the number of words on the list. |
527 | int getLength() const; |
528 | |
529 | // Return the <idx>th word from the list. |
530 | TextWord *get(int idx); |
531 | |
532 | private: |
533 | std::vector<TextWord *> words; |
534 | }; |
535 | |
536 | #endif // TEXTOUT_WORD_LIST |
537 | |
538 | class TextWordSelection |
539 | { |
540 | public: |
541 | TextWordSelection(const TextWord *wordA, int beginA, int endA) : word(wordA), begin(beginA), end(endA) { } |
542 | |
543 | const TextWord *getWord() const { return word; } |
544 | int getBegin() const { return begin; } |
545 | int getEnd() const { return end; } |
546 | |
547 | private: |
548 | const TextWord *word; |
549 | int begin; |
550 | int end; |
551 | |
552 | friend class TextSelectionPainter; |
553 | friend class TextSelectionDumper; |
554 | }; |
555 | |
556 | //------------------------------------------------------------------------ |
557 | // TextPage |
558 | //------------------------------------------------------------------------ |
559 | |
560 | class POPPLER_PRIVATE_EXPORT TextPage |
561 | { |
562 | public: |
563 | // Constructor. |
564 | explicit TextPage(bool rawOrderA, bool discardDiagA = false); |
565 | |
566 | TextPage(const TextPage &) = delete; |
567 | TextPage &operator=(const TextPage &) = delete; |
568 | |
569 | void incRefCnt(); |
570 | void decRefCnt(); |
571 | |
572 | // Start a new page. |
573 | void startPage(const GfxState *state); |
574 | |
575 | // End the current page. |
576 | void endPage(); |
577 | |
578 | // Update the current font. |
579 | void updateFont(const GfxState *state); |
580 | |
581 | // Begin a new word. |
582 | void beginWord(const GfxState *state); |
583 | |
584 | // Add a character to the current word. |
585 | void addChar(const GfxState *state, double x, double y, double dx, double dy, CharCode c, int nBytes, const Unicode *u, int uLen); |
586 | |
587 | // Add <nChars> invisible characters. |
588 | void incCharCount(int nChars); |
589 | |
590 | // End the current word, sorting it into the list of words. |
591 | void endWord(); |
592 | |
593 | // Add a word, sorting it into the list of words. |
594 | void addWord(TextWord *word); |
595 | |
596 | // Add a (potential) underline. |
597 | void addUnderline(double x0, double y0, double x1, double y1); |
598 | |
599 | // Add a hyperlink. |
600 | void addLink(int xMin, int yMin, int xMax, int yMax, AnnotLink *link); |
601 | |
602 | // Coalesce strings that look like parts of the same line. |
603 | void coalesce(bool physLayout, double fixedPitch, bool doHTML); |
604 | void coalesce(bool physLayout, double fixedPitch, bool doHTML, double minColSpacing1); |
605 | |
606 | // Find a string. If <startAtTop> is true, starts looking at the |
607 | // top of the page; else if <startAtLast> is true, starts looking |
608 | // immediately after the last find result; else starts looking at |
609 | // <xMin>,<yMin>. If <stopAtBottom> is true, stops looking at the |
610 | // bottom of the page; else if <stopAtLast> is true, stops looking |
611 | // just before the last find result; else stops looking at |
612 | // <xMax>,<yMax>. |
613 | bool findText(const Unicode *s, int len, bool startAtTop, bool stopAtBottom, bool startAtLast, bool stopAtLast, bool caseSensitive, bool backward, bool wholeWord, double *xMin, double *yMin, double *xMax, double *yMax); |
614 | |
615 | // Adds new parameter ignoreDiacritics, which will do diacritics |
616 | // insensitive search, i.e. ignore accents, umlauts, diaeresis,etc. |
617 | // while matching. This option will be ignored if <s> contains characters |
618 | // which are not pure ascii. |
619 | bool findText(const Unicode *s, int len, bool startAtTop, bool stopAtBottom, bool startAtLast, bool stopAtLast, bool caseSensitive, bool ignoreDiacritics, bool backward, bool wholeWord, double *xMin, double *yMin, double *xMax, |
620 | double *yMax); |
621 | |
622 | // Adds new parameter <matchAcrossLines>, which allows <s> to match on text |
623 | // spanning from end of a line to the next line. In that case, the rect for |
624 | // the part of match that falls on the next line will be stored in |
625 | // <continueMatch>, and if hyphenation (i.e. ignoring hyphen at end of line) |
626 | // was used while matching at the end of the line prior to <continueMatch>, |
627 | // then <ignoredHyphen> will be true, otherwise will be false. |
628 | // Only finding across two lines is supported, i.e. it won't match where <s> |
629 | // spans more than two lines. |
630 | // |
631 | // <matchAcrossLines> will be ignored if <backward> is true (as that |
632 | // combination has not been implemented yet). |
633 | bool findText(const Unicode *s, int len, bool startAtTop, bool stopAtBottom, bool startAtLast, bool stopAtLast, bool caseSensitive, bool ignoreDiacritics, bool matchAcrossLines, bool backward, bool wholeWord, double *xMin, double *yMin, |
634 | double *xMax, double *yMax, PDFRectangle *continueMatch, bool *ignoredHyphen); |
635 | |
636 | // Get the text which is inside the specified rectangle. |
637 | GooString *getText(double xMin, double yMin, double xMax, double yMax, EndOfLineKind textEOL) const; |
638 | |
639 | void visitSelection(TextSelectionVisitor *visitor, const PDFRectangle *selection, SelectionStyle style); |
640 | |
641 | void drawSelection(OutputDev *out, double scale, int rotation, const PDFRectangle *selection, SelectionStyle style, const GfxColor *glyph_color, const GfxColor *box_color); |
642 | |
643 | std::vector<PDFRectangle *> *getSelectionRegion(const PDFRectangle *selection, SelectionStyle style, double scale); |
644 | |
645 | GooString *getSelectionText(const PDFRectangle *selection, SelectionStyle style); |
646 | |
647 | std::vector<TextWordSelection *> **getSelectionWords(const PDFRectangle *selection, SelectionStyle style, int *nLines); |
648 | |
649 | // Find a string by character position and length. If found, sets |
650 | // the text bounding rectangle and returns true; otherwise returns |
651 | // false. |
652 | bool findCharRange(int pos, int length, double *xMin, double *yMin, double *xMax, double *yMax) const; |
653 | |
654 | // Dump contents of page to a file. |
655 | void dump(void *outputStream, TextOutputFunc outputFunc, bool physLayout, EndOfLineKind textEOL, bool pageBreaks); |
656 | |
657 | // Get the head of the linked list of TextFlows. |
658 | const TextFlow *getFlows() const { return flows; } |
659 | |
660 | // If true, will combine characters when a base and combining |
661 | // character are drawn on eachother. |
662 | void setMergeCombining(bool merge); |
663 | |
664 | #ifdef TEXTOUT_WORD_LIST |
665 | // Build a flat word list, in content stream order (if |
666 | // this->rawOrder is true), physical layout order (if <physLayout> |
667 | // is true and this->rawOrder is false), or reading order (if both |
668 | // flags are false). |
669 | std::unique_ptr<TextWordList> makeWordList(bool physLayout); |
670 | #endif |
671 | |
672 | private: |
673 | // Destructor. |
674 | ~TextPage(); |
675 | |
676 | void clear(); |
677 | void assignColumns(TextLineFrag *frags, int nFrags, bool rot) const; |
678 | int dumpFragment(const Unicode *text, int len, const UnicodeMap *uMap, GooString *s) const; |
679 | void adjustRotation(TextLine *line, int start, int end, double *xMin, double *xMax, double *yMin, double *yMax); |
680 | |
681 | bool rawOrder; // keep text in content stream order |
682 | bool discardDiag; // discard diagonal text |
683 | bool mergeCombining; // merge when combining and base characters |
684 | // are drawn on top of each other |
685 | |
686 | double pageWidth, pageHeight; // width and height of current page |
687 | TextWord *curWord; // currently active string |
688 | int charPos; // next character position (within content |
689 | // stream) |
690 | TextFontInfo *curFont; // current font |
691 | double curFontSize; // current font size |
692 | int nest; // current nesting level (for Type 3 fonts) |
693 | int nTinyChars; // number of "tiny" chars seen so far |
694 | bool lastCharOverlap; // set if the last added char overlapped the |
695 | // previous char |
696 | bool diagonal; // whether the current text is diagonal |
697 | |
698 | std::unique_ptr<TextPool> pools[4]; // a "pool" of TextWords for each rotation |
699 | TextFlow *flows; // linked list of flows |
700 | TextBlock **blocks; // array of blocks, in yx order |
701 | int nBlocks; // number of blocks |
702 | int primaryRot; // primary rotation |
703 | bool primaryLR; // primary direction (true means L-to-R, |
704 | // false means R-to-L) |
705 | TextWord *rawWords; // list of words, in raw order (only if |
706 | // rawOrder is set) |
707 | TextWord *rawLastWord; // last word on rawWords list |
708 | |
709 | std::vector<std::unique_ptr<TextFontInfo>> fonts; // all font info objects used on this page |
710 | |
711 | double lastFindXMin, // coordinates of the last "find" result |
712 | lastFindYMin; |
713 | bool haveLastFind; |
714 | |
715 | std::vector<std::unique_ptr<TextUnderline>> underlines; |
716 | std::vector<std::unique_ptr<TextLink>> links; |
717 | |
718 | int refCnt; |
719 | |
720 | friend class TextLine; |
721 | friend class TextLineFrag; |
722 | friend class TextBlock; |
723 | friend class TextFlow; |
724 | friend class TextWordList; |
725 | friend class TextSelectionPainter; |
726 | friend class TextSelectionDumper; |
727 | }; |
728 | |
729 | //------------------------------------------------------------------------ |
730 | // ActualText |
731 | //------------------------------------------------------------------------ |
732 | |
733 | class POPPLER_PRIVATE_EXPORT ActualText |
734 | { |
735 | public: |
736 | // Create an ActualText |
737 | explicit ActualText(TextPage *out); |
738 | ~ActualText(); |
739 | |
740 | ActualText(const ActualText &) = delete; |
741 | ActualText &operator=(const ActualText &) = delete; |
742 | |
743 | void addChar(const GfxState *state, double x, double y, double dx, double dy, CharCode c, int nBytes, const Unicode *u, int uLen); |
744 | void begin(const GfxState *state, const GooString *text); |
745 | void end(const GfxState *state); |
746 | |
747 | private: |
748 | TextPage *text; |
749 | |
750 | GooString *actualText; // replacement text for the span |
751 | double actualTextX0; |
752 | double actualTextY0; |
753 | double actualTextX1; |
754 | double actualTextY1; |
755 | int actualTextNBytes; |
756 | }; |
757 | |
758 | //------------------------------------------------------------------------ |
759 | // TextOutputDev |
760 | //------------------------------------------------------------------------ |
761 | |
762 | class POPPLER_PRIVATE_EXPORT TextOutputDev : public OutputDev |
763 | { |
764 | public: |
765 | static double minColSpacing1_default; |
766 | |
767 | // Open a text output file. If <fileName> is NULL, no file is |
768 | // written (this is useful, e.g., for searching text). If |
769 | // <physLayoutA> is true, the original physical layout of the text |
770 | // is maintained. If <rawOrder> is true, the text is kept in |
771 | // content stream order. If <discardDiag> is true, diagonal text |
772 | // is removed from output. |
773 | TextOutputDev(const char *fileName, bool physLayoutA, double fixedPitchA, bool rawOrderA, bool append, bool discardDiagA = false); |
774 | |
775 | // Create a TextOutputDev which will write to a generic stream. If |
776 | // <physLayoutA> is true, the original physical layout of the text |
777 | // is maintained. If <rawOrder> is true, the text is kept in |
778 | // content stream order. If <discardDiag> is true, diagonal text |
779 | // is removed from output. |
780 | TextOutputDev(TextOutputFunc func, void *stream, bool physLayoutA, double fixedPitchA, bool rawOrderA, bool discardDiagA = false); |
781 | |
782 | // Destructor. |
783 | ~TextOutputDev() override; |
784 | |
785 | // Check if file was successfully created. |
786 | virtual bool isOk() { return ok; } |
787 | |
788 | //---- get info about output device |
789 | |
790 | // Does this device use upside-down coordinates? |
791 | // (Upside-down means (0,0) is the top left corner of the page.) |
792 | bool upsideDown() override { return true; } |
793 | |
794 | // Does this device use drawChar() or drawString()? |
795 | bool useDrawChar() override { return true; } |
796 | |
797 | // Does this device use beginType3Char/endType3Char? Otherwise, |
798 | // text in Type 3 fonts will be drawn with drawChar/drawString. |
799 | bool interpretType3Chars() override { return false; } |
800 | |
801 | // Does this device need non-text content? |
802 | bool needNonText() override { return false; } |
803 | |
804 | // Does this device require incCharCount to be called for text on |
805 | // non-shown layers? |
806 | bool needCharCount() override { return true; } |
807 | |
808 | //----- initialization and control |
809 | |
810 | // Start a page. |
811 | void startPage(int pageNum, GfxState *state, XRef *xref) override; |
812 | |
813 | // End a page. |
814 | void endPage() override; |
815 | |
816 | //----- save/restore graphics state |
817 | void restoreState(GfxState *state) override; |
818 | |
819 | //----- update text state |
820 | void updateFont(GfxState *state) override; |
821 | |
822 | //----- text drawing |
823 | void beginString(GfxState *state, const GooString *s) override; |
824 | void endString(GfxState *state) override; |
825 | void drawChar(GfxState *state, double x, double y, double dx, double dy, double originX, double originY, CharCode c, int nBytes, const Unicode *u, int uLen) override; |
826 | void incCharCount(int nChars) override; |
827 | void beginActualText(GfxState *state, const GooString *text) override; |
828 | void endActualText(GfxState *state) override; |
829 | |
830 | //----- path painting |
831 | void stroke(GfxState *state) override; |
832 | void fill(GfxState *state) override; |
833 | void eoFill(GfxState *state) override; |
834 | |
835 | //----- link borders |
836 | void processLink(AnnotLink *link) override; |
837 | |
838 | //----- special access |
839 | |
840 | // Find a string. If <startAtTop> is true, starts looking at the |
841 | // top of the page; else if <startAtLast> is true, starts looking |
842 | // immediately after the last find result; else starts looking at |
843 | // <xMin>,<yMin>. If <stopAtBottom> is true, stops looking at the |
844 | // bottom of the page; else if <stopAtLast> is true, stops looking |
845 | // just before the last find result; else stops looking at |
846 | // <xMax>,<yMax>. |
847 | bool findText(const Unicode *s, int len, bool startAtTop, bool stopAtBottom, bool startAtLast, bool stopAtLast, bool caseSensitive, bool backward, bool wholeWord, double *xMin, double *yMin, double *xMax, double *yMax) const; |
848 | |
849 | // Get the text which is inside the specified rectangle. |
850 | GooString *getText(double xMin, double yMin, double xMax, double yMax) const; |
851 | |
852 | // Find a string by character position and length. If found, sets |
853 | // the text bounding rectangle and returns true; otherwise returns |
854 | // false. |
855 | bool findCharRange(int pos, int length, double *xMin, double *yMin, double *xMax, double *yMax) const; |
856 | |
857 | void drawSelection(OutputDev *out, double scale, int rotation, const PDFRectangle *selection, SelectionStyle style, const GfxColor *glyph_color, const GfxColor *box_color); |
858 | |
859 | std::vector<PDFRectangle *> *getSelectionRegion(const PDFRectangle *selection, SelectionStyle style, double scale); |
860 | |
861 | GooString *getSelectionText(const PDFRectangle *selection, SelectionStyle style); |
862 | |
863 | // If true, will combine characters when a base and combining |
864 | // character are drawn on eachother. |
865 | void setMergeCombining(bool merge); |
866 | |
867 | #ifdef TEXTOUT_WORD_LIST |
868 | // Build a flat word list, in content stream order (if |
869 | // this->rawOrder is true), physical layout order (if |
870 | // this->physLayout is true and this->rawOrder is false), or reading |
871 | // order (if both flags are false). |
872 | std::unique_ptr<TextWordList> makeWordList(); |
873 | #endif |
874 | |
875 | // Returns the TextPage object for the last rasterized page, |
876 | // transferring ownership to the caller. |
877 | TextPage *takeText(); |
878 | |
879 | // Turn extra processing for HTML conversion on or off. |
880 | void (bool doHTMLA) { doHTML = doHTMLA; } |
881 | |
882 | // Get the head of the linked list of TextFlows for the |
883 | // last rasterized page. |
884 | const TextFlow *getFlows() const; |
885 | |
886 | static constexpr EndOfLineKind defaultEndOfLine() |
887 | { |
888 | #if defined(_WIN32) |
889 | return eolDOS; |
890 | #else |
891 | return eolUnix; |
892 | #endif |
893 | } |
894 | void setTextEOL(EndOfLineKind textEOLA) { textEOL = textEOLA; } |
895 | void setTextPageBreaks(bool textPageBreaksA) { textPageBreaks = textPageBreaksA; } |
896 | double getMinColSpacing1() const { return minColSpacing1; } |
897 | void setMinColSpacing1(double val) { minColSpacing1 = val; } |
898 | |
899 | private: |
900 | TextOutputFunc outputFunc; // output function |
901 | void *outputStream; // output stream |
902 | bool needClose; // need to close the output file? |
903 | // (only if outputStream is a FILE*) |
904 | TextPage *text; // text for the current page |
905 | bool physLayout; // maintain original physical layout when |
906 | // dumping text |
907 | double fixedPitch; // if physLayout is true and this is non-zero, |
908 | // assume fixed-pitch characters with this |
909 | // width |
910 | double minColSpacing1; // see default value defined with same name at TextOutputDev.cc |
911 | bool rawOrder; // keep text in content stream order |
912 | bool discardDiag; // Diagonal text, i.e., text that is not close to one of the |
913 | // 0, 90, 180, or 270 degree axes, is discarded. This is useful |
914 | // to skip watermarks drawn on top of body text, etc. |
915 | bool doHTML; // extra processing for HTML conversion |
916 | bool ok; // set up ok? |
917 | bool textPageBreaks; // insert end-of-page markers? |
918 | EndOfLineKind textEOL; // type of EOL marker to use |
919 | |
920 | ActualText *actualText; |
921 | }; |
922 | |
923 | #endif |
924 | |