1//========================================================================
2//
3// TextOutputDev.cc
4//
5// Copyright 1997-2003 Glyph & Cog, LLC
6//
7//========================================================================
8
9//========================================================================
10//
11// Modified under the Poppler project - http://poppler.freedesktop.org
12//
13// All changes made under the Poppler project to this file are licensed
14// under GPL version 2 or later
15//
16// Copyright (C) 2005-2007 Kristian Høgsberg <krh@redhat.com>
17// Copyright (C) 2005 Nickolay V. Shmyrev <nshmyrev@yandex.ru>
18// Copyright (C) 2006-2008, 2011-2013 Carlos Garcia Campos <carlosgc@gnome.org>
19// Copyright (C) 2006, 2007, 2013 Ed Catmur <ed@catmur.co.uk>
20// Copyright (C) 2006 Jeff Muizelaar <jeff@infidigm.net>
21// Copyright (C) 2007, 2008, 2012, 2017 Adrian Johnson <ajohnson@redneon.com>
22// Copyright (C) 2008 Koji Otani <sho@bbr.jp>
23// Copyright (C) 2008, 2010-2012, 2014-2022, 2024 Albert Astals Cid <aacid@kde.org>
24// Copyright (C) 2008 Pino Toscano <pino@kde.org>
25// Copyright (C) 2008, 2010 Hib Eris <hib@hiberis.nl>
26// Copyright (C) 2009 Ross Moore <ross@maths.mq.edu.au>
27// Copyright (C) 2009 Kovid Goyal <kovid@kovidgoyal.net>
28// Copyright (C) 2010 Brian Ewins <brian.ewins@gmail.com>
29// Copyright (C) 2010, 2021 Marek Kasik <mkasik@redhat.com>
30// Copyright (C) 2010, 2020 Suzuki Toshiya <mpsuzuki@hiroshima-u.ac.jp>
31// Copyright (C) 2011 Sam Liao <phyomh@gmail.com>
32// Copyright (C) 2012 Horst Prote <prote@fmi.uni-stuttgart.de>
33// Copyright (C) 2012, 2013-2018 Jason Crain <jason@aquaticape.us>
34// Copyright (C) 2012 Peter Breitenlohner <peb@mppmu.mpg.de>
35// Copyright (C) 2013 José Aliste <jaliste@src.gnome.org>
36// Copyright (C) 2013 Thomas Freitag <Thomas.Freitag@alfa.de>
37// Copyright (C) 2013 Ed Catmur <ed@catmur.co.uk>
38// Copyright (C) 2016 Khaled Hosny <khaledhosny@eglug.org>
39// Copyright (C) 2018 Klarälvdalens Datakonsult AB, a KDAB Group company, <info@kdab.com>. Work sponsored by the LiMux project of the city of Munich
40// Copyright (C) 2018 Sanchit Anand <sanxchit@gmail.com>
41// Copyright (C) 2018 Adam Reichold <adam.reichold@t-online.de>
42// Copyright (C) 2018-2022, 2024 Nelson Benítez León <nbenitezl@gmail.com>
43// Copyright (C) 2019 Christian Persch <chpe@src.gnome.org>
44// Copyright (C) 2019, 2022 Oliver Sander <oliver.sander@tu-dresden.de>
45// Copyright (C) 2019 Dan Shea <dan.shea@logical-innovations.com>
46// Copyright (C) 2021 Peter Williams <peter@newton.cx>
47// Copyright (C) 2024 Adam Sampson <ats@offog.org>
48// Copyright (C) 2024 g10 Code GmbH, Author: Sune Stolborg Vuorela <sune@vuorela.dk>
49// Copyright (C) 2024 Stefan Brüns <stefan.bruens@rwth-aachen.de>
50//
51// To see a description of the changes please see the Changelog file that
52// came with your tarball or type make ChangeLog if you are building from git
53//
54//========================================================================
55
56#include <config.h>
57
58#include <cstdio>
59#include <cstdlib>
60#include <cstddef>
61#include <cmath>
62#include <cfloat>
63#include <cctype>
64#include <algorithm>
65#if defined(_WIN32) || defined(__CYGWIN__)
66# include <fcntl.h> // for O_BINARY
67# include <io.h> // for _setmode
68#endif
69#include "goo/gfile.h"
70#include "goo/gmem.h"
71#include "goo/GooString.h"
72#include "poppler-config.h"
73#include "Error.h"
74#include "GlobalParams.h"
75#include "UnicodeMap.h"
76#include "UnicodeTypeTable.h"
77#include "Link.h"
78#include "TextOutputDev.h"
79#include "Page.h"
80#include "Annot.h"
81#include "UTF.h"
82
83//------------------------------------------------------------------------
84// parameters
85//------------------------------------------------------------------------
86
87// Each bucket in a text pool includes baselines within a range of
88// this many points.
89#define textPoolStep 4
90
91// Inter-character space width which will cause addChar to start a new
92// word.
93#define minWordBreakSpace 0.1
94
95// Negative inter-character space width, i.e., overlap, which will
96// cause addChar to start a new word.
97#define minDupBreakOverlap 0.2
98
99// Max distance between baselines of two lines within a block, as a
100// fraction of the font size.
101#define maxLineSpacingDelta 1.5
102
103// Max difference in primary font sizes on two lines in the same
104// block. Delta1 is used when examining new lines above and below the
105// current block; delta2 is used when examining text that overlaps the
106// current block; delta3 is used when examining text to the left and
107// right of the current block.
108#define maxBlockFontSizeDelta1 0.05
109#define maxBlockFontSizeDelta2 0.6
110#define maxBlockFontSizeDelta3 0.2
111
112// Max difference in font sizes inside a word.
113#define maxWordFontSizeDelta 0.05
114
115// Maximum distance between baselines of two words on the same line,
116// e.g., distance between subscript or superscript and the primary
117// baseline, as a fraction of the font size.
118#define maxIntraLineDelta 0.5
119
120// Minimum inter-word spacing, as a fraction of the font size. (Only
121// used for raw ordering.)
122#define minWordSpacing 0.15
123
124// Maximum inter-word spacing, as a fraction of the font size.
125#define maxWordSpacing 1.5
126
127// Maximum horizontal spacing which will allow a word to be pulled
128// into a block, as a fraction of the font size.
129// This default value can be tweaked via API.
130double TextOutputDev::minColSpacing1_default = 0.7;
131
132// Minimum spacing between columns, as a fraction of the font size.
133#define minColSpacing2 1.0
134
135// Maximum vertical spacing between blocks within a flow, as a
136// multiple of the font size.
137#define maxBlockSpacing 2.5
138
139// Minimum spacing between characters within a word, as a fraction of
140// the font size.
141#define minCharSpacing -0.5
142
143// Maximum spacing between characters within a word, as a fraction of
144// the font size, when there is no obvious extra-wide character
145// spacing.
146#define maxCharSpacing 0.03
147
148// When extra-wide character spacing is detected, the inter-character
149// space threshold is set to the minimum inter-character space
150// multiplied by this constant.
151#define maxWideCharSpacingMul 1.3
152
153// Upper limit on spacing between characters in a word.
154#define maxWideCharSpacing 0.4
155
156// Max difference in primary,secondary coordinates (as a fraction of
157// the font size) allowed for duplicated text (fake boldface, drop
158// shadows) which is to be discarded.
159#define dupMaxPriDelta 0.1
160#define dupMaxSecDelta 0.2
161
162// Max width of underlines (in points).
163#define maxUnderlineWidth 3
164
165// Min distance between baseline and underline (in points).
166//~ this should be font-size-dependent
167#define minUnderlineGap -2
168
169// Max distance between baseline and underline (in points).
170//~ this should be font-size-dependent
171#define maxUnderlineGap 4
172
173// Max horizontal distance between edge of word and start of underline
174// (in points).
175//~ this should be font-size-dependent
176#define underlineSlack 1
177
178// Max distance between edge of text and edge of link border
179#define hyperlinkSlack 2
180
181// Max distance between characters when combining a base character and
182// combining character
183#define combMaxMidDelta 0.3
184#define combMaxBaseDelta 0.4
185
186// Text is considered diagonal if abs(tan(angle)) > diagonalThreshold.
187// (Or 1/tan(angle) for 90/270 degrees.)
188#define diagonalThreshold 0.1
189
190// How opaque a selection on a glyphless font should be. Since the font is
191// glyphless and overlaid over text in image form, this must enable users
192// to read the underlying image. Issue #157
193#define glyphlessSelectionOpacity 0.4
194
195// Returns whether x is between a and b or equal to a or b.
196// a and b don't need to be sorted.
197#define XBetweenAB(x, a, b) (!(((x) > (a) && (x) > (b)) || ((x) < (a) && (x) < (b))) ? true : false)
198
199namespace {
200
201inline bool isAscii7(Unicode uchar)
202{
203 return uchar < 128;
204}
205
206}
207
208static int reorderText(const Unicode *text, int len, const UnicodeMap *uMap, bool primaryLR, GooString *s, Unicode *u)
209{
210 char lre[8], rle[8], popdf[8], buf[8];
211 int lreLen = 0, rleLen = 0, popdfLen = 0, n;
212 int nCols, i, j, k;
213
214 nCols = 0;
215
216 if (s) {
217 lreLen = uMap->mapUnicode(u: 0x202a, buf: lre, bufSize: sizeof(lre));
218 rleLen = uMap->mapUnicode(u: 0x202b, buf: rle, bufSize: sizeof(rle));
219 popdfLen = uMap->mapUnicode(u: 0x202c, buf: popdf, bufSize: sizeof(popdf));
220 }
221
222 if (primaryLR) {
223 i = 0;
224 while (i < len) {
225 // output a left-to-right section
226 for (j = i; j < len && !unicodeTypeR(c: text[j]); ++j) {
227 ;
228 }
229 for (k = i; k < j; ++k) {
230 if (s) {
231 n = uMap->mapUnicode(u: text[k], buf, bufSize: sizeof(buf));
232 s->append(str: buf, lengthA: n);
233 }
234 if (u) {
235 u[nCols] = text[k];
236 }
237 ++nCols;
238 }
239 i = j;
240 // output a right-to-left section
241 for (j = i; j < len && !(unicodeTypeL(c: text[j]) || unicodeTypeNum(c: text[j])); ++j) {
242 ;
243 }
244 if (j > i) {
245 if (s) {
246 s->append(str: rle, lengthA: rleLen);
247 }
248 for (k = j - 1; k >= i; --k) {
249 if (s) {
250 n = uMap->mapUnicode(u: text[k], buf, bufSize: sizeof(buf));
251 s->append(str: buf, lengthA: n);
252 }
253 if (u) {
254 u[nCols] = text[k];
255 }
256 ++nCols;
257 }
258 if (s) {
259 s->append(str: popdf, lengthA: popdfLen);
260 }
261 i = j;
262 }
263 }
264 } else {
265 // Note: This code treats numeric characters (European and
266 // Arabic/Indic) as left-to-right, which isn't strictly correct
267 // (incurs extra LRE/POPDF pairs), but does produce correct
268 // visual formatting.
269 if (s) {
270 s->append(str: rle, lengthA: rleLen);
271 }
272 i = len - 1;
273 while (i >= 0) {
274 // output a right-to-left section
275 for (j = i; j >= 0 && !(unicodeTypeL(c: text[j]) || unicodeTypeNum(c: text[j])); --j) {
276 ;
277 }
278 for (k = i; k > j; --k) {
279 if (s) {
280 n = uMap->mapUnicode(u: text[k], buf, bufSize: sizeof(buf));
281 s->append(str: buf, lengthA: n);
282 }
283 if (u) {
284 u[nCols] = text[k];
285 }
286 ++nCols;
287 }
288 i = j;
289 // output a left-to-right section
290 for (j = i; j >= 0 && !unicodeTypeR(c: text[j]); --j) {
291 ;
292 }
293 if (j < i) {
294 if (s) {
295 s->append(str: lre, lengthA: lreLen);
296 }
297 for (k = j + 1; k <= i; ++k) {
298 if (s) {
299 n = uMap->mapUnicode(u: text[k], buf, bufSize: sizeof(buf));
300 s->append(str: buf, lengthA: n);
301 }
302 if (u) {
303 u[nCols] = text[k];
304 }
305 ++nCols;
306 }
307 if (s) {
308 s->append(str: popdf, lengthA: popdfLen);
309 }
310 i = j;
311 }
312 }
313 if (s) {
314 s->append(str: popdf, lengthA: popdfLen);
315 }
316 }
317
318 return nCols;
319}
320
321//------------------------------------------------------------------------
322// TextUnderline
323//------------------------------------------------------------------------
324
325class TextUnderline
326{
327public:
328 TextUnderline(double x0A, double y0A, double x1A, double y1A)
329 {
330 x0 = x0A;
331 y0 = y0A;
332 x1 = x1A;
333 y1 = y1A;
334 horiz = y0 == y1;
335 }
336 ~TextUnderline() { }
337
338 double x0, y0, x1, y1;
339 bool horiz;
340};
341
342//------------------------------------------------------------------------
343// TextLink
344//------------------------------------------------------------------------
345
346class TextLink
347{
348public:
349 TextLink(int xMinA, int yMinA, int xMaxA, int yMaxA, AnnotLink *linkA)
350 {
351 xMin = xMinA;
352 yMin = yMinA;
353 xMax = xMaxA;
354 yMax = yMaxA;
355 link = linkA;
356 }
357 ~TextLink() { }
358
359 int xMin, yMin, xMax, yMax;
360 AnnotLink *link;
361};
362
363//------------------------------------------------------------------------
364// TextFontInfo
365//------------------------------------------------------------------------
366
367TextFontInfo::TextFontInfo(const GfxState *state)
368{
369 gfxFont = state->getFont();
370#ifdef TEXTOUT_WORD_LIST
371 fontName = (gfxFont && gfxFont->getName()) ? new GooString(*gfxFont->getName()) : nullptr;
372 flags = gfxFont ? gfxFont->getFlags() : 0;
373#endif
374}
375
376TextFontInfo::~TextFontInfo()
377{
378#ifdef TEXTOUT_WORD_LIST
379 if (fontName) {
380 delete fontName;
381 }
382#endif
383}
384
385bool TextFontInfo::matches(const GfxState *state) const
386{
387 return state->getFont() == gfxFont;
388}
389
390bool TextFontInfo::matches(const TextFontInfo *fontInfo) const
391{
392 return gfxFont == fontInfo->gfxFont;
393}
394
395bool TextFontInfo::matches(const Ref *ref) const
396{
397 return gfxFont && (*(gfxFont->getID()) == *ref);
398}
399
400double TextFontInfo::getAscent() const
401{
402 return gfxFont ? gfxFont->getAscent() : 0.95;
403}
404
405double TextFontInfo::getDescent() const
406{
407 return gfxFont ? gfxFont->getDescent() : -0.35;
408}
409
410int TextFontInfo::getWMode() const
411{
412 return gfxFont ? gfxFont->getWMode() : 0;
413}
414
415//------------------------------------------------------------------------
416// TextWord
417//------------------------------------------------------------------------
418
419TextWord::TextWord(const GfxState *state, int rotA, double fontSizeA)
420{
421 rot = rotA;
422 fontSize = fontSizeA;
423 spaceAfter = false;
424 next = nullptr;
425 invisible = state->getRender() == 3;
426
427#ifdef TEXTOUT_WORD_LIST
428 GfxRGB rgb;
429
430 if ((state->getRender() & 3) == 1) {
431 state->getStrokeRGB(rgb: &rgb);
432 } else {
433 state->getFillRGB(rgb: &rgb);
434 }
435 colorR = colToDbl(x: rgb.r);
436 colorG = colToDbl(x: rgb.g);
437 colorB = colToDbl(x: rgb.b);
438#endif
439
440 underlined = false;
441 link = nullptr;
442}
443
444TextWord::~TextWord() { }
445
446void TextWord::addChar(const GfxState *state, TextFontInfo *fontA, double x, double y, double dx, double dy, int charPosA, int charLen, CharCode c, Unicode u, const Matrix &textMatA)
447{
448 chars.push_back(x: CharInfo { .text: u, .charcode: c, .charPos: charPosA, .edge: 0.0, .font: fontA, .textMat: textMatA });
449 charPosEnd = charPosA + charLen;
450
451 if (len() == 1) {
452 setInitialBounds(fontA, x, y);
453 }
454
455 if (wMode) { // vertical writing mode
456 // NB: the rotation value has been incremented by 1 (in
457 // TextPage::beginWord()) for vertical writing mode
458 switch (rot) {
459 case 0:
460 chars.back().edge = x - fontSize;
461 xMax = edgeEnd = x;
462 break;
463 case 1:
464 chars.back().edge = y - fontSize;
465 yMax = edgeEnd = y;
466 break;
467 case 2:
468 chars.back().edge = x + fontSize;
469 xMin = edgeEnd = x;
470 break;
471 case 3:
472 chars.back().edge = y + fontSize;
473 yMin = edgeEnd = y;
474 break;
475 }
476 } else { // horizontal writing mode
477 switch (rot) {
478 case 0:
479 chars.back().edge = x;
480 xMax = edgeEnd = x + dx;
481 break;
482 case 1:
483 chars.back().edge = y;
484 yMax = edgeEnd = y + dy;
485 break;
486 case 2:
487 chars.back().edge = x;
488 xMin = edgeEnd = x + dx;
489 break;
490 case 3:
491 chars.back().edge = y;
492 yMin = edgeEnd = y + dy;
493 break;
494 }
495 }
496}
497
498void TextWord::setInitialBounds(TextFontInfo *fontA, double x, double y)
499{
500 double ascent = fontA->getAscent() * fontSize;
501 double descent = fontA->getDescent() * fontSize;
502 wMode = fontA->getWMode();
503
504 if (wMode) { // vertical writing mode
505 // NB: the rotation value has been incremented by 1 (in
506 // TextPage::beginWord()) for vertical writing mode
507 switch (rot) {
508 case 0:
509 xMin = x - fontSize;
510 yMin = y - fontSize;
511 yMax = y;
512 base = y;
513 break;
514 case 1:
515 xMin = x;
516 yMin = y - fontSize;
517 xMax = x + fontSize;
518 base = x;
519 break;
520 case 2:
521 yMin = y;
522 xMax = x + fontSize;
523 yMax = y + fontSize;
524 base = y;
525 break;
526 case 3:
527 xMin = x - fontSize;
528 xMax = x;
529 yMax = y + fontSize;
530 base = x;
531 break;
532 }
533 } else { // horizontal writing mode
534 switch (rot) {
535 case 0:
536 xMin = x;
537 yMin = y - ascent;
538 yMax = y - descent;
539 if (yMin == yMax) {
540 // this is a sanity check for a case that shouldn't happen -- but
541 // if it does happen, we want to avoid dividing by zero later
542 yMin = y;
543 yMax = y + 1;
544 }
545 base = y;
546 break;
547 case 1:
548 xMin = x + descent;
549 yMin = y;
550 xMax = x + ascent;
551 if (xMin == xMax) {
552 // this is a sanity check for a case that shouldn't happen -- but
553 // if it does happen, we want to avoid dividing by zero later
554 xMin = x;
555 xMax = x + 1;
556 }
557 base = x;
558 break;
559 case 2:
560 yMin = y + descent;
561 xMax = x;
562 yMax = y + ascent;
563 if (yMin == yMax) {
564 // this is a sanity check for a case that shouldn't happen -- but
565 // if it does happen, we want to avoid dividing by zero later
566 yMin = y;
567 yMax = y + 1;
568 }
569 base = y;
570 break;
571 case 3:
572 xMin = x - ascent;
573 xMax = x - descent;
574 yMax = y;
575 if (xMin == xMax) {
576 // this is a sanity check for a case that shouldn't happen -- but
577 // if it does happen, we want to avoid dividing by zero later
578 xMin = x;
579 xMax = x + 1;
580 }
581 base = x;
582 break;
583 }
584 }
585}
586
587struct CombiningTable
588{
589 Unicode base;
590 Unicode comb;
591};
592
593static const struct CombiningTable combiningTable[] = {
594 { .base: 0x0060, .comb: 0x0300 }, // grave
595 { .base: 0x00a8, .comb: 0x0308 }, // dieresis
596 { .base: 0x00af, .comb: 0x0304 }, // macron
597 { .base: 0x00b4, .comb: 0x0301 }, // acute
598 { .base: 0x00b8, .comb: 0x0327 }, // cedilla
599 { .base: 0x02c6, .comb: 0x0302 }, // circumflex
600 { .base: 0x02c7, .comb: 0x030c }, // caron
601 { .base: 0x02d8, .comb: 0x0306 }, // breve
602 { .base: 0x02d9, .comb: 0x0307 }, // dotaccent
603 { .base: 0x02da, .comb: 0x030a }, // ring
604 { .base: 0x02dc, .comb: 0x0303 }, // tilde
605 { .base: 0x02dd, .comb: 0x030b } // hungarumlaut (double acute accent)
606};
607
608// returning combining versions of characters
609static Unicode getCombiningChar(Unicode u)
610{
611 for (const CombiningTable &combining : combiningTable) {
612 if (u == combining.base) {
613 return combining.comb;
614 }
615 }
616 return 0;
617}
618
619bool TextWord::addCombining(const GfxState *state, TextFontInfo *fontA, double fontSizeA, double x, double y, double dx, double dy, int charPosA, int charLen, CharCode c, Unicode u, const Matrix &textMatA)
620{
621 if (chars.empty() || wMode != 0 || fontA->getWMode() != 0) {
622 return false;
623 }
624
625 Unicode cCurrent = getCombiningChar(u);
626 if (cCurrent != 0 && unicodeTypeAlphaNum(c: chars.back().text)) {
627 // Current is a combining character, previous is base character
628 double maxScaledMidDelta = fabs(x: edgeEnd - chars.back().edge) * combMaxMidDelta;
629 double charMid, charBase, maxScaledBaseDelta;
630
631 // Test if characters overlap
632 if (rot == 0 || rot == 2) {
633 charMid = x + (dx / 2);
634 charBase = y;
635 maxScaledBaseDelta = (yMax - yMin) * combMaxBaseDelta;
636 } else {
637 charMid = y + (dy / 2);
638 charBase = x;
639 maxScaledBaseDelta = (xMax - xMin) * combMaxBaseDelta;
640 }
641
642 double edgeMid = (chars.back().edge + edgeEnd) / 2;
643 if (fabs(x: charMid - edgeMid) >= maxScaledMidDelta || fabs(x: charBase - base) >= maxScaledBaseDelta) {
644 return false;
645 }
646
647 // Add character, but don't adjust edge / bounding box because
648 // combining character's positioning could be odd.
649 chars.emplace_back(args: CharInfo { .text: cCurrent, .charcode: c, .charPos: charPosA, .edge: edgeMid, .font: fontA, .textMat: textMatA });
650 charPosEnd = charPosA + charLen;
651
652 return true;
653 }
654
655 Unicode cPrev = getCombiningChar(u: chars.back().text);
656 if (cPrev != 0 && unicodeTypeAlphaNum(c: u)) {
657 // Previous is a combining character, current is base character
658 double maxScaledBaseDelta = (fontA->getAscent() - fontA->getDescent()) * fontSizeA * combMaxBaseDelta;
659 double charMid, charBase, maxScaledMidDelta;
660
661 // Test if characters overlap
662 if (rot == 0 || rot == 2) {
663 charMid = x + (dx / 2);
664 charBase = y;
665 maxScaledMidDelta = fabs(x: dx * combMaxMidDelta);
666 } else {
667 charMid = y + (dy / 2);
668 charBase = x;
669 maxScaledMidDelta = fabs(x: dy * combMaxMidDelta);
670 }
671
672 double edgeMid = (chars.back().edge + edgeEnd) / 2;
673 if (fabs(x: charMid - edgeMid) >= maxScaledMidDelta || fabs(x: charBase - base) >= maxScaledBaseDelta) {
674 return false;
675 }
676
677 fontSize = fontSizeA;
678 // move combining character to after base character
679 chars.emplace_back(args: CharInfo { .text: cPrev, .charcode: chars.back().charcode, .charPos: charPosA, .edge: edgeMid, .font: chars.back().font, .textMat: chars.back().textMat });
680
681 auto &lastChar = chars[chars.size() - 2];
682
683 charPosEnd = charPosA + charLen;
684 lastChar.text = u;
685 lastChar.charcode = c;
686 lastChar.font = fontA;
687 lastChar.textMat = textMatA;
688
689 if (len() == 2) {
690 setInitialBounds(fontA, x, y);
691 }
692
693 // Updated edges / bounding box because we changed the base
694 // character.
695 if (wMode) {
696 // FIXME unreachable, wMode == 0
697 switch (rot) {
698 case 0:
699 lastChar.edge = x - fontSize;
700 xMax = edgeEnd = x;
701 break;
702 case 1:
703 lastChar.edge = y - fontSize;
704 yMax = edgeEnd = y;
705 break;
706 case 2:
707 lastChar.edge = x + fontSize;
708 xMin = edgeEnd = x;
709 break;
710 case 3:
711 lastChar.edge = y + fontSize;
712 yMin = edgeEnd = y;
713 break;
714 }
715 } else {
716 switch (rot) {
717 case 0:
718 lastChar.edge = x;
719 xMax = edgeEnd = x + dx;
720 break;
721 case 1:
722 lastChar.edge = y;
723 yMax = edgeEnd = y + dy;
724 break;
725 case 2:
726 lastChar.edge = x;
727 xMin = edgeEnd = x + dx;
728 break;
729 case 3:
730 lastChar.edge = y;
731 yMin = edgeEnd = y + dy;
732 break;
733 }
734 }
735
736 chars.back().edge = (edgeEnd + lastChar.edge) / 2;
737 return true;
738 }
739 return false;
740}
741
742void TextWord::merge(TextWord *word)
743{
744 if (word->xMin < xMin) {
745 xMin = word->xMin;
746 }
747 if (word->yMin < yMin) {
748 yMin = word->yMin;
749 }
750 if (word->xMax > xMax) {
751 xMax = word->xMax;
752 }
753 if (word->yMax > yMax) {
754 yMax = word->yMax;
755 }
756 chars.insert(position: chars.end(), first: word->chars.begin(), last: word->chars.end());
757 edgeEnd = word->edgeEnd;
758 charPosEnd = word->charPosEnd;
759}
760
761inline int TextWord::primaryCmp(const TextWord *word) const
762{
763 double cmp;
764
765 cmp = 0; // make gcc happy
766 switch (rot) {
767 case 0:
768 cmp = xMin - word->xMin;
769 break;
770 case 1:
771 cmp = yMin - word->yMin;
772 break;
773 case 2:
774 cmp = word->xMax - xMax;
775 break;
776 case 3:
777 cmp = word->yMax - yMax;
778 break;
779 }
780 return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
781}
782
783double TextWord::primaryDelta(const TextWord *word) const
784{
785 double delta;
786
787 delta = 0; // make gcc happy
788 switch (rot) {
789 case 0:
790 delta = word->xMin - xMax;
791 break;
792 case 1:
793 delta = word->yMin - yMax;
794 break;
795 case 2:
796 delta = xMin - word->xMax;
797 break;
798 case 3:
799 delta = yMin - word->yMax;
800 break;
801 }
802 return delta;
803}
804
805int TextWord::cmpYX(const void *p1, const void *p2)
806{
807 TextWord *word1 = *(TextWord **)p1;
808 TextWord *word2 = *(TextWord **)p2;
809 double cmp;
810
811 cmp = word1->yMin - word2->yMin;
812 if (cmp == 0) {
813 cmp = word1->xMin - word2->xMin;
814 }
815 return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
816}
817
818#ifdef TEXTOUT_WORD_LIST
819
820GooString *TextWord::getText() const
821{
822 GooString *s;
823 const UnicodeMap *uMap;
824 char buf[8];
825
826 s = new GooString();
827 if (!(uMap = globalParams->getTextEncoding())) {
828 return s;
829 }
830 for (size_t i = 0; i < len(); ++i) {
831 auto n = uMap->mapUnicode(u: chars[i].text, buf, bufSize: sizeof(buf));
832 s->append(str: buf, lengthA: n);
833 }
834 return s;
835}
836
837void TextWord::getCharBBox(int charIdx, double *xMinA, double *yMinA, double *xMaxA, double *yMaxA) const
838{
839 if (charIdx < 0) {
840 return;
841 }
842 size_t uCharIdx = charIdx;
843 if (uCharIdx >= len()) {
844 return;
845 }
846 auto startingEdge = chars[uCharIdx].edge;
847 auto endingEdge = (uCharIdx + 1 == len()) ? edgeEnd : chars[charIdx + 1].edge;
848 switch (rot) {
849 case 0:
850 *xMinA = startingEdge;
851 *xMaxA = endingEdge;
852 *yMinA = yMin;
853 *yMaxA = yMax;
854 break;
855 case 1:
856 *xMinA = xMin;
857 *xMaxA = xMax;
858 *yMinA = startingEdge;
859 *yMaxA = endingEdge;
860 break;
861 case 2:
862 *xMinA = endingEdge;
863 *xMaxA = startingEdge;
864 *yMinA = yMin;
865 *yMaxA = yMax;
866 break;
867 case 3:
868 *xMinA = xMin;
869 *xMaxA = xMax;
870 *yMinA = endingEdge;
871 *yMaxA = startingEdge;
872 break;
873 }
874}
875
876#endif // TEXTOUT_WORD_LIST
877
878//------------------------------------------------------------------------
879// TextPool
880//------------------------------------------------------------------------
881
882TextPool::TextPool()
883{
884 minBaseIdx = 0;
885 maxBaseIdx = -1;
886 pool = nullptr;
887 cursor = nullptr;
888 cursorBaseIdx = -1;
889}
890
891TextPool::~TextPool()
892{
893 int baseIdx;
894 TextWord *word, *word2;
895
896 for (baseIdx = minBaseIdx; baseIdx <= maxBaseIdx; ++baseIdx) {
897 for (word = pool[baseIdx - minBaseIdx]; word; word = word2) {
898 word2 = word->next;
899 delete word;
900 }
901 }
902 gfree(p: pool);
903}
904
905int TextPool::getBaseIdx(double base) const
906{
907 const double baseIdxDouble = base / textPoolStep;
908 if (std::isnan(x: baseIdxDouble) || baseIdxDouble < minBaseIdx) {
909 return minBaseIdx;
910 }
911 if (baseIdxDouble > maxBaseIdx) {
912 return maxBaseIdx;
913 }
914 return (int)baseIdxDouble;
915}
916
917void TextPool::addWord(TextWord *word)
918{
919 int wordBaseIdx, newMinBaseIdx, newMaxBaseIdx, baseIdx;
920 TextWord *w0, *w1;
921
922 // expand the array if needed
923 wordBaseIdx = (int)(word->base / textPoolStep);
924 if (unlikely(wordBaseIdx <= INT_MIN + 128 || wordBaseIdx >= INT_MAX - 128)) {
925 error(category: errSyntaxWarning, pos: -1, msg: "wordBaseIdx out of range");
926 delete word;
927 return;
928 }
929 if (minBaseIdx > maxBaseIdx) {
930 minBaseIdx = wordBaseIdx - 128;
931 maxBaseIdx = wordBaseIdx + 128;
932 pool = (TextWord **)gmallocn(count: maxBaseIdx - minBaseIdx + 1, size: sizeof(TextWord *));
933 for (baseIdx = minBaseIdx; baseIdx <= maxBaseIdx; ++baseIdx) {
934 pool[baseIdx - minBaseIdx] = nullptr;
935 }
936 } else if (wordBaseIdx < minBaseIdx) {
937 newMinBaseIdx = wordBaseIdx - 128;
938 TextWord **newPool = (TextWord **)gmallocn_checkoverflow(count: maxBaseIdx - newMinBaseIdx + 1, size: sizeof(TextWord *));
939 if (unlikely(!newPool)) {
940 error(category: errSyntaxWarning, pos: -1, msg: "newPool would overflow");
941 delete word;
942 return;
943 }
944 for (baseIdx = newMinBaseIdx; baseIdx < minBaseIdx; ++baseIdx) {
945 newPool[baseIdx - newMinBaseIdx] = nullptr;
946 }
947 memcpy(dest: &newPool[minBaseIdx - newMinBaseIdx], src: pool, n: (maxBaseIdx - minBaseIdx + 1) * sizeof(TextWord *));
948 gfree(p: pool);
949 pool = newPool;
950 minBaseIdx = newMinBaseIdx;
951 } else if (wordBaseIdx > maxBaseIdx) {
952 newMaxBaseIdx = wordBaseIdx + 128;
953 TextWord **reallocatedPool = (TextWord **)greallocn(p: pool, count: newMaxBaseIdx - minBaseIdx + 1, size: sizeof(TextWord *), checkoverflow: true /*checkoverflow*/, free_p: false /*free_pool*/);
954 if (!reallocatedPool) {
955 error(category: errSyntaxWarning, pos: -1, msg: "new pool size would overflow");
956 delete word;
957 return;
958 }
959 pool = reallocatedPool;
960 for (baseIdx = maxBaseIdx + 1; baseIdx <= newMaxBaseIdx; ++baseIdx) {
961 pool[baseIdx - minBaseIdx] = nullptr;
962 }
963 maxBaseIdx = newMaxBaseIdx;
964 }
965
966 // insert the new word
967 if (cursor && wordBaseIdx == cursorBaseIdx && word->primaryCmp(word: cursor) >= 0) {
968 w0 = cursor;
969 w1 = cursor->next;
970 } else {
971 w0 = nullptr;
972 w1 = pool[wordBaseIdx - minBaseIdx];
973 }
974 for (; w1 && word->primaryCmp(word: w1) > 0; w0 = w1, w1 = w1->next) {
975 ;
976 }
977 word->next = w1;
978 if (w0) {
979 w0->next = word;
980 } else {
981 pool[wordBaseIdx - minBaseIdx] = word;
982 }
983 cursor = word;
984 cursorBaseIdx = wordBaseIdx;
985}
986
987//------------------------------------------------------------------------
988// TextLine
989//------------------------------------------------------------------------
990
991TextLine::TextLine(TextBlock *blkA, int rotA, double baseA)
992{
993 blk = blkA;
994 rot = rotA;
995 base = baseA;
996 words = lastWord = nullptr;
997 text = nullptr;
998 edge = nullptr;
999 col = nullptr;
1000 len = 0;
1001 convertedLen = 0;
1002 hyphenated = false;
1003 next = nullptr;
1004 xMin = yMin = 0;
1005 xMax = yMax = -1;
1006 normalized = nullptr;
1007 normalized_len = 0;
1008 normalized_idx = nullptr;
1009 ascii_translation = nullptr;
1010 ascii_len = 0;
1011 ascii_idx = nullptr;
1012}
1013
1014TextLine::~TextLine()
1015{
1016 TextWord *word;
1017
1018 while (words) {
1019 word = words;
1020 words = words->next;
1021 delete word;
1022 }
1023 gfree(p: text);
1024 gfree(p: edge);
1025 gfree(p: col);
1026 if (normalized) {
1027 gfree(p: normalized);
1028 gfree(p: normalized_idx);
1029 }
1030 if (ascii_translation) {
1031 gfree(p: ascii_translation);
1032 gfree(p: ascii_idx);
1033 }
1034}
1035
1036void TextLine::addWord(TextWord *word)
1037{
1038 if (lastWord) {
1039 lastWord->next = word;
1040 } else {
1041 words = word;
1042 }
1043 lastWord = word;
1044
1045 if (xMin > xMax) {
1046 xMin = word->xMin;
1047 xMax = word->xMax;
1048 yMin = word->yMin;
1049 yMax = word->yMax;
1050 } else {
1051 if (word->xMin < xMin) {
1052 xMin = word->xMin;
1053 }
1054 if (word->xMax > xMax) {
1055 xMax = word->xMax;
1056 }
1057 if (word->yMin < yMin) {
1058 yMin = word->yMin;
1059 }
1060 if (word->yMax > yMax) {
1061 yMax = word->yMax;
1062 }
1063 }
1064}
1065
1066double TextLine::primaryDelta(const TextLine *line) const
1067{
1068 double delta;
1069
1070 delta = 0; // make gcc happy
1071 switch (rot) {
1072 case 0:
1073 delta = line->xMin - xMax;
1074 break;
1075 case 1:
1076 delta = line->yMin - yMax;
1077 break;
1078 case 2:
1079 delta = xMin - line->xMax;
1080 break;
1081 case 3:
1082 delta = yMin - line->yMax;
1083 break;
1084 }
1085 return delta;
1086}
1087
1088int TextLine::primaryCmp(const TextLine *line) const
1089{
1090 double cmp;
1091
1092 cmp = 0; // make gcc happy
1093 switch (rot) {
1094 case 0:
1095 cmp = xMin - line->xMin;
1096 break;
1097 case 1:
1098 cmp = yMin - line->yMin;
1099 break;
1100 case 2:
1101 cmp = line->xMax - xMax;
1102 break;
1103 case 3:
1104 cmp = line->yMax - yMax;
1105 break;
1106 }
1107 return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
1108}
1109
1110int TextLine::secondaryCmp(const TextLine *line) const
1111{
1112 double cmp;
1113
1114 cmp = (rot == 0 || rot == 3) ? base - line->base : line->base - base;
1115 return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
1116}
1117
1118int TextLine::cmpYX(const TextLine *line) const
1119{
1120 int cmp;
1121
1122 if ((cmp = secondaryCmp(line))) {
1123 return cmp;
1124 }
1125 return primaryCmp(line);
1126}
1127
1128int TextLine::cmpXY(const void *p1, const void *p2)
1129{
1130 TextLine *line1 = *(TextLine **)p1;
1131 TextLine *line2 = *(TextLine **)p2;
1132 int cmp;
1133
1134 if ((cmp = line1->primaryCmp(line: line2))) {
1135 return cmp;
1136 }
1137 return line1->secondaryCmp(line: line2);
1138}
1139
1140void TextLine::coalesce(const UnicodeMap *uMap)
1141{
1142 double space, delta, minSpace;
1143 bool isUnicode;
1144 char buf[8];
1145
1146 if (words->next) {
1147
1148 // compute the inter-word space threshold
1149 if (words->len() > 1 || words->next->len() > 1) {
1150 minSpace = 0;
1151 } else {
1152 minSpace = words->primaryDelta(word: words->next);
1153 for (auto word0 = words->next, word1 = word0->next; word1 && minSpace > 0; word0 = word1, word1 = word0->next) {
1154 if (word1->len() > 1) {
1155 minSpace = 0;
1156 }
1157 delta = word0->primaryDelta(word: word1);
1158 if (delta < minSpace) {
1159 minSpace = delta;
1160 }
1161 }
1162 }
1163 if (minSpace <= 0) {
1164 space = maxCharSpacing * words->fontSize;
1165 } else {
1166 space = maxWideCharSpacingMul * minSpace;
1167 if (space > maxWideCharSpacing * words->fontSize) {
1168 space = maxWideCharSpacing * words->fontSize;
1169 }
1170 }
1171
1172 // merge words
1173 auto word0 = words;
1174 auto word1 = words->next;
1175 while (word1) {
1176 if (word0->primaryDelta(word: word1) >= space) {
1177 word0->spaceAfter = true;
1178 word0 = word1;
1179 word1 = word1->next;
1180 } else if (word0->chars.back().font == word1->chars.front().font //
1181 && word0->underlined == word1->underlined //
1182 && fabs(x: word0->fontSize - word1->fontSize) < maxWordFontSizeDelta * words->fontSize //
1183 && word1->chars.front().charPos == word0->charPosEnd) {
1184 word0->merge(word: word1);
1185 word0->next = word1->next;
1186 delete word1;
1187 word1 = word0->next;
1188 } else {
1189 word0 = word1;
1190 word1 = word1->next;
1191 }
1192 }
1193 }
1194
1195 // build the line text
1196 isUnicode = uMap ? uMap->isUnicode() : false;
1197 len = 0;
1198 for (auto word1 = words; word1; word1 = word1->next) {
1199 len += word1->len();
1200 if (word1->spaceAfter) {
1201 ++len;
1202 }
1203 }
1204 text = (Unicode *)gmallocn(count: len, size: sizeof(Unicode));
1205 edge = (double *)gmallocn(count: len + 1, size: sizeof(double));
1206 size_t i = 0;
1207 for (auto word1 = words; word1; word1 = word1->next) {
1208 for (size_t j = 0; j < word1->len(); ++j) {
1209 text[i] = word1->chars[j].text;
1210 edge[i] = word1->chars[j].edge;
1211 ++i;
1212 }
1213 edge[i] = word1->edgeEnd;
1214 if (word1->spaceAfter) {
1215 text[i] = (Unicode)0x0020;
1216 ++i;
1217 }
1218 }
1219
1220 // compute convertedLen and set up the col array
1221 col = (int *)gmallocn(count: len + 1, size: sizeof(int));
1222 convertedLen = 0;
1223 for (int ci = 0; ci < len; ++ci) {
1224 col[ci] = convertedLen;
1225 if (isUnicode) {
1226 ++convertedLen;
1227 } else if (uMap) {
1228 convertedLen += uMap->mapUnicode(u: text[ci], buf, bufSize: sizeof(buf));
1229 }
1230 }
1231 col[len] = convertedLen;
1232
1233 // check for hyphen at end of line
1234 //~ need to check for other chars used as hyphens
1235 hyphenated = text[len - 1] == (Unicode)'-';
1236}
1237
1238//------------------------------------------------------------------------
1239// TextLineFrag
1240//------------------------------------------------------------------------
1241
1242class TextLineFrag
1243{
1244public:
1245 TextLine *line; // the line object
1246 int start, len; // offset and length of this fragment
1247 // (in Unicode chars)
1248 double xMin, xMax; // bounding box coordinates
1249 double yMin, yMax;
1250 double base; // baseline virtual coordinate
1251 int col; // first column
1252
1253 void init(TextLine *lineA, int startA, int lenA);
1254 void computeCoords(bool oneRot);
1255
1256 static int cmpYXPrimaryRot(const void *p1, const void *p2);
1257 static int cmpYXLineRot(const void *p1, const void *p2);
1258 static int cmpXYLineRot(const void *p1, const void *p2);
1259 static int cmpXYColumnPrimaryRot(const void *p1, const void *p2);
1260 static int cmpXYColumnLineRot(const void *p1, const void *p2);
1261};
1262
1263void TextLineFrag::init(TextLine *lineA, int startA, int lenA)
1264{
1265 line = lineA;
1266 start = startA;
1267 len = lenA;
1268 col = line->col[start];
1269}
1270
1271void TextLineFrag::computeCoords(bool oneRot)
1272{
1273 TextBlock *blk;
1274 double d0, d1, d2, d3, d4;
1275
1276 if (oneRot) {
1277
1278 switch (line->rot) {
1279 case 0:
1280 xMin = line->edge[start];
1281 xMax = line->edge[start + len];
1282 yMin = line->yMin;
1283 yMax = line->yMax;
1284 break;
1285 case 1:
1286 xMin = line->xMin;
1287 xMax = line->xMax;
1288 yMin = line->edge[start];
1289 yMax = line->edge[start + len];
1290 break;
1291 case 2:
1292 xMin = line->edge[start + len];
1293 xMax = line->edge[start];
1294 yMin = line->yMin;
1295 yMax = line->yMax;
1296 break;
1297 case 3:
1298 xMin = line->xMin;
1299 xMax = line->xMax;
1300 yMin = line->edge[start + len];
1301 yMax = line->edge[start];
1302 break;
1303 }
1304 base = line->base;
1305
1306 } else {
1307
1308 if (line->rot == 0 && line->blk->page->primaryRot == 0) {
1309
1310 xMin = line->edge[start];
1311 xMax = line->edge[start + len];
1312 yMin = line->yMin;
1313 yMax = line->yMax;
1314 base = line->base;
1315
1316 } else {
1317
1318 blk = line->blk;
1319 d0 = line->edge[start];
1320 d1 = line->edge[start + len];
1321 d2 = d3 = d4 = 0; // make gcc happy
1322
1323 switch (line->rot) {
1324 case 0:
1325 d2 = line->yMin;
1326 d3 = line->yMax;
1327 d4 = line->base;
1328 d0 = (d0 - blk->xMin) / (blk->xMax - blk->xMin);
1329 d1 = (d1 - blk->xMin) / (blk->xMax - blk->xMin);
1330 d2 = (d2 - blk->yMin) / (blk->yMax - blk->yMin);
1331 d3 = (d3 - blk->yMin) / (blk->yMax - blk->yMin);
1332 d4 = (d4 - blk->yMin) / (blk->yMax - blk->yMin);
1333 break;
1334 case 1:
1335 d2 = line->xMax;
1336 d3 = line->xMin;
1337 d4 = line->base;
1338 d0 = (d0 - blk->yMin) / (blk->yMax - blk->yMin);
1339 d1 = (d1 - blk->yMin) / (blk->yMax - blk->yMin);
1340 d2 = (blk->xMax - d2) / (blk->xMax - blk->xMin);
1341 d3 = (blk->xMax - d3) / (blk->xMax - blk->xMin);
1342 d4 = (blk->xMax - d4) / (blk->xMax - blk->xMin);
1343 break;
1344 case 2:
1345 d2 = line->yMax;
1346 d3 = line->yMin;
1347 d4 = line->base;
1348 d0 = (blk->xMax - d0) / (blk->xMax - blk->xMin);
1349 d1 = (blk->xMax - d1) / (blk->xMax - blk->xMin);
1350 d2 = (blk->yMax - d2) / (blk->yMax - blk->yMin);
1351 d3 = (blk->yMax - d3) / (blk->yMax - blk->yMin);
1352 d4 = (blk->yMax - d4) / (blk->yMax - blk->yMin);
1353 break;
1354 case 3:
1355 d2 = line->xMin;
1356 d3 = line->xMax;
1357 d4 = line->base;
1358 d0 = (blk->yMax - d0) / (blk->yMax - blk->yMin);
1359 d1 = (blk->yMax - d1) / (blk->yMax - blk->yMin);
1360 d2 = (d2 - blk->xMin) / (blk->xMax - blk->xMin);
1361 d3 = (d3 - blk->xMin) / (blk->xMax - blk->xMin);
1362 d4 = (d4 - blk->xMin) / (blk->xMax - blk->xMin);
1363 break;
1364 }
1365
1366 switch (line->blk->page->primaryRot) {
1367 case 0:
1368 xMin = blk->xMin + d0 * (blk->xMax - blk->xMin);
1369 xMax = blk->xMin + d1 * (blk->xMax - blk->xMin);
1370 yMin = blk->yMin + d2 * (blk->yMax - blk->yMin);
1371 yMax = blk->yMin + d3 * (blk->yMax - blk->yMin);
1372 base = blk->yMin + d4 * (blk->yMax - blk->yMin);
1373 break;
1374 case 1:
1375 xMin = blk->xMax - d3 * (blk->xMax - blk->xMin);
1376 xMax = blk->xMax - d2 * (blk->xMax - blk->xMin);
1377 yMin = blk->yMin + d0 * (blk->yMax - blk->yMin);
1378 yMax = blk->yMin + d1 * (blk->yMax - blk->yMin);
1379 base = blk->xMax - d4 * (blk->xMax - blk->xMin);
1380 break;
1381 case 2:
1382 xMin = blk->xMax - d1 * (blk->xMax - blk->xMin);
1383 xMax = blk->xMax - d0 * (blk->xMax - blk->xMin);
1384 yMin = blk->yMax - d3 * (blk->yMax - blk->yMin);
1385 yMax = blk->yMax - d2 * (blk->yMax - blk->yMin);
1386 base = blk->yMax - d4 * (blk->yMax - blk->yMin);
1387 break;
1388 case 3:
1389 xMin = blk->xMin + d2 * (blk->xMax - blk->xMin);
1390 xMax = blk->xMin + d3 * (blk->xMax - blk->xMin);
1391 yMin = blk->yMax - d1 * (blk->yMax - blk->yMin);
1392 yMax = blk->yMax - d0 * (blk->yMax - blk->yMin);
1393 base = blk->xMin + d4 * (blk->xMax - blk->xMin);
1394 break;
1395 }
1396 }
1397 }
1398}
1399
1400int TextLineFrag::cmpYXPrimaryRot(const void *p1, const void *p2)
1401{
1402 TextLineFrag *frag1 = (TextLineFrag *)p1;
1403 TextLineFrag *frag2 = (TextLineFrag *)p2;
1404 double cmp;
1405
1406 cmp = 0; // make gcc happy
1407 switch (frag1->line->blk->page->primaryRot) {
1408 case 0:
1409 if (fabs(x: cmp = frag1->yMin - frag2->yMin) < 0.01) {
1410 cmp = frag1->xMin - frag2->xMin;
1411 }
1412 break;
1413 case 1:
1414 if (fabs(x: cmp = frag2->xMax - frag1->xMax) < 0.01) {
1415 cmp = frag1->yMin - frag2->yMin;
1416 }
1417 break;
1418 case 2:
1419 if (fabs(x: cmp = frag2->yMin - frag1->yMin) < 0.01) {
1420 cmp = frag2->xMax - frag1->xMax;
1421 }
1422 break;
1423 case 3:
1424 if (fabs(x: cmp = frag1->xMax - frag2->xMax) < 0.01) {
1425 cmp = frag2->yMax - frag1->yMax;
1426 }
1427 break;
1428 }
1429 return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
1430}
1431
1432int TextLineFrag::cmpYXLineRot(const void *p1, const void *p2)
1433{
1434 TextLineFrag *frag1 = (TextLineFrag *)p1;
1435 TextLineFrag *frag2 = (TextLineFrag *)p2;
1436 double cmp;
1437
1438 cmp = 0; // make gcc happy
1439 switch (frag1->line->rot) {
1440 case 0:
1441 if ((cmp = frag1->yMin - frag2->yMin) == 0) {
1442 cmp = frag1->xMin - frag2->xMin;
1443 }
1444 break;
1445 case 1:
1446 if ((cmp = frag2->xMax - frag1->xMax) == 0) {
1447 cmp = frag1->yMin - frag2->yMin;
1448 }
1449 break;
1450 case 2:
1451 if ((cmp = frag2->yMin - frag1->yMin) == 0) {
1452 cmp = frag2->xMax - frag1->xMax;
1453 }
1454 break;
1455 case 3:
1456 if ((cmp = frag1->xMax - frag2->xMax) == 0) {
1457 cmp = frag2->yMax - frag1->yMax;
1458 }
1459 break;
1460 }
1461 return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
1462}
1463
1464int TextLineFrag::cmpXYLineRot(const void *p1, const void *p2)
1465{
1466 TextLineFrag *frag1 = (TextLineFrag *)p1;
1467 TextLineFrag *frag2 = (TextLineFrag *)p2;
1468 double cmp;
1469
1470 cmp = 0; // make gcc happy
1471 switch (frag1->line->rot) {
1472 case 0:
1473 if ((cmp = frag1->xMin - frag2->xMin) == 0) {
1474 cmp = frag1->yMin - frag2->yMin;
1475 }
1476 break;
1477 case 1:
1478 if ((cmp = frag1->yMin - frag2->yMin) == 0) {
1479 cmp = frag2->xMax - frag1->xMax;
1480 }
1481 break;
1482 case 2:
1483 if ((cmp = frag2->xMax - frag1->xMax) == 0) {
1484 cmp = frag2->yMin - frag1->yMin;
1485 }
1486 break;
1487 case 3:
1488 if ((cmp = frag2->yMax - frag1->yMax) == 0) {
1489 cmp = frag1->xMax - frag2->xMax;
1490 }
1491 break;
1492 }
1493 return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
1494}
1495
1496int TextLineFrag::cmpXYColumnPrimaryRot(const void *p1, const void *p2)
1497{
1498 TextLineFrag *frag1 = (TextLineFrag *)p1;
1499 TextLineFrag *frag2 = (TextLineFrag *)p2;
1500 double cmp;
1501
1502 // if columns overlap, compare y values
1503 if (frag1->col < frag2->col + (frag2->line->col[frag2->start + frag2->len] - frag2->line->col[frag2->start]) && frag2->col < frag1->col + (frag1->line->col[frag1->start + frag1->len] - frag1->line->col[frag1->start])) {
1504 cmp = 0; // make gcc happy
1505 switch (frag1->line->blk->page->primaryRot) {
1506 case 0:
1507 cmp = frag1->yMin - frag2->yMin;
1508 break;
1509 case 1:
1510 cmp = frag2->xMax - frag1->xMax;
1511 break;
1512 case 2:
1513 cmp = frag2->yMin - frag1->yMin;
1514 break;
1515 case 3:
1516 cmp = frag1->xMax - frag2->xMax;
1517 break;
1518 }
1519 return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
1520 }
1521
1522 // otherwise, compare starting column
1523 return frag1->col - frag2->col;
1524}
1525
1526int TextLineFrag::cmpXYColumnLineRot(const void *p1, const void *p2)
1527{
1528 TextLineFrag *frag1 = (TextLineFrag *)p1;
1529 TextLineFrag *frag2 = (TextLineFrag *)p2;
1530 double cmp;
1531
1532 // if columns overlap, compare y values
1533 if (frag1->col < frag2->col + (frag2->line->col[frag2->start + frag2->len] - frag2->line->col[frag2->start]) && frag2->col < frag1->col + (frag1->line->col[frag1->start + frag1->len] - frag1->line->col[frag1->start])) {
1534 cmp = 0; // make gcc happy
1535 switch (frag1->line->rot) {
1536 case 0:
1537 cmp = frag1->yMin - frag2->yMin;
1538 break;
1539 case 1:
1540 cmp = frag2->xMax - frag1->xMax;
1541 break;
1542 case 2:
1543 cmp = frag2->yMin - frag1->yMin;
1544 break;
1545 case 3:
1546 cmp = frag1->xMax - frag2->xMax;
1547 break;
1548 }
1549 return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
1550 }
1551
1552 // otherwise, compare starting column
1553 return frag1->col - frag2->col;
1554}
1555
1556//------------------------------------------------------------------------
1557// TextBlock
1558//------------------------------------------------------------------------
1559
1560TextBlock::TextBlock(TextPage *pageA, int rotA)
1561{
1562 page = pageA;
1563 rot = rotA;
1564 xMin = yMin = 0;
1565 xMax = yMax = -1;
1566 priMin = 0;
1567 priMax = page->pageWidth;
1568 pool = new TextPool();
1569 lines = nullptr;
1570 curLine = nullptr;
1571 next = nullptr;
1572 stackNext = nullptr;
1573 tableId = -1;
1574 tableEnd = false;
1575}
1576
1577TextBlock::~TextBlock()
1578{
1579 TextLine *line;
1580
1581 delete pool;
1582 while (lines) {
1583 line = lines;
1584 lines = lines->next;
1585 delete line;
1586 }
1587}
1588
1589void TextBlock::addWord(TextWord *word)
1590{
1591 pool->addWord(word);
1592 if (xMin > xMax) {
1593 xMin = word->xMin;
1594 xMax = word->xMax;
1595 yMin = word->yMin;
1596 yMax = word->yMax;
1597 } else {
1598 if (word->xMin < xMin) {
1599 xMin = word->xMin;
1600 }
1601 if (word->xMax > xMax) {
1602 xMax = word->xMax;
1603 }
1604 if (word->yMin < yMin) {
1605 yMin = word->yMin;
1606 }
1607 if (word->yMax > yMax) {
1608 yMax = word->yMax;
1609 }
1610 }
1611}
1612
1613void TextBlock::coalesce(const UnicodeMap *uMap, double fixedPitch)
1614{
1615 // discard duplicated text (fake boldface, drop shadows)
1616 for (int idx0 = pool->minBaseIdx; idx0 <= pool->maxBaseIdx; ++idx0) {
1617 // Get the first LHS word from the pool
1618 TextWord *word0 = pool->getPool(baseIdx: idx0);
1619
1620 while (word0) {
1621 double priDelta = dupMaxPriDelta * word0->fontSize;
1622 double secDelta = dupMaxSecDelta * word0->fontSize;
1623 double xDelta = ((rot == 0) || (rot == 2)) ? priDelta : secDelta;
1624 double yDelta = ((rot == 0) || (rot == 2)) ? secDelta : priDelta;
1625
1626 int maxBaseIdx = pool->getBaseIdx(base: word0->base + secDelta);
1627
1628 for (int idx1 = idx0; idx1 <= maxBaseIdx; idx1++) {
1629 TextWord *prevWord;
1630 /* In case the RHS word is from the same pool as the LHS word,
1631 * start the inner loop with the word following the LHS word.
1632 * Otherwise, start with the second word from the subsequent pools
1633 * - the first word is compared at the end.
1634 */
1635 if (idx0 == idx1) {
1636 prevWord = word0;
1637 } else {
1638 prevWord = pool->getPool(baseIdx: idx1);
1639 if (!prevWord) {
1640 continue;
1641 }
1642 }
1643 TextWord *word1 = prevWord->next;
1644
1645 auto equalText = [](const TextWord &w1, const TextWord &w2) -> bool { //
1646 return std::equal(first1: w1.chars.begin(), last1: w1.chars.end(), first2: w2.chars.begin(), last2: w2.chars.end(), //
1647 binary_pred: [](auto c1, auto c2) { return c1.text == c2.text; });
1648 };
1649 auto match = [&equalText, xDelta, yDelta](const TextWord &w1, const TextWord &w2) -> bool {
1650 if (!equalText(w1, w2)) {
1651 return false;
1652 }
1653 return fabs(x: w1.xMin - w2.xMin) < xDelta && fabs(x: w1.xMax - w2.xMax) < xDelta //
1654 && fabs(x: w1.yMin - w2.yMin) < yDelta && fabs(x: w1.yMax - w2.yMax) < yDelta;
1655 };
1656
1657 while (word1) {
1658 if (match(*word0, *word1)) {
1659 prevWord->next = word1->next;
1660 delete word1;
1661 word1 = prevWord->next;
1662 } else {
1663 prevWord = word1;
1664 word1 = word1->next;
1665 }
1666 }
1667
1668 // Check the first word from each subsequent pool
1669 if (idx0 != idx1) {
1670 word1 = pool->getPool(baseIdx: idx1);
1671 }
1672 if (word1 && match(*word0, *word1)) {
1673 pool->setPool(baseIdx: idx1, p: word1->next);
1674 delete word1;
1675 }
1676 }
1677
1678 word0 = word0->next;
1679 }
1680 }
1681
1682 TextWord *word0, *word1;
1683 TextWord *bestWord0, *bestWord1, *lastWord;
1684 TextLine *line, *line0, *line1;
1685 TextLine **lineArray;
1686 int poolMinBaseIdx, startBaseIdx, minBaseIdx, maxBaseIdx;
1687 int baseIdx, bestWordBaseIdx;
1688 double minBase, maxBase;
1689 double fontSize, wordSpacing, delta;
1690 bool overlap;
1691 int col1, col2;
1692 int i, j, k;
1693
1694 // build the lines
1695 curLine = nullptr;
1696 poolMinBaseIdx = pool->minBaseIdx;
1697 charCount = 0;
1698 nLines = 0;
1699 while (true) {
1700
1701 // find the first non-empty line in the pool
1702 for (; poolMinBaseIdx <= pool->maxBaseIdx && !pool->getPool(baseIdx: poolMinBaseIdx); ++poolMinBaseIdx) {
1703 ;
1704 }
1705 if (poolMinBaseIdx > pool->maxBaseIdx) {
1706 break;
1707 }
1708
1709 // look for the left-most word in the first four lines of the
1710 // pool -- this avoids starting with a superscript word
1711 startBaseIdx = poolMinBaseIdx;
1712 for (baseIdx = poolMinBaseIdx + 1; baseIdx < poolMinBaseIdx + 4 && baseIdx <= pool->maxBaseIdx; ++baseIdx) {
1713 if (!pool->getPool(baseIdx)) {
1714 continue;
1715 }
1716 if (pool->getPool(baseIdx)->primaryCmp(word: pool->getPool(baseIdx: startBaseIdx)) < 0) {
1717 startBaseIdx = baseIdx;
1718 }
1719 }
1720
1721 // create a new line
1722 word0 = pool->getPool(baseIdx: startBaseIdx);
1723 pool->setPool(baseIdx: startBaseIdx, p: word0->next);
1724 word0->next = nullptr;
1725 line = new TextLine(this, word0->rot, word0->base);
1726 line->addWord(word: word0);
1727 lastWord = word0;
1728
1729 // compute the search range
1730 fontSize = word0->fontSize;
1731 minBase = word0->base - maxIntraLineDelta * fontSize;
1732 maxBase = word0->base + maxIntraLineDelta * fontSize;
1733 minBaseIdx = pool->getBaseIdx(base: minBase);
1734 maxBaseIdx = pool->getBaseIdx(base: maxBase);
1735 wordSpacing = fixedPitch ? fixedPitch : maxWordSpacing * fontSize;
1736
1737 // find the rest of the words in this line
1738 while (true) {
1739
1740 // find the left-most word whose baseline is in the range for
1741 // this line
1742 bestWordBaseIdx = 0;
1743 bestWord0 = bestWord1 = nullptr;
1744 overlap = false;
1745 for (baseIdx = minBaseIdx; !overlap && baseIdx <= maxBaseIdx; ++baseIdx) {
1746 for (word0 = nullptr, word1 = pool->getPool(baseIdx); word1; word0 = word1, word1 = word1->next) {
1747 if (word1->base >= minBase && word1->base <= maxBase) {
1748 delta = lastWord->primaryDelta(word: word1);
1749 if (delta < minCharSpacing * fontSize) {
1750 overlap = true;
1751 break;
1752 } else {
1753 if (delta < wordSpacing && (!bestWord1 || word1->primaryCmp(word: bestWord1) < 0)) {
1754 bestWordBaseIdx = baseIdx;
1755 bestWord0 = word0;
1756 bestWord1 = word1;
1757 }
1758 break;
1759 }
1760 }
1761 }
1762 }
1763 if (overlap || !bestWord1) {
1764 break;
1765 }
1766
1767 // remove it from the pool, and add it to the line
1768 if (bestWord0) {
1769 bestWord0->next = bestWord1->next;
1770 } else {
1771 pool->setPool(baseIdx: bestWordBaseIdx, p: bestWord1->next);
1772 }
1773 bestWord1->next = nullptr;
1774 line->addWord(word: bestWord1);
1775 lastWord = bestWord1;
1776 }
1777
1778 // add the line
1779 if (curLine && line->cmpYX(line: curLine) > 0) {
1780 line0 = curLine;
1781 line1 = curLine->next;
1782 } else {
1783 line0 = nullptr;
1784 line1 = lines;
1785 }
1786 for (; line1 && line->cmpYX(line: line1) > 0; line0 = line1, line1 = line1->next) {
1787 ;
1788 }
1789 if (line0) {
1790 line0->next = line;
1791 } else {
1792 lines = line;
1793 }
1794 line->next = line1;
1795 curLine = line;
1796 line->coalesce(uMap);
1797 charCount += line->len;
1798 ++nLines;
1799 }
1800
1801 // sort lines into xy order for column assignment
1802 lineArray = (TextLine **)gmallocn(count: nLines, size: sizeof(TextLine *));
1803 for (line = lines, i = 0; line; line = line->next, ++i) {
1804 lineArray[i] = line;
1805 }
1806 qsort(base: lineArray, nmemb: nLines, size: sizeof(TextLine *), compar: &TextLine::cmpXY);
1807
1808 // column assignment
1809 nColumns = 0;
1810 if (fixedPitch) {
1811 for (i = 0; i < nLines; ++i) {
1812 line0 = lineArray[i];
1813 col1 = 0; // make gcc happy
1814 switch (rot) {
1815 case 0:
1816 col1 = (int)((line0->xMin - xMin) / fixedPitch + 0.5);
1817 break;
1818 case 1:
1819 col1 = (int)((line0->yMin - yMin) / fixedPitch + 0.5);
1820 break;
1821 case 2:
1822 col1 = (int)((xMax - line0->xMax) / fixedPitch + 0.5);
1823 break;
1824 case 3:
1825 col1 = (int)((yMax - line0->yMax) / fixedPitch + 0.5);
1826 break;
1827 }
1828 for (k = 0; k <= line0->len; ++k) {
1829 line0->col[k] += col1;
1830 }
1831 if (line0->col[line0->len] > nColumns) {
1832 nColumns = line0->col[line0->len];
1833 }
1834 }
1835 } else {
1836 for (i = 0; i < nLines; ++i) {
1837 line0 = lineArray[i];
1838 col1 = 0;
1839 for (j = 0; j < i; ++j) {
1840 line1 = lineArray[j];
1841 if (line1->primaryDelta(line: line0) >= 0) {
1842 col2 = line1->col[line1->len] + 1;
1843 } else {
1844 k = 0; // make gcc happy
1845 switch (rot) {
1846 case 0:
1847 for (k = 0; k < line1->len && line0->xMin >= 0.5 * (line1->edge[k] + line1->edge[k + 1]); ++k) {
1848 ;
1849 }
1850 break;
1851 case 1:
1852 for (k = 0; k < line1->len && line0->yMin >= 0.5 * (line1->edge[k] + line1->edge[k + 1]); ++k) {
1853 ;
1854 }
1855 break;
1856 case 2:
1857 for (k = 0; k < line1->len && line0->xMax <= 0.5 * (line1->edge[k] + line1->edge[k + 1]); ++k) {
1858 ;
1859 }
1860 break;
1861 case 3:
1862 for (k = 0; k < line1->len && line0->yMax <= 0.5 * (line1->edge[k] + line1->edge[k + 1]); ++k) {
1863 ;
1864 }
1865 break;
1866 }
1867 col2 = line1->col[k];
1868 }
1869 if (col2 > col1) {
1870 col1 = col2;
1871 }
1872 }
1873 for (k = 0; k <= line0->len; ++k) {
1874 line0->col[k] += col1;
1875 }
1876 if (line0->col[line0->len] > nColumns) {
1877 nColumns = line0->col[line0->len];
1878 }
1879 }
1880 }
1881 gfree(p: lineArray);
1882}
1883
1884void TextBlock::updatePriMinMax(const TextBlock *blk)
1885{
1886 double newPriMin, newPriMax;
1887 bool gotPriMin, gotPriMax;
1888
1889 gotPriMin = gotPriMax = false;
1890 newPriMin = newPriMax = 0; // make gcc happy
1891 switch (page->primaryRot) {
1892 case 0:
1893 case 2:
1894 if (blk->yMin < yMax && blk->yMax > yMin) {
1895 if (blk->xMin < xMin) {
1896 newPriMin = blk->xMax;
1897 gotPriMin = true;
1898 }
1899 if (blk->xMax > xMax) {
1900 newPriMax = blk->xMin;
1901 gotPriMax = true;
1902 }
1903 }
1904 break;
1905 case 1:
1906 case 3:
1907 if (blk->xMin < xMax && blk->xMax > xMin) {
1908 if (blk->yMin < yMin) {
1909 newPriMin = blk->yMax;
1910 gotPriMin = true;
1911 }
1912 if (blk->yMax > yMax) {
1913 newPriMax = blk->yMin;
1914 gotPriMax = true;
1915 }
1916 }
1917 break;
1918 }
1919 if (gotPriMin) {
1920 if (newPriMin > xMin) {
1921 newPriMin = xMin;
1922 }
1923 if (newPriMin > priMin) {
1924 priMin = newPriMin;
1925 }
1926 }
1927 if (gotPriMax) {
1928 if (newPriMax < xMax) {
1929 newPriMax = xMax;
1930 }
1931 if (newPriMax < priMax) {
1932 priMax = newPriMax;
1933 }
1934 }
1935}
1936
1937int TextBlock::cmpXYPrimaryRot(const void *p1, const void *p2)
1938{
1939 TextBlock *blk1 = *(TextBlock **)p1;
1940 TextBlock *blk2 = *(TextBlock **)p2;
1941 double cmp;
1942
1943 cmp = 0; // make gcc happy
1944 switch (blk1->page->primaryRot) {
1945 case 0:
1946 if ((cmp = blk1->xMin - blk2->xMin) == 0) {
1947 cmp = blk1->yMin - blk2->yMin;
1948 }
1949 break;
1950 case 1:
1951 if ((cmp = blk1->yMin - blk2->yMin) == 0) {
1952 cmp = blk2->xMax - blk1->xMax;
1953 }
1954 break;
1955 case 2:
1956 if ((cmp = blk2->xMax - blk1->xMax) == 0) {
1957 cmp = blk2->yMin - blk1->yMin;
1958 }
1959 break;
1960 case 3:
1961 if ((cmp = blk2->yMax - blk1->yMax) == 0) {
1962 cmp = blk1->xMax - blk2->xMax;
1963 }
1964 break;
1965 }
1966 return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
1967}
1968
1969int TextBlock::cmpYXPrimaryRot(const void *p1, const void *p2)
1970{
1971 TextBlock *blk1 = *(TextBlock **)p1;
1972 TextBlock *blk2 = *(TextBlock **)p2;
1973 double cmp;
1974
1975 cmp = 0; // make gcc happy
1976 switch (blk1->page->primaryRot) {
1977 case 0:
1978 if ((cmp = blk1->yMin - blk2->yMin) == 0) {
1979 cmp = blk1->xMin - blk2->xMin;
1980 }
1981 break;
1982 case 1:
1983 if ((cmp = blk2->xMax - blk1->xMax) == 0) {
1984 cmp = blk1->yMin - blk2->yMin;
1985 }
1986 break;
1987 case 2:
1988 if ((cmp = blk2->yMin - blk1->yMin) == 0) {
1989 cmp = blk2->xMax - blk1->xMax;
1990 }
1991 break;
1992 case 3:
1993 if ((cmp = blk1->xMax - blk2->xMax) == 0) {
1994 cmp = blk2->yMax - blk1->yMax;
1995 }
1996 break;
1997 }
1998 return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
1999}
2000
2001int TextBlock::primaryCmp(const TextBlock *blk) const
2002{
2003 double cmp;
2004
2005 cmp = 0; // make gcc happy
2006 switch (rot) {
2007 case 0:
2008 cmp = xMin - blk->xMin;
2009 break;
2010 case 1:
2011 cmp = yMin - blk->yMin;
2012 break;
2013 case 2:
2014 cmp = blk->xMax - xMax;
2015 break;
2016 case 3:
2017 cmp = blk->yMax - yMax;
2018 break;
2019 }
2020 return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
2021}
2022
2023double TextBlock::secondaryDelta(const TextBlock *blk) const
2024{
2025 double delta;
2026
2027 delta = 0; // make gcc happy
2028 switch (rot) {
2029 case 0:
2030 delta = blk->yMin - yMax;
2031 break;
2032 case 1:
2033 delta = xMin - blk->xMax;
2034 break;
2035 case 2:
2036 delta = yMin - blk->yMax;
2037 break;
2038 case 3:
2039 delta = blk->xMin - xMax;
2040 break;
2041 }
2042 return delta;
2043}
2044
2045bool TextBlock::isBelow(const TextBlock *blk) const
2046{
2047 bool below;
2048
2049 below = false; // make gcc happy
2050 switch (page->primaryRot) {
2051 case 0:
2052 below = xMin >= blk->priMin && xMax <= blk->priMax && yMin > blk->yMin;
2053 break;
2054 case 1:
2055 below = yMin >= blk->priMin && yMax <= blk->priMax && xMax < blk->xMax;
2056 break;
2057 case 2:
2058 below = xMin >= blk->priMin && xMax <= blk->priMax && yMax < blk->yMax;
2059 break;
2060 case 3:
2061 below = yMin >= blk->priMin && yMax <= blk->priMax && xMin > blk->xMin;
2062 break;
2063 }
2064
2065 return below;
2066}
2067
2068bool TextBlock::isBeforeByRule1(const TextBlock *blk1)
2069{
2070 bool before = false;
2071 bool overlap = false;
2072
2073 switch (this->page->primaryRot) {
2074 case 0:
2075 case 2:
2076 overlap = ((this->ExMin <= blk1->ExMin) && (blk1->ExMin <= this->ExMax)) || ((blk1->ExMin <= this->ExMin) && (this->ExMin <= blk1->ExMax));
2077 break;
2078 case 1:
2079 case 3:
2080 overlap = ((this->EyMin <= blk1->EyMin) && (blk1->EyMin <= this->EyMax)) || ((blk1->EyMin <= this->EyMin) && (this->EyMin <= blk1->EyMax));
2081 break;
2082 }
2083 switch (this->page->primaryRot) {
2084 case 0:
2085 before = overlap && this->EyMin < blk1->EyMin;
2086 break;
2087 case 1:
2088 before = overlap && this->ExMax > blk1->ExMax;
2089 break;
2090 case 2:
2091 before = overlap && this->EyMax > blk1->EyMax;
2092 break;
2093 case 3:
2094 before = overlap && this->ExMin < blk1->ExMin;
2095 break;
2096 }
2097 return before;
2098}
2099
2100bool TextBlock::isBeforeByRule2(const TextBlock *blk1)
2101{
2102 double cmp = 0;
2103 int rotLR = rot;
2104
2105 if (!page->primaryLR) {
2106 rotLR = (rotLR + 2) % 4;
2107 }
2108
2109 switch (rotLR) {
2110 case 0:
2111 cmp = ExMax - blk1->ExMin;
2112 break;
2113 case 1:
2114 cmp = EyMin - blk1->EyMax;
2115 break;
2116 case 2:
2117 cmp = blk1->ExMax - ExMin;
2118 break;
2119 case 3:
2120 cmp = blk1->EyMin - EyMax;
2121 break;
2122 }
2123 return cmp <= 0;
2124}
2125
2126// Sort into reading order by performing a topological sort using the rules
2127// given in "High Performance Document Layout Analysis", T.M. Breuel, 2003.
2128// See http://pubs.iupr.org/#2003-breuel-sdiut
2129// Topological sort is done by depth first search, see
2130// http://en.wikipedia.org/wiki/Topological_sorting
2131int TextBlock::visitDepthFirst(TextBlock *blkList, int pos1, TextBlock **sorted, int sortPos, bool *visited, TextBlock **cache, int cacheSize)
2132{
2133 int pos2;
2134 TextBlock *blk1, *blk2, *blk3;
2135 bool before;
2136
2137 if (visited[pos1]) {
2138 return sortPos;
2139 }
2140
2141 blk1 = this;
2142
2143#if 0 // for debugging
2144 printf("visited: %d %.2f..%.2f %.2f..%.2f\n",
2145 sortPos, blk1->ExMin, blk1->ExMax, blk1->EyMin, blk1->EyMax);
2146#endif
2147 visited[pos1] = true;
2148 pos2 = -1;
2149 for (blk2 = blkList; blk2; blk2 = blk2->next) {
2150 pos2++;
2151 if (visited[pos2]) {
2152 // skip visited nodes
2153 continue;
2154 }
2155 before = false;
2156
2157 // is blk2 before blk1? (for table entries)
2158 if (blk1->tableId >= 0 && blk1->tableId == blk2->tableId) {
2159 if (page->primaryLR) {
2160 if (blk2->xMax <= blk1->xMin && blk2->yMin <= blk1->yMax && blk2->yMax >= blk1->yMin) {
2161 before = true;
2162 }
2163 } else {
2164 if (blk2->xMin >= blk1->xMax && blk2->yMin <= blk1->yMax && blk2->yMax >= blk1->yMin) {
2165 before = true;
2166 }
2167 }
2168
2169 if (blk2->yMax <= blk1->yMin) {
2170 before = true;
2171 }
2172 } else {
2173 if (blk2->isBeforeByRule1(blk1)) {
2174 // Rule (1) blk1 and blk2 overlap, and blk2 is above blk1.
2175 before = true;
2176#if 0 // for debugging
2177 printf("rule1: %.2f..%.2f %.2f..%.2f %.2f..%.2f %.2f..%.2f\n",
2178 blk2->ExMin, blk2->ExMax, blk2->EyMin, blk2->EyMax,
2179 blk1->ExMin, blk1->ExMax, blk1->EyMin, blk1->EyMax);
2180#endif
2181 } else if (blk2->isBeforeByRule2(blk1)) {
2182 // Rule (2) blk2 left of blk1, and no intervening blk3
2183 // such that blk1 is before blk3 by rule 1,
2184 // and blk3 is before blk2 by rule 1.
2185 before = true;
2186 for (int i = 0; i < cacheSize && cache[i]; ++i) {
2187 if (blk1->isBeforeByRule1(blk1: cache[i]) && cache[i]->isBeforeByRule1(blk1: blk2)) {
2188 before = false;
2189 std::rotate(first: cache, middle: cache + i, last: cache + i + 1);
2190 break;
2191 }
2192 }
2193
2194 if (before) {
2195 for (blk3 = blkList; blk3; blk3 = blk3->next) {
2196 if (blk3 == blk2 || blk3 == blk1) {
2197 continue;
2198 }
2199 if (blk1->isBeforeByRule1(blk1: blk3) && blk3->isBeforeByRule1(blk1: blk2)) {
2200 before = false;
2201 std::copy_backward(first: cache, last: cache + cacheSize - 1, result: cache + cacheSize);
2202 cache[0] = blk3;
2203 break;
2204 }
2205 }
2206 }
2207#if 0 // for debugging
2208 if (before) {
2209 printf("rule2: %.2f..%.2f %.2f..%.2f %.2f..%.2f %.2f..%.2f\n",
2210 blk1->ExMin, blk1->ExMax, blk1->EyMin, blk1->EyMax,
2211 blk2->ExMin, blk2->ExMax, blk2->EyMin, blk2->EyMax);
2212 }
2213#endif
2214 }
2215 }
2216 if (before) {
2217 // blk2 is before blk1, so it needs to be visited
2218 // before we can add blk1 to the sorted list.
2219 sortPos = blk2->visitDepthFirst(blkList, pos1: pos2, sorted, sortPos, visited, cache, cacheSize);
2220 }
2221 }
2222#if 0 // for debugging
2223 printf("sorted: %d %.2f..%.2f %.2f..%.2f\n",
2224 sortPos, blk1->ExMin, blk1->ExMax, blk1->EyMin, blk1->EyMax);
2225#endif
2226 sorted[sortPos++] = blk1;
2227 return sortPos;
2228}
2229
2230int TextBlock::visitDepthFirst(TextBlock *blkList, int pos1, TextBlock **sorted, int sortPos, bool *visited)
2231{
2232 const int blockCacheSize = 4;
2233 TextBlock *blockCache[blockCacheSize];
2234 std::fill(first: blockCache, last: blockCache + blockCacheSize, value: nullptr);
2235 return visitDepthFirst(blkList, pos1, sorted, sortPos, visited, cache: blockCache, cacheSize: blockCacheSize);
2236}
2237
2238//------------------------------------------------------------------------
2239// TextFlow
2240//------------------------------------------------------------------------
2241
2242TextFlow::TextFlow(TextPage *pageA, TextBlock *blk)
2243{
2244 page = pageA;
2245 xMin = blk->xMin;
2246 xMax = blk->xMax;
2247 yMin = blk->yMin;
2248 yMax = blk->yMax;
2249 priMin = blk->priMin;
2250 priMax = blk->priMax;
2251 blocks = lastBlk = blk;
2252 next = nullptr;
2253}
2254
2255TextFlow::~TextFlow()
2256{
2257 TextBlock *blk;
2258
2259 while (blocks) {
2260 blk = blocks;
2261 blocks = blocks->next;
2262 delete blk;
2263 }
2264}
2265
2266void TextFlow::addBlock(TextBlock *blk)
2267{
2268 if (lastBlk) {
2269 lastBlk->next = blk;
2270 } else {
2271 blocks = blk;
2272 }
2273 lastBlk = blk;
2274 if (blk->xMin < xMin) {
2275 xMin = blk->xMin;
2276 }
2277 if (blk->xMax > xMax) {
2278 xMax = blk->xMax;
2279 }
2280 if (blk->yMin < yMin) {
2281 yMin = blk->yMin;
2282 }
2283 if (blk->yMax > yMax) {
2284 yMax = blk->yMax;
2285 }
2286}
2287
2288bool TextFlow::blockFits(const TextBlock *blk, const TextBlock *prevBlk) const
2289{
2290 bool fits;
2291
2292 // lower blocks must use smaller fonts
2293 if (blk->lines->words->fontSize > lastBlk->lines->words->fontSize) {
2294 return false;
2295 }
2296
2297 fits = false; // make gcc happy
2298 switch (page->primaryRot) {
2299 case 0:
2300 fits = blk->xMin >= priMin && blk->xMax <= priMax;
2301 break;
2302 case 1:
2303 fits = blk->yMin >= priMin && blk->yMax <= priMax;
2304 break;
2305 case 2:
2306 fits = blk->xMin >= priMin && blk->xMax <= priMax;
2307 break;
2308 case 3:
2309 fits = blk->yMin >= priMin && blk->yMax <= priMax;
2310 break;
2311 }
2312 return fits;
2313}
2314
2315#ifdef TEXTOUT_WORD_LIST
2316
2317//------------------------------------------------------------------------
2318// TextWordList
2319//------------------------------------------------------------------------
2320
2321TextWordList::TextWordList(const TextPage *text, bool physLayout)
2322{
2323 TextFlow *flow;
2324 TextBlock *blk;
2325 TextLine *line;
2326 TextWord *word;
2327 TextWord **wordArray;
2328 int nWords, i;
2329
2330 if (text->rawOrder) {
2331 for (word = text->rawWords; word; word = word->next) {
2332 words.push_back(x: word);
2333 }
2334
2335 } else if (physLayout) {
2336 // this is inefficient, but it's also the least useful of these
2337 // three cases
2338 nWords = 0;
2339 for (flow = text->flows; flow; flow = flow->next) {
2340 for (blk = flow->blocks; blk; blk = blk->next) {
2341 for (line = blk->lines; line; line = line->next) {
2342 for (word = line->words; word; word = word->next) {
2343 ++nWords;
2344 }
2345 }
2346 }
2347 }
2348 wordArray = (TextWord **)gmallocn(count: nWords, size: sizeof(TextWord *));
2349 i = 0;
2350 for (flow = text->flows; flow; flow = flow->next) {
2351 for (blk = flow->blocks; blk; blk = blk->next) {
2352 for (line = blk->lines; line; line = line->next) {
2353 for (word = line->words; word; word = word->next) {
2354 wordArray[i++] = word;
2355 }
2356 }
2357 }
2358 }
2359 qsort(base: wordArray, nmemb: nWords, size: sizeof(TextWord *), compar: &TextWord::cmpYX);
2360 for (i = 0; i < nWords; ++i) {
2361 words.push_back(x: wordArray[i]);
2362 }
2363 gfree(p: wordArray);
2364
2365 } else {
2366 for (flow = text->flows; flow; flow = flow->next) {
2367 for (blk = flow->blocks; blk; blk = blk->next) {
2368 for (line = blk->lines; line; line = line->next) {
2369 for (word = line->words; word; word = word->next) {
2370 words.push_back(x: word);
2371 }
2372 }
2373 }
2374 }
2375 }
2376}
2377
2378TextWordList::~TextWordList() { }
2379
2380int TextWordList::getLength() const
2381{
2382 return words.size();
2383}
2384
2385TextWord *TextWordList::get(int idx)
2386{
2387 if (idx < 0 || idx >= (int)words.size()) {
2388 return nullptr;
2389 }
2390 return words[idx];
2391}
2392
2393#endif // TEXTOUT_WORD_LIST
2394
2395//------------------------------------------------------------------------
2396// TextPage
2397//------------------------------------------------------------------------
2398
2399TextPage::TextPage(bool rawOrderA, bool discardDiagA)
2400{
2401 int rot;
2402
2403 refCnt = 1;
2404 rawOrder = rawOrderA;
2405 discardDiag = discardDiagA;
2406 curWord = nullptr;
2407 charPos = 0;
2408 curFont = nullptr;
2409 curFontSize = 0;
2410 nest = 0;
2411 nTinyChars = 0;
2412 lastCharOverlap = false;
2413 if (!rawOrder) {
2414 for (rot = 0; rot < 4; ++rot) {
2415 pools[rot] = std::make_unique<TextPool>();
2416 }
2417 }
2418 flows = nullptr;
2419 blocks = nullptr;
2420 rawWords = nullptr;
2421 rawLastWord = nullptr;
2422 lastFindXMin = lastFindYMin = 0;
2423 haveLastFind = false;
2424 mergeCombining = true;
2425 diagonal = false;
2426}
2427
2428TextPage::~TextPage()
2429{
2430 clear();
2431}
2432
2433void TextPage::incRefCnt()
2434{
2435 refCnt++;
2436}
2437
2438void TextPage::decRefCnt()
2439{
2440 if (--refCnt == 0) {
2441 delete this;
2442 }
2443}
2444
2445void TextPage::startPage(const GfxState *state)
2446{
2447 clear();
2448 if (state) {
2449 pageWidth = state->getPageWidth();
2450 pageHeight = state->getPageHeight();
2451 } else {
2452 pageWidth = pageHeight = 0;
2453 }
2454}
2455
2456void TextPage::endPage()
2457{
2458 if (curWord) {
2459 endWord();
2460 }
2461}
2462
2463void TextPage::clear()
2464{
2465 int rot;
2466 TextFlow *flow;
2467 TextWord *word;
2468
2469 if (curWord) {
2470 delete curWord;
2471 curWord = nullptr;
2472 }
2473 if (rawOrder) {
2474 while (rawWords) {
2475 word = rawWords;
2476 rawWords = rawWords->next;
2477 delete word;
2478 }
2479 } else {
2480 for (rot = 0; rot < 4; ++rot) {
2481 pools[rot] = std::make_unique<TextPool>();
2482 }
2483 while (flows) {
2484 flow = flows;
2485 flows = flows->next;
2486 delete flow;
2487 }
2488 gfree(p: blocks);
2489 }
2490 fonts.clear();
2491 underlines.clear();
2492 links.clear();
2493
2494 diagonal = false;
2495 curWord = nullptr;
2496 charPos = 0;
2497 curFont = nullptr;
2498 curFontSize = 0;
2499 nest = 0;
2500 nTinyChars = 0;
2501 flows = nullptr;
2502 blocks = nullptr;
2503 rawWords = nullptr;
2504 rawLastWord = nullptr;
2505}
2506
2507void TextPage::updateFont(const GfxState *state)
2508{
2509 const double *fm;
2510 const char *name;
2511 int code, mCode, letterCode, anyCode;
2512 double w;
2513
2514 // get the font info object
2515 curFont = nullptr;
2516 for (const std::unique_ptr<TextFontInfo> &f : fonts) {
2517 if (f->matches(state)) {
2518 curFont = f.get();
2519 break;
2520 }
2521 }
2522 if (!curFont) {
2523 fonts.emplace_back(args: std::make_unique<TextFontInfo>(args&: state));
2524 curFont = fonts.back().get();
2525 }
2526
2527 // adjust the font size
2528 GfxFont *const gfxFont = state->getFont().get();
2529 curFontSize = state->getTransformedFontSize();
2530 if (gfxFont && gfxFont->getType() == fontType3) {
2531 // This is a hack which makes it possible to deal with some Type 3
2532 // fonts. The problem is that it's impossible to know what the
2533 // base coordinate system used in the font is without actually
2534 // rendering the font. This code tries to guess by looking at the
2535 // width of the character 'm' (which breaks if the font is a
2536 // subset that doesn't contain 'm').
2537 mCode = letterCode = anyCode = -1;
2538 for (code = 0; code < 256; ++code) {
2539 name = ((Gfx8BitFont *)gfxFont)->getCharName(code);
2540 int nameLen = name ? strlen(s: name) : 0;
2541 bool nameOneChar = nameLen == 1 || (nameLen > 1 && name[1] == '\0');
2542 if (nameOneChar && name[0] == 'm') {
2543 mCode = code;
2544 }
2545 if (letterCode < 0 && nameOneChar && ((name[0] >= 'A' && name[0] <= 'Z') || (name[0] >= 'a' && name[0] <= 'z'))) {
2546 letterCode = code;
2547 }
2548 if (anyCode < 0 && name && ((Gfx8BitFont *)gfxFont)->getWidth(c: code) > 0) {
2549 anyCode = code;
2550 }
2551 }
2552 if (mCode >= 0 && (w = ((Gfx8BitFont *)gfxFont)->getWidth(c: mCode)) > 0) {
2553 // 0.6 is a generic average 'm' width -- yes, this is a hack
2554 curFontSize *= w / 0.6;
2555 } else if (letterCode >= 0 && (w = ((Gfx8BitFont *)gfxFont)->getWidth(c: letterCode)) > 0) {
2556 // even more of a hack: 0.5 is a generic letter width
2557 curFontSize *= w / 0.5;
2558 } else if (anyCode >= 0 && (w = ((Gfx8BitFont *)gfxFont)->getWidth(c: anyCode)) > 0) {
2559 // better than nothing: 0.5 is a generic character width
2560 curFontSize *= w / 0.5;
2561 }
2562 fm = gfxFont->getFontMatrix();
2563 if (fm[0] != 0) {
2564 curFontSize *= fabs(x: fm[3] / fm[0]);
2565 }
2566 }
2567}
2568
2569void TextPage::beginWord(const GfxState *state)
2570{
2571 const double *fontm;
2572 double m[4], m2[4];
2573 int rot;
2574
2575 // This check is needed because Type 3 characters can contain
2576 // text-drawing operations (when TextPage is being used via
2577 // {X,Win}SplashOutputDev rather than TextOutputDev).
2578 if (curWord) {
2579 ++nest;
2580 return;
2581 }
2582
2583 // compute the rotation
2584 state->getFontTransMat(m11: &m[0], m12: &m[1], m21: &m[2], m22: &m[3]);
2585 std::shared_ptr<GfxFont> gfxFont = state->getFont();
2586 if (gfxFont && gfxFont->getType() == fontType3) {
2587 fontm = state->getFont()->getFontMatrix();
2588 m2[0] = fontm[0] * m[0] + fontm[1] * m[2];
2589 m2[1] = fontm[0] * m[1] + fontm[1] * m[3];
2590 m2[2] = fontm[2] * m[0] + fontm[3] * m[2];
2591 m2[3] = fontm[2] * m[1] + fontm[3] * m[3];
2592 m[0] = m2[0];
2593 m[1] = m2[1];
2594 m[2] = m2[2];
2595 m[3] = m2[3];
2596 }
2597 if (fabs(x: m[0] * m[3]) > fabs(x: m[1] * m[2])) {
2598 rot = (m[0] > 0 || m[3] < 0) ? 0 : 2;
2599 } else {
2600 rot = (m[2] > 0) ? 1 : 3;
2601 }
2602 if (fabs(x: m[0]) >= fabs(x: m[1])) {
2603 diagonal = fabs(x: m[1]) > diagonalThreshold * fabs(x: m[0]);
2604 } else {
2605 diagonal = fabs(x: m[0]) > diagonalThreshold * fabs(x: m[1]);
2606 }
2607
2608 // for vertical writing mode, the lines are effectively rotated 90
2609 // degrees
2610 if (gfxFont && gfxFont->getWMode()) {
2611 rot = (rot + 1) & 3;
2612 }
2613
2614 curWord = new TextWord(state, rot, curFontSize);
2615}
2616
2617void TextPage::addChar(const GfxState *state, double x, double y, double dx, double dy, CharCode c, int nBytes, const Unicode *u, int uLen)
2618{
2619 double x1, y1, w1, h1, dx2, dy2, base, sp, delta;
2620 bool overlap;
2621 int i;
2622 int wMode;
2623 Matrix mat;
2624
2625 // subtract char and word spacing from the dx,dy values
2626 sp = state->getCharSpace();
2627 if (c == (CharCode)0x20) {
2628 sp += state->getWordSpace();
2629 }
2630 state->textTransformDelta(x1: sp * state->getHorizScaling(), y1: 0, x2: &dx2, y2: &dy2);
2631 dx -= dx2;
2632 dy -= dy2;
2633 state->transformDelta(x1: dx, y1: dy, x2: &w1, y2: &h1);
2634
2635 // throw away chars that aren't inside the page bounds
2636 // (and also do a sanity check on the character size)
2637 state->transform(x1: x, y1: y, x2: &x1, y2: &y1);
2638 if (x1 + w1 < 0 || x1 > pageWidth || y1 + h1 < 0 || y1 > pageHeight || std::isnan(x: x1) || std::isnan(x: y1) || std::isnan(x: w1) || std::isnan(x: h1)) {
2639 charPos += nBytes;
2640 return;
2641 }
2642
2643 // check the tiny chars limit
2644 if (fabs(x: w1) < 3 && fabs(x: h1) < 3) {
2645 if (++nTinyChars > 50000) {
2646 charPos += nBytes;
2647 return;
2648 }
2649 }
2650
2651 // break words at space character
2652 if (uLen == 1 && UnicodeIsWhitespace(ucs4: u[0])) {
2653 charPos += nBytes;
2654 endWord();
2655 return;
2656 } else if (uLen == 1 && u[0] == (Unicode)0x0) {
2657 // ignore null characters
2658 charPos += nBytes;
2659 return;
2660 }
2661
2662 state->getFontTransMat(m11: &mat.m[0], m12: &mat.m[1], m21: &mat.m[2], m22: &mat.m[3]);
2663 mat.m[0] *= state->getHorizScaling();
2664 mat.m[1] *= state->getHorizScaling();
2665 mat.m[4] = x1;
2666 mat.m[5] = y1;
2667
2668 if (mergeCombining && curWord && uLen == 1 && curWord->addCombining(state, fontA: curFont, fontSizeA: curFontSize, x: x1, y: y1, dx: w1, dy: h1, charPosA: charPos, charLen: nBytes, c, u: u[0], textMatA: mat)) {
2669 charPos += nBytes;
2670 return;
2671 }
2672
2673 // start a new word if:
2674 // (1) this character doesn't fall in the right place relative to
2675 // the end of the previous word (this places upper and lower
2676 // constraints on the position deltas along both the primary
2677 // and secondary axes), or
2678 // (2) this character overlaps the previous one (duplicated text), or
2679 // (3) the previous character was an overlap (we want each duplicated
2680 // character to be in a word by itself at this stage),
2681 // (4) the font size has changed
2682 // (5) the WMode changed
2683 if (curWord && curWord->len() > 0) {
2684 base = sp = delta = 0; // make gcc happy
2685 switch (curWord->rot) {
2686 case 0:
2687 base = y1;
2688 sp = x1 - curWord->xMax;
2689 delta = x1 - curWord->chars.back().edge;
2690 break;
2691 case 1:
2692 base = x1;
2693 sp = y1 - curWord->yMax;
2694 delta = y1 - curWord->chars.back().edge;
2695 break;
2696 case 2:
2697 base = y1;
2698 sp = curWord->xMin - x1;
2699 delta = curWord->chars.back().edge - x1;
2700 break;
2701 case 3:
2702 base = x1;
2703 sp = curWord->yMin - y1;
2704 delta = curWord->chars.back().edge - y1;
2705 break;
2706 }
2707 overlap = fabs(x: delta) < dupMaxPriDelta * curWord->fontSize && fabs(x: base - curWord->base) < dupMaxSecDelta * curWord->fontSize;
2708 wMode = curFont->getWMode();
2709 if (overlap || lastCharOverlap || sp < -minDupBreakOverlap * curWord->fontSize || sp > minWordBreakSpace * curWord->fontSize || fabs(x: base - curWord->base) > 0.5 || curFontSize != curWord->fontSize || wMode != curWord->wMode) {
2710 endWord();
2711 }
2712 lastCharOverlap = overlap;
2713 } else {
2714 lastCharOverlap = false;
2715 }
2716
2717 if (uLen != 0) {
2718 // start a new word if needed
2719 if (!curWord) {
2720 beginWord(state);
2721 }
2722
2723 // throw away diagonal chars
2724 if (discardDiag && diagonal) {
2725 charPos += nBytes;
2726 return;
2727 }
2728
2729 // page rotation and/or transform matrices can cause text to be
2730 // drawn in reverse order -- in this case, swap the begin/end
2731 // coordinates and break text into individual chars
2732 if ((curWord->rot == 0 && w1 < 0) || (curWord->rot == 1 && h1 < 0) || (curWord->rot == 2 && w1 > 0) || (curWord->rot == 3 && h1 > 0)) {
2733 endWord();
2734 beginWord(state);
2735
2736 // throw away diagonal chars
2737 if (discardDiag && diagonal) {
2738 charPos += nBytes;
2739 return;
2740 }
2741
2742 x1 += w1;
2743 y1 += h1;
2744 w1 = -w1;
2745 h1 = -h1;
2746 }
2747
2748 // add the characters to the current word
2749 w1 /= uLen;
2750 h1 /= uLen;
2751 for (i = 0; i < uLen; ++i) {
2752 curWord->addChar(state, fontA: curFont, x: x1 + i * w1, y: y1 + i * h1, dx: w1, dy: h1, charPosA: charPos, charLen: nBytes, c, u: u[i], textMatA: mat);
2753 }
2754 }
2755 charPos += nBytes;
2756}
2757
2758void TextPage::incCharCount(int nChars)
2759{
2760 charPos += nChars;
2761}
2762
2763void TextPage::endWord()
2764{
2765 // This check is needed because Type 3 characters can contain
2766 // text-drawing operations (when TextPage is being used via
2767 // {X,Win}SplashOutputDev rather than TextOutputDev).
2768 if (nest > 0) {
2769 --nest;
2770 return;
2771 }
2772
2773 if (curWord) {
2774 addWord(word: curWord);
2775 curWord = nullptr;
2776 }
2777}
2778
2779void TextPage::addWord(TextWord *word)
2780{
2781 // throw away zero-length words -- they don't have valid xMin/xMax
2782 // values, and they're useless anyway
2783 if (word->len() == 0) {
2784 delete word;
2785 return;
2786 }
2787
2788 if (rawOrder) {
2789 if (rawLastWord) {
2790 rawLastWord->next = word;
2791 } else {
2792 rawWords = word;
2793 }
2794 rawLastWord = word;
2795 } else {
2796 pools[word->rot]->addWord(word);
2797 }
2798}
2799
2800void TextPage::addUnderline(double x0, double y0, double x1, double y1)
2801{
2802 underlines.emplace_back(args: std::make_unique<TextUnderline>(args&: x0, args&: y0, args&: x1, args&: y1));
2803}
2804
2805void TextPage::addLink(int xMin, int yMin, int xMax, int yMax, AnnotLink *link)
2806{
2807 links.emplace_back(args: std::make_unique<TextLink>(args&: xMin, args&: yMin, args&: xMax, args&: yMax, args&: link));
2808}
2809
2810void TextPage::coalesce(bool physLayout, double fixedPitch, bool doHTML)
2811{
2812 coalesce(physLayout, fixedPitch, doHTML, minColSpacing1: TextOutputDev::minColSpacing1_default);
2813}
2814
2815void TextPage::coalesce(bool physLayout, double fixedPitch, bool doHTML, double minColSpacing1)
2816{
2817 TextWord *word0, *word1, *word2;
2818 TextLine *line;
2819 TextBlock *blkList, *blk, *lastBlk, *blk0, *blk1, *blk2;
2820 TextFlow *flow, *lastFlow;
2821 int rot, poolMinBaseIdx, baseIdx, startBaseIdx, endBaseIdx;
2822 double minBase, maxBase, newMinBase, newMaxBase;
2823 double fontSize, colSpace1, colSpace2, lineSpace, intraLineSpace, blkSpace;
2824 bool found;
2825 int count[4];
2826 int lrCount;
2827 int col1, col2;
2828 int j, n;
2829
2830 if (rawOrder) {
2831 primaryRot = 0;
2832 primaryLR = true;
2833 return;
2834 }
2835
2836 const UnicodeMap *uMap = globalParams->getTextEncoding();
2837 blkList = nullptr;
2838 lastBlk = nullptr;
2839 nBlocks = 0;
2840 primaryRot = 0;
2841
2842#if 0 // for debugging
2843 printf("*** initial words ***\n");
2844 for (rot = 0; rot < 4; ++rot) {
2845 pool = pools[rot];
2846 for (baseIdx = pool->minBaseIdx; baseIdx <= pool->maxBaseIdx; ++baseIdx) {
2847 for (word0 = pool->getPool(baseIdx); word0; word0 = word0->next) {
2848 printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSize=%.2f rot=%d link=%p '",
2849 word0->xMin, word0->xMax, word0->yMin, word0->yMax,
2850 word0->base, word0->fontSize, rot*90, word0->link);
2851 for (i = 0; i < word0->len; ++i) {
2852 fputc(word0->text[i] & 0xff, stdout);
2853 }
2854 printf("'\n");
2855 }
2856 }
2857 }
2858 printf("\n");
2859#endif
2860
2861#if 0 //~ for debugging
2862 for (i = 0; i < underlines->getLength(); ++i) {
2863 underline = (TextUnderline *)underlines->get(i);
2864 printf("underline: x=%g..%g y=%g..%g horiz=%d\n",
2865 underline->x0, underline->x1, underline->y0, underline->y1,
2866 underline->horiz);
2867 }
2868#endif
2869
2870 if (doHTML) {
2871
2872 //----- handle underlining
2873 for (const std::unique_ptr<TextUnderline> &underline : underlines) {
2874 if (underline->horiz) {
2875 // rot = 0
2876 if (pools[0]->minBaseIdx <= pools[0]->maxBaseIdx) {
2877 startBaseIdx = pools[0]->getBaseIdx(base: underline->y0 + minUnderlineGap);
2878 endBaseIdx = pools[0]->getBaseIdx(base: underline->y0 + maxUnderlineGap);
2879 for (j = startBaseIdx; j <= endBaseIdx; ++j) {
2880 for (word0 = pools[0]->getPool(baseIdx: j); word0; word0 = word0->next) {
2881 //~ need to check the y value against the word baseline
2882 if (underline->x0 < word0->xMin + underlineSlack && word0->xMax - underlineSlack < underline->x1) {
2883 word0->underlined = true;
2884 }
2885 }
2886 }
2887 }
2888
2889 // rot = 2
2890 if (pools[2]->minBaseIdx <= pools[2]->maxBaseIdx) {
2891 startBaseIdx = pools[2]->getBaseIdx(base: underline->y0 - maxUnderlineGap);
2892 endBaseIdx = pools[2]->getBaseIdx(base: underline->y0 - minUnderlineGap);
2893 for (j = startBaseIdx; j <= endBaseIdx; ++j) {
2894 for (word0 = pools[2]->getPool(baseIdx: j); word0; word0 = word0->next) {
2895 if (underline->x0 < word0->xMin + underlineSlack && word0->xMax - underlineSlack < underline->x1) {
2896 word0->underlined = true;
2897 }
2898 }
2899 }
2900 }
2901 } else {
2902 // rot = 1
2903 if (pools[1]->minBaseIdx <= pools[1]->maxBaseIdx) {
2904 startBaseIdx = pools[1]->getBaseIdx(base: underline->x0 - maxUnderlineGap);
2905 endBaseIdx = pools[1]->getBaseIdx(base: underline->x0 - minUnderlineGap);
2906 for (j = startBaseIdx; j <= endBaseIdx; ++j) {
2907 for (word0 = pools[1]->getPool(baseIdx: j); word0; word0 = word0->next) {
2908 if (underline->y0 < word0->yMin + underlineSlack && word0->yMax - underlineSlack < underline->y1) {
2909 word0->underlined = true;
2910 }
2911 }
2912 }
2913 }
2914
2915 // rot = 3
2916 if (pools[3]->minBaseIdx <= pools[3]->maxBaseIdx) {
2917 startBaseIdx = pools[3]->getBaseIdx(base: underline->x0 + minUnderlineGap);
2918 endBaseIdx = pools[3]->getBaseIdx(base: underline->x0 + maxUnderlineGap);
2919 for (j = startBaseIdx; j <= endBaseIdx; ++j) {
2920 for (word0 = pools[3]->getPool(baseIdx: j); word0; word0 = word0->next) {
2921 if (underline->y0 < word0->yMin + underlineSlack && word0->yMax - underlineSlack < underline->y1) {
2922 word0->underlined = true;
2923 }
2924 }
2925 }
2926 }
2927 }
2928 }
2929
2930 //----- handle links
2931 for (const std::unique_ptr<TextLink> &link : links) {
2932 // rot = 0
2933 if (pools[0]->minBaseIdx <= pools[0]->maxBaseIdx) {
2934 startBaseIdx = pools[0]->getBaseIdx(base: link->yMin);
2935 endBaseIdx = pools[0]->getBaseIdx(base: link->yMax);
2936 for (j = startBaseIdx; j <= endBaseIdx; ++j) {
2937 for (word0 = pools[0]->getPool(baseIdx: j); word0; word0 = word0->next) {
2938 if (link->xMin < word0->xMin + hyperlinkSlack && word0->xMax - hyperlinkSlack < link->xMax && link->yMin < word0->yMin + hyperlinkSlack && word0->yMax - hyperlinkSlack < link->yMax) {
2939 word0->link = link->link;
2940 }
2941 }
2942 }
2943 }
2944
2945 // rot = 2
2946 if (pools[2]->minBaseIdx <= pools[2]->maxBaseIdx) {
2947 startBaseIdx = pools[2]->getBaseIdx(base: link->yMin);
2948 endBaseIdx = pools[2]->getBaseIdx(base: link->yMax);
2949 for (j = startBaseIdx; j <= endBaseIdx; ++j) {
2950 for (word0 = pools[2]->getPool(baseIdx: j); word0; word0 = word0->next) {
2951 if (link->xMin < word0->xMin + hyperlinkSlack && word0->xMax - hyperlinkSlack < link->xMax && link->yMin < word0->yMin + hyperlinkSlack && word0->yMax - hyperlinkSlack < link->yMax) {
2952 word0->link = link->link;
2953 }
2954 }
2955 }
2956 }
2957
2958 // rot = 1
2959 if (pools[1]->minBaseIdx <= pools[1]->maxBaseIdx) {
2960 startBaseIdx = pools[1]->getBaseIdx(base: link->xMin);
2961 endBaseIdx = pools[1]->getBaseIdx(base: link->xMax);
2962 for (j = startBaseIdx; j <= endBaseIdx; ++j) {
2963 for (word0 = pools[1]->getPool(baseIdx: j); word0; word0 = word0->next) {
2964 if (link->yMin < word0->yMin + hyperlinkSlack && word0->yMax - hyperlinkSlack < link->yMax && link->xMin < word0->xMin + hyperlinkSlack && word0->xMax - hyperlinkSlack < link->xMax) {
2965 word0->link = link->link;
2966 }
2967 }
2968 }
2969 }
2970
2971 // rot = 3
2972 if (pools[3]->minBaseIdx <= pools[3]->maxBaseIdx) {
2973 startBaseIdx = pools[3]->getBaseIdx(base: link->xMin);
2974 endBaseIdx = pools[3]->getBaseIdx(base: link->xMax);
2975 for (j = startBaseIdx; j <= endBaseIdx; ++j) {
2976 for (word0 = pools[3]->getPool(baseIdx: j); word0; word0 = word0->next) {
2977 if (link->yMin < word0->yMin + hyperlinkSlack && word0->yMax - hyperlinkSlack < link->yMax && link->xMin < word0->xMin + hyperlinkSlack && word0->xMax - hyperlinkSlack < link->xMax) {
2978 word0->link = link->link;
2979 }
2980 }
2981 }
2982 }
2983 }
2984 }
2985
2986 //----- assemble the blocks
2987
2988 //~ add an outer loop for writing mode (vertical text)
2989
2990 // build blocks for each rotation value
2991 for (rot = 0; rot < 4; ++rot) {
2992 std::unique_ptr<TextPool> &pool = pools[rot];
2993 poolMinBaseIdx = pool->minBaseIdx;
2994 count[rot] = 0;
2995
2996 // add blocks until no more words are left
2997 while (true) {
2998
2999 // find the first non-empty line in the pool
3000 for (; poolMinBaseIdx <= pool->maxBaseIdx && !pool->getPool(baseIdx: poolMinBaseIdx); ++poolMinBaseIdx) {
3001 ;
3002 }
3003 if (poolMinBaseIdx > pool->maxBaseIdx) {
3004 break;
3005 }
3006
3007 // look for the left-most word in the first four lines of the
3008 // pool -- this avoids starting with a superscript word
3009 startBaseIdx = poolMinBaseIdx;
3010 for (baseIdx = poolMinBaseIdx + 1; baseIdx < poolMinBaseIdx + 4 && baseIdx <= pool->maxBaseIdx; ++baseIdx) {
3011 if (!pool->getPool(baseIdx)) {
3012 continue;
3013 }
3014 if (pool->getPool(baseIdx)->primaryCmp(word: pool->getPool(baseIdx: startBaseIdx)) < 0) {
3015 startBaseIdx = baseIdx;
3016 }
3017 }
3018
3019 // create a new block
3020 word0 = pool->getPool(baseIdx: startBaseIdx);
3021 pool->setPool(baseIdx: startBaseIdx, p: word0->next);
3022 word0->next = nullptr;
3023 blk = new TextBlock(this, rot);
3024 blk->addWord(word: word0);
3025
3026 fontSize = word0->fontSize;
3027 minBase = maxBase = word0->base;
3028 colSpace1 = minColSpacing1 * fontSize;
3029 colSpace2 = minColSpacing2 * fontSize;
3030 lineSpace = maxLineSpacingDelta * fontSize;
3031 intraLineSpace = maxIntraLineDelta * fontSize;
3032
3033 // add words to the block
3034 do {
3035 found = false;
3036
3037 // look for words on the line above the current top edge of
3038 // the block
3039 newMinBase = minBase;
3040 for (baseIdx = pool->getBaseIdx(base: minBase); baseIdx >= pool->getBaseIdx(base: minBase - lineSpace); --baseIdx) {
3041 word0 = nullptr;
3042 word1 = pool->getPool(baseIdx);
3043 while (word1) {
3044 if (word1->base < minBase && word1->base >= minBase - lineSpace && ((rot == 0 || rot == 2) ? (word1->xMin < blk->xMax && word1->xMax > blk->xMin) : (word1->yMin < blk->yMax && word1->yMax > blk->yMin))
3045 && fabs(x: word1->fontSize - fontSize) < maxBlockFontSizeDelta1 * fontSize) {
3046 word2 = word1;
3047 if (word0) {
3048 word0->next = word1->next;
3049 } else {
3050 pool->setPool(baseIdx, p: word1->next);
3051 }
3052 word1 = word1->next;
3053 word2->next = nullptr;
3054 blk->addWord(word: word2);
3055 found = true;
3056 newMinBase = word2->base;
3057 } else {
3058 word0 = word1;
3059 word1 = word1->next;
3060 }
3061 }
3062 }
3063 minBase = newMinBase;
3064
3065 // look for words on the line below the current bottom edge of
3066 // the block
3067 newMaxBase = maxBase;
3068 for (baseIdx = pool->getBaseIdx(base: maxBase); baseIdx <= pool->getBaseIdx(base: maxBase + lineSpace); ++baseIdx) {
3069 word0 = nullptr;
3070 word1 = pool->getPool(baseIdx);
3071 while (word1) {
3072 if (word1->base > maxBase && word1->base <= maxBase + lineSpace && ((rot == 0 || rot == 2) ? (word1->xMin < blk->xMax && word1->xMax > blk->xMin) : (word1->yMin < blk->yMax && word1->yMax > blk->yMin))
3073 && fabs(x: word1->fontSize - fontSize) < maxBlockFontSizeDelta1 * fontSize) {
3074 word2 = word1;
3075 if (word0) {
3076 word0->next = word1->next;
3077 } else {
3078 pool->setPool(baseIdx, p: word1->next);
3079 }
3080 word1 = word1->next;
3081 word2->next = nullptr;
3082 blk->addWord(word: word2);
3083 found = true;
3084 newMaxBase = word2->base;
3085 } else {
3086 word0 = word1;
3087 word1 = word1->next;
3088 }
3089 }
3090 }
3091 maxBase = newMaxBase;
3092
3093 // look for words that are on lines already in the block, and
3094 // that overlap the block horizontally
3095 for (baseIdx = pool->getBaseIdx(base: minBase - intraLineSpace); baseIdx <= pool->getBaseIdx(base: maxBase + intraLineSpace); ++baseIdx) {
3096 word0 = nullptr;
3097 word1 = pool->getPool(baseIdx);
3098 while (word1) {
3099 if (word1->base >= minBase - intraLineSpace && word1->base <= maxBase + intraLineSpace
3100 && ((rot == 0 || rot == 2) ? (word1->xMin < blk->xMax + colSpace1 && word1->xMax > blk->xMin - colSpace1) : (word1->yMin < blk->yMax + colSpace1 && word1->yMax > blk->yMin - colSpace1))
3101 && fabs(x: word1->fontSize - fontSize) < maxBlockFontSizeDelta2 * fontSize) {
3102 word2 = word1;
3103 if (word0) {
3104 word0->next = word1->next;
3105 } else {
3106 pool->setPool(baseIdx, p: word1->next);
3107 }
3108 word1 = word1->next;
3109 word2->next = nullptr;
3110 blk->addWord(word: word2);
3111 found = true;
3112 } else {
3113 word0 = word1;
3114 word1 = word1->next;
3115 }
3116 }
3117 }
3118
3119 // only check for outlying words (the next two chunks of code)
3120 // if we didn't find anything else
3121 if (found) {
3122 continue;
3123 }
3124
3125 // scan down the left side of the block, looking for words
3126 // that are near (but not overlapping) the block; if there are
3127 // three or fewer, add them to the block
3128 n = 0;
3129 for (baseIdx = pool->getBaseIdx(base: minBase - intraLineSpace); baseIdx <= pool->getBaseIdx(base: maxBase + intraLineSpace); ++baseIdx) {
3130 word1 = pool->getPool(baseIdx);
3131 while (word1) {
3132 if (word1->base >= minBase - intraLineSpace && word1->base <= maxBase + intraLineSpace
3133 && ((rot == 0 || rot == 2) ? (word1->xMax <= blk->xMin && word1->xMax > blk->xMin - colSpace2) : (word1->yMax <= blk->yMin && word1->yMax > blk->yMin - colSpace2))
3134 && fabs(x: word1->fontSize - fontSize) < maxBlockFontSizeDelta3 * fontSize) {
3135 ++n;
3136 break;
3137 }
3138 word1 = word1->next;
3139 }
3140 }
3141 if (n > 0 && n <= 3) {
3142 for (baseIdx = pool->getBaseIdx(base: minBase - intraLineSpace); baseIdx <= pool->getBaseIdx(base: maxBase + intraLineSpace); ++baseIdx) {
3143 word0 = nullptr;
3144 word1 = pool->getPool(baseIdx);
3145 while (word1) {
3146 if (word1->base >= minBase - intraLineSpace && word1->base <= maxBase + intraLineSpace
3147 && ((rot == 0 || rot == 2) ? (word1->xMax <= blk->xMin && word1->xMax > blk->xMin - colSpace2) : (word1->yMax <= blk->yMin && word1->yMax > blk->yMin - colSpace2))
3148 && fabs(x: word1->fontSize - fontSize) < maxBlockFontSizeDelta3 * fontSize) {
3149 word2 = word1;
3150 if (word0) {
3151 word0->next = word1->next;
3152 } else {
3153 pool->setPool(baseIdx, p: word1->next);
3154 }
3155 word1 = word1->next;
3156 word2->next = nullptr;
3157 blk->addWord(word: word2);
3158 if (word2->base < minBase) {
3159 minBase = word2->base;
3160 } else if (word2->base > maxBase) {
3161 maxBase = word2->base;
3162 }
3163 found = true;
3164 break;
3165 } else {
3166 word0 = word1;
3167 word1 = word1->next;
3168 }
3169 }
3170 }
3171 }
3172
3173 // scan down the right side of the block, looking for words
3174 // that are near (but not overlapping) the block; if there are
3175 // three or fewer, add them to the block
3176 n = 0;
3177 for (baseIdx = pool->getBaseIdx(base: minBase - intraLineSpace); baseIdx <= pool->getBaseIdx(base: maxBase + intraLineSpace); ++baseIdx) {
3178 word1 = pool->getPool(baseIdx);
3179 while (word1) {
3180 if (word1->base >= minBase - intraLineSpace && word1->base <= maxBase + intraLineSpace
3181 && ((rot == 0 || rot == 2) ? (word1->xMin >= blk->xMax && word1->xMin < blk->xMax + colSpace2) : (word1->yMin >= blk->yMax && word1->yMin < blk->yMax + colSpace2))
3182 && fabs(x: word1->fontSize - fontSize) < maxBlockFontSizeDelta3 * fontSize) {
3183 ++n;
3184 break;
3185 }
3186 word1 = word1->next;
3187 }
3188 }
3189 if (n > 0 && n <= 3) {
3190 for (baseIdx = pool->getBaseIdx(base: minBase - intraLineSpace); baseIdx <= pool->getBaseIdx(base: maxBase + intraLineSpace); ++baseIdx) {
3191 word0 = nullptr;
3192 word1 = pool->getPool(baseIdx);
3193 while (word1) {
3194 if (word1->base >= minBase - intraLineSpace && word1->base <= maxBase + intraLineSpace
3195 && ((rot == 0 || rot == 2) ? (word1->xMin >= blk->xMax && word1->xMin < blk->xMax + colSpace2) : (word1->yMin >= blk->yMax && word1->yMin < blk->yMax + colSpace2))
3196 && fabs(x: word1->fontSize - fontSize) < maxBlockFontSizeDelta3 * fontSize) {
3197 word2 = word1;
3198 if (word0) {
3199 word0->next = word1->next;
3200 } else {
3201 pool->setPool(baseIdx, p: word1->next);
3202 }
3203 word1 = word1->next;
3204 word2->next = nullptr;
3205 blk->addWord(word: word2);
3206 if (word2->base < minBase) {
3207 minBase = word2->base;
3208 } else if (word2->base > maxBase) {
3209 maxBase = word2->base;
3210 }
3211 found = true;
3212 break;
3213 } else {
3214 word0 = word1;
3215 word1 = word1->next;
3216 }
3217 }
3218 }
3219 }
3220
3221 } while (found);
3222
3223 //~ need to compute the primary writing mode (horiz/vert) in
3224 //~ addition to primary rotation
3225
3226 // coalesce the block, and add it to the list
3227 blk->coalesce(uMap, fixedPitch);
3228 if (lastBlk) {
3229 lastBlk->next = blk;
3230 } else {
3231 blkList = blk;
3232 }
3233 lastBlk = blk;
3234 count[rot] += blk->charCount;
3235 ++nBlocks;
3236 }
3237
3238 if (count[rot] > count[primaryRot]) {
3239 primaryRot = rot;
3240 }
3241 }
3242
3243#if 0 // for debugging
3244 printf("*** rotation ***\n");
3245 for (rot = 0; rot < 4; ++rot) {
3246 printf(" %d: %6d\n", rot, count[rot]);
3247 }
3248 printf(" primary rot = %d\n", primaryRot);
3249 printf("\n");
3250#endif
3251
3252#if 0 // for debugging
3253 printf("*** blocks ***\n");
3254 for (blk = blkList; blk; blk = blk->next) {
3255 printf("block: rot=%d x=%.2f..%.2f y=%.2f..%.2f\n",
3256 blk->rot, blk->xMin, blk->xMax, blk->yMin, blk->yMax);
3257 for (line = blk->lines; line; line = line->next) {
3258 printf(" line: x=%.2f..%.2f y=%.2f..%.2f base=%.2f\n",
3259 line->xMin, line->xMax, line->yMin, line->yMax, line->base);
3260 for (word0 = line->words; word0; word0 = word0->next) {
3261 printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSize=%.2f space=%d: '",
3262 word0->xMin, word0->xMax, word0->yMin, word0->yMax,
3263 word0->base, word0->fontSize, word0->spaceAfter);
3264 for (i = 0; i < word0->len; ++i) {
3265 fputc(word0->text[i] & 0xff, stdout);
3266 }
3267 printf("'\n");
3268 }
3269 }
3270 }
3271 printf("\n");
3272#endif
3273
3274 // determine the primary direction
3275 lrCount = 0;
3276 for (blk = blkList; blk; blk = blk->next) {
3277 for (line = blk->lines; line; line = line->next) {
3278 for (word0 = line->words; word0; word0 = word0->next) {
3279 for (size_t i = 0; i < word0->len(); ++i) {
3280 if (unicodeTypeL(c: word0->chars[i].text)) {
3281 ++lrCount;
3282 } else if (unicodeTypeR(c: word0->chars[i].text)) {
3283 --lrCount;
3284 }
3285 }
3286 }
3287 }
3288 }
3289 primaryLR = lrCount >= 0;
3290
3291#if 0 // for debugging
3292 printf("*** direction ***\n");
3293 printf("lrCount = %d\n", lrCount);
3294 printf("primaryLR = %d\n", primaryLR);
3295#endif
3296
3297 //----- column assignment
3298
3299 // sort blocks into xy order for column assignment
3300 if (blocks) {
3301 gfree(p: blocks);
3302 }
3303 if (physLayout && fixedPitch) {
3304
3305 blocks = (TextBlock **)gmallocn(count: nBlocks, size: sizeof(TextBlock *));
3306 int i;
3307 for (blk = blkList, i = 0; blk; blk = blk->next, ++i) {
3308 blocks[i] = blk;
3309 col1 = 0; // make gcc happy
3310 switch (primaryRot) {
3311 case 0:
3312 col1 = (int)(blk->xMin / fixedPitch + 0.5);
3313 break;
3314 case 1:
3315 col1 = (int)(blk->yMin / fixedPitch + 0.5);
3316 break;
3317 case 2:
3318 col1 = (int)((pageWidth - blk->xMax) / fixedPitch + 0.5);
3319 break;
3320 case 3:
3321 col1 = (int)((pageHeight - blk->yMax) / fixedPitch + 0.5);
3322 break;
3323 }
3324 blk->col = col1;
3325 for (line = blk->lines; line; line = line->next) {
3326 for (j = 0; j <= line->len; ++j) {
3327 line->col[j] += col1;
3328 }
3329 }
3330 }
3331
3332 } else {
3333
3334 // sort blocks into xy order for column assignment
3335 blocks = (TextBlock **)gmallocn(count: nBlocks, size: sizeof(TextBlock *));
3336 int i;
3337 for (blk = blkList, i = 0; blk; blk = blk->next, ++i) {
3338 blocks[i] = blk;
3339 }
3340 if (blocks) {
3341 qsort(base: blocks, nmemb: nBlocks, size: sizeof(TextBlock *), compar: &TextBlock::cmpXYPrimaryRot);
3342 }
3343
3344 // column assignment
3345 for (i = 0; i < nBlocks; ++i) {
3346 blk0 = blocks[i];
3347 col1 = 0;
3348 for (j = 0; j < i; ++j) {
3349 blk1 = blocks[j];
3350 col2 = 0; // make gcc happy
3351 switch (primaryRot) {
3352 case 0:
3353 if (blk0->xMin > blk1->xMax) {
3354 col2 = blk1->col + blk1->nColumns + 3;
3355 } else if (blk1->xMax == blk1->xMin) {
3356 col2 = blk1->col;
3357 } else {
3358 col2 = blk1->col + (int)(((blk0->xMin - blk1->xMin) / (blk1->xMax - blk1->xMin)) * blk1->nColumns);
3359 }
3360 break;
3361 case 1:
3362 if (blk0->yMin > blk1->yMax) {
3363 col2 = blk1->col + blk1->nColumns + 3;
3364 } else if (blk1->yMax == blk1->yMin) {
3365 col2 = blk1->col;
3366 } else {
3367 col2 = blk1->col + (int)(((blk0->yMin - blk1->yMin) / (blk1->yMax - blk1->yMin)) * blk1->nColumns);
3368 }
3369 break;
3370 case 2:
3371 if (blk0->xMax < blk1->xMin) {
3372 col2 = blk1->col + blk1->nColumns + 3;
3373 } else if (blk1->xMin == blk1->xMax) {
3374 col2 = blk1->col;
3375 } else {
3376 col2 = blk1->col + (int)(((blk0->xMax - blk1->xMax) / (blk1->xMin - blk1->xMax)) * blk1->nColumns);
3377 }
3378 break;
3379 case 3:
3380 if (blk0->yMax < blk1->yMin) {
3381 col2 = blk1->col + blk1->nColumns + 3;
3382 } else if (blk1->yMin == blk1->yMax) {
3383 col2 = blk1->col;
3384 } else {
3385 col2 = blk1->col + (int)(((blk0->yMax - blk1->yMax) / (blk1->yMin - blk1->yMax)) * blk1->nColumns);
3386 }
3387 break;
3388 }
3389 if (col2 > col1) {
3390 col1 = col2;
3391 }
3392 }
3393 blk0->col = col1;
3394 for (line = blk0->lines; line; line = line->next) {
3395 for (j = 0; j <= line->len; ++j) {
3396 line->col[j] += col1;
3397 }
3398 }
3399 }
3400 }
3401
3402#if 0 // for debugging
3403 printf("*** blocks, after column assignment ***\n");
3404 for (blk = blkList; blk; blk = blk->next) {
3405 printf("block: rot=%d x=%.2f..%.2f y=%.2f..%.2f col=%d nCols=%d\n",
3406 blk->rot, blk->xMin, blk->xMax, blk->yMin, blk->yMax, blk->col,
3407 blk->nColumns);
3408 for (line = blk->lines; line; line = line->next) {
3409 printf(" line: col[0]=%d\n", line->col[0]);
3410 for (word0 = line->words; word0; word0 = word0->next) {
3411 printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSize=%.2f space=%d: '",
3412 word0->xMin, word0->xMax, word0->yMin, word0->yMax,
3413 word0->base, word0->fontSize, word0->spaceAfter);
3414 for (i = 0; i < word0->len; ++i) {
3415 fputc(word0->text[i] & 0xff, stdout);
3416 }
3417 printf("'\n");
3418 }
3419 }
3420 }
3421 printf("\n");
3422#endif
3423
3424 //----- reading order sort
3425
3426 // compute space on left and right sides of each block
3427 for (int i = 0; i < nBlocks; ++i) {
3428 blk0 = blocks[i];
3429 for (j = 0; j < nBlocks; ++j) {
3430 blk1 = blocks[j];
3431 if (blk1 != blk0) {
3432 blk0->updatePriMinMax(blk: blk1);
3433 }
3434 }
3435 }
3436
3437#if 0 // for debugging
3438 printf("PAGE\n");
3439#endif
3440
3441 int sortPos = 0;
3442 bool *visited = (bool *)gmallocn(count: nBlocks, size: sizeof(bool));
3443 for (int i = 0; i < nBlocks; i++) {
3444 visited[i] = false;
3445 }
3446
3447 double bxMin0, byMin0, bxMin1, byMin1;
3448 int numTables = 0;
3449 int tableId = -1;
3450 int correspondenceX, correspondenceY;
3451 double xCentre1, yCentre1, xCentre2, yCentre2;
3452 double xCentre3, yCentre3, xCentre4, yCentre4;
3453 double deltaX, deltaY;
3454 TextBlock *fblk2 = nullptr, *fblk3 = nullptr, *fblk4 = nullptr;
3455
3456 for (blk1 = blkList; blk1; blk1 = blk1->next) {
3457 blk1->ExMin = blk1->xMin;
3458 blk1->ExMax = blk1->xMax;
3459 blk1->EyMin = blk1->yMin;
3460 blk1->EyMax = blk1->yMax;
3461
3462 bxMin0 = DBL_MAX;
3463 byMin0 = DBL_MAX;
3464 bxMin1 = DBL_MAX;
3465 byMin1 = DBL_MAX;
3466
3467 fblk2 = nullptr;
3468 fblk3 = nullptr;
3469 fblk4 = nullptr;
3470
3471 /* find fblk2, fblk3 and fblk4 so that
3472 * fblk2 is on the right of blk1 and overlap with blk1 in y axis
3473 * fblk3 is under blk1 and overlap with blk1 in x axis
3474 * fblk4 is under blk1 and on the right of blk1
3475 * and they are closest to blk1
3476 */
3477 for (blk2 = blkList; blk2; blk2 = blk2->next) {
3478 if (blk2 != blk1) {
3479 if (blk2->yMin <= blk1->yMax && blk2->yMax >= blk1->yMin && blk2->xMin > blk1->xMax && blk2->xMin < bxMin0) {
3480 bxMin0 = blk2->xMin;
3481 fblk2 = blk2;
3482 } else if (blk2->xMin <= blk1->xMax && blk2->xMax >= blk1->xMin && blk2->yMin > blk1->yMax && blk2->yMin < byMin0) {
3483 byMin0 = blk2->yMin;
3484 fblk3 = blk2;
3485 } else if (blk2->xMin > blk1->xMax && blk2->xMin < bxMin1 && blk2->yMin > blk1->yMax && blk2->yMin < byMin1) {
3486 bxMin1 = blk2->xMin;
3487 byMin1 = blk2->yMin;
3488 fblk4 = blk2;
3489 }
3490 }
3491 }
3492
3493 /* fblk4 can not overlap with fblk3 in x and with fblk2 in y
3494 * fblk2 can not overlap with fblk3 in x and y
3495 * fblk4 has to overlap with fblk3 in y and with fblk2 in x
3496 */
3497 if (fblk2 != nullptr && fblk3 != nullptr && fblk4 != nullptr) {
3498 if (((fblk3->xMin <= fblk4->xMax && fblk3->xMax >= fblk4->xMin) || (fblk2->yMin <= fblk4->yMax && fblk2->yMax >= fblk4->yMin) || (fblk2->xMin <= fblk3->xMax && fblk2->xMax >= fblk3->xMin)
3499 || (fblk2->yMin <= fblk3->yMax && fblk2->yMax >= fblk3->yMin))
3500 || !(fblk4->xMin <= fblk2->xMax && fblk4->xMax >= fblk2->xMin && fblk4->yMin <= fblk3->yMax && fblk4->yMax >= fblk3->yMin)) {
3501 fblk2 = nullptr;
3502 fblk3 = nullptr;
3503 fblk4 = nullptr;
3504 }
3505 }
3506
3507 // if we found any then look whether they form a table
3508 if (fblk2 != nullptr && fblk3 != nullptr && fblk4 != nullptr) {
3509 tableId = -1;
3510 correspondenceX = 0;
3511 correspondenceY = 0;
3512 deltaX = 0.0;
3513 deltaY = 0.0;
3514
3515 if (blk1->lines && blk1->lines->words) {
3516 deltaX = blk1->lines->words->getFontSize();
3517 }
3518 if (fblk2->lines && fblk2->lines->words) {
3519 deltaX = deltaX < fblk2->lines->words->getFontSize() ? deltaX : fblk2->lines->words->getFontSize();
3520 }
3521 if (fblk3->lines && fblk3->lines->words) {
3522 deltaX = deltaX < fblk3->lines->words->getFontSize() ? deltaX : fblk3->lines->words->getFontSize();
3523 }
3524 if (fblk4->lines && fblk4->lines->words) {
3525 deltaX = deltaX < fblk4->lines->words->getFontSize() ? deltaX : fblk4->lines->words->getFontSize();
3526 }
3527
3528 deltaY = deltaX;
3529
3530 deltaX *= minColSpacing1;
3531 deltaY *= maxIntraLineDelta;
3532
3533 xCentre1 = (blk1->xMax + blk1->xMin) / 2.0;
3534 yCentre1 = (blk1->yMax + blk1->yMin) / 2.0;
3535 xCentre2 = (fblk2->xMax + fblk2->xMin) / 2.0;
3536 yCentre2 = (fblk2->yMax + fblk2->yMin) / 2.0;
3537 xCentre3 = (fblk3->xMax + fblk3->xMin) / 2.0;
3538 yCentre3 = (fblk3->yMax + fblk3->yMin) / 2.0;
3539 xCentre4 = (fblk4->xMax + fblk4->xMin) / 2.0;
3540 yCentre4 = (fblk4->yMax + fblk4->yMin) / 2.0;
3541
3542 // are blocks centrally aligned in x ?
3543 if (fabs(x: xCentre1 - xCentre3) <= deltaX && fabs(x: xCentre2 - xCentre4) <= deltaX) {
3544 correspondenceX++;
3545 }
3546
3547 // are blocks centrally aligned in y ?
3548 if (fabs(x: yCentre1 - yCentre2) <= deltaY && fabs(x: yCentre3 - yCentre4) <= deltaY) {
3549 correspondenceY++;
3550 }
3551
3552 // are blocks aligned to the left ?
3553 if (fabs(x: blk1->xMin - fblk3->xMin) <= deltaX && fabs(x: fblk2->xMin - fblk4->xMin) <= deltaX) {
3554 correspondenceX++;
3555 }
3556
3557 // are blocks aligned to the right ?
3558 if (fabs(x: blk1->xMax - fblk3->xMax) <= deltaX && fabs(x: fblk2->xMax - fblk4->xMax) <= deltaX) {
3559 correspondenceX++;
3560 }
3561
3562 // are blocks aligned to the top ?
3563 if (fabs(x: blk1->yMin - fblk2->yMin) <= deltaY && fabs(x: fblk3->yMin - fblk4->yMin) <= deltaY) {
3564 correspondenceY++;
3565 }
3566
3567 // are blocks aligned to the bottom ?
3568 if (fabs(x: blk1->yMax - fblk2->yMax) <= deltaY && fabs(x: fblk3->yMax - fblk4->yMax) <= deltaY) {
3569 correspondenceY++;
3570 }
3571
3572 // are blocks aligned in x and y ?
3573 if (correspondenceX > 0 && correspondenceY > 0) {
3574
3575 // find maximal tableId
3576 tableId = tableId < fblk4->tableId ? fblk4->tableId : tableId;
3577 tableId = tableId < fblk3->tableId ? fblk3->tableId : tableId;
3578 tableId = tableId < fblk2->tableId ? fblk2->tableId : tableId;
3579 tableId = tableId < blk1->tableId ? blk1->tableId : tableId;
3580
3581 // if the tableId is -1, then we found new table
3582 if (tableId < 0) {
3583 tableId = numTables;
3584 numTables++;
3585 }
3586
3587 blk1->tableId = tableId;
3588 fblk2->tableId = tableId;
3589 fblk3->tableId = tableId;
3590 fblk4->tableId = tableId;
3591 }
3592 }
3593 }
3594
3595 /* set extended bounding boxes of all table entries
3596 * so that they contain whole table
3597 * (we need to process whole table size when comparing it
3598 * with regular text blocks)
3599 */
3600 PDFRectangle *envelopes = new PDFRectangle[numTables];
3601 TextBlock **ending_blocks = new TextBlock *[numTables];
3602
3603 for (int i = 0; i < numTables; i++) {
3604 envelopes[i].x1 = DBL_MAX;
3605 envelopes[i].x2 = DBL_MIN;
3606 envelopes[i].y1 = DBL_MAX;
3607 envelopes[i].y2 = DBL_MIN;
3608 ending_blocks[i] = nullptr;
3609 }
3610
3611 for (blk1 = blkList; blk1; blk1 = blk1->next) {
3612 if (blk1->tableId >= 0) {
3613 if (blk1->ExMin < envelopes[blk1->tableId].x1) {
3614 envelopes[blk1->tableId].x1 = blk1->ExMin;
3615 if (!blk1->page->primaryLR) {
3616 ending_blocks[blk1->tableId] = blk1;
3617 }
3618 }
3619
3620 if (blk1->ExMax > envelopes[blk1->tableId].x2) {
3621 envelopes[blk1->tableId].x2 = blk1->ExMax;
3622 if (blk1->page->primaryLR) {
3623 ending_blocks[blk1->tableId] = blk1;
3624 }
3625 }
3626
3627 envelopes[blk1->tableId].y1 = blk1->EyMin < envelopes[blk1->tableId].y1 ? blk1->EyMin : envelopes[blk1->tableId].y1;
3628 envelopes[blk1->tableId].y2 = blk1->EyMax > envelopes[blk1->tableId].y2 ? blk1->EyMax : envelopes[blk1->tableId].y2;
3629 }
3630 }
3631
3632 for (blk1 = blkList; blk1; blk1 = blk1->next) {
3633 if (blk1->tableId >= 0 && ending_blocks[blk1->tableId] && blk1->xMin <= ending_blocks[blk1->tableId]->xMax && blk1->xMax >= ending_blocks[blk1->tableId]->xMin) {
3634 blk1->tableEnd = true;
3635 }
3636 }
3637
3638 for (blk1 = blkList; blk1; blk1 = blk1->next) {
3639 if (blk1->tableId >= 0) {
3640 blk1->ExMin = envelopes[blk1->tableId].x1;
3641 blk1->ExMax = envelopes[blk1->tableId].x2;
3642 blk1->EyMin = envelopes[blk1->tableId].y1;
3643 blk1->EyMax = envelopes[blk1->tableId].y2;
3644 }
3645 }
3646 delete[] envelopes;
3647 delete[] ending_blocks;
3648
3649 /* set extended bounding boxes of all other blocks
3650 * so that they extend in x without hitting neighbours
3651 */
3652 for (blk1 = blkList; blk1; blk1 = blk1->next) {
3653 if (!(blk1->tableId >= 0)) {
3654 double xMax = DBL_MAX;
3655 double xMin = DBL_MIN;
3656
3657 for (blk2 = blkList; blk2; blk2 = blk2->next) {
3658 if (blk2 == blk1) {
3659 continue;
3660 }
3661
3662 if (blk1->yMin <= blk2->yMax && blk1->yMax >= blk2->yMin) {
3663 if (blk2->xMin < xMax && blk2->xMin > blk1->xMax) {
3664 xMax = blk2->xMin;
3665 }
3666
3667 if (blk2->xMax > xMin && blk2->xMax < blk1->xMin) {
3668 xMin = blk2->xMax;
3669 }
3670 }
3671 }
3672
3673 for (blk2 = blkList; blk2; blk2 = blk2->next) {
3674 if (blk2 == blk1) {
3675 continue;
3676 }
3677
3678 if (blk2->xMax > blk1->ExMax && blk2->xMax <= xMax && blk2->yMin >= blk1->yMax) {
3679 blk1->ExMax = blk2->xMax;
3680 }
3681
3682 if (blk2->xMin < blk1->ExMin && blk2->xMin >= xMin && blk2->yMin >= blk1->yMax) {
3683 blk1->ExMin = blk2->xMin;
3684 }
3685 }
3686 }
3687 }
3688
3689 int i = -1;
3690 for (blk1 = blkList; blk1; blk1 = blk1->next) {
3691 i++;
3692 sortPos = blk1->visitDepthFirst(blkList, pos1: i, sorted: blocks, sortPos, visited);
3693 }
3694 if (visited) {
3695 gfree(p: visited);
3696 }
3697
3698#if 0 // for debugging
3699 printf("*** blocks, after ro sort ***\n");
3700 for (i = 0; i < nBlocks; ++i) {
3701 blk = blocks[i];
3702 printf("block: rot=%d x=%.2f..%.2f y=%.2f..%.2f space=%.2f..%.2f\n",
3703 blk->rot, blk->xMin, blk->xMax, blk->yMin, blk->yMax,
3704 blk->priMin, blk->priMax);
3705 for (line = blk->lines; line; line = line->next) {
3706 printf(" line:\n");
3707 for (word0 = line->words; word0; word0 = word0->next) {
3708 printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSize=%.2f space=%d: '",
3709 word0->xMin, word0->xMax, word0->yMin, word0->yMax,
3710 word0->base, word0->fontSize, word0->spaceAfter);
3711 for (j = 0; j < word0->len; ++j) {
3712 fputc(word0->text[j] & 0xff, stdout);
3713 }
3714 printf("'\n");
3715 }
3716 }
3717 }
3718 printf("\n");
3719 fflush(stdout);
3720#endif
3721
3722 // build the flows
3723 //~ this needs to be adjusted for writing mode (vertical text)
3724 //~ this also needs to account for right-to-left column ordering
3725 while (flows) {
3726 flow = flows;
3727 flows = flows->next;
3728 delete flow;
3729 }
3730 flow = nullptr;
3731 flows = lastFlow = nullptr;
3732 // assume blocks are already in reading order,
3733 // and construct flows accordingly.
3734 for (i = 0; i < nBlocks; i++) {
3735 blk = blocks[i];
3736 blk->next = nullptr;
3737 if (flow) {
3738 blk1 = blocks[i - 1];
3739 blkSpace = maxBlockSpacing * blk1->lines->words->fontSize;
3740 if (blk1->secondaryDelta(blk) <= blkSpace && blk->isBelow(blk: blk1) && flow->blockFits(blk, prevBlk: blk1)) {
3741 flow->addBlock(blk);
3742 continue;
3743 }
3744 }
3745 flow = new TextFlow(this, blk);
3746 if (lastFlow) {
3747 lastFlow->next = flow;
3748 } else {
3749 flows = flow;
3750 }
3751 lastFlow = flow;
3752 }
3753
3754#if 0 // for debugging
3755 printf("*** flows ***\n");
3756 for (flow = flows; flow; flow = flow->next) {
3757 printf("flow: x=%.2f..%.2f y=%.2f..%.2f pri:%.2f..%.2f\n",
3758 flow->xMin, flow->xMax, flow->yMin, flow->yMax,
3759 flow->priMin, flow->priMax);
3760 for (blk = flow->blocks; blk; blk = blk->next) {
3761 printf(" block: rot=%d x=%.2f..%.2f y=%.2f..%.2f pri=%.2f..%.2f\n",
3762 blk->rot, blk->ExMin, blk->ExMax, blk->EyMin, blk->EyMax,
3763 blk->priMin, blk->priMax);
3764 for (line = blk->lines; line; line = line->next) {
3765 printf(" line:\n");
3766 for (word0 = line->words; word0; word0 = word0->next) {
3767 printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSize=%.2f space=%d: '",
3768 word0->xMin, word0->xMax, word0->yMin, word0->yMax,
3769 word0->base, word0->fontSize, word0->spaceAfter);
3770 for (i = 0; i < word0->len; ++i) {
3771 fputc(word0->text[i] & 0xff, stdout);
3772 }
3773 printf("'\n");
3774 }
3775 }
3776 }
3777 }
3778 printf("\n");
3779#endif
3780}
3781
3782void TextPage::adjustRotation(TextLine *line, int start, int end, double *xMin, double *xMax, double *yMin, double *yMax)
3783{
3784 switch (line->rot) {
3785 case 0:
3786 *xMin = line->edge[start];
3787 *xMax = line->edge[end];
3788 *yMin = line->yMin;
3789 *yMax = line->yMax;
3790 break;
3791 case 1:
3792 *xMin = line->xMin;
3793 *xMax = line->xMax;
3794 *yMin = line->edge[start];
3795 *yMax = line->edge[end];
3796 break;
3797 case 2:
3798 *xMin = line->edge[end];
3799 *xMax = line->edge[start];
3800 *yMin = line->yMin;
3801 *yMax = line->yMax;
3802 break;
3803 case 3:
3804 *xMin = line->xMin;
3805 *xMax = line->xMax;
3806 *yMin = line->edge[end];
3807 *yMax = line->edge[start];
3808 break;
3809 }
3810}
3811
3812bool TextPage::findText(const Unicode *s, int len, bool startAtTop, bool stopAtBottom, bool startAtLast, bool stopAtLast, bool caseSensitive, bool backward, bool wholeWord, double *xMin, double *yMin, double *xMax, double *yMax)
3813{
3814 return findText(s, len, startAtTop, stopAtBottom, startAtLast, stopAtLast, caseSensitive, ignoreDiacritics: false, matchAcrossLines: false, backward, wholeWord, xMin, yMin, xMax, yMax, continueMatch: nullptr, ignoredHyphen: nullptr);
3815}
3816
3817bool TextPage::findText(const Unicode *s, int len, bool startAtTop, bool stopAtBottom, bool startAtLast, bool stopAtLast, bool caseSensitive, bool ignoreDiacritics, bool backward, bool wholeWord, double *xMin, double *yMin, double *xMax,
3818 double *yMax)
3819{
3820 return findText(s, len, startAtTop, stopAtBottom, startAtLast, stopAtLast, caseSensitive, ignoreDiacritics, matchAcrossLines: false, backward, wholeWord, xMin, yMin, xMax, yMax, continueMatch: nullptr, ignoredHyphen: nullptr);
3821}
3822
3823bool TextPage::findText(const Unicode *s, int len, bool startAtTop, bool stopAtBottom, bool startAtLast, bool stopAtLast, bool caseSensitive, bool ignoreDiacritics, bool matchAcrossLines, bool backward, bool wholeWord, double *xMin,
3824 double *yMin, double *xMax, double *yMax, PDFRectangle *continueMatch, bool *ignoredHyphen)
3825{
3826 TextBlock *blk;
3827 TextLine *line;
3828 Unicode *s2, *txt, *reordered;
3829 Unicode *p;
3830 TextLine *nextline;
3831 Unicode *nextline_txt;
3832 int nextline_len;
3833 bool nextlineAfterHyphen = false;
3834 int txtSize, m, i, j, k;
3835 double xStart, yStart, xStop, yStop;
3836 double xMin0, yMin0, xMax0, yMax0;
3837 double xMin1, yMin1, xMax1, yMax1;
3838 double xMin2, yMin2, xMax2, yMax2;
3839 bool found;
3840
3841 if (len == 0) {
3842 return false;
3843 }
3844
3845 if (rawOrder) {
3846 return false;
3847 }
3848
3849 if (matchAcrossLines && backward) {
3850 // matchAcrossLines is unimplemented for backward search
3851 matchAcrossLines = false;
3852 }
3853
3854 // handle right-to-left text
3855 reordered = (Unicode *)gmallocn(count: len, size: sizeof(Unicode));
3856 reorderText(text: s, len, uMap: nullptr, primaryLR, s: nullptr, u: reordered);
3857
3858 // normalize the search string
3859 s2 = unicodeNormalizeNFKC(in: reordered, len, out_len: &len, indices: nullptr);
3860
3861 // if search string is not pure ascii then don't
3862 // use ignoreDiacritics (as they won't match)
3863 if (!caseSensitive) {
3864 // convert the search string to uppercase
3865 for (i = 0; i < len; ++i) {
3866 s2[i] = unicodeToUpper(c: s2[i]);
3867 if (ignoreDiacritics && !isAscii7(uchar: s2[i])) {
3868 ignoreDiacritics = false;
3869 }
3870 }
3871 } else if (ignoreDiacritics) {
3872 for (i = 0; i < len; ++i) {
3873 if (!isAscii7(uchar: s2[i])) {
3874 ignoreDiacritics = false;
3875 break;
3876 }
3877 }
3878 }
3879
3880 txt = nullptr;
3881 txtSize = 0;
3882
3883 xStart = yStart = xStop = yStop = 0;
3884 if (startAtLast && haveLastFind) {
3885 xStart = lastFindXMin;
3886 yStart = lastFindYMin;
3887 } else if (!startAtTop) {
3888 xStart = *xMin;
3889 yStart = *yMin;
3890 }
3891 if (stopAtLast && haveLastFind) {
3892 xStop = lastFindXMin;
3893 yStop = lastFindYMin;
3894 } else if (!stopAtBottom) {
3895 xStop = *xMax;
3896 yStop = *yMax;
3897 }
3898
3899 found = false;
3900 xMin0 = xMax0 = yMin0 = yMax0 = 0; // make gcc happy
3901 xMin1 = xMax1 = yMin1 = yMax1 = 0; // make gcc happy
3902
3903 for (i = backward ? nBlocks - 1 : 0; backward ? i >= 0 : i < nBlocks; i += backward ? -1 : 1) {
3904 blk = blocks[i];
3905
3906 // check: is the block above the top limit?
3907 // (this only works if the page's primary rotation is zero --
3908 // otherwise the blocks won't be sorted in the useful order)
3909 if (!startAtTop && primaryRot == 0 && (backward ? blk->yMin > yStart : blk->yMax < yStart)) {
3910 continue;
3911 }
3912
3913 // check: is the block below the bottom limit?
3914 // (this only works if the page's primary rotation is zero --
3915 // otherwise the blocks won't be sorted in the useful order)
3916 if (!stopAtBottom && primaryRot == 0 && (backward ? blk->yMax < yStop : blk->yMin > yStop)) {
3917 break;
3918 }
3919
3920 for (line = blk->lines; line; line = line->next) {
3921
3922 // check: is the line above the top limit?
3923 // (this only works if the page's primary rotation is zero --
3924 // otherwise the lines won't be sorted in the useful order)
3925 if (!startAtTop && primaryRot == 0 && (backward ? line->yMin > yStart : line->yMin < yStart)) {
3926 continue;
3927 }
3928
3929 // check: is the line below the bottom limit?
3930 // (this only works if the page's primary rotation is zero --
3931 // otherwise the lines won't be sorted in the useful order)
3932 if (!stopAtBottom && primaryRot == 0 && (backward ? line->yMin < yStop : line->yMin > yStop)) {
3933 continue;
3934 }
3935
3936 if (!line->normalized) {
3937 line->normalized = unicodeNormalizeNFKC(in: line->text, len: line->len, out_len: &line->normalized_len, indices: &line->normalized_idx, reverseRTL: true);
3938 }
3939
3940 nextline = nullptr;
3941 nextline_txt = nullptr;
3942 nextline_len = 0;
3943 if (line->next) {
3944 nextline = line->next;
3945 } else {
3946 // set nextline to first line of next block
3947 int ind = i + (backward ? -1 : 1);
3948 if ((backward && ind >= 0) || (!backward && ind < nBlocks)) {
3949 nextline = blocks[ind]->lines;
3950 }
3951 }
3952
3953 if (matchAcrossLines && nextline && !nextline->normalized) {
3954 nextline->normalized = unicodeNormalizeNFKC(in: nextline->text, len: nextline->len, out_len: &nextline->normalized_len, indices: &nextline->normalized_idx, reverseRTL: true);
3955 }
3956
3957 // convert the line to uppercase
3958 m = line->normalized_len;
3959
3960 if (ignoreDiacritics) {
3961 if (!line->ascii_translation) {
3962 unicodeToAscii7(in: line->normalized, len: line->normalized_len, ucs4_out: &line->ascii_translation, out_len: &line->ascii_len, in_idx: line->normalized_idx, indices: &line->ascii_idx);
3963 }
3964 if (line->ascii_len) {
3965 m = line->ascii_len;
3966 } else {
3967 ignoreDiacritics = false;
3968 }
3969
3970 if (matchAcrossLines && nextline && !nextline->ascii_translation) {
3971 unicodeToAscii7(in: nextline->normalized, len: nextline->normalized_len, ucs4_out: &nextline->ascii_translation, out_len: &nextline->ascii_len, in_idx: nextline->normalized_idx, indices: &nextline->ascii_idx);
3972 }
3973 }
3974 if (!caseSensitive) {
3975 if (m > txtSize) {
3976 txt = (Unicode *)greallocn(p: txt, count: m, size: sizeof(Unicode));
3977 txtSize = m;
3978 }
3979 for (k = 0; k < m; ++k) {
3980 if (ignoreDiacritics) {
3981 txt[k] = unicodeToUpper(c: line->ascii_translation[k]);
3982 } else {
3983 txt[k] = unicodeToUpper(c: line->normalized[k]);
3984 }
3985 }
3986 if (matchAcrossLines && nextline) {
3987 nextline_len = ignoreDiacritics ? nextline->ascii_len : nextline->normalized_len;
3988 nextline_txt = (Unicode *)gmallocn(count: nextline_len, size: sizeof(Unicode));
3989 for (k = 0; k < nextline_len; ++k) {
3990 nextline_txt[k] = ignoreDiacritics ? unicodeToUpper(c: nextline->ascii_translation[k]) : unicodeToUpper(c: nextline->normalized[k]);
3991 }
3992 }
3993 } else {
3994 if (ignoreDiacritics) {
3995 txt = line->ascii_translation;
3996 } else {
3997 txt = line->normalized;
3998 }
3999
4000 if (matchAcrossLines && nextline) {
4001 nextline_len = ignoreDiacritics ? nextline->ascii_len : nextline->normalized_len;
4002 nextline_txt = ignoreDiacritics ? nextline->ascii_translation : nextline->normalized;
4003 }
4004 }
4005
4006 // search each position in this line
4007 j = backward ? m - len : 0;
4008 p = txt + j;
4009 while (backward ? j >= 0 : j <= m - (nextline_txt ? 1 : len)) {
4010 bool wholeWordStartIsOk, wholeWordEndIsOk;
4011 if (wholeWord) {
4012 wholeWordStartIsOk = j == 0 || !unicodeTypeAlphaNum(c: txt[j - 1]);
4013 if (nextline_txt) {
4014 wholeWordEndIsOk = true; // word end may be in next line, so we'll check it later
4015 } else {
4016 wholeWordEndIsOk = j + len == m || !unicodeTypeAlphaNum(c: txt[j + len]);
4017 }
4018 }
4019 if (!wholeWord || (wholeWordStartIsOk && wholeWordEndIsOk)) {
4020 int n = 0;
4021 bool spaceConsumedByNewline = false;
4022 bool found_it;
4023
4024 // compare the strings
4025 for (k = 0; k < len; ++k) {
4026 bool last_char_of_line = j + k == m - 1;
4027 bool last_char_of_search_term = k == len - 1;
4028 bool match_started = (bool)k;
4029
4030 if (p[k] != s2[k] || (nextline_txt && last_char_of_line && !last_char_of_search_term)) {
4031 // now check if the comparison failed at the end-of-line hyphen,
4032 // and if so, keep on comparing at the next line
4033 nextlineAfterHyphen = false;
4034
4035 if (s2[k] == p[k]) {
4036 if (p[k] != (Unicode)'-' && !UnicodeIsWhitespace(ucs4: s2[k + 1])) {
4037 break;
4038 }
4039 k++;
4040 } else if (!match_started || p[k] != (Unicode)'-' || !last_char_of_line || UnicodeIsWhitespace(ucs4: s2[k])) {
4041 break;
4042 } else {
4043 nextlineAfterHyphen = true;
4044 }
4045
4046 for (; n < nextline_len && k < len; ++k, ++n) {
4047 if (nextline_txt[n] != s2[k]) {
4048 if (!spaceConsumedByNewline && !n && UnicodeIsWhitespace(ucs4: s2[k])) {
4049 n = -1;
4050 spaceConsumedByNewline = true;
4051 continue;
4052 }
4053 break;
4054 }
4055 }
4056 break;
4057 }
4058 }
4059
4060 found_it = k == len;
4061 if (found_it && nextline_txt && wholeWord) { // check word end for nextline case
4062 if (n) { // Match ended at next line
4063 wholeWordEndIsOk = n == nextline_len || !unicodeTypeAlphaNum(c: nextline_txt[n]);
4064 } else { // Match ended on same line
4065 wholeWordEndIsOk = j + len == m || !unicodeTypeAlphaNum(c: txt[j + len]);
4066 }
4067
4068 if (!wholeWordEndIsOk) {
4069 found_it = false;
4070 }
4071 }
4072 // found it
4073 if (found_it) {
4074 bool nextLineMatch = (bool)n;
4075 if (spaceConsumedByNewline) {
4076 k--;
4077 }
4078 // where s2 matches a subsequence of a compatibility equivalence
4079 // decomposition, highlight the entire glyph, since we don't know
4080 // the internal layout of subglyph components
4081 int normStart, normAfterEnd;
4082 if (ignoreDiacritics) {
4083 normStart = line->ascii_idx[j];
4084 if (nextline_txt) {
4085 normAfterEnd = line->ascii_idx[j + k - n];
4086 } else {
4087 normAfterEnd = line->ascii_idx[j + len - 1] + 1;
4088 }
4089 } else {
4090 normStart = line->normalized_idx[j];
4091 if (nextline_txt) {
4092 normAfterEnd = line->normalized_idx[j + k - n];
4093 } else {
4094 normAfterEnd = line->normalized_idx[j + len - 1] + 1;
4095 }
4096 }
4097
4098 adjustRotation(line, start: normStart, end: normAfterEnd, xMin: &xMin1, xMax: &xMax1, yMin: &yMin1, yMax: &yMax1);
4099
4100 if (backward) {
4101 if ((startAtTop || yMin1 < yStart || (yMin1 == yStart && xMin1 < xStart)) && (stopAtBottom || yMin1 > yStop || (yMin1 == yStop && xMin1 > xStop))) {
4102 if (!found || yMin1 > yMin0 || (yMin1 == yMin0 && xMin1 > xMin0)) {
4103 xMin0 = xMin1;
4104 xMax0 = xMax1;
4105 yMin0 = yMin1;
4106 yMax0 = yMax1;
4107 found = true;
4108 }
4109 }
4110 } else {
4111 if ((startAtTop || yMin1 > yStart || (yMin1 == yStart && xMin1 > xStart)) && (stopAtBottom || yMin1 < yStop || (yMin1 == yStop && xMin1 < xStop))) {
4112 if (!found || yMin1 < yMin0 || (yMin1 == yMin0 && xMin1 < xMin0)) {
4113 xMin0 = xMin1;
4114 xMax0 = xMax1;
4115 yMin0 = yMin1;
4116 yMax0 = yMax1;
4117 found = true;
4118 if (nextLineMatch) { // set the out parameters
4119 if (ignoredHyphen) {
4120 *ignoredHyphen = nextlineAfterHyphen;
4121 }
4122
4123 if (continueMatch) {
4124 adjustRotation(line: nextline, start: 0, end: n, xMin: &xMin2, xMax: &xMax2, yMin: &yMin2, yMax: &yMax2);
4125 continueMatch->x1 = xMin2;
4126 continueMatch->y1 = yMax2;
4127 continueMatch->x2 = xMax2;
4128 continueMatch->y2 = yMin2;
4129 }
4130 } else if (continueMatch && continueMatch->x1 != std::numeric_limits<double>::max()) {
4131 if (ignoredHyphen) {
4132 *ignoredHyphen = false;
4133 }
4134
4135 continueMatch->x1 = std::numeric_limits<double>::max();
4136 }
4137 }
4138 }
4139 }
4140 }
4141 }
4142 if (backward) {
4143 --j;
4144 --p;
4145 } else {
4146 ++j;
4147 ++p;
4148 }
4149 }
4150
4151 if (nextline_txt && nextline_txt != nextline->ascii_translation && nextline_txt != nextline->normalized) {
4152 gfree(p: nextline_txt);
4153 }
4154 }
4155 }
4156
4157 gfree(p: s2);
4158 gfree(p: reordered);
4159 if (!caseSensitive) {
4160 gfree(p: txt);
4161 }
4162
4163 if (found) {
4164 *xMin = xMin0;
4165 *xMax = xMax0;
4166 *yMin = yMin0;
4167 *yMax = yMax0;
4168 lastFindXMin = xMin0;
4169 lastFindYMin = yMin0;
4170 haveLastFind = true;
4171 return true;
4172 }
4173
4174 return false;
4175}
4176
4177GooString *TextPage::getText(double xMin, double yMin, double xMax, double yMax, EndOfLineKind textEOL) const
4178{
4179 GooString *s;
4180 const UnicodeMap *uMap;
4181 TextBlock *blk;
4182 TextLine *line;
4183 TextLineFrag *frags;
4184 int nFrags, fragsSize;
4185 TextLineFrag *frag;
4186 char space[8], eol[16];
4187 int spaceLen, eolLen;
4188 int lastRot;
4189 double x, y, delta;
4190 int col, idx0, idx1, i, j;
4191 bool multiLine, oneRot;
4192
4193 s = new GooString();
4194
4195 // get the output encoding
4196 if (!(uMap = globalParams->getTextEncoding())) {
4197 return s;
4198 }
4199
4200 if (rawOrder) {
4201 TextWord *word;
4202 char mbc[16];
4203 int mbc_len;
4204
4205 for (word = rawWords; word && word <= rawLastWord; word = word->next) {
4206 for (j = 0; j < word->getLength(); ++j) {
4207 double gXMin, gXMax, gYMin, gYMax;
4208 word->getCharBBox(charIdx: j, xMinA: &gXMin, yMinA: &gYMin, xMaxA: &gXMax, yMaxA: &gYMax);
4209 if (xMin <= gXMin && gXMax <= xMax && yMin <= gYMin && gYMax <= yMax) {
4210 mbc_len = uMap->mapUnicode(u: *(word->getChar(idx: j)), buf: mbc, bufSize: sizeof(mbc));
4211 s->append(str: mbc, lengthA: mbc_len);
4212 }
4213 }
4214 }
4215 return s;
4216 }
4217
4218 spaceLen = uMap->mapUnicode(u: 0x20, buf: space, bufSize: sizeof(space));
4219 eolLen = 0; // make gcc happy
4220 switch (textEOL) {
4221 case eolUnix:
4222 eolLen = uMap->mapUnicode(u: 0x0a, buf: eol, bufSize: sizeof(eol));
4223 break;
4224 case eolDOS:
4225 eolLen = uMap->mapUnicode(u: 0x0d, buf: eol, bufSize: sizeof(eol));
4226 eolLen += uMap->mapUnicode(u: 0x0a, buf: eol + eolLen, bufSize: sizeof(eol) - eolLen);
4227 break;
4228 case eolMac:
4229 eolLen = uMap->mapUnicode(u: 0x0d, buf: eol, bufSize: sizeof(eol));
4230 break;
4231 }
4232
4233 //~ writing mode (horiz/vert)
4234
4235 // collect the line fragments that are in the rectangle
4236 fragsSize = 256;
4237 frags = (TextLineFrag *)gmallocn(count: fragsSize, size: sizeof(TextLineFrag));
4238 nFrags = 0;
4239 lastRot = -1;
4240 oneRot = true;
4241 for (i = 0; i < nBlocks; ++i) {
4242 blk = blocks[i];
4243 if (xMin < blk->xMax && blk->xMin < xMax && yMin < blk->yMax && blk->yMin < yMax) {
4244 for (line = blk->lines; line; line = line->next) {
4245 if (xMin < line->xMax && line->xMin < xMax && yMin < line->yMax && line->yMin < yMax) {
4246 idx0 = idx1 = -1;
4247 switch (line->rot) {
4248 case 0:
4249 y = 0.5 * (line->yMin + line->yMax);
4250 if (yMin < y && y < yMax) {
4251 j = 0;
4252 while (j < line->len) {
4253 if (0.5 * (line->edge[j] + line->edge[j + 1]) > xMin) {
4254 idx0 = j;
4255 break;
4256 }
4257 ++j;
4258 }
4259 j = line->len - 1;
4260 while (j >= 0) {
4261 if (0.5 * (line->edge[j] + line->edge[j + 1]) < xMax) {
4262 idx1 = j;
4263 break;
4264 }
4265 --j;
4266 }
4267 }
4268 break;
4269 case 1:
4270 x = 0.5 * (line->xMin + line->xMax);
4271 if (xMin < x && x < xMax) {
4272 j = 0;
4273 while (j < line->len) {
4274 if (0.5 * (line->edge[j] + line->edge[j + 1]) > yMin) {
4275 idx0 = j;
4276 break;
4277 }
4278 ++j;
4279 }
4280 j = line->len - 1;
4281 while (j >= 0) {
4282 if (0.5 * (line->edge[j] + line->edge[j + 1]) < yMax) {
4283 idx1 = j;
4284 break;
4285 }
4286 --j;
4287 }
4288 }
4289 break;
4290 case 2:
4291 y = 0.5 * (line->yMin + line->yMax);
4292 if (yMin < y && y < yMax) {
4293 j = 0;
4294 while (j < line->len) {
4295 if (0.5 * (line->edge[j] + line->edge[j + 1]) < xMax) {
4296 idx0 = j;
4297 break;
4298 }
4299 ++j;
4300 }
4301 j = line->len - 1;
4302 while (j >= 0) {
4303 if (0.5 * (line->edge[j] + line->edge[j + 1]) > xMin) {
4304 idx1 = j;
4305 break;
4306 }
4307 --j;
4308 }
4309 }
4310 break;
4311 case 3:
4312 x = 0.5 * (line->xMin + line->xMax);
4313 if (xMin < x && x < xMax) {
4314 j = 0;
4315 while (j < line->len) {
4316 if (0.5 * (line->edge[j] + line->edge[j + 1]) < yMax) {
4317 idx0 = j;
4318 break;
4319 }
4320 ++j;
4321 }
4322 j = line->len - 1;
4323 while (j >= 0) {
4324 if (0.5 * (line->edge[j] + line->edge[j + 1]) > yMin) {
4325 idx1 = j;
4326 break;
4327 }
4328 --j;
4329 }
4330 }
4331 break;
4332 }
4333 if (idx0 >= 0 && idx1 >= 0) {
4334 if (nFrags == fragsSize) {
4335 fragsSize *= 2;
4336 frags = (TextLineFrag *)greallocn(p: frags, count: fragsSize, size: sizeof(TextLineFrag));
4337 }
4338 frags[nFrags].init(lineA: line, startA: idx0, lenA: idx1 - idx0 + 1);
4339 ++nFrags;
4340 if (lastRot >= 0 && line->rot != lastRot) {
4341 oneRot = false;
4342 }
4343 lastRot = line->rot;
4344 }
4345 }
4346 }
4347 }
4348 }
4349
4350 // sort the fragments and generate the string
4351 if (nFrags > 0) {
4352
4353 for (i = 0; i < nFrags; ++i) {
4354 frags[i].computeCoords(oneRot);
4355 }
4356 assignColumns(frags, nFrags, rot: oneRot);
4357
4358 // if all lines in the region have the same rotation, use it;
4359 // otherwise, use the page's primary rotation
4360 if (oneRot) {
4361 qsort(base: frags, nmemb: nFrags, size: sizeof(TextLineFrag), compar: &TextLineFrag::cmpYXLineRot);
4362 } else {
4363 qsort(base: frags, nmemb: nFrags, size: sizeof(TextLineFrag), compar: &TextLineFrag::cmpYXPrimaryRot);
4364 }
4365 i = 0;
4366 while (i < nFrags) {
4367 delta = maxIntraLineDelta * frags[i].line->words->fontSize;
4368 for (j = i + 1; j < nFrags && fabs(x: frags[j].base - frags[i].base) < delta; ++j) {
4369 ;
4370 }
4371 qsort(base: frags + i, nmemb: j - i, size: sizeof(TextLineFrag), compar: oneRot ? &TextLineFrag::cmpXYColumnLineRot : &TextLineFrag::cmpXYColumnPrimaryRot);
4372 i = j;
4373 }
4374
4375 col = 0;
4376 multiLine = false;
4377 for (i = 0; i < nFrags; ++i) {
4378 frag = &frags[i];
4379
4380 // insert a return
4381 if (frag->col < col || (i > 0 && fabs(x: frag->base - frags[i - 1].base) > maxIntraLineDelta * frags[i - 1].line->words->fontSize)) {
4382 s->append(str: eol, lengthA: eolLen);
4383 col = 0;
4384 multiLine = true;
4385 }
4386
4387 // column alignment
4388 for (; col < frag->col; ++col) {
4389 s->append(str: space, lengthA: spaceLen);
4390 }
4391
4392 // get the fragment text
4393 col += dumpFragment(text: frag->line->text + frag->start, len: frag->len, uMap, s);
4394 }
4395
4396 if (multiLine) {
4397 s->append(str: eol, lengthA: eolLen);
4398 }
4399 }
4400
4401 gfree(p: frags);
4402
4403 return s;
4404}
4405
4406class TextSelectionVisitor
4407{
4408public:
4409 explicit TextSelectionVisitor(TextPage *page);
4410 virtual ~TextSelectionVisitor();
4411 TextSelectionVisitor(const TextSelectionVisitor &) = delete;
4412 TextSelectionVisitor &operator=(const TextSelectionVisitor &) = delete;
4413 virtual void visitBlock(TextBlock *block, TextLine *begin, TextLine *end, const PDFRectangle *selection) = 0;
4414 virtual void visitLine(TextLine *line, TextWord *begin, TextWord *end, int edge_begin, int edge_end, const PDFRectangle *selection) = 0;
4415 virtual void visitWord(TextWord *word, int begin, int end, const PDFRectangle *selection) = 0;
4416
4417protected:
4418 TextPage *page;
4419};
4420
4421TextSelectionVisitor::TextSelectionVisitor(TextPage *p) : page(p) { }
4422
4423TextSelectionVisitor::~TextSelectionVisitor() = default;
4424
4425class TextSelectionDumper : public TextSelectionVisitor
4426{
4427public:
4428 explicit TextSelectionDumper(TextPage *page);
4429 ~TextSelectionDumper() override;
4430
4431 void visitBlock(TextBlock *block, TextLine *begin, TextLine *end, const PDFRectangle *selection) override {};
4432 void visitLine(TextLine *line, TextWord *begin, TextWord *end, int edge_begin, int edge_end, const PDFRectangle *selection) override;
4433 void visitWord(TextWord *word, int begin, int end, const PDFRectangle *selection) override;
4434 void endPage();
4435
4436 GooString *getText();
4437 std::vector<TextWordSelection *> **takeWordList(int *nLines);
4438
4439private:
4440 void startLine();
4441 void finishLine();
4442
4443 std::vector<TextWordSelection *> **lines;
4444 int nLines, linesSize;
4445 std::vector<TextWordSelection *> *words;
4446 int tableId;
4447 TextBlock *currentBlock;
4448};
4449
4450TextSelectionDumper::TextSelectionDumper(TextPage *p) : TextSelectionVisitor(p)
4451{
4452 linesSize = 256;
4453 lines = (std::vector<TextWordSelection *> **)gmallocn(count: linesSize, size: sizeof(std::vector<TextWordSelection *> *));
4454 nLines = 0;
4455
4456 tableId = -1;
4457 currentBlock = nullptr;
4458 words = nullptr;
4459}
4460
4461TextSelectionDumper::~TextSelectionDumper()
4462{
4463 for (int i = 0; i < nLines; i++) {
4464 for (auto entry : *(lines[i])) {
4465 delete entry;
4466 }
4467 delete lines[i];
4468 }
4469 gfree(p: lines);
4470}
4471
4472void TextSelectionDumper::startLine()
4473{
4474 finishLine();
4475 words = new std::vector<TextWordSelection *>();
4476}
4477
4478void TextSelectionDumper::finishLine()
4479{
4480 if (nLines == linesSize) {
4481 linesSize *= 2;
4482 lines = (std::vector<TextWordSelection *> **)grealloc(p: lines, size: linesSize * sizeof(std::vector<TextWordSelection *> *));
4483 }
4484
4485 if (words && words->size() > 0) {
4486 // Reverse word order for RTL text. Fixes #53 for glib backend (Evince)
4487 if (!page->primaryLR) {
4488 std::reverse(first: words->begin(), last: words->end());
4489 }
4490
4491 lines[nLines++] = words;
4492 } else if (words) {
4493 delete words;
4494 }
4495 words = nullptr;
4496}
4497
4498void TextSelectionDumper::visitLine(TextLine *line, TextWord *begin, TextWord *end, int edge_begin, int edge_end, const PDFRectangle *selection)
4499{
4500 TextLineFrag frag;
4501
4502 frag.init(lineA: line, startA: edge_begin, lenA: edge_end - edge_begin);
4503
4504 if (tableId >= 0 && frag.line->blk->tableId < 0) {
4505 finishLine();
4506
4507 tableId = -1;
4508 currentBlock = nullptr;
4509 }
4510
4511 if (frag.line->blk->tableId >= 0) { // a table
4512 if (tableId == -1) {
4513 tableId = frag.line->blk->tableId;
4514 currentBlock = frag.line->blk;
4515 }
4516
4517 if (currentBlock == frag.line->blk) { // the same block
4518 startLine();
4519 } else { // another block
4520 if (currentBlock->tableEnd) { // previous block ended its row
4521 startLine();
4522 }
4523 currentBlock = frag.line->blk;
4524 }
4525 } else { // not a table
4526 startLine();
4527 }
4528}
4529
4530void TextSelectionDumper::visitWord(TextWord *word, int begin, int end, const PDFRectangle *selection)
4531{
4532 words->push_back(x: new TextWordSelection(word, begin, end));
4533}
4534
4535void TextSelectionDumper::endPage()
4536{
4537 finishLine();
4538}
4539
4540GooString *TextSelectionDumper::getText()
4541{
4542 GooString *text;
4543 int i;
4544 const UnicodeMap *uMap;
4545 char space[8], eol[16];
4546 int spaceLen, eolLen;
4547
4548 text = new GooString();
4549
4550 if (!(uMap = globalParams->getTextEncoding())) {
4551 return text;
4552 }
4553
4554 spaceLen = uMap->mapUnicode(u: 0x20, buf: space, bufSize: sizeof(space));
4555 eolLen = uMap->mapUnicode(u: 0x0a, buf: eol, bufSize: sizeof(eol));
4556
4557 std::vector<Unicode> uText;
4558 for (i = 0; i < nLines; i++) {
4559 std::vector<TextWordSelection *> *lineWords = lines[i];
4560 for (std::size_t j = 0; j < lineWords->size(); j++) {
4561 TextWordSelection *sel = (*lineWords)[j];
4562
4563 uText.resize(new_size: sel->end - sel->begin);
4564 std::transform(first: sel->word->chars.begin() + sel->begin, last: sel->word->chars.begin() + sel->end, result: uText.begin(), unary_op: [](auto &c) { return c.text; });
4565 page->dumpFragment(text: uText.data(), len: uText.size(), uMap, s: text);
4566
4567 if (j < lineWords->size() - 1 && sel->word->spaceAfter) {
4568 text->append(str: space, lengthA: spaceLen);
4569 }
4570 }
4571 if (i < nLines - 1) {
4572 text->append(str: eol, lengthA: eolLen);
4573 }
4574 }
4575
4576 return text;
4577}
4578
4579std::vector<TextWordSelection *> **TextSelectionDumper::takeWordList(int *nLinesOut)
4580{
4581 std::vector<TextWordSelection *> **returnValue = lines;
4582
4583 *nLinesOut = nLines;
4584 if (nLines == 0) {
4585 return nullptr;
4586 }
4587
4588 nLines = 0;
4589 lines = nullptr;
4590
4591 return returnValue;
4592}
4593
4594class TextSelectionSizer : public TextSelectionVisitor
4595{
4596public:
4597 TextSelectionSizer(TextPage *page, double scale);
4598 ~TextSelectionSizer() override { delete list; }
4599
4600 void visitBlock(TextBlock *block, TextLine *begin, TextLine *end, const PDFRectangle *selection) override {};
4601 void visitLine(TextLine *line, TextWord *begin, TextWord *end, int edge_begin, int edge_end, const PDFRectangle *selection) override;
4602 void visitWord(TextWord *word, int begin, int end, const PDFRectangle *selection) override {};
4603
4604 std::vector<PDFRectangle *> *takeRegion()
4605 {
4606 auto aux = list;
4607 list = nullptr;
4608 return aux;
4609 }
4610
4611private:
4612 std::vector<PDFRectangle *> *list;
4613 double scale;
4614};
4615
4616TextSelectionSizer::TextSelectionSizer(TextPage *p, double s) : TextSelectionVisitor(p), scale(s)
4617{
4618 list = new std::vector<PDFRectangle *>();
4619}
4620
4621void TextSelectionSizer::visitLine(TextLine *line, TextWord *begin, TextWord *end, int edge_begin, int edge_end, const PDFRectangle *selection)
4622{
4623 PDFRectangle *rect;
4624 double x1, y1, x2, y2, margin;
4625
4626 switch (line->rot) {
4627 default:
4628 case 0:
4629 margin = (line->yMax - line->yMin) / 8;
4630 x1 = line->edge[edge_begin];
4631 x2 = line->edge[edge_end];
4632 y1 = line->yMin - margin;
4633 y2 = line->yMax + margin;
4634 break;
4635 case 1:
4636 margin = (line->xMax - line->xMin) / 8;
4637 x1 = line->xMin - margin;
4638 x2 = line->xMax + margin;
4639 y1 = line->edge[edge_begin];
4640 y2 = line->edge[edge_end];
4641 break;
4642 case 2:
4643 margin = (line->yMax - line->yMin) / 8;
4644 x1 = line->edge[edge_end];
4645 x2 = line->edge[edge_begin];
4646 y1 = line->yMin - margin;
4647 y2 = line->yMax + margin;
4648 break;
4649 case 3:
4650 margin = (line->xMax - line->xMin) / 8;
4651 x1 = line->xMin - margin;
4652 x2 = line->xMax + margin;
4653 y1 = line->edge[edge_end];
4654 y2 = line->edge[edge_begin];
4655 break;
4656 }
4657
4658 rect = new PDFRectangle(floor(x: x1 * scale), floor(x: y1 * scale), ceil(x: x2 * scale), ceil(x: y2 * scale));
4659 list->push_back(x: rect);
4660}
4661
4662class TextSelectionPainter : public TextSelectionVisitor
4663{
4664public:
4665 TextSelectionPainter(TextPage *page, double scale, int rotation, OutputDev *out, const GfxColor *box_color, const GfxColor *glyph_color);
4666 ~TextSelectionPainter() override;
4667
4668 void visitBlock(TextBlock *block, TextLine *begin, TextLine *end, const PDFRectangle *selection) override {};
4669 void visitLine(TextLine *line, TextWord *begin, TextWord *end, int edge_begin, int edge_end, const PDFRectangle *selection) override;
4670 void visitWord(TextWord *word, int begin, int end, const PDFRectangle *selection) override;
4671 void endPage();
4672
4673private:
4674 OutputDev *out;
4675 const GfxColor *glyph_color;
4676 GfxState *state;
4677 std::vector<TextWordSelection *> *selectionList;
4678 Matrix ctm, ictm;
4679 bool hasGlyphLessFont();
4680};
4681
4682TextSelectionPainter::TextSelectionPainter(TextPage *p, double scale, int rotation, OutputDev *outA, const GfxColor *box_color, const GfxColor *glyph_colorA) : TextSelectionVisitor(p), out(outA), glyph_color(glyph_colorA)
4683{
4684 PDFRectangle box(0, 0, p->pageWidth, p->pageHeight);
4685
4686 selectionList = new std::vector<TextWordSelection *>();
4687 state = new GfxState(72 * scale, 72 * scale, &box, rotation, false);
4688
4689 state->getCTM(m: &ctm);
4690 ctm.invertTo(other: &ictm);
4691
4692 out->startPage(pageNum: 0, state, xref: nullptr);
4693 out->setDefaultCTM(state->getCTM());
4694
4695 state->setFillColorSpace(new GfxDeviceRGBColorSpace());
4696 state->setFillColor(box_color);
4697 out->updateFillColor(state);
4698}
4699
4700TextSelectionPainter::~TextSelectionPainter()
4701{
4702 for (auto entry : *selectionList) {
4703 delete entry;
4704 }
4705 delete selectionList;
4706 delete state;
4707}
4708
4709void TextSelectionPainter::visitLine(TextLine *line, TextWord *begin, TextWord *end, int edge_begin, int edge_end, const PDFRectangle *selection)
4710{
4711 double x1, y1, x2, y2, margin;
4712
4713 switch (line->rot) {
4714 default:
4715 case 0:
4716 margin = (line->yMax - line->yMin) / 8;
4717 x1 = line->edge[edge_begin];
4718 x2 = line->edge[edge_end];
4719 y1 = line->yMin - margin;
4720 y2 = line->yMax + margin;
4721 break;
4722 case 1:
4723 margin = (line->xMax - line->xMin) / 8;
4724 x1 = line->xMin - margin;
4725 x2 = line->xMax + margin;
4726 y1 = line->edge[edge_begin];
4727 y2 = line->edge[edge_end];
4728 break;
4729 case 2:
4730 margin = (line->yMax - line->yMin) / 8;
4731 x1 = line->edge[edge_end];
4732 x2 = line->edge[edge_begin];
4733 y1 = line->yMin - margin;
4734 y2 = line->yMax + margin;
4735 break;
4736 case 3:
4737 margin = (line->xMax - line->xMin) / 8;
4738 x1 = line->xMin - margin;
4739 x2 = line->xMax + margin;
4740 y1 = line->edge[edge_end];
4741 y2 = line->edge[edge_begin];
4742 break;
4743 }
4744
4745 ctm.transform(x: x1, y: y1, tx: &x1, ty: &y1);
4746 ctm.transform(x: x2, y: y2, tx: &x2, ty: &y2);
4747
4748 if (x1 < x2) {
4749 x1 = floor(x: x1);
4750 x2 = ceil(x: x2);
4751 } else {
4752 x1 = ceil(x: x1);
4753 x2 = floor(x: x2);
4754 }
4755
4756 if (y1 < y2) {
4757 y1 = floor(x: y1);
4758 y2 = ceil(x: y2);
4759 } else {
4760 y1 = ceil(x: y1);
4761 y2 = floor(x: y2);
4762 }
4763
4764 ictm.transform(x: x1, y: y1, tx: &x1, ty: &y1);
4765 ictm.transform(x: x2, y: y2, tx: &x2, ty: &y2);
4766
4767 state->moveTo(x: x1, y: y1);
4768 state->lineTo(x: x2, y: y1);
4769 state->lineTo(x: x2, y: y2);
4770 state->lineTo(x: x1, y: y2);
4771 state->closePath();
4772}
4773
4774void TextSelectionPainter::visitWord(TextWord *word, int begin, int end, const PDFRectangle *selection)
4775{
4776 selectionList->push_back(x: new TextWordSelection(word, begin, end));
4777}
4778
4779bool TextSelectionPainter::hasGlyphLessFont()
4780{
4781 if (selectionList && selectionList->size()) {
4782 TextWordSelection *sel = (*selectionList)[0];
4783 return sel->word->invisible;
4784 }
4785
4786 return false;
4787}
4788
4789void TextSelectionPainter::endPage()
4790{
4791 /* Take a shortcut for glyphless fonts (eg. Tesseract scanned documents)
4792 * cause we just paint a transparent fill over existent text.Issue #157 */
4793 if (hasGlyphLessFont()) {
4794 state->setFillOpacity(glyphlessSelectionOpacity);
4795 out->updateFillOpacity(state);
4796 out->fill(state);
4797 out->endPage();
4798 return;
4799 }
4800
4801 out->fill(state);
4802
4803 out->saveState(state);
4804 out->clip(state);
4805
4806 state->clearPath();
4807
4808 state->setFillColor(glyph_color);
4809
4810 out->updateFillColor(state);
4811
4812 GooString string;
4813 for (const TextWordSelection *sel : *selectionList) {
4814 int begin = sel->begin;
4815
4816 while (begin < sel->end) {
4817 TextFontInfo *font = sel->word->chars[begin].font;
4818 const Matrix *mat = &sel->word->chars[begin].textMat;
4819
4820 state->setTextMat(a: mat->m[0], b: mat->m[1], c: mat->m[2], d: mat->m[3], e: 0, f: 0);
4821 state->setFont(fontA: font->gfxFont, fontSizeA: 1);
4822 out->updateFont(state);
4823
4824 int fEnd = begin + 1;
4825 while (fEnd < sel->end && font->matches(fontInfo: sel->word->chars[fEnd].font) //
4826 && mat->m[0] == sel->word->chars[fEnd].textMat.m[0] && mat->m[1] == sel->word->chars[fEnd].textMat.m[1] //
4827 && mat->m[2] == sel->word->chars[fEnd].textMat.m[2] && mat->m[3] == sel->word->chars[fEnd].textMat.m[3]) {
4828 fEnd++;
4829 }
4830
4831 /* The only purpose of this string is to let the output device query
4832 * it's length. Might want to change this interface later. */
4833 string.clear();
4834 std::for_each(first: sel->word->chars.begin() + begin, last: sel->word->chars.begin() + fEnd, f: [&string](const auto c) { string.append(c.charcode); });
4835 out->beginString(state, &string);
4836
4837 for (int j = begin; j < fEnd; j++) {
4838 const auto &charJ = sel->word->chars[j];
4839 if (j != begin && charJ.charPos == sel->word->chars[j - 1].charPos) {
4840 continue;
4841 }
4842 out->drawChar(state, charJ.textMat.m[4], charJ.textMat.m[5], 0, 0, 0, 0, charJ.charcode, 1, nullptr, 0);
4843 }
4844 out->endString(state);
4845 begin = fEnd;
4846 }
4847 }
4848
4849 out->restoreState(state);
4850 out->endPage();
4851}
4852
4853void TextWord::visitSelection(TextSelectionVisitor *visitor, const PDFRectangle *selection, SelectionStyle style)
4854{
4855 double mid, s1, s2;
4856
4857 if (rot == 0 || rot == 2) {
4858 s1 = selection->x1;
4859 s2 = selection->x2;
4860 } else {
4861 s1 = selection->y1;
4862 s2 = selection->y2;
4863 }
4864
4865 size_t begin = len();
4866 size_t end = 0;
4867 for (size_t i = 0; i < len(); i++) {
4868 if (i + 1 < len()) {
4869 mid = (chars[i].edge + chars[i + 1].edge) / 2;
4870 } else {
4871 mid = (chars[i].edge + edgeEnd) / 2;
4872 }
4873 if (XBetweenAB(mid, s1, s2)) {
4874 if (i < begin) {
4875 begin = i;
4876 }
4877
4878 end = i + 1;
4879 }
4880 }
4881
4882 /* Skip empty selection. */
4883 if (end <= begin) {
4884 return;
4885 }
4886
4887 visitor->visitWord(word: this, begin, end, selection);
4888}
4889
4890void TextLine::visitSelection(TextSelectionVisitor *visitor, const PDFRectangle *selection, SelectionStyle style)
4891{
4892 TextWord *p, *begin, *end, *current;
4893 int i, edge_begin, edge_end;
4894 PDFRectangle child_selection;
4895 double s1, s2, pMin, pMax;
4896
4897 if (rot == 0 || rot == 2) {
4898 s1 = selection->x1;
4899 s2 = selection->x2;
4900 } else {
4901 s1 = selection->y1;
4902 s2 = selection->y2;
4903 }
4904
4905 begin = nullptr;
4906 end = nullptr;
4907 current = nullptr;
4908 for (p = words; p != nullptr; p = p->next) {
4909 if (rot == 0 || rot == 2) {
4910 pMin = p->xMin;
4911 pMax = p->xMax;
4912 } else {
4913 pMin = p->yMin;
4914 pMax = p->yMax;
4915 }
4916
4917 if (blk->page->primaryLR) {
4918 if (((s1 < pMax) || (s2 < pMax)) && begin == nullptr) {
4919 begin = p;
4920 }
4921
4922 if (((s1 > pMin) || (s2 > pMin)) && begin != nullptr) {
4923 end = p->next;
4924 current = p;
4925 }
4926 } else {
4927 if (((s1 > pMin) || (s2 > pMin)) && begin == nullptr) {
4928 begin = p;
4929 }
4930
4931 if (((s1 < pMax) || (s2 < pMax)) && begin != nullptr) {
4932 end = p->next;
4933 current = p;
4934 }
4935 }
4936 }
4937
4938 if (!current) {
4939 current = begin;
4940 }
4941
4942 child_selection = *selection;
4943 if (style == selectionStyleWord) {
4944 if (rot == 0 || rot == 2) {
4945 child_selection.x1 = begin ? begin->xMin : xMin;
4946 if (end && end->xMax != -1) {
4947 child_selection.x2 = current->xMax;
4948 } else {
4949 child_selection.x2 = xMax;
4950 }
4951 } else {
4952 child_selection.y1 = begin ? begin->yMin : yMin;
4953 if (end && end->yMax != -1) {
4954 child_selection.y2 = current->yMax;
4955 } else {
4956 child_selection.y2 = yMax;
4957 }
4958 }
4959 }
4960
4961 if (rot == 0 || rot == 2) {
4962 s1 = child_selection.x1;
4963 s2 = child_selection.x2;
4964 } else {
4965 s1 = child_selection.y1;
4966 s2 = child_selection.y2;
4967 }
4968
4969 edge_begin = len;
4970 edge_end = 0;
4971 for (i = 0; i < len; i++) {
4972 double mid = (edge[i] + edge[i + 1]) / 2;
4973 if (XBetweenAB(mid, s1, s2)) {
4974 if (i < edge_begin) {
4975 edge_begin = i;
4976 }
4977
4978 edge_end = i + 1;
4979 }
4980 }
4981
4982 /* Skip empty selection. */
4983 if (edge_end <= edge_begin) {
4984 return;
4985 }
4986
4987 visitor->visitLine(line: this, begin, end, edge_begin, edge_end, selection: &child_selection);
4988
4989 for (p = begin; p != end; p = p->next) {
4990 p->visitSelection(visitor, selection: &child_selection, style);
4991 }
4992}
4993
4994void TextBlock::visitSelection(TextSelectionVisitor *visitor, const PDFRectangle *selection, SelectionStyle style)
4995{
4996 PDFRectangle child_selection;
4997 double x[2], y[2], d, best_d[2];
4998 TextLine *p, *best_line[2];
4999 int i, count = 0, best_count[2], start, stop;
5000 bool all[2];
5001
5002 x[0] = selection->x1;
5003 y[0] = selection->y1;
5004 x[1] = selection->x2;
5005 y[1] = selection->y2;
5006
5007 for (i = 0; i < 2; i++) {
5008 // the first/last lines are often not nearest
5009 // the corners, so we have to force them to be
5010 // selected when the selection runs outside this
5011 // block.
5012 if (page->primaryLR) {
5013 all[i] = x[i] >= this->xMax && y[i] >= this->yMax;
5014 if (x[i] <= this->xMin && y[i] <= this->yMin) {
5015 best_line[i] = this->lines;
5016 best_count[i] = 1;
5017 } else {
5018 best_line[i] = nullptr;
5019 best_count[i] = 0;
5020 }
5021 } else {
5022 all[i] = x[i] <= this->xMin && y[i] >= this->yMax;
5023 if (x[i] >= this->xMax && y[i] <= this->yMin) {
5024 best_line[i] = this->lines;
5025 best_count[i] = 1;
5026 } else {
5027 best_line[i] = nullptr;
5028 best_count[i] = 0;
5029 }
5030 }
5031 best_d[i] = 0;
5032 }
5033
5034 // find the nearest line to the selection points
5035 // using the manhattan distance.
5036 for (p = this->lines; p; p = p->next) {
5037 count++;
5038 for (i = 0; i < 2; i++) {
5039 d = fmax(x: p->xMin - x[i], y: 0.0) + fmax(x: x[i] - p->xMax, y: 0.0) + fmax(x: p->yMin - y[i], y: 0.0) + fmax(x: y[i] - p->yMax, y: 0.0);
5040 if (!best_line[i] || all[i] || d < best_d[i]) {
5041 best_line[i] = p;
5042 best_count[i] = count;
5043 best_d[i] = d;
5044 }
5045 }
5046 }
5047 // assert: best is always set.
5048 if (!best_line[0] || !best_line[1]) {
5049 return;
5050 }
5051
5052 // Now decide which point was first.
5053 if (best_count[0] < best_count[1] || (best_count[0] == best_count[1] && y[0] < y[1])) {
5054 start = 0;
5055 stop = 1;
5056 } else {
5057 start = 1;
5058 stop = 0;
5059 }
5060
5061 visitor->visitBlock(block: this, begin: best_line[start], end: best_line[stop], selection);
5062
5063 for (p = best_line[start]; p; p = p->next) {
5064 if (page->primaryLR) {
5065 child_selection.x1 = p->xMin;
5066 child_selection.x2 = p->xMax;
5067 } else {
5068 child_selection.x1 = p->xMax;
5069 child_selection.x2 = p->xMin;
5070 }
5071 child_selection.y1 = p->yMin;
5072 child_selection.y2 = p->yMax;
5073 if (style == selectionStyleLine) {
5074 if (p == best_line[start]) {
5075 child_selection.x1 = 0;
5076 child_selection.y1 = 0;
5077 }
5078 if (p == best_line[stop]) {
5079 child_selection.x2 = page->pageWidth;
5080 child_selection.y2 = page->pageHeight;
5081 }
5082 } else {
5083 if (p == best_line[start]) {
5084 child_selection.x1 = fmax(x: p->xMin, y: fmin(x: p->xMax, y: x[start]));
5085 child_selection.y1 = fmax(x: p->yMin, y: fmin(x: p->yMax, y: y[start]));
5086 }
5087 if (p == best_line[stop]) {
5088 child_selection.x2 = fmax(x: p->xMin, y: fmin(x: p->xMax, y: x[stop]));
5089 child_selection.y2 = fmax(x: p->yMin, y: fmin(x: p->yMax, y: y[stop]));
5090 }
5091 }
5092 p->visitSelection(visitor, selection: &child_selection, style);
5093 if (p == best_line[stop]) {
5094 return;
5095 }
5096 }
5097}
5098
5099void TextPage::visitSelection(TextSelectionVisitor *visitor, const PDFRectangle *selection, SelectionStyle style)
5100{
5101 PDFRectangle child_selection;
5102 double x[2], y[2], d, best_d[2];
5103 double xMin, yMin, xMax, yMax;
5104 TextFlow *flow, *best_flow[2];
5105 TextBlock *blk, *best_block[2];
5106 int i, count = 0, best_count[2], start, stop;
5107
5108 if (!flows) {
5109 return;
5110 }
5111
5112 x[0] = selection->x1;
5113 y[0] = selection->y1;
5114 x[1] = selection->x2;
5115 y[1] = selection->y2;
5116
5117 xMin = pageWidth;
5118 yMin = pageHeight;
5119 xMax = 0.0;
5120 yMax = 0.0;
5121
5122 for (i = 0; i < 2; i++) {
5123 best_block[i] = nullptr;
5124 best_flow[i] = nullptr;
5125 best_count[i] = 0;
5126 best_d[i] = 0;
5127 }
5128
5129 // find the nearest blocks to the selection points
5130 // using the manhattan distance.
5131 for (flow = flows; flow; flow = flow->next) {
5132 for (blk = flow->blocks; blk; blk = blk->next) {
5133 count++;
5134 // the first/last blocks in reading order are
5135 // often not the closest to the page corners;
5136 // track the corners, force those blocks to
5137 // be selected if the selection runs across
5138 // multiple pages.
5139 xMin = fmin(x: xMin, y: blk->xMin);
5140 yMin = fmin(x: yMin, y: blk->yMin);
5141 xMax = fmax(x: xMax, y: blk->xMax);
5142 yMax = fmax(x: yMax, y: blk->yMax);
5143 for (i = 0; i < 2; i++) {
5144 d = fmax(x: blk->xMin - x[i], y: 0.0) + fmax(x: x[i] - blk->xMax, y: 0.0) + fmax(x: blk->yMin - y[i], y: 0.0) + fmax(x: y[i] - blk->yMax, y: 0.0);
5145 if (!best_block[i] || d < best_d[i] || (!blk->next && !flow->next && x[i] >= fmin(x: xMax, y: pageWidth) && y[i] >= fmin(x: yMax, y: pageHeight))) {
5146 best_block[i] = blk;
5147 best_flow[i] = flow;
5148 best_count[i] = count;
5149 best_d[i] = d;
5150 }
5151 }
5152 }
5153 }
5154 for (i = 0; i < 2; i++) {
5155 if (primaryLR) {
5156 if (x[i] < xMin && y[i] < yMin) {
5157 best_block[i] = flows->blocks;
5158 best_flow[i] = flows;
5159 best_count[i] = 1;
5160 }
5161 } else {
5162 if (x[i] > xMax && y[i] < yMin) {
5163 best_block[i] = flows->blocks;
5164 best_flow[i] = flows;
5165 best_count[i] = 1;
5166 }
5167 }
5168 }
5169 // assert: best is always set.
5170 if (!best_block[0] || !best_block[1]) {
5171 return;
5172 }
5173
5174 // Now decide which point was first.
5175 if (best_count[0] < best_count[1] || (best_count[0] == best_count[1] && y[0] < y[1])) {
5176 start = 0;
5177 stop = 1;
5178 } else {
5179 start = 1;
5180 stop = 0;
5181 }
5182
5183 for (flow = best_flow[start]; flow; flow = flow->next) {
5184 if (flow == best_flow[start]) {
5185 blk = best_block[start];
5186 } else {
5187 blk = flow->blocks;
5188 }
5189 for (; blk; blk = blk->next) {
5190 if (primaryLR) {
5191 child_selection.x1 = blk->xMin;
5192 child_selection.x2 = blk->xMax;
5193 } else {
5194 child_selection.x1 = blk->xMax;
5195 child_selection.x2 = blk->xMin;
5196 }
5197 child_selection.y1 = blk->yMin;
5198 child_selection.y2 = blk->yMax;
5199 if (blk == best_block[start]) {
5200 child_selection.x1 = fmax(x: blk->xMin, y: fmin(x: blk->xMax, y: x[start]));
5201 child_selection.y1 = fmax(x: blk->yMin, y: fmin(x: blk->yMax, y: y[start]));
5202 }
5203 if (blk == best_block[stop]) {
5204 child_selection.x2 = fmax(x: blk->xMin, y: fmin(x: blk->xMax, y: x[stop]));
5205 child_selection.y2 = fmax(x: blk->yMin, y: fmin(x: blk->yMax, y: y[stop]));
5206 blk->visitSelection(visitor, selection: &child_selection, style);
5207 return;
5208 }
5209 blk->visitSelection(visitor, selection: &child_selection, style);
5210 }
5211 }
5212}
5213
5214void TextPage::drawSelection(OutputDev *out, double scale, int rotation, const PDFRectangle *selection, SelectionStyle style, const GfxColor *glyph_color, const GfxColor *box_color)
5215{
5216 TextSelectionPainter painter(this, scale, rotation, out, box_color, glyph_color);
5217
5218 visitSelection(visitor: &painter, selection, style);
5219 painter.endPage();
5220}
5221
5222std::vector<PDFRectangle *> *TextPage::getSelectionRegion(const PDFRectangle *selection, SelectionStyle style, double scale)
5223{
5224 TextSelectionSizer sizer(this, scale);
5225
5226 visitSelection(visitor: &sizer, selection, style);
5227
5228 return sizer.takeRegion();
5229}
5230
5231GooString *TextPage::getSelectionText(const PDFRectangle *selection, SelectionStyle style)
5232{
5233 TextSelectionDumper dumper(this);
5234
5235 visitSelection(visitor: &dumper, selection, style);
5236 dumper.endPage();
5237
5238 return dumper.getText();
5239}
5240
5241std::vector<TextWordSelection *> **TextPage::getSelectionWords(const PDFRectangle *selection, SelectionStyle style, int *nLines)
5242{
5243 TextSelectionDumper dumper(this);
5244
5245 visitSelection(visitor: &dumper, selection, style);
5246 dumper.endPage();
5247
5248 return dumper.takeWordList(nLinesOut: nLines);
5249}
5250
5251bool TextPage::findCharRange(int pos, int length, double *xMin, double *yMin, double *xMax, double *yMax) const
5252{
5253 TextBlock *blk;
5254 TextLine *line;
5255 TextWord *word;
5256 double xMin0, xMax0, yMin0, yMax0;
5257 double xMin1, xMax1, yMin1, yMax1;
5258 bool first;
5259
5260 if (rawOrder) {
5261 return false;
5262 }
5263
5264 //~ this doesn't correctly handle ranges split across multiple lines
5265 //~ (the highlighted region is the bounding box of all the parts of
5266 //~ the range)
5267 first = true;
5268 xMin0 = xMax0 = yMin0 = yMax0 = 0; // make gcc happy
5269 xMin1 = xMax1 = yMin1 = yMax1 = 0; // make gcc happy
5270 for (int i = 0; i < nBlocks; ++i) {
5271 blk = blocks[i];
5272 for (line = blk->lines; line; line = line->next) {
5273 for (word = line->words; word; word = word->next) {
5274 if (pos < word->charPosEnd && pos + length > word->chars.front().charPos) {
5275 size_t j0, j1;
5276 for (j0 = 0; (j0 + 1) < word->len() && pos >= word->chars[j0 + 1].charPos; ++j0) {
5277 ;
5278 }
5279 for (j1 = word->len(); j1 > j0 && pos + length <= word->chars[j1].charPos; --j1) {
5280 ;
5281 }
5282 auto startingEdge = word->chars[j0].edge;
5283 auto endingEdge = (j1 + 1 == word->len()) ? word->edgeEnd : word->chars[j1 + 1].edge;
5284 switch (line->rot) {
5285 case 0:
5286 xMin1 = startingEdge;
5287 xMax1 = endingEdge;
5288 yMin1 = word->yMin;
5289 yMax1 = word->yMax;
5290 break;
5291 case 1:
5292 xMin1 = word->xMin;
5293 xMax1 = word->xMax;
5294 yMin1 = startingEdge;
5295 yMax1 = endingEdge;
5296 break;
5297 case 2:
5298 xMin1 = endingEdge;
5299 xMax1 = startingEdge;
5300 yMin1 = word->yMin;
5301 yMax1 = word->yMax;
5302 break;
5303 case 3:
5304 xMin1 = word->xMin;
5305 xMax1 = word->xMax;
5306 yMin1 = endingEdge;
5307 yMax1 = startingEdge;
5308 break;
5309 }
5310 if (first || xMin1 < xMin0) {
5311 xMin0 = xMin1;
5312 }
5313 if (first || xMax1 > xMax0) {
5314 xMax0 = xMax1;
5315 }
5316 if (first || yMin1 < yMin0) {
5317 yMin0 = yMin1;
5318 }
5319 if (first || yMax1 > yMax0) {
5320 yMax0 = yMax1;
5321 }
5322 first = false;
5323 }
5324 }
5325 }
5326 }
5327 if (!first) {
5328 *xMin = xMin0;
5329 *xMax = xMax0;
5330 *yMin = yMin0;
5331 *yMax = yMax0;
5332 return true;
5333 }
5334 return false;
5335}
5336
5337void TextPage::dump(void *outputStream, TextOutputFunc outputFunc, bool physLayout, EndOfLineKind textEOL, bool pageBreaks)
5338{
5339 const UnicodeMap *uMap;
5340 TextFlow *flow;
5341 TextBlock *blk;
5342 TextLine *line;
5343 TextLineFrag *frags;
5344 TextWord *word;
5345 int nFrags, fragsSize;
5346 TextLineFrag *frag;
5347 char space[8], eol[16], eop[8];
5348 int spaceLen, eolLen, eopLen;
5349 double delta;
5350 int col, i, j, d, n;
5351
5352 // get the output encoding
5353 if (!(uMap = globalParams->getTextEncoding())) {
5354 return;
5355 }
5356 spaceLen = uMap->mapUnicode(u: 0x20, buf: space, bufSize: sizeof(space));
5357 eolLen = 0; // make gcc happy
5358 switch (textEOL) {
5359 case eolUnix:
5360 eolLen = uMap->mapUnicode(u: 0x0a, buf: eol, bufSize: sizeof(eol));
5361 break;
5362 case eolDOS:
5363 eolLen = uMap->mapUnicode(u: 0x0d, buf: eol, bufSize: sizeof(eol));
5364 eolLen += uMap->mapUnicode(u: 0x0a, buf: eol + eolLen, bufSize: sizeof(eol) - eolLen);
5365 break;
5366 case eolMac:
5367 eolLen = uMap->mapUnicode(u: 0x0d, buf: eol, bufSize: sizeof(eol));
5368 break;
5369 }
5370 eopLen = uMap->mapUnicode(u: 0x0c, buf: eop, bufSize: sizeof(eop));
5371
5372 //~ writing mode (horiz/vert)
5373
5374 // output the page in raw (content stream) order
5375 if (rawOrder) {
5376
5377 GooString s;
5378 std::vector<Unicode> uText;
5379
5380 for (word = rawWords; word; word = word->next) {
5381 s.clear();
5382 uText.resize(new_size: word->len());
5383 std::transform(first: word->chars.begin(), last: word->chars.end(), result: uText.begin(), unary_op: [](auto &c) { return c.text; });
5384 dumpFragment(text: uText.data(), len: uText.size(), uMap, s: &s);
5385 (*outputFunc)(outputStream, s.c_str(), s.getLength());
5386
5387 if (word->next && fabs(x: word->next->base - word->base) < maxIntraLineDelta * word->fontSize && word->next->xMin > word->xMax - minDupBreakOverlap * word->fontSize) {
5388 if (word->next->xMin > word->xMax + minWordSpacing * word->fontSize) {
5389 (*outputFunc)(outputStream, space, spaceLen);
5390 }
5391 } else {
5392 (*outputFunc)(outputStream, eol, eolLen);
5393 }
5394 }
5395
5396 // output the page, maintaining the original physical layout
5397 } else if (physLayout) {
5398
5399 // collect the line fragments for the page and sort them
5400 fragsSize = 256;
5401 frags = (TextLineFrag *)gmallocn(count: fragsSize, size: sizeof(TextLineFrag));
5402 nFrags = 0;
5403 for (i = 0; i < nBlocks; ++i) {
5404 blk = blocks[i];
5405 for (line = blk->lines; line; line = line->next) {
5406 if (nFrags == fragsSize) {
5407 fragsSize *= 2;
5408 frags = (TextLineFrag *)greallocn(p: frags, count: fragsSize, size: sizeof(TextLineFrag));
5409 }
5410 frags[nFrags].init(lineA: line, startA: 0, lenA: line->len);
5411 frags[nFrags].computeCoords(oneRot: true);
5412 ++nFrags;
5413 }
5414 }
5415 qsort(base: frags, nmemb: nFrags, size: sizeof(TextLineFrag), compar: &TextLineFrag::cmpYXPrimaryRot);
5416 i = 0;
5417 while (i < nFrags) {
5418 delta = maxIntraLineDelta * frags[i].line->words->fontSize;
5419 for (j = i + 1; j < nFrags && fabs(x: frags[j].base - frags[i].base) < delta; ++j) {
5420 ;
5421 }
5422 qsort(base: frags + i, nmemb: j - i, size: sizeof(TextLineFrag), compar: &TextLineFrag::cmpXYColumnPrimaryRot);
5423 i = j;
5424 }
5425
5426#if 0 // for debugging
5427 printf("*** line fragments ***\n");
5428 for (i = 0; i < nFrags; ++i) {
5429 frag = &frags[i];
5430 printf("frag: x=%.2f..%.2f y=%.2f..%.2f base=%.2f '",
5431 frag->xMin, frag->xMax, frag->yMin, frag->yMax, frag->base);
5432 for (n = 0; n < frag->len; ++n) {
5433 fputc(frag->line->text[frag->start + n] & 0xff, stdout);
5434 }
5435 printf("'\n");
5436 }
5437 printf("\n");
5438#endif
5439
5440 GooString s;
5441 // generate output
5442 col = 0;
5443 for (i = 0; i < nFrags; ++i) {
5444 frag = &frags[i];
5445
5446 // column alignment
5447 for (; col < frag->col; ++col) {
5448 (*outputFunc)(outputStream, space, spaceLen);
5449 }
5450
5451 // print the line
5452 s.clear();
5453 col += dumpFragment(text: frag->line->text + frag->start, len: frag->len, uMap, s: &s);
5454 (*outputFunc)(outputStream, s.c_str(), s.getLength());
5455
5456 // print one or more returns if necessary
5457 if (i == nFrags - 1 || frags[i + 1].col < col || fabs(x: frags[i + 1].base - frag->base) > maxIntraLineDelta * frag->line->words->fontSize) {
5458 if (i < nFrags - 1) {
5459 d = (int)((frags[i + 1].base - frag->base) / frag->line->words->fontSize);
5460 if (d < 1) {
5461 d = 1;
5462 } else if (d > 5) {
5463 d = 5;
5464 }
5465 } else {
5466 d = 1;
5467 }
5468 for (; d > 0; --d) {
5469 (*outputFunc)(outputStream, eol, eolLen);
5470 }
5471 col = 0;
5472 }
5473 }
5474
5475 gfree(p: frags);
5476
5477 // output the page, "undoing" the layout
5478 } else {
5479 for (flow = flows; flow; flow = flow->next) {
5480 for (blk = flow->blocks; blk; blk = blk->next) {
5481 for (line = blk->lines; line; line = line->next) {
5482 n = line->len;
5483 if (line->hyphenated && (line->next || blk->next)) {
5484 --n;
5485 }
5486 GooString s;
5487 dumpFragment(text: line->text, len: n, uMap, s: &s);
5488 (*outputFunc)(outputStream, s.c_str(), s.getLength());
5489 // output a newline when a hyphen is not suppressed
5490 if (n == line->len) {
5491 (*outputFunc)(outputStream, eol, eolLen);
5492 }
5493 }
5494 }
5495 (*outputFunc)(outputStream, eol, eolLen);
5496 }
5497 }
5498
5499 // end of page
5500 if (pageBreaks) {
5501 (*outputFunc)(outputStream, eop, eopLen);
5502 }
5503}
5504
5505void TextPage::setMergeCombining(bool merge)
5506{
5507 mergeCombining = merge;
5508}
5509
5510void TextPage::assignColumns(TextLineFrag *frags, int nFrags, bool oneRot) const
5511{
5512 TextLineFrag *frag0, *frag1;
5513 int rot, col1, col2, i, j, k;
5514
5515 // all text in the region has the same rotation -- recompute the
5516 // column numbers based only on the text in the region
5517 if (oneRot) {
5518 qsort(base: frags, nmemb: nFrags, size: sizeof(TextLineFrag), compar: &TextLineFrag::cmpXYLineRot);
5519 rot = frags[0].line->rot;
5520 for (i = 0; i < nFrags; ++i) {
5521 frag0 = &frags[i];
5522 col1 = 0;
5523 for (j = 0; j < i; ++j) {
5524 frag1 = &frags[j];
5525 col2 = 0; // make gcc happy
5526 switch (rot) {
5527 case 0:
5528 if (frag0->xMin >= frag1->xMax) {
5529 col2 = frag1->col + (frag1->line->col[frag1->start + frag1->len] - frag1->line->col[frag1->start]) + 1;
5530 } else {
5531 for (k = frag1->start; k < frag1->start + frag1->len && frag0->xMin >= 0.5 * (frag1->line->edge[k] + frag1->line->edge[k + 1]); ++k) {
5532 ;
5533 }
5534 col2 = frag1->col + frag1->line->col[k] - frag1->line->col[frag1->start];
5535 }
5536 break;
5537 case 1:
5538 if (frag0->yMin >= frag1->yMax) {
5539 col2 = frag1->col + (frag1->line->col[frag1->start + frag1->len] - frag1->line->col[frag1->start]) + 1;
5540 } else {
5541 for (k = frag1->start; k < frag1->start + frag1->len && frag0->yMin >= 0.5 * (frag1->line->edge[k] + frag1->line->edge[k + 1]); ++k) {
5542 ;
5543 }
5544 col2 = frag1->col + frag1->line->col[k] - frag1->line->col[frag1->start];
5545 }
5546 break;
5547 case 2:
5548 if (frag0->xMax <= frag1->xMin) {
5549 col2 = frag1->col + (frag1->line->col[frag1->start + frag1->len] - frag1->line->col[frag1->start]) + 1;
5550 } else {
5551 for (k = frag1->start; k < frag1->start + frag1->len && frag0->xMax <= 0.5 * (frag1->line->edge[k] + frag1->line->edge[k + 1]); ++k) {
5552 ;
5553 }
5554 col2 = frag1->col + frag1->line->col[k] - frag1->line->col[frag1->start];
5555 }
5556 break;
5557 case 3:
5558 if (frag0->yMax <= frag1->yMin) {
5559 col2 = frag1->col + (frag1->line->col[frag1->start + frag1->len] - frag1->line->col[frag1->start]) + 1;
5560 } else {
5561 for (k = frag1->start; k < frag1->start + frag1->len && frag0->yMax <= 0.5 * (frag1->line->edge[k] + frag1->line->edge[k + 1]); ++k) {
5562 ;
5563 }
5564 col2 = frag1->col + frag1->line->col[k] - frag1->line->col[frag1->start];
5565 }
5566 break;
5567 }
5568 if (col2 > col1) {
5569 col1 = col2;
5570 }
5571 }
5572 frag0->col = col1;
5573 }
5574
5575 // the region includes text at different rotations -- use the
5576 // globally assigned column numbers, offset by the minimum column
5577 // number (i.e., shift everything over to column 0)
5578 } else {
5579 col1 = frags[0].col;
5580 for (i = 1; i < nFrags; ++i) {
5581 if (frags[i].col < col1) {
5582 col1 = frags[i].col;
5583 }
5584 }
5585 for (i = 0; i < nFrags; ++i) {
5586 frags[i].col -= col1;
5587 }
5588 }
5589}
5590
5591int TextPage::dumpFragment(const Unicode *text, int len, const UnicodeMap *uMap, GooString *s) const
5592{
5593 if (uMap->isUnicode()) {
5594 return reorderText(text, len, uMap, primaryLR, s, u: nullptr);
5595 } else {
5596 int nCols = 0;
5597
5598 char buf[8];
5599 int buflen = 0;
5600
5601 for (int i = 0; i < len; ++i) {
5602 buflen = uMap->mapUnicode(u: text[i], buf, bufSize: sizeof(buf));
5603 s->append(str: buf, lengthA: buflen);
5604 nCols += buflen;
5605 }
5606
5607 return nCols;
5608 }
5609}
5610
5611#ifdef TEXTOUT_WORD_LIST
5612std::unique_ptr<TextWordList> TextPage::makeWordList(bool physLayout)
5613{
5614 return std::make_unique<TextWordList>(args: this, args&: physLayout);
5615}
5616#endif
5617
5618//------------------------------------------------------------------------
5619// ActualText
5620//------------------------------------------------------------------------
5621ActualText::ActualText(TextPage *out)
5622{
5623 out->incRefCnt();
5624 text = out;
5625 actualText = nullptr;
5626 actualTextNBytes = 0;
5627}
5628
5629ActualText::~ActualText()
5630{
5631 if (actualText) {
5632 delete actualText;
5633 }
5634 text->decRefCnt();
5635}
5636
5637void ActualText::addChar(const GfxState *state, double x, double y, double dx, double dy, CharCode c, int nBytes, const Unicode *u, int uLen)
5638{
5639 if (!actualText) {
5640 text->addChar(state, x, y, dx, dy, c, nBytes, u, uLen);
5641 return;
5642 }
5643
5644 // Inside ActualText span.
5645 if (!actualTextNBytes) {
5646 actualTextX0 = x;
5647 actualTextY0 = y;
5648 }
5649 actualTextX1 = x + dx;
5650 actualTextY1 = y + dy;
5651 actualTextNBytes += nBytes;
5652}
5653
5654void ActualText::begin(const GfxState *state, const GooString *t)
5655{
5656 if (actualText) {
5657 delete actualText;
5658 }
5659 actualText = new GooString(t);
5660 actualTextNBytes = 0;
5661}
5662
5663void ActualText::end(const GfxState *state)
5664{
5665 // ActualText span closed. Output the span text and the
5666 // extents of all the glyphs inside the span
5667
5668 if (actualTextNBytes) {
5669 // now that we have the position info for all of the text inside
5670 // the marked content span, we feed the "ActualText" back through
5671 // text->addChar()
5672 std::vector<Unicode> uni = TextStringToUCS4(textStr: actualText->toStr());
5673 text->addChar(state, x: actualTextX0, y: actualTextY0, dx: actualTextX1 - actualTextX0, dy: actualTextY1 - actualTextY0, c: 0, nBytes: actualTextNBytes, u: uni.data(), uLen: uni.size());
5674 }
5675
5676 delete actualText;
5677 actualText = nullptr;
5678 actualTextNBytes = 0;
5679}
5680
5681//------------------------------------------------------------------------
5682// TextOutputDev
5683//------------------------------------------------------------------------
5684
5685static void TextOutputDev_outputToFile(void *stream, const char *text, int len)
5686{
5687 fwrite(ptr: text, size: 1, n: len, s: (FILE *)stream);
5688}
5689
5690TextOutputDev::TextOutputDev(const char *fileName, bool physLayoutA, double fixedPitchA, bool rawOrderA, bool append, bool discardDiagA)
5691{
5692 text = nullptr;
5693 physLayout = physLayoutA;
5694 fixedPitch = physLayout ? fixedPitchA : 0;
5695 rawOrder = rawOrderA;
5696 discardDiag = discardDiagA;
5697 doHTML = false;
5698 textEOL = defaultEndOfLine();
5699 textPageBreaks = true;
5700 ok = true;
5701 minColSpacing1 = minColSpacing1_default;
5702
5703 // open file
5704 needClose = false;
5705 if (fileName) {
5706 if (!strcmp(s1: fileName, s2: "-")) {
5707 outputStream = stdout;
5708#if defined(_WIN32) || defined(__CYGWIN__)
5709 // keep DOS from munging the end-of-line characters
5710 _setmode(fileno(stdout), O_BINARY);
5711#endif
5712 } else if ((outputStream = openFile(path: fileName, mode: append ? "ab" : "wb"))) {
5713 needClose = true;
5714 } else {
5715 error(category: errIO, pos: -1, msg: "Couldn't open text file '{0:s}'", fileName);
5716 ok = false;
5717 actualText = nullptr;
5718 return;
5719 }
5720 outputFunc = &TextOutputDev_outputToFile;
5721 } else {
5722 outputStream = nullptr;
5723 }
5724
5725 // set up text object
5726 text = new TextPage(rawOrderA, discardDiagA);
5727 actualText = new ActualText(text);
5728}
5729
5730TextOutputDev::TextOutputDev(TextOutputFunc func, void *stream, bool physLayoutA, double fixedPitchA, bool rawOrderA, bool discardDiagA)
5731{
5732 outputFunc = func;
5733 outputStream = stream;
5734 needClose = false;
5735 physLayout = physLayoutA;
5736 fixedPitch = physLayout ? fixedPitchA : 0;
5737 rawOrder = rawOrderA;
5738 discardDiag = discardDiagA;
5739 doHTML = false;
5740 text = new TextPage(rawOrderA, discardDiagA);
5741 actualText = new ActualText(text);
5742 textEOL = defaultEndOfLine();
5743 textPageBreaks = true;
5744 ok = true;
5745 minColSpacing1 = minColSpacing1_default;
5746}
5747
5748TextOutputDev::~TextOutputDev()
5749{
5750 if (needClose) {
5751 fclose(stream: (FILE *)outputStream);
5752 }
5753 if (text) {
5754 text->decRefCnt();
5755 }
5756 delete actualText;
5757}
5758
5759void TextOutputDev::startPage(int pageNum, GfxState *state, XRef *xref)
5760{
5761 text->startPage(state);
5762}
5763
5764void TextOutputDev::endPage()
5765{
5766 text->endPage();
5767 text->coalesce(physLayout, fixedPitch, doHTML, minColSpacing1);
5768 if (outputStream) {
5769 text->dump(outputStream, outputFunc, physLayout, textEOL, pageBreaks: textPageBreaks);
5770 }
5771}
5772
5773void TextOutputDev::restoreState(GfxState *state)
5774{
5775 text->updateFont(state);
5776}
5777
5778void TextOutputDev::updateFont(GfxState *state)
5779{
5780 text->updateFont(state);
5781}
5782
5783void TextOutputDev::beginString(GfxState *state, const GooString *s) { }
5784
5785void TextOutputDev::endString(GfxState *state) { }
5786
5787void TextOutputDev::drawChar(GfxState *state, double x, double y, double dx, double dy, double originX, double originY, CharCode c, int nBytes, const Unicode *u, int uLen)
5788{
5789 actualText->addChar(state, x, y, dx, dy, c, nBytes, u, uLen);
5790}
5791
5792void TextOutputDev::incCharCount(int nChars)
5793{
5794 text->incCharCount(nChars);
5795}
5796
5797void TextOutputDev::beginActualText(GfxState *state, const GooString *t)
5798{
5799 actualText->begin(state, t);
5800}
5801
5802void TextOutputDev::endActualText(GfxState *state)
5803{
5804 actualText->end(state);
5805}
5806
5807void TextOutputDev::stroke(GfxState *state)
5808{
5809 double x[2], y[2];
5810
5811 if (!doHTML) {
5812 return;
5813 }
5814 const GfxPath *path = state->getPath();
5815 if (path->getNumSubpaths() != 1) {
5816 return;
5817 }
5818 const GfxSubpath *subpath = path->getSubpath(i: 0);
5819 if (subpath->getNumPoints() != 2) {
5820 return;
5821 }
5822 state->transform(x1: subpath->getX(i: 0), y1: subpath->getY(i: 0), x2: &x[0], y2: &y[0]);
5823 state->transform(x1: subpath->getX(i: 1), y1: subpath->getY(i: 1), x2: &x[1], y2: &y[1]);
5824
5825 // look for a vertical or horizontal line
5826 if (x[0] == x[1] || y[0] == y[1]) {
5827 text->addUnderline(x0: x[0], y0: y[0], x1: x[1], y1: y[1]);
5828 }
5829}
5830
5831void TextOutputDev::fill(GfxState *state)
5832{
5833 double x[5], y[5];
5834 double rx0, ry0, rx1, ry1, t;
5835 int i;
5836
5837 if (!doHTML) {
5838 return;
5839 }
5840 const GfxPath *path = state->getPath();
5841 if (path->getNumSubpaths() != 1) {
5842 return;
5843 }
5844 const GfxSubpath *subpath = path->getSubpath(i: 0);
5845 if (subpath->getNumPoints() != 5) {
5846 return;
5847 }
5848 for (i = 0; i < 5; ++i) {
5849 if (subpath->getCurve(i)) {
5850 return;
5851 }
5852 state->transform(x1: subpath->getX(i), y1: subpath->getY(i), x2: &x[i], y2: &y[i]);
5853 }
5854
5855 // look for a rectangle
5856 if (x[0] == x[1] && y[1] == y[2] && x[2] == x[3] && y[3] == y[4] && x[0] == x[4] && y[0] == y[4]) {
5857 rx0 = x[0];
5858 ry0 = y[0];
5859 rx1 = x[2];
5860 ry1 = y[1];
5861 } else if (y[0] == y[1] && x[1] == x[2] && y[2] == y[3] && x[3] == x[4] && x[0] == x[4] && y[0] == y[4]) {
5862 rx0 = x[0];
5863 ry0 = y[0];
5864 rx1 = x[1];
5865 ry1 = y[2];
5866 } else {
5867 return;
5868 }
5869 if (rx1 < rx0) {
5870 t = rx0;
5871 rx0 = rx1;
5872 rx1 = t;
5873 }
5874 if (ry1 < ry0) {
5875 t = ry0;
5876 ry0 = ry1;
5877 ry1 = t;
5878 }
5879
5880 // skinny horizontal rectangle
5881 if (ry1 - ry0 < rx1 - rx0) {
5882 if (ry1 - ry0 < maxUnderlineWidth) {
5883 ry0 = 0.5 * (ry0 + ry1);
5884 text->addUnderline(x0: rx0, y0: ry0, x1: rx1, y1: ry0);
5885 }
5886
5887 // skinny vertical rectangle
5888 } else {
5889 if (rx1 - rx0 < maxUnderlineWidth) {
5890 rx0 = 0.5 * (rx0 + rx1);
5891 text->addUnderline(x0: rx0, y0: ry0, x1: rx0, y1: ry1);
5892 }
5893 }
5894}
5895
5896void TextOutputDev::eoFill(GfxState *state)
5897{
5898 if (!doHTML) {
5899 return;
5900 }
5901 fill(state);
5902}
5903
5904void TextOutputDev::processLink(AnnotLink *link)
5905{
5906 double x1, y1, x2, y2;
5907 int xMin, yMin, xMax, yMax, x, y;
5908
5909 if (!doHTML) {
5910 return;
5911 }
5912 link->getRect(x1: &x1, y1: &y1, x2: &x2, y2: &y2);
5913 cvtUserToDev(ux: x1, uy: y1, dx: &x, dy: &y);
5914 xMin = xMax = x;
5915 yMin = yMax = y;
5916 cvtUserToDev(ux: x1, uy: y2, dx: &x, dy: &y);
5917 if (x < xMin) {
5918 xMin = x;
5919 } else if (x > xMax) {
5920 xMax = x;
5921 }
5922 if (y < yMin) {
5923 yMin = y;
5924 } else if (y > yMax) {
5925 yMax = y;
5926 }
5927 cvtUserToDev(ux: x2, uy: y1, dx: &x, dy: &y);
5928 if (x < xMin) {
5929 xMin = x;
5930 } else if (x > xMax) {
5931 xMax = x;
5932 }
5933 if (y < yMin) {
5934 yMin = y;
5935 } else if (y > yMax) {
5936 yMax = y;
5937 }
5938 cvtUserToDev(ux: x2, uy: y2, dx: &x, dy: &y);
5939 if (x < xMin) {
5940 xMin = x;
5941 } else if (x > xMax) {
5942 xMax = x;
5943 }
5944 if (y < yMin) {
5945 yMin = y;
5946 } else if (y > yMax) {
5947 yMax = y;
5948 }
5949 text->addLink(xMin, yMin, xMax, yMax, link);
5950}
5951
5952bool TextOutputDev::findText(const Unicode *s, int len, bool startAtTop, bool stopAtBottom, bool startAtLast, bool stopAtLast, bool caseSensitive, bool backward, bool wholeWord, double *xMin, double *yMin, double *xMax, double *yMax) const
5953{
5954 return text->findText(s, len, startAtTop, stopAtBottom, startAtLast, stopAtLast, caseSensitive, backward, wholeWord, xMin, yMin, xMax, yMax);
5955}
5956
5957GooString *TextOutputDev::getText(double xMin, double yMin, double xMax, double yMax) const
5958{
5959 return text->getText(xMin, yMin, xMax, yMax, textEOL);
5960}
5961
5962void TextOutputDev::drawSelection(OutputDev *out, double scale, int rotation, const PDFRectangle *selection, SelectionStyle style, const GfxColor *glyph_color, const GfxColor *box_color)
5963{
5964 text->drawSelection(out, scale, rotation, selection, style, glyph_color, box_color);
5965}
5966
5967std::vector<PDFRectangle *> *TextOutputDev::getSelectionRegion(const PDFRectangle *selection, SelectionStyle style, double scale)
5968{
5969 return text->getSelectionRegion(selection, style, scale);
5970}
5971
5972GooString *TextOutputDev::getSelectionText(const PDFRectangle *selection, SelectionStyle style)
5973{
5974 return text->getSelectionText(selection, style);
5975}
5976
5977bool TextOutputDev::findCharRange(int pos, int length, double *xMin, double *yMin, double *xMax, double *yMax) const
5978{
5979 return text->findCharRange(pos, length, xMin, yMin, xMax, yMax);
5980}
5981
5982void TextOutputDev::setMergeCombining(bool merge)
5983{
5984 text->setMergeCombining(merge);
5985}
5986
5987#ifdef TEXTOUT_WORD_LIST
5988std::unique_ptr<TextWordList> TextOutputDev::makeWordList()
5989{
5990 return text->makeWordList(physLayout);
5991}
5992#endif
5993
5994TextPage *TextOutputDev::takeText()
5995{
5996 TextPage *ret;
5997
5998 ret = text;
5999 text = new TextPage(rawOrder, discardDiag);
6000 delete actualText;
6001 actualText = new ActualText(text);
6002 return ret;
6003}
6004
6005const TextFlow *TextOutputDev::getFlows() const
6006{
6007 return text->getFlows();
6008}
6009

source code of poppler/poppler/TextOutputDev.cc