1//========================================================================
2//
3// TextOutputDev.cc
4//
5// Copyright 1997-2003 Glyph & Cog, LLC
6//
7//========================================================================
8
9//========================================================================
10//
11// Modified under the Poppler project - http://poppler.freedesktop.org
12//
13// All changes made under the Poppler project to this file are licensed
14// under GPL version 2 or later
15//
16// Copyright (C) 2005-2007 Kristian Høgsberg <krh@redhat.com>
17// Copyright (C) 2005 Nickolay V. Shmyrev <nshmyrev@yandex.ru>
18// Copyright (C) 2006-2008, 2011-2013 Carlos Garcia Campos <carlosgc@gnome.org>
19// Copyright (C) 2006, 2007, 2013 Ed Catmur <ed@catmur.co.uk>
20// Copyright (C) 2006 Jeff Muizelaar <jeff@infidigm.net>
21// Copyright (C) 2007, 2008, 2012, 2017 Adrian Johnson <ajohnson@redneon.com>
22// Copyright (C) 2008 Koji Otani <sho@bbr.jp>
23// Copyright (C) 2008, 2010-2012, 2014-2022, 2024 Albert Astals Cid <aacid@kde.org>
24// Copyright (C) 2008 Pino Toscano <pino@kde.org>
25// Copyright (C) 2008, 2010 Hib Eris <hib@hiberis.nl>
26// Copyright (C) 2009 Ross Moore <ross@maths.mq.edu.au>
27// Copyright (C) 2009 Kovid Goyal <kovid@kovidgoyal.net>
28// Copyright (C) 2010 Brian Ewins <brian.ewins@gmail.com>
29// Copyright (C) 2010, 2021 Marek Kasik <mkasik@redhat.com>
30// Copyright (C) 2010, 2020 Suzuki Toshiya <mpsuzuki@hiroshima-u.ac.jp>
31// Copyright (C) 2011 Sam Liao <phyomh@gmail.com>
32// Copyright (C) 2012 Horst Prote <prote@fmi.uni-stuttgart.de>
33// Copyright (C) 2012, 2013-2018 Jason Crain <jason@aquaticape.us>
34// Copyright (C) 2012 Peter Breitenlohner <peb@mppmu.mpg.de>
35// Copyright (C) 2013 José Aliste <jaliste@src.gnome.org>
36// Copyright (C) 2013 Thomas Freitag <Thomas.Freitag@alfa.de>
37// Copyright (C) 2013 Ed Catmur <ed@catmur.co.uk>
38// Copyright (C) 2016 Khaled Hosny <khaledhosny@eglug.org>
39// Copyright (C) 2018 Klarälvdalens Datakonsult AB, a KDAB Group company, <info@kdab.com>. Work sponsored by the LiMux project of the city of Munich
40// Copyright (C) 2018 Sanchit Anand <sanxchit@gmail.com>
41// Copyright (C) 2018 Adam Reichold <adam.reichold@t-online.de>
42// Copyright (C) 2018-2022, 2024 Nelson Benítez León <nbenitezl@gmail.com>
43// Copyright (C) 2019 Christian Persch <chpe@src.gnome.org>
44// Copyright (C) 2019, 2022 Oliver Sander <oliver.sander@tu-dresden.de>
45// Copyright (C) 2019 Dan Shea <dan.shea@logical-innovations.com>
46// Copyright (C) 2021 Peter Williams <peter@newton.cx>
47// Copyright (C) 2024 Adam Sampson <ats@offog.org>
48// Copyright (C) 2024 g10 Code GmbH, Author: Sune Stolborg Vuorela <sune@vuorela.dk>
49// Copyright (C) 2024 Stefan Brüns <stefan.bruens@rwth-aachen.de>
50//
51// To see a description of the changes please see the Changelog file that
52// came with your tarball or type make ChangeLog if you are building from git
53//
54//========================================================================
55
56#include <config.h>
57
58#include <cstdio>
59#include <cstdlib>
60#include <cstddef>
61#include <cmath>
62#include <cfloat>
63#include <cctype>
64#include <algorithm>
65#if defined(_WIN32) || defined(__CYGWIN__)
66# include <fcntl.h> // for O_BINARY
67# include <io.h> // for _setmode
68#endif
69#include "goo/gfile.h"
70#include "goo/gmem.h"
71#include "goo/GooString.h"
72#include "poppler-config.h"
73#include "Error.h"
74#include "GlobalParams.h"
75#include "UnicodeMap.h"
76#include "UnicodeTypeTable.h"
77#include "Link.h"
78#include "TextOutputDev.h"
79#include "Page.h"
80#include "Annot.h"
81#include "UTF.h"
82
83//------------------------------------------------------------------------
84// parameters
85//------------------------------------------------------------------------
86
87// Each bucket in a text pool includes baselines within a range of
88// this many points.
89#define textPoolStep 4
90
91// Inter-character space width which will cause addChar to start a new
92// word.
93#define minWordBreakSpace 0.1
94
95// Negative inter-character space width, i.e., overlap, which will
96// cause addChar to start a new word.
97#define minDupBreakOverlap 0.2
98
99// Max distance between baselines of two lines within a block, as a
100// fraction of the font size.
101#define maxLineSpacingDelta 1.5
102
103// Max difference in primary font sizes on two lines in the same
104// block. Delta1 is used when examining new lines above and below the
105// current block; delta2 is used when examining text that overlaps the
106// current block; delta3 is used when examining text to the left and
107// right of the current block.
108#define maxBlockFontSizeDelta1 0.05
109#define maxBlockFontSizeDelta2 0.6
110#define maxBlockFontSizeDelta3 0.2
111
112// Max difference in font sizes inside a word.
113#define maxWordFontSizeDelta 0.05
114
115// Maximum distance between baselines of two words on the same line,
116// e.g., distance between subscript or superscript and the primary
117// baseline, as a fraction of the font size.
118#define maxIntraLineDelta 0.5
119
120// Minimum inter-word spacing, as a fraction of the font size. (Only
121// used for raw ordering.)
122#define minWordSpacing 0.15
123
124// Maximum inter-word spacing, as a fraction of the font size.
125#define maxWordSpacing 1.5
126
127// Maximum horizontal spacing which will allow a word to be pulled
128// into a block, as a fraction of the font size.
129// This default value can be tweaked via API.
130double TextOutputDev::minColSpacing1_default = 0.7;
131
132// Minimum spacing between columns, as a fraction of the font size.
133#define minColSpacing2 1.0
134
135// Maximum vertical spacing between blocks within a flow, as a
136// multiple of the font size.
137#define maxBlockSpacing 2.5
138
139// Minimum spacing between characters within a word, as a fraction of
140// the font size.
141#define minCharSpacing -0.5
142
143// Maximum spacing between characters within a word, as a fraction of
144// the font size, when there is no obvious extra-wide character
145// spacing.
146#define maxCharSpacing 0.03
147
148// When extra-wide character spacing is detected, the inter-character
149// space threshold is set to the minimum inter-character space
150// multiplied by this constant.
151#define maxWideCharSpacingMul 1.3
152
153// Upper limit on spacing between characters in a word.
154#define maxWideCharSpacing 0.4
155
156// Max difference in primary,secondary coordinates (as a fraction of
157// the font size) allowed for duplicated text (fake boldface, drop
158// shadows) which is to be discarded.
159#define dupMaxPriDelta 0.1
160#define dupMaxSecDelta 0.2
161
162// Max width of underlines (in points).
163#define maxUnderlineWidth 3
164
165// Min distance between baseline and underline (in points).
166//~ this should be font-size-dependent
167#define minUnderlineGap -2
168
169// Max distance between baseline and underline (in points).
170//~ this should be font-size-dependent
171#define maxUnderlineGap 4
172
173// Max horizontal distance between edge of word and start of underline
174// (in points).
175//~ this should be font-size-dependent
176#define underlineSlack 1
177
178// Max distance between edge of text and edge of link border
179#define hyperlinkSlack 2
180
181// Max distance between characters when combining a base character and
182// combining character
183#define combMaxMidDelta 0.3
184#define combMaxBaseDelta 0.4
185
186// Text is considered diagonal if abs(tan(angle)) > diagonalThreshold.
187// (Or 1/tan(angle) for 90/270 degrees.)
188#define diagonalThreshold 0.1
189
190// How opaque a selection on a glyphless font should be. Since the font is
191// glyphless and overlaid over text in image form, this must enable users
192// to read the underlying image. Issue #157
193#define glyphlessSelectionOpacity 0.4
194
195// Returns whether x is between a and b or equal to a or b.
196// a and b don't need to be sorted.
197#define XBetweenAB(x, a, b) (!(((x) > (a) && (x) > (b)) || ((x) < (a) && (x) < (b))) ? true : false)
198
199namespace {
200
201inline bool isAscii7(Unicode uchar)
202{
203 return uchar < 128;
204}
205
206}
207
208static int reorderText(const Unicode *text, int len, const UnicodeMap *uMap, bool primaryLR, GooString *s, Unicode *u)
209{
210 char lre[8], rle[8], popdf[8], buf[8];
211 int lreLen = 0, rleLen = 0, popdfLen = 0, n;
212 int nCols, i, j, k;
213
214 nCols = 0;
215
216 if (s) {
217 lreLen = uMap->mapUnicode(u: 0x202a, buf: lre, bufSize: sizeof(lre));
218 rleLen = uMap->mapUnicode(u: 0x202b, buf: rle, bufSize: sizeof(rle));
219 popdfLen = uMap->mapUnicode(u: 0x202c, buf: popdf, bufSize: sizeof(popdf));
220 }
221
222 if (primaryLR) {
223 i = 0;
224 while (i < len) {
225 // output a left-to-right section
226 for (j = i; j < len && !unicodeTypeR(c: text[j]); ++j) {
227 ;
228 }
229 for (k = i; k < j; ++k) {
230 if (s) {
231 n = uMap->mapUnicode(u: text[k], buf, bufSize: sizeof(buf));
232 s->append(str: buf, lengthA: n);
233 }
234 if (u) {
235 u[nCols] = text[k];
236 }
237 ++nCols;
238 }
239 i = j;
240 // output a right-to-left section
241 for (j = i; j < len && !(unicodeTypeL(c: text[j]) || unicodeTypeNum(c: text[j])); ++j) {
242 ;
243 }
244 if (j > i) {
245 if (s) {
246 s->append(str: rle, lengthA: rleLen);
247 }
248 for (k = j - 1; k >= i; --k) {
249 if (s) {
250 n = uMap->mapUnicode(u: text[k], buf, bufSize: sizeof(buf));
251 s->append(str: buf, lengthA: n);
252 }
253 if (u) {
254 u[nCols] = text[k];
255 }
256 ++nCols;
257 }
258 if (s) {
259 s->append(str: popdf, lengthA: popdfLen);
260 }
261 i = j;
262 }
263 }
264 } else {
265 // Note: This code treats numeric characters (European and
266 // Arabic/Indic) as left-to-right, which isn't strictly correct
267 // (incurs extra LRE/POPDF pairs), but does produce correct
268 // visual formatting.
269 if (s) {
270 s->append(str: rle, lengthA: rleLen);
271 }
272 i = len - 1;
273 while (i >= 0) {
274 // output a right-to-left section
275 for (j = i; j >= 0 && !(unicodeTypeL(c: text[j]) || unicodeTypeNum(c: text[j])); --j) {
276 ;
277 }
278 for (k = i; k > j; --k) {
279 if (s) {
280 n = uMap->mapUnicode(u: text[k], buf, bufSize: sizeof(buf));
281 s->append(str: buf, lengthA: n);
282 }
283 if (u) {
284 u[nCols] = text[k];
285 }
286 ++nCols;
287 }
288 i = j;
289 // output a left-to-right section
290 for (j = i; j >= 0 && !unicodeTypeR(c: text[j]); --j) {
291 ;
292 }
293 if (j < i) {
294 if (s) {
295 s->append(str: lre, lengthA: lreLen);
296 }
297 for (k = j + 1; k <= i; ++k) {
298 if (s) {
299 n = uMap->mapUnicode(u: text[k], buf, bufSize: sizeof(buf));
300 s->append(str: buf, lengthA: n);
301 }
302 if (u) {
303 u[nCols] = text[k];
304 }
305 ++nCols;
306 }
307 if (s) {
308 s->append(str: popdf, lengthA: popdfLen);
309 }
310 i = j;
311 }
312 }
313 if (s) {
314 s->append(str: popdf, lengthA: popdfLen);
315 }
316 }
317
318 return nCols;
319}
320
321//------------------------------------------------------------------------
322// TextUnderline
323//------------------------------------------------------------------------
324
325class TextUnderline
326{
327public:
328 TextUnderline(double x0A, double y0A, double x1A, double y1A)
329 {
330 x0 = x0A;
331 y0 = y0A;
332 x1 = x1A;
333 y1 = y1A;
334 horiz = y0 == y1;
335 }
336 ~TextUnderline() { }
337
338 double x0, y0, x1, y1;
339 bool horiz;
340};
341
342//------------------------------------------------------------------------
343// TextLink
344//------------------------------------------------------------------------
345
346class TextLink
347{
348public:
349 TextLink(int xMinA, int yMinA, int xMaxA, int yMaxA, AnnotLink *linkA)
350 {
351 xMin = xMinA;
352 yMin = yMinA;
353 xMax = xMaxA;
354 yMax = yMaxA;
355 link = linkA;
356 }
357 ~TextLink() { }
358
359 int xMin, yMin, xMax, yMax;
360 AnnotLink *link;
361};
362
363//------------------------------------------------------------------------
364// TextFontInfo
365//------------------------------------------------------------------------
366
367TextFontInfo::TextFontInfo(const GfxState *state)
368{
369 gfxFont = state->getFont();
370#ifdef TEXTOUT_WORD_LIST
371 fontName = (gfxFont && gfxFont->getName()) ? new GooString(*gfxFont->getName()) : nullptr;
372 flags = gfxFont ? gfxFont->getFlags() : 0;
373#endif
374}
375
376TextFontInfo::~TextFontInfo()
377{
378#ifdef TEXTOUT_WORD_LIST
379 if (fontName) {
380 delete fontName;
381 }
382#endif
383}
384
385bool TextFontInfo::matches(const GfxState *state) const
386{
387 return state->getFont() == gfxFont;
388}
389
390bool TextFontInfo::matches(const TextFontInfo *fontInfo) const
391{
392 return gfxFont == fontInfo->gfxFont;
393}
394
395bool TextFontInfo::matches(const Ref *ref) const
396{
397 return gfxFont && (*(gfxFont->getID()) == *ref);
398}
399
400double TextFontInfo::getAscent() const
401{
402 return gfxFont ? gfxFont->getAscent() : 0.95;
403}
404
405double TextFontInfo::getDescent() const
406{
407 return gfxFont ? gfxFont->getDescent() : -0.35;
408}
409
410int TextFontInfo::getWMode() const
411{
412 return gfxFont ? gfxFont->getWMode() : 0;
413}
414
415//------------------------------------------------------------------------
416// TextWord
417//------------------------------------------------------------------------
418
419TextWord::TextWord(const GfxState *state, int rotA, double fontSizeA)
420{
421 rot = rotA;
422 fontSize = fontSizeA;
423 spaceAfter = false;
424 next = nullptr;
425 invisible = state->getRender() == 3;
426
427#ifdef TEXTOUT_WORD_LIST
428 GfxRGB rgb;
429
430 if ((state->getRender() & 3) == 1) {
431 state->getStrokeRGB(rgb: &rgb);
432 } else {
433 state->getFillRGB(rgb: &rgb);
434 }
435 colorR = colToDbl(x: rgb.r);
436 colorG = colToDbl(x: rgb.g);
437 colorB = colToDbl(x: rgb.b);
438#endif
439
440 underlined = false;
441 link = nullptr;
442}
443
444TextWord::~TextWord() { }
445
446void TextWord::addChar(const GfxState *state, TextFontInfo *fontA, double x, double y, double dx, double dy, int charPosA, int charLen, CharCode c, Unicode u, const Matrix &textMatA)
447{
448 chars.push_back(x: CharInfo { .text: u, .charcode: c, .charPos: charPosA, .edge: 0.0, .font: fontA, .textMat: textMatA });
449 charPosEnd = charPosA + charLen;
450
451 if (len() == 1) {
452 setInitialBounds(fontA, x, y);
453 }
454
455 if (wMode) { // vertical writing mode
456 // NB: the rotation value has been incremented by 1 (in
457 // TextPage::beginWord()) for vertical writing mode
458 switch (rot) {
459 case 0:
460 chars.back().edge = x - fontSize;
461 xMax = edgeEnd = x;
462 break;
463 case 1:
464 chars.back().edge = y - fontSize;
465 yMax = edgeEnd = y;
466 break;
467 case 2:
468 chars.back().edge = x + fontSize;
469 xMin = edgeEnd = x;
470 break;
471 case 3:
472 chars.back().edge = y + fontSize;
473 yMin = edgeEnd = y;
474 break;
475 }
476 } else { // horizontal writing mode
477 switch (rot) {
478 case 0:
479 chars.back().edge = x;
480 xMax = edgeEnd = x + dx;
481 break;
482 case 1:
483 chars.back().edge = y;
484 yMax = edgeEnd = y + dy;
485 break;
486 case 2:
487 chars.back().edge = x;
488 xMin = edgeEnd = x + dx;
489 break;
490 case 3:
491 chars.back().edge = y;
492 yMin = edgeEnd = y + dy;
493 break;
494 }
495 }
496}
497
498void TextWord::setInitialBounds(TextFontInfo *fontA, double x, double y)
499{
500 double ascent = fontA->getAscent() * fontSize;
501 double descent = fontA->getDescent() * fontSize;
502 wMode = fontA->getWMode();
503
504 if (wMode) { // vertical writing mode
505 // NB: the rotation value has been incremented by 1 (in
506 // TextPage::beginWord()) for vertical writing mode
507 switch (rot) {
508 case 0:
509 xMin = x - fontSize;
510 yMin = y - fontSize;
511 yMax = y;
512 base = y;
513 break;
514 case 1:
515 xMin = x;
516 yMin = y - fontSize;
517 xMax = x + fontSize;
518 base = x;
519 break;
520 case 2:
521 yMin = y;
522 xMax = x + fontSize;
523 yMax = y + fontSize;
524 base = y;
525 break;
526 case 3:
527 xMin = x - fontSize;
528 xMax = x;
529 yMax = y + fontSize;
530 base = x;
531 break;
532 }
533 } else { // horizontal writing mode
534 switch (rot) {
535 case 0:
536 xMin = x;
537 yMin = y - ascent;
538 yMax = y - descent;
539 if (yMin == yMax) {
540 // this is a sanity check for a case that shouldn't happen -- but
541 // if it does happen, we want to avoid dividing by zero later
542 yMin = y;
543 yMax = y + 1;
544 }
545 base = y;
546 break;
547 case 1:
548 xMin = x + descent;
549 yMin = y;
550 xMax = x + ascent;
551 if (xMin == xMax) {
552 // this is a sanity check for a case that shouldn't happen -- but
553 // if it does happen, we want to avoid dividing by zero later
554 xMin = x;
555 xMax = x + 1;
556 }
557 base = x;
558 break;
559 case 2:
560 yMin = y + descent;
561 xMax = x;
562 yMax = y + ascent;
563 if (yMin == yMax) {
564 // this is a sanity check for a case that shouldn't happen -- but
565 // if it does happen, we want to avoid dividing by zero later
566 yMin = y;
567 yMax = y + 1;
568 }
569 base = y;
570 break;
571 case 3:
572 xMin = x - ascent;
573 xMax = x - descent;
574 yMax = y;
575 if (xMin == xMax) {
576 // this is a sanity check for a case that shouldn't happen -- but
577 // if it does happen, we want to avoid dividing by zero later
578 xMin = x;
579 xMax = x + 1;
580 }
581 base = x;
582 break;
583 }
584 }
585}
586
587struct CombiningTable
588{
589 Unicode base;
590 Unicode comb;
591};
592
593static const struct CombiningTable combiningTable[] = {
594 { .base: 0x0060, .comb: 0x0300 }, // grave
595 { .base: 0x00a8, .comb: 0x0308 }, // dieresis
596 { .base: 0x00af, .comb: 0x0304 }, // macron
597 { .base: 0x00b4, .comb: 0x0301 }, // acute
598 { .base: 0x00b8, .comb: 0x0327 }, // cedilla
599 { .base: 0x02c6, .comb: 0x0302 }, // circumflex
600 { .base: 0x02c7, .comb: 0x030c }, // caron
601 { .base: 0x02d8, .comb: 0x0306 }, // breve
602 { .base: 0x02d9, .comb: 0x0307 }, // dotaccent
603 { .base: 0x02da, .comb: 0x030a }, // ring
604 { .base: 0x02dc, .comb: 0x0303 }, // tilde
605 { .base: 0x02dd, .comb: 0x030b } // hungarumlaut (double acute accent)
606};
607
608// returning combining versions of characters
609static Unicode getCombiningChar(Unicode u)
610{
611 for (const CombiningTable &combining : combiningTable) {
612 if (u == combining.base) {
613 return combining.comb;
614 }
615 }
616 return 0;
617}
618
619bool TextWord::addCombining(const GfxState *state, TextFontInfo *fontA, double fontSizeA, double x, double y, double dx, double dy, int charPosA, int charLen, CharCode c, Unicode u, const Matrix &textMatA)
620{
621 if (chars.empty() || wMode != 0 || fontA->getWMode() != 0) {
622 return false;
623 }
624
625 Unicode cCurrent = getCombiningChar(u);
626 if (cCurrent != 0 && unicodeTypeAlphaNum(c: chars.back().text)) {
627 // Current is a combining character, previous is base character
628 double maxScaledMidDelta = fabs(x: edgeEnd - chars.back().edge) * combMaxMidDelta;
629 double charMid, charBase, maxScaledBaseDelta;
630
631 // Test if characters overlap
632 if (rot == 0 || rot == 2) {
633 charMid = x + (dx / 2);
634 charBase = y;
635 maxScaledBaseDelta = (yMax - yMin) * combMaxBaseDelta;
636 } else {
637 charMid = y + (dy / 2);
638 charBase = x;
639 maxScaledBaseDelta = (xMax - xMin) * combMaxBaseDelta;
640 }
641
642 double edgeMid = (chars.back().edge + edgeEnd) / 2;
643 if (fabs(x: charMid - edgeMid) >= maxScaledMidDelta || fabs(x: charBase - base) >= maxScaledBaseDelta) {
644 return false;
645 }
646
647 // Add character, but don't adjust edge / bounding box because
648 // combining character's positioning could be odd.
649 chars.emplace_back(args: CharInfo { .text: cCurrent, .charcode: c, .charPos: charPosA, .edge: edgeMid, .font: fontA, .textMat: textMatA });
650 charPosEnd = charPosA + charLen;
651
652 return true;
653 }
654
655 Unicode cPrev = getCombiningChar(u: chars.back().text);
656 if (cPrev != 0 && unicodeTypeAlphaNum(c: u)) {
657 // Previous is a combining character, current is base character
658 double maxScaledBaseDelta = (fontA->getAscent() - fontA->getDescent()) * fontSizeA * combMaxBaseDelta;
659 double charMid, charBase, maxScaledMidDelta;
660
661 // Test if characters overlap
662 if (rot == 0 || rot == 2) {
663 charMid = x + (dx / 2);
664 charBase = y;
665 maxScaledMidDelta = fabs(x: dx * combMaxMidDelta);
666 } else {
667 charMid = y + (dy / 2);
668 charBase = x;
669 maxScaledMidDelta = fabs(x: dy * combMaxMidDelta);
670 }
671
672 double edgeMid = (chars.back().edge + edgeEnd) / 2;
673 if (fabs(x: charMid - edgeMid) >= maxScaledMidDelta || fabs(x: charBase - base) >= maxScaledBaseDelta) {
674 return false;
675 }
676
677 fontSize = fontSizeA;
678 // move combining character to after base character
679 chars.emplace_back(args: CharInfo { .text: cPrev, .charcode: chars.back().charcode, .charPos: charPosA, .edge: edgeMid, .font: chars.back().font, .textMat: chars.back().textMat });
680
681 auto &lastChar = chars[chars.size() - 2];
682
683 charPosEnd = charPosA + charLen;
684 lastChar.text = u;
685 lastChar.charcode = c;
686 lastChar.font = fontA;
687 lastChar.textMat = textMatA;
688
689 if (len() == 2) {
690 setInitialBounds(fontA, x, y);
691 }
692
693 // Updated edges / bounding box because we changed the base
694 // character.
695 if (wMode) {
696 // FIXME unreachable, wMode == 0
697 switch (rot) {
698 case 0:
699 lastChar.edge = x - fontSize;
700 xMax = edgeEnd = x;
701 break;
702 case 1:
703 lastChar.edge = y - fontSize;
704 yMax = edgeEnd = y;
705 break;
706 case 2:
707 lastChar.edge = x + fontSize;
708 xMin = edgeEnd = x;
709 break;
710 case 3:
711 lastChar.edge = y + fontSize;
712 yMin = edgeEnd = y;
713 break;
714 }
715 } else {
716 switch (rot) {
717 case 0:
718 lastChar.edge = x;
719 xMax = edgeEnd = x + dx;
720 break;
721 case 1:
722 lastChar.edge = y;
723 yMax = edgeEnd = y + dy;
724 break;
725 case 2:
726 lastChar.edge = x;
727 xMin = edgeEnd = x + dx;
728 break;
729 case 3:
730 lastChar.edge = y;
731 yMin = edgeEnd = y + dy;
732 break;
733 }
734 }
735
736 chars.back().edge = (edgeEnd + lastChar.edge) / 2;
737 return true;
738 }
739 return false;
740}
741
742void TextWord::merge(TextWord *word)
743{
744 if (word->xMin < xMin) {
745 xMin = word->xMin;
746 }
747 if (word->yMin < yMin) {
748 yMin = word->yMin;
749 }
750 if (word->xMax > xMax) {
751 xMax = word->xMax;
752 }
753 if (word->yMax > yMax) {
754 yMax = word->yMax;
755 }
756 chars.insert(position: chars.end(), first: word->chars.begin(), last: word->chars.end());
757 edgeEnd = word->edgeEnd;
758 charPosEnd = word->charPosEnd;
759}
760
761inline int TextWord::primaryCmp(const TextWord *word) const
762{
763 double cmp;
764
765 cmp = 0; // make gcc happy
766 switch (rot) {
767 case 0:
768 cmp = xMin - word->xMin;
769 break;
770 case 1:
771 cmp = yMin - word->yMin;
772 break;
773 case 2:
774 cmp = word->xMax - xMax;
775 break;
776 case 3:
777 cmp = word->yMax - yMax;
778 break;
779 }
780 return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
781}
782
783double TextWord::primaryDelta(const TextWord *word) const
784{
785 double delta;
786
787 delta = 0; // make gcc happy
788 switch (rot) {
789 case 0:
790 delta = word->xMin - xMax;
791 break;
792 case 1:
793 delta = word->yMin - yMax;
794 break;
795 case 2:
796 delta = xMin - word->xMax;
797 break;
798 case 3:
799 delta = yMin - word->yMax;
800 break;
801 }
802 return delta;
803}
804
805int TextWord::cmpYX(const void *p1, const void *p2)
806{
807 TextWord *word1 = *(TextWord **)p1;
808 TextWord *word2 = *(TextWord **)p2;
809 double cmp;
810
811 cmp = word1->yMin - word2->yMin;
812 if (cmp == 0) {
813 cmp = word1->xMin - word2->xMin;
814 }
815 return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
816}
817
818#ifdef TEXTOUT_WORD_LIST
819
820GooString *TextWord::getText() const
821{
822 GooString *s;
823 const UnicodeMap *uMap;
824 char buf[8];
825
826 s = new GooString();
827 if (!(uMap = globalParams->getTextEncoding())) {
828 return s;
829 }
830 for (size_t i = 0; i < len(); ++i) {
831 auto n = uMap->mapUnicode(u: chars[i].text, buf, bufSize: sizeof(buf));
832 s->append(str: buf, lengthA: n);
833 }
834 return s;
835}
836
837void TextWord::getCharBBox(int charIdx, double *xMinA, double *yMinA, double *xMaxA, double *yMaxA) const
838{
839 if (charIdx < 0) {
840 return;
841 }
842 size_t uCharIdx = charIdx;
843 if (uCharIdx >= len()) {
844 return;
845 }
846 auto startingEdge = chars[uCharIdx].edge;
847 auto endingEdge = (uCharIdx + 1 == len()) ? edgeEnd : chars[charIdx + 1].edge;
848 switch (rot) {
849 case 0:
850 *xMinA = startingEdge;
851 *xMaxA = endingEdge;
852 *yMinA = yMin;
853 *yMaxA = yMax;
854 break;
855 case 1:
856 *xMinA = xMin;
857 *xMaxA = xMax;
858 *yMinA = startingEdge;
859 *yMaxA = endingEdge;
860 break;
861 case 2:
862 *xMinA = endingEdge;
863 *xMaxA = startingEdge;
864 *yMinA = yMin;
865 *yMaxA = yMax;
866 break;
867 case 3:
868 *xMinA = xMin;
869 *xMaxA = xMax;
870 *yMinA = endingEdge;
871 *yMaxA = startingEdge;
872 break;
873 }
874}
875
876#endif // TEXTOUT_WORD_LIST
877
878//------------------------------------------------------------------------
879// TextPool
880//------------------------------------------------------------------------
881
882TextPool::TextPool()
883{
884 minBaseIdx = 0;
885 maxBaseIdx = -1;
886 pool = nullptr;
887 cursor = nullptr;
888 cursorBaseIdx = -1;
889}
890
891TextPool::~TextPool()
892{
893 int baseIdx;
894 TextWord *word, *word2;
895
896 for (baseIdx = minBaseIdx; baseIdx <= maxBaseIdx; ++baseIdx) {
897 for (word = pool[baseIdx - minBaseIdx]; word; word = word2) {
898 word2 = word->next;
899 delete word;
900 }
901 }
902 gfree(p: pool);
903}
904
905int TextPool::getBaseIdx(double base) const
906{
907 const double baseIdxDouble = base / textPoolStep;
908 if (std::isnan(x: baseIdxDouble) || baseIdxDouble < minBaseIdx) {
909 return minBaseIdx;
910 }
911 if (baseIdxDouble > maxBaseIdx) {
912 return maxBaseIdx;
913 }
914 return (int)baseIdxDouble;
915}
916
917void TextPool::addWord(TextWord *word)
918{
919 int wordBaseIdx, newMinBaseIdx, newMaxBaseIdx, baseIdx;
920 TextWord *w0, *w1;
921
922 // expand the array if needed
923 wordBaseIdx = (int)(word->base / textPoolStep);
924 if (unlikely(wordBaseIdx <= INT_MIN + 128 || wordBaseIdx >= INT_MAX - 128)) {
925 error(category: errSyntaxWarning, pos: -1, msg: "wordBaseIdx out of range");
926 delete word;
927 return;
928 }
929 if (minBaseIdx > maxBaseIdx) {
930 minBaseIdx = wordBaseIdx - 128;
931 maxBaseIdx = wordBaseIdx + 128;
932 pool = (TextWord **)gmallocn(count: maxBaseIdx - minBaseIdx + 1, size: sizeof(TextWord *));
933 for (baseIdx = minBaseIdx; baseIdx <= maxBaseIdx; ++baseIdx) {
934 pool[baseIdx - minBaseIdx] = nullptr;
935 }
936 } else if (wordBaseIdx < minBaseIdx) {
937 newMinBaseIdx = wordBaseIdx - 128;
938 TextWord **newPool = (TextWord **)gmallocn_checkoverflow(count: maxBaseIdx - newMinBaseIdx + 1, size: sizeof(TextWord *));
939 if (unlikely(!newPool)) {
940 error(category: errSyntaxWarning, pos: -1, msg: "newPool would overflow");
941 delete word;
942 return;
943 }
944 for (baseIdx = newMinBaseIdx; baseIdx < minBaseIdx; ++baseIdx) {
945 newPool[baseIdx - newMinBaseIdx] = nullptr;
946 }
947 memcpy(dest: &newPool[minBaseIdx - newMinBaseIdx], src: pool, n: (maxBaseIdx - minBaseIdx + 1) * sizeof(TextWord *));
948 gfree(p: pool);
949 pool = newPool;
950 minBaseIdx = newMinBaseIdx;
951 } else if (wordBaseIdx > maxBaseIdx) {
952 newMaxBaseIdx = wordBaseIdx + 128;
953 TextWord **reallocatedPool = (TextWord **)greallocn(p: pool, count: newMaxBaseIdx - minBaseIdx + 1, size: sizeof(TextWord *), checkoverflow: true /*checkoverflow*/, free_p: false /*free_pool*/);
954 if (!reallocatedPool) {
955 error(category: errSyntaxWarning, pos: -1, msg: "new pool size would overflow");
956 delete word;
957 return;
958 }
959 pool = reallocatedPool;
960 for (baseIdx = maxBaseIdx + 1; baseIdx <= newMaxBaseIdx; ++baseIdx) {
961 pool[baseIdx - minBaseIdx] = nullptr;
962 }
963 maxBaseIdx = newMaxBaseIdx;
964 }
965
966 // insert the new word
967 if (cursor && wordBaseIdx == cursorBaseIdx && word->primaryCmp(word: cursor) >= 0) {
968 w0 = cursor;
969 w1 = cursor->next;
970 } else {
971 w0 = nullptr;
972 w1 = pool[wordBaseIdx - minBaseIdx];
973 }
974 for (; w1 && word->primaryCmp(word: w1) > 0; w0 = w1, w1 = w1->next) {
975 ;
976 }
977 word->next = w1;
978 if (w0) {
979 w0->next = word;
980 } else {
981 pool[wordBaseIdx - minBaseIdx] = word;
982 }
983 cursor = word;
984 cursorBaseIdx = wordBaseIdx;
985}
986
987//------------------------------------------------------------------------
988// TextLine
989//------------------------------------------------------------------------
990
991TextLine::TextLine(TextBlock *blkA, int rotA, double baseA)
992{
993 blk = blkA;
994 rot = rotA;
995 base = baseA;
996 words = lastWord = nullptr;
997 text = nullptr;
998 edge = nullptr;
999 col = nullptr;
1000 len = 0;
1001 convertedLen = 0;
1002 hyphenated = false;
1003 next = nullptr;
1004 xMin = yMin = 0;
1005 xMax = yMax = -1;
1006 normalized = nullptr;
1007 normalized_len = 0;
1008 normalized_idx = nullptr;
1009 ascii_translation = nullptr;
1010 ascii_len = 0;
1011 ascii_idx = nullptr;
1012}
1013
1014TextLine::~TextLine()
1015{
1016 TextWord *word;
1017
1018 while (words) {
1019 word = words;
1020 words = words->next;
1021 delete word;
1022 }
1023 gfree(p: text);
1024 gfree(p: edge);
1025 gfree(p: col);
1026 if (normalized) {
1027 gfree(p: normalized);
1028 gfree(p: normalized_idx);
1029 }
1030 if (ascii_translation) {
1031 gfree(p: ascii_translation);
1032 gfree(p: ascii_idx);
1033 }
1034}
1035
1036void TextLine::addWord(TextWord *word)
1037{
1038 if (lastWord) {
1039 lastWord->next = word;
1040 } else {
1041 words = word;
1042 }
1043 lastWord = word;
1044
1045 if (xMin > xMax) {
1046 xMin = word->xMin;
1047 xMax = word->xMax;
1048 yMin = word->yMin;
1049 yMax = word->yMax;
1050 } else {
1051 if (word->xMin < xMin) {
1052 xMin = word->xMin;
1053 }
1054 if (word->xMax > xMax) {
1055 xMax = word->xMax;
1056 }
1057 if (word->yMin < yMin) {
1058 yMin = word->yMin;
1059 }
1060 if (word->yMax > yMax) {
1061 yMax = word->yMax;
1062 }
1063 }
1064}
1065
1066double TextLine::primaryDelta(const TextLine *line) const
1067{
1068 double delta;
1069
1070 delta = 0; // make gcc happy
1071 switch (rot) {
1072 case 0:
1073 delta = line->xMin - xMax;
1074 break;
1075 case 1:
1076 delta = line->yMin - yMax;
1077 break;
1078 case 2:
1079 delta = xMin - line->xMax;
1080 break;
1081 case 3:
1082 delta = yMin - line->yMax;
1083 break;
1084 }
1085 return delta;
1086}
1087
1088int TextLine::primaryCmp(const TextLine *line) const
1089{
1090 double cmp;
1091
1092 cmp = 0; // make gcc happy
1093 switch (rot) {
1094 case 0:
1095 cmp = xMin - line->xMin;
1096 break;
1097 case 1:
1098 cmp = yMin - line->yMin;
1099 break;
1100 case 2:
1101 cmp = line->xMax - xMax;
1102 break;
1103 case 3:
1104 cmp = line->yMax - yMax;
1105 break;
1106 }
1107 return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
1108}
1109
1110int TextLine::secondaryCmp(const TextLine *line) const
1111{
1112 double cmp;
1113
1114 cmp = (rot == 0 || rot == 3) ? base - line->base : line->base - base;
1115 return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
1116}
1117
1118int TextLine::cmpYX(const TextLine *line) const
1119{
1120 int cmp;
1121
1122 if ((cmp = secondaryCmp(line))) {
1123 return cmp;
1124 }
1125 return primaryCmp(line);
1126}
1127
1128int TextLine::cmpXY(const void *p1, const void *p2)
1129{
1130 TextLine *line1 = *(TextLine **)p1;
1131 TextLine *line2 = *(TextLine **)p2;
1132 int cmp;
1133
1134 if ((cmp = line1->primaryCmp(line: line2))) {
1135 return cmp;
1136 }
1137 return line1->secondaryCmp(line: line2);
1138}
1139
1140void TextLine::coalesce(const UnicodeMap *uMap)
1141{
1142 double space, delta, minSpace;
1143 bool isUnicode;
1144 char buf[8];
1145
1146 if (words->next) {
1147
1148 // compute the inter-word space threshold
1149 if (words->len() > 1 || words->next->len() > 1) {
1150 minSpace = 0;
1151 } else {
1152 minSpace = words->primaryDelta(word: words->next);
1153 for (auto word0 = words->next, word1 = word0->next; word1 && minSpace > 0; word0 = word1, word1 = word0->next) {
1154 if (word1->len() > 1) {
1155 minSpace = 0;
1156 }
1157 delta = word0->primaryDelta(word: word1);
1158 if (delta < minSpace) {
1159 minSpace = delta;
1160 }
1161 }
1162 }
1163 if (minSpace <= 0) {
1164 space = maxCharSpacing * words->fontSize;
1165 } else {
1166 space = maxWideCharSpacingMul * minSpace;
1167 if (space > maxWideCharSpacing * words->fontSize) {
1168 space = maxWideCharSpacing * words->fontSize;
1169 }
1170 }
1171
1172 // merge words
1173 auto word0 = words;
1174 auto word1 = words->next;
1175 while (word1) {
1176 if (word0->primaryDelta(word: word1) >= space) {
1177 word0->spaceAfter = true;
1178 word0 = word1;
1179 word1 = word1->next;
1180 } else if (word0->chars.back().font == word1->chars.front().font //
1181 && word0->underlined == word1->underlined //
1182 && fabs(x: word0->fontSize - word1->fontSize) < maxWordFontSizeDelta * words->fontSize //
1183 && word1->chars.front().charPos == word0->charPosEnd) {
1184 word0->merge(word: word1);
1185 word0->next = word1->next;
1186 delete word1;
1187 word1 = word0->next;
1188 } else {
1189 word0 = word1;
1190 word1 = word1->next;
1191 }
1192 }
1193 }
1194
1195 // build the line text
1196 isUnicode = uMap ? uMap->isUnicode() : false;
1197 len = 0;
1198 for (auto word1 = words; word1; word1 = word1->next) {
1199 len += word1->len();
1200 if (word1->spaceAfter) {
1201 ++len;
1202 }
1203 }
1204 text = (Unicode *)gmallocn(count: len, size: sizeof(Unicode));
1205 edge = (double *)gmallocn(count: len + 1, size: sizeof(double));
1206 size_t i = 0;
1207 for (auto word1 = words; word1; word1 = word1->next) {
1208 for (size_t j = 0; j < word1->len(); ++j) {
1209 text[i] = word1->chars[j].text;
1210 edge[i] = word1->chars[j].edge;
1211 ++i;
1212 }
1213 edge[i] = word1->edgeEnd;
1214 if (word1->spaceAfter) {
1215 text[i] = (Unicode)0x0020;
1216 ++i;
1217 }
1218 }
1219
1220 // compute convertedLen and set up the col array
1221 col = (int *)gmallocn(count: len + 1, size: sizeof(int));
1222 convertedLen = 0;
1223 for (int ci = 0; ci < len; ++ci) {
1224 col[ci] = convertedLen;
1225 if (isUnicode) {
1226 ++convertedLen;
1227 } else if (uMap) {
1228 convertedLen += uMap->mapUnicode(u: text[ci], buf, bufSize: sizeof(buf));
1229 }
1230 }
1231 col[len] = convertedLen;
1232
1233 // check for hyphen at end of line
1234 //~ need to check for other chars used as hyphens
1235 hyphenated = text[len - 1] == (Unicode)'-';
1236}
1237
1238//------------------------------------------------------------------------
1239// TextLineFrag
1240//------------------------------------------------------------------------
1241
1242class TextLineFrag
1243{
1244public:
1245 TextLine *line; // the line object
1246 int start, len; // offset and length of this fragment
1247 // (in Unicode chars)
1248 double xMin, xMax; // bounding box coordinates
1249 double yMin, yMax;
1250 double base; // baseline virtual coordinate
1251 int col; // first column
1252
1253 void init(TextLine *lineA, int startA, int lenA);
1254 void computeCoords(bool oneRot);
1255
1256 static int cmpYXPrimaryRot(const void *p1, const void *p2);
1257 static int cmpYXLineRot(const void *p1, const void *p2);
1258 static int cmpXYLineRot(const void *p1, const void *p2);
1259 static int cmpXYColumnPrimaryRot(const void *p1, const void *p2);
1260 static int cmpXYColumnLineRot(const void *p1, const void *p2);
1261};
1262
1263void TextLineFrag::init(TextLine *lineA, int startA, int lenA)
1264{
1265 line = lineA;
1266 start = startA;
1267 len = lenA;
1268 col = line->col[start];
1269}
1270
1271void TextLineFrag::computeCoords(bool oneRot)
1272{
1273 TextBlock *blk;
1274 double d0, d1, d2, d3, d4;
1275
1276 if (oneRot) {
1277
1278 switch (line->rot) {
1279 case 0:
1280 xMin = line->edge[start];
1281 xMax = line->edge[start + len];
1282 yMin = line->yMin;
1283 yMax = line->yMax;
1284 break;
1285 case 1:
1286 xMin = line->xMin;
1287 xMax = line->xMax;
1288 yMin = line->edge[start];
1289 yMax = line->edge[start + len];
1290 break;
1291 case 2:
1292 xMin = line->edge[start + len];
1293 xMax = line->edge[start];
1294 yMin = line->yMin;
1295 yMax = line->yMax;
1296 break;
1297 case 3:
1298 xMin = line->xMin;
1299 xMax = line->xMax;
1300 yMin = line->edge[start + len];
1301 yMax = line->edge[start];
1302 break;
1303 }
1304 base = line->base;
1305
1306 } else {
1307
1308 if (line->rot == 0 && line->blk->page->primaryRot == 0) {
1309
1310 xMin = line->edge[start];
1311 xMax = line->edge[start + len];
1312 yMin = line->yMin;
1313 yMax = line->yMax;
1314 base = line->base;
1315
1316 } else {
1317
1318 blk = line->blk;
1319 d0 = line->edge[start];
1320 d1 = line->edge[start + len];
1321 d2 = d3 = d4 = 0; // make gcc happy
1322
1323 switch (line->rot) {
1324 case 0:
1325 d2 = line->yMin;
1326 d3 = line->yMax;
1327 d4 = line->base;
1328 d0 = (d0 - blk->xMin) / (blk->xMax - blk->xMin);
1329 d1 = (d1 - blk->xMin) / (blk->xMax - blk->xMin);
1330 d2 = (d2 - blk->yMin) / (blk->yMax - blk->yMin);
1331 d3 = (d3 - blk->yMin) / (blk->yMax - blk->yMin);
1332 d4 = (d4 - blk->yMin) / (blk->yMax - blk->yMin);
1333 break;
1334 case 1:
1335 d2 = line->xMax;
1336 d3 = line->xMin;
1337 d4 = line->base;
1338 d0 = (d0 - blk->yMin) / (blk->yMax - blk->yMin);
1339 d1 = (d1 - blk->yMin) / (blk->yMax - blk->yMin);
1340 d2 = (blk->xMax - d2) / (blk->xMax - blk->xMin);
1341 d3 = (blk->xMax - d3) / (blk->xMax - blk->xMin);
1342 d4 = (blk->xMax - d4) / (blk->xMax - blk->xMin);
1343 break;
1344 case 2:
1345 d2 = line->yMax;
1346 d3 = line->yMin;
1347 d4 = line->base;
1348 d0 = (blk->xMax - d0) / (blk->xMax - blk->xMin);
1349 d1 = (blk->xMax - d1) / (blk->xMax - blk->xMin);
1350 d2 = (blk->yMax - d2) / (blk->yMax - blk->yMin);
1351 d3 = (blk->yMax - d3) / (blk->yMax - blk->yMin);
1352 d4 = (blk->yMax - d4) / (blk->yMax - blk->yMin);
1353 break;
1354 case 3:
1355 d2 = line->xMin;
1356 d3 = line->xMax;
1357 d4 = line->base;
1358 d0 = (blk->yMax - d0) / (blk->yMax - blk->yMin);
1359 d1 = (blk->yMax - d1) / (blk->yMax - blk->yMin);
1360 d2 = (d2 - blk->xMin) / (blk->xMax - blk->xMin);
1361 d3 = (d3 - blk->xMin) / (blk->xMax - blk->xMin);
1362 d4 = (d4 - blk->xMin) / (blk->xMax - blk->xMin);
1363 break;
1364 }
1365
1366 switch (line->blk->page->primaryRot) {
1367 case 0:
1368 xMin = blk->xMin + d0 * (blk->xMax - blk->xMin);
1369 xMax = blk->xMin + d1 * (blk->xMax - blk->xMin);
1370 yMin = blk->yMin + d2 * (blk->yMax - blk->yMin);
1371 yMax = blk->yMin + d3 * (blk->yMax - blk->yMin);
1372 base = blk->yMin + d4 * (blk->yMax - blk->yMin);
1373 break;
1374 case 1:
1375 xMin = blk->xMax - d3 * (blk->xMax - blk->xMin);
1376 xMax = blk->xMax - d2 * (blk->xMax - blk->xMin);
1377 yMin = blk->yMin + d0 * (blk->yMax - blk->yMin);
1378 yMax = blk->yMin + d1 * (blk->yMax - blk->yMin);
1379 base = blk->xMax - d4 * (blk->xMax - blk->xMin);
1380 break;
1381 case 2:
1382 xMin = blk->xMax - d1 * (blk->xMax - blk->xMin);
1383 xMax = blk->xMax - d0 * (blk->xMax - blk->xMin);
1384 yMin = blk->yMax - d3 * (blk->yMax - blk->yMin);
1385 yMax = blk->yMax - d2 * (blk->yMax - blk->yMin);
1386 base = blk->yMax - d4 * (blk->yMax - blk->yMin);
1387 break;
1388 case 3:
1389 xMin = blk->xMin + d2 * (blk->xMax - blk->xMin);
1390 xMax = blk->xMin + d3 * (blk->xMax - blk->xMin);
1391 yMin = blk->yMax - d1 * (blk->yMax - blk->yMin);
1392 yMax = blk->yMax - d0 * (blk->yMax - blk->yMin);
1393 base = blk->xMin + d4 * (blk->xMax - blk->xMin);
1394 break;
1395 }
1396 }
1397 }
1398}
1399
1400int TextLineFrag::cmpYXPrimaryRot(const void *p1, const void *p2)
1401{
1402 TextLineFrag *frag1 = (TextLineFrag *)p1;
1403 TextLineFrag *frag2 = (TextLineFrag *)p2;
1404 double cmp;
1405
1406 cmp = 0; // make gcc happy
1407 switch (frag1->line->blk->page->primaryRot) {
1408 case 0:
1409 if (fabs(x: cmp = frag1->yMin - frag2->yMin) < 0.01) {
1410 cmp = frag1->xMin - frag2->xMin;
1411 }
1412 break;
1413 case 1:
1414 if (fabs(x: cmp = frag2->xMax - frag1->xMax) < 0.01) {
1415 cmp = frag1->yMin - frag2->yMin;
1416 }
1417 break;
1418 case 2:
1419 if (fabs(x: cmp = frag2->yMin - frag1->yMin) < 0.01) {
1420 cmp = frag2->xMax - frag1->xMax;
1421 }
1422 break;
1423 case 3:
1424 if (fabs(x: cmp = frag1->xMax - frag2->xMax) < 0.01) {
1425 cmp = frag2->yMax - frag1->yMax;
1426 }
1427 break;
1428 }
1429 return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
1430}
1431
1432int TextLineFrag::cmpYXLineRot(const void *p1, const void *p2)
1433{
1434 TextLineFrag *frag1 = (TextLineFrag *)p1;
1435 TextLineFrag *frag2 = (TextLineFrag *)p2;
1436 double cmp;
1437
1438 cmp = 0; // make gcc happy
1439 switch (frag1->line->rot) {
1440 case 0:
1441 if ((cmp = frag1->yMin - frag2->yMin) == 0) {
1442 cmp = frag1->xMin - frag2->xMin;
1443 }
1444 break;
1445 case 1:
1446 if ((cmp = frag2->xMax - frag1->xMax) == 0) {
1447 cmp = frag1->yMin - frag2->yMin;
1448 }
1449 break;
1450 case 2:
1451 if ((cmp = frag2->yMin - frag1->yMin) == 0) {
1452 cmp = frag2->xMax - frag1->xMax;
1453 }
1454 break;
1455 case 3:
1456 if ((cmp = frag1->xMax - frag2->xMax) == 0) {
1457 cmp = frag2->yMax - frag1->yMax;
1458 }
1459 break;
1460 }
1461 return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
1462}
1463
1464int TextLineFrag::cmpXYLineRot(const void *p1, const void *p2)
1465{
1466 TextLineFrag *frag1 = (TextLineFrag *)p1;
1467 TextLineFrag *frag2 = (TextLineFrag *)p2;
1468 double cmp;
1469
1470 cmp = 0; // make gcc happy
1471 switch (frag1->line->rot) {
1472 case 0:
1473 if ((cmp = frag1->xMin - frag2->xMin) == 0) {
1474 cmp = frag1->yMin - frag2->yMin;
1475 }
1476 break;
1477 case 1:
1478 if ((cmp = frag1->yMin - frag2->yMin) == 0) {
1479 cmp = frag2->xMax - frag1->xMax;
1480 }
1481 break;
1482 case 2:
1483 if ((cmp = frag2->xMax - frag1->xMax) == 0) {
1484 cmp = frag2->yMin - frag1->yMin;
1485 }
1486 break;
1487 case 3:
1488 if ((cmp = frag2->yMax - frag1->yMax) == 0) {
1489 cmp = frag1->xMax - frag2->xMax;
1490 }
1491 break;
1492 }
1493 return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
1494}
1495
1496int TextLineFrag::cmpXYColumnPrimaryRot(const void *p1, const void *p2)
1497{
1498 TextLineFrag *frag1 = (TextLineFrag *)p1;
1499 TextLineFrag *frag2 = (TextLineFrag *)p2;
1500 double cmp;
1501
1502 // if columns overlap, compare y values
1503 if (frag1->col < frag2->col + (frag2->line->col[frag2->start + frag2->len] - frag2->line->col[frag2->start]) && frag2->col < frag1->col + (frag1->line->col[frag1->start + frag1->len] - frag1->line->col[frag1->start])) {
1504 cmp = 0; // make gcc happy
1505 switch (frag1->line->blk->page->primaryRot) {
1506 case 0:
1507 cmp = frag1->yMin - frag2->yMin;
1508 break;
1509 case 1:
1510 cmp = frag2->xMax - frag1->xMax;
1511 break;
1512 case 2:
1513 cmp = frag2->yMin - frag1->yMin;
1514 break;
1515 case 3:
1516 cmp = frag1->xMax - frag2->xMax;
1517 break;
1518 }
1519 return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
1520 }
1521
1522 // otherwise, compare starting column
1523 return frag1->col - frag2->col;
1524}
1525
1526int TextLineFrag::cmpXYColumnLineRot(const void *p1, const void *p2)
1527{
1528 TextLineFrag *frag1 = (TextLineFrag *)p1;
1529 TextLineFrag *frag2 = (TextLineFrag *)p2;
1530 double cmp;
1531
1532 // if columns overlap, compare y values
1533 if (frag1->col < frag2->col + (frag2->line->col[frag2->start + frag2->len] - frag2->line->col[frag2->start]) && frag2->col < frag1->col + (frag1->line->col[frag1->start + frag1->len] - frag1->line->col[frag1->start])) {
1534 cmp = 0; // make gcc happy
1535 switch (frag1->line->rot) {
1536 case 0:
1537 cmp = frag1->yMin - frag2->yMin;
1538 break;
1539 case 1:
1540 cmp = frag2->xMax - frag1->xMax;
1541 break;
1542 case 2:
1543 cmp = frag2->yMin - frag1->yMin;
1544 break;
1545 case 3:
1546 cmp = frag1->xMax - frag2->xMax;
1547 break;
1548 }
1549 return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
1550 }
1551
1552 // otherwise, compare starting column
1553 return frag1->col - frag2->col;
1554}
1555
1556//------------------------------------------------------------------------
1557// TextBlock
1558//------------------------------------------------------------------------
1559
1560TextBlock::TextBlock(TextPage *pageA, int rotA)
1561{
1562 page = pageA;
1563 rot = rotA;
1564 xMin = yMin = 0;
1565 xMax = yMax = -1;
1566 priMin = 0;
1567 priMax = page->pageWidth;
1568 pool = new TextPool();
1569 lines = nullptr;
1570 curLine = nullptr;
1571 next = nullptr;
1572 stackNext = nullptr;
1573 tableId = -1;
1574 tableEnd = false;
1575}
1576
1577TextBlock::~TextBlock()
1578{
1579 TextLine *line;
1580
1581 delete pool;
1582 while (lines) {
1583 line = lines;
1584 lines = lines->next;
1585 delete line;
1586 }
1587}
1588
1589void TextBlock::addWord(TextWord *word)
1590{
1591 pool->addWord(word);
1592 if (xMin > xMax) {
1593 xMin = word->xMin;
1594 xMax = word->xMax;
1595 yMin = word->yMin;
1596 yMax = word->yMax;
1597 } else {
1598 if (word->xMin < xMin) {
1599 xMin = word->xMin;
1600 }
1601 if (word->xMax > xMax) {
1602 xMax = word->xMax;
1603 }
1604 if (word->yMin < yMin) {
1605 yMin = word->yMin;
1606 }
1607 if (word->yMax > yMax) {
1608 yMax = word->yMax;
1609 }
1610 }
1611}
1612
1613void TextBlock::coalesce(const UnicodeMap *uMap, double fixedPitch)
1614{
1615 // discard duplicated text (fake boldface, drop shadows)
1616 for (int idx0 = pool->minBaseIdx; idx0 <= pool->maxBaseIdx; ++idx0) {
1617 // Get the first LHS word from the pool
1618 TextWord *word0 = pool->getPool(baseIdx: idx0);
1619
1620 while (word0) {
1621 double priDelta = dupMaxPriDelta * word0->fontSize;
1622 double secDelta = dupMaxSecDelta * word0->fontSize;
1623 double xDelta = ((rot == 0) || (rot == 2)) ? priDelta : secDelta;
1624 double yDelta = ((rot == 0) || (rot == 2)) ? secDelta : priDelta;
1625
1626 int maxBaseIdx = pool->getBaseIdx(base: word0->base + secDelta);
1627
1628 for (int idx1 = idx0; idx1 <= maxBaseIdx; idx1++) {
1629 TextWord *prevWord;
1630 /* In case the RHS word is from the same pool as the LHS word,
1631 * start the inner loop with the word following the LHS word.
1632 * Otherwise, start with the second word from the subsequent pools
1633 * - the first word is compared at the end.
1634 */
1635 if (idx0 == idx1) {
1636 prevWord = word0;
1637 } else {
1638 prevWord = pool->getPool(baseIdx: idx1);
1639 if (!prevWord) {
1640 continue;
1641 }
1642 }
1643 TextWord *word1 = prevWord->next;
1644
1645 auto equalText = [](const TextWord &w1, const TextWord &w2) -> bool { //
1646 return std::equal(first1: w1.chars.begin(), last1: w1.chars.end(), first2: w2.chars.begin(), last2: w2.chars.end(), //
1647 binary_pred: [](auto c1, auto c2) { return c1.text == c2.text; });
1648 };
1649 auto match = [&equalText, xDelta, yDelta](const TextWord &w1, const TextWord &w2) -> bool {
1650 if (!equalText(w1, w2)) {
1651 return false;
1652 }
1653 return fabs(x: w1.xMin - w2.xMin) < xDelta && fabs(x: w1.xMax - w2.xMax) < xDelta //
1654 && fabs(x: w1.yMin - w2.yMin) < yDelta && fabs(x: w1.yMax - w2.yMax) < yDelta;
1655 };
1656
1657 while (word1) {
1658 if (match(*word0, *word1)) {
1659 prevWord->next = word1->next;
1660 delete word1;
1661 word1 = prevWord->next;
1662 } else {
1663 prevWord = word1;
1664 word1 = word1->next;
1665 }
1666 }
1667
1668 // Check the first word from each subsequent pool
1669 if (idx0 != idx1) {
1670 word1 = pool->getPool(baseIdx: idx1);
1671 }
1672 if (word1 && match(*word0, *word1)) {
1673 pool->setPool(baseIdx: idx1, p: word1->next);
1674 delete word1;
1675 }
1676 }
1677
1678 word0 = word0->next;
1679 }
1680 }
1681
1682 TextWord *word0, *word1;
1683 TextWord *bestWord0, *bestWord1, *lastWord;
1684 TextLine *line, *line0, *line1;
1685 TextLine **lineArray;
1686 int poolMinBaseIdx, startBaseIdx, minBaseIdx, maxBaseIdx;
1687 int baseIdx, bestWordBaseIdx;
1688 double minBase, maxBase;
1689 double fontSize, wordSpacing, delta;
1690 bool overlap;
1691 int col1, col2;
1692 int i, j, k;
1693
1694 // build the lines
1695 curLine = nullptr;
1696 poolMinBaseIdx = pool->minBaseIdx;
1697 charCount = 0;
1698 nLines = 0;
1699 while (true) {
1700
1701 // find the first non-empty line in the pool
1702 for (; poolMinBaseIdx <= pool->maxBaseIdx && !pool->getPool(baseIdx: poolMinBaseIdx); ++poolMinBaseIdx) {
1703 ;
1704 }
1705 if (poolMinBaseIdx > pool->maxBaseIdx) {
1706 break;
1707 }
1708
1709 // look for the left-most word in the first four lines of the
1710 // pool -- this avoids starting with a superscript word
1711 startBaseIdx = poolMinBaseIdx;
1712 for (baseIdx = poolMinBaseIdx + 1; baseIdx < poolMinBaseIdx + 4 && baseIdx <= pool->maxBaseIdx; ++baseIdx) {
1713 if (!pool->getPool(baseIdx)) {
1714 continue;
1715 }
1716 if (pool->getPool(baseIdx)->primaryCmp(word: pool->getPool(baseIdx: startBaseIdx)) < 0) {
1717 startBaseIdx = baseIdx;
1718 }
1719 }
1720
1721 // create a new line
1722 word0 = pool->getPool(baseIdx: startBaseIdx);
1723 pool->setPool(baseIdx: startBaseIdx, p: word0->next);
1724 word0->next = nullptr;
1725 line = new TextLine(this, word0->rot, word0->base);
1726 line->addWord(word: word0);
1727 lastWord = word0;
1728
1729 // compute the search range
1730 fontSize = word0->fontSize;
1731 minBase = word0->base - maxIntraLineDelta * fontSize;
1732 maxBase = word0->base + maxIntraLineDelta * fontSize;
1733 minBaseIdx = pool->getBaseIdx(base: minBase);
1734 maxBaseIdx = pool->getBaseIdx(base: maxBase);
1735 wordSpacing = fixedPitch ? fixedPitch : maxWordSpacing * fontSize;
1736
1737 // find the rest of the words in this line
1738 while (true) {
1739
1740 // find the left-most word whose baseline is in the range for
1741 // this line
1742 bestWordBaseIdx = 0;
1743 bestWord0 = bestWord1 = nullptr;
1744 overlap = false;
1745 for (baseIdx = minBaseIdx; !overlap && baseIdx <= maxBaseIdx; ++baseIdx) {
1746 for (word0 = nullptr, word1 = pool->getPool(baseIdx); word1; word0 = word1, word1 = word1->next) {
1747 if (word1->base >= minBase && word1->base <= maxBase) {
1748 delta = lastWord->primaryDelta(word: word1);
1749 if (delta < minCharSpacing * fontSize) {
1750 overlap = true;
1751 break;
1752 } else {
1753 if (delta < wordSpacing && (!bestWord1 || word1->primaryCmp(word: bestWord1) < 0)) {
1754 bestWordBaseIdx = baseIdx;
1755 bestWord0 = word0;
1756 bestWord1 = word1;
1757 }
1758 break;
1759 }
1760 }
1761 }
1762 }
1763 if (overlap || !bestWord1) {
1764 break;
1765 }
1766
1767 // remove it from the pool, and add it to the line
1768 if (bestWord0) {
1769 bestWord0->next = bestWord1->next;
1770 } else {
1771 pool->setPool(baseIdx: bestWordBaseIdx, p: bestWord1->next);
1772 }
1773 bestWord1->next = nullptr;
1774 line->addWord(word: bestWord1);
1775 lastWord = bestWord1;
1776 }
1777
1778 // add the line
1779 if (curLine && line->cmpYX(line: curLine) > 0) {
1780 line0 = curLine;
1781 line1 = curLine->next;
1782 } else {
1783 line0 = nullptr;
1784 line1 = lines;
1785 }
1786 for (; line1 && line->cmpYX(line: line1) > 0; line0 = line1, line1 = line1->next) {
1787 ;
1788 }
1789 if (line0) {
1790 line0->next = line;
1791 } else {
1792 lines = line;
1793 }
1794 line->next = line1;
1795 curLine = line;
1796 line->coalesce(uMap);
1797 charCount += line->len;
1798 ++nLines;
1799 }
1800
1801 // sort lines into xy order for column assignment
1802 lineArray = (TextLine **)gmallocn(count: nLines, size: sizeof(TextLine *));
1803 for (line = lines, i = 0; line; line = line->next, ++i) {
1804 lineArray[i] = line;
1805 }
1806 qsort(base: lineArray, nmemb: nLines, size: sizeof(TextLine *), compar: &TextLine::cmpXY);
1807
1808 // column assignment
1809 nColumns = 0;
1810 if (fixedPitch) {
1811 for (i = 0; i < nLines; ++i) {
1812 line0 = lineArray[i];
1813 col1 = 0; // make gcc happy
1814 switch (rot) {
1815 case 0:
1816 col1 = (int)((line0->xMin - xMin) / fixedPitch + 0.5);
1817 break;
1818 case 1:
1819 col1 = (int)((line0->yMin - yMin) / fixedPitch + 0.5);
1820 break;
1821 case 2:
1822 col1 = (int)((xMax - line0->xMax) / fixedPitch + 0.5);
1823 break;
1824 case 3:
1825 col1 = (int)((yMax - line0->yMax) / fixedPitch + 0.5);
1826 break;
1827 }
1828 for (k = 0; k <= line0->len; ++k) {
1829 line0->col[k] += col1;
1830 }
1831 if (line0->col[line0->len] > nColumns) {
1832 nColumns = line0->col[line0->len];
1833 }
1834 }
1835 } else {
1836 for (i = 0; i < nLines; ++i) {
1837 line0 = lineArray[i];
1838 col1 = 0;
1839 for (j = 0; j < i; ++j) {
1840 line1 = lineArray[j];
1841 if (line1->primaryDelta(line: line0) >= 0) {
1842 col2 = line1->col[line1->len] + 1;
1843 } else {
1844 k = 0; // make gcc happy
1845 switch (rot) {
1846 case 0:
1847 for (k = 0; k < line1->len && line0->xMin >= 0.5 * (line1->edge[k] + line1->edge[k + 1]); ++k) {
1848 ;
1849 }
1850 break;
1851 case 1:
1852 for (k = 0; k < line1->len && line0->yMin >= 0.5 * (line1->edge[k] + line1->edge[k + 1]); ++k) {
1853 ;
1854 }
1855 break;
1856 case 2:
1857 for (k = 0; k < line1->len && line0->xMax <= 0.5 * (line1->edge[k] + line1->edge[k + 1]); ++k) {
1858 ;
1859 }
1860 break;
1861 case 3:
1862 for (k = 0; k < line1->len && line0->yMax <= 0.5 * (line1->edge[k] + line1->edge[k + 1]); ++k) {
1863 ;
1864 }
1865 break;
1866 }
1867 col2 = line1->col[k];
1868 }
1869 if (col2 > col1) {
1870 col1 = col2;
1871 }
1872 }
1873 for (k = 0; k <= line0->len; ++k) {
1874 line0->col[k] += col1;
1875 }
1876 if (line0->col[line0->len] > nColumns) {
1877 nColumns = line0->col[line0->len];
1878 }
1879 }
1880 }
1881 gfree(p: lineArray);
1882}
1883
1884void TextBlock::updatePriMinMax(const TextBlock *blk)
1885{
1886 double newPriMin, newPriMax;
1887 bool gotPriMin, gotPriMax;
1888
1889 gotPriMin = gotPriMax = false;
1890 newPriMin = newPriMax = 0; // make gcc happy
1891 switch (page->primaryRot) {
1892 case 0:
1893 case 2:
1894 if (blk->yMin < yMax && blk->yMax > yMin) {
1895 if (blk->xMin < xMin) {
1896 newPriMin = blk->xMax;
1897 gotPriMin = true;
1898 }
1899 if (blk->xMax > xMax) {
1900 newPriMax = blk->xMin;
1901 gotPriMax = true;
1902 }
1903 }
1904 break;
1905 case 1:
1906 case 3:
1907 if (blk->xMin < xMax && blk->xMax > xMin) {
1908 if (blk->yMin < yMin) {
1909 newPriMin = blk->yMax;
1910 gotPriMin = true;
1911 }
1912 if (blk->yMax > yMax) {
1913 newPriMax = blk->yMin;
1914 gotPriMax = true;
1915 }
1916 }
1917 break;
1918 }
1919 if (gotPriMin) {
1920 if (newPriMin > xMin) {
1921 newPriMin = xMin;
1922 }
1923 if (newPriMin > priMin) {
1924 priMin = newPriMin;
1925 }
1926 }
1927 if (gotPriMax) {
1928 if (newPriMax < xMax) {
1929 newPriMax = xMax;
1930 }
1931 if (newPriMax < priMax) {
1932 priMax = newPriMax;
1933 }
1934 }
1935}
1936
1937int TextBlock::cmpXYPrimaryRot(const void *p1, const void *p2)
1938{
1939 TextBlock *blk1 = *(TextBlock **)p1;
1940 TextBlock *blk2 = *(TextBlock **)p2;
1941 double cmp;
1942
1943 cmp = 0; // make gcc happy
1944 switch (blk1->page->primaryRot) {
1945 case 0:
1946 if ((cmp = blk1->xMin - blk2->xMin) == 0) {
1947 cmp = blk1->yMin - blk2->yMin;
1948 }
1949 break;
1950 case 1:
1951 if ((cmp = blk1->yMin - blk2->yMin) == 0) {
1952 cmp = blk2->xMax - blk1->xMax;
1953 }
1954 break;
1955 case 2:
1956 if ((cmp = blk2->xMax - blk1->xMax) == 0) {
1957 cmp = blk2->yMin - blk1->yMin;
1958 }
1959 break;
1960 case 3:
1961 if ((cmp = blk2->yMax - blk1->yMax) == 0) {
1962 cmp = blk1->xMax - blk2->xMax;
1963 }
1964 break;
1965 }
1966 return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
1967}
1968
1969int TextBlock::cmpYXPrimaryRot(const void *p1, const void *p2)
1970{
1971 TextBlock *blk1 = *(TextBlock **)p1;
1972 TextBlock *blk2 = *(TextBlock **)p2;
1973 double cmp;
1974
1975 cmp = 0; // make gcc happy
1976 switch (blk1->page->primaryRot) {
1977 case 0:
1978 if ((cmp = blk1->yMin - blk2->yMin) == 0) {
1979 cmp = blk1->xMin - blk2->xMin;
1980 }
1981 break;
1982 case 1:
1983 if ((cmp = blk2->xMax - blk1->xMax) == 0) {
1984 cmp = blk1->yMin - blk2->yMin;
1985 }
1986 break;
1987 case 2:
1988 if ((cmp = blk2->yMin - blk1->yMin) == 0) {
1989 cmp = blk2->xMax - blk1->xMax;
1990 }
1991 break;
1992 case 3:
1993 if ((cmp = blk1->xMax - blk2->xMax) == 0) {
1994 cmp = blk2->yMax - blk1->yMax;
1995 }
1996 break;
1997 }
1998 return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
1999}
2000
2001int TextBlock::primaryCmp(const TextBlock *blk) const
2002{
2003 double cmp;
2004
2005 cmp = 0; // make gcc happy
2006 switch (rot) {
2007 case 0:
2008 cmp = xMin - blk->xMin;
2009 break;
2010 case 1:
2011 cmp = yMin - blk->yMin;
2012 break;
2013 case 2:
2014 cmp = blk->xMax - xMax;
2015 break;
2016 case 3:
2017 cmp = blk->yMax - yMax;
2018 break;
2019 }
2020 return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
2021}
2022
2023double TextBlock::secondaryDelta(const TextBlock *blk) const
2024{
2025 double delta;
2026
2027 delta = 0; // make gcc happy
2028 switch (rot) {
2029 case 0:
2030 delta = blk->yMin - yMax;
2031 break;
2032 case 1:
2033 delta = xMin - blk->xMax;
2034 break;
2035 case 2:
2036 delta = yMin - blk->yMax;
2037 break;
2038 case 3:
2039 delta = blk->xMin - xMax;
2040 break;
2041 }
2042 return delta;
2043}
2044
2045bool TextBlock::isBelow(const TextBlock *blk) const
2046{
2047 bool below;
2048
2049 below = false; // make gcc happy
2050 switch (page->primaryRot) {
2051 case 0:
2052 below = xMin >= blk->priMin && xMax <= blk->priMax && yMin > blk->yMin;
2053 break;
2054 case 1:
2055 below = yMin >= blk->priMin && yMax <= blk->priMax && xMax < blk->xMax;
2056 break;
2057 case 2:
2058 below = xMin >= blk->priMin && xMax <= blk->priMax && yMax < blk->yMax;
2059 break;
2060 case 3:
2061 below = yMin >= blk->priMin && yMax <= blk->priMax && xMin > blk->xMin;
2062 break;
2063 }
2064
2065 return below;
2066}
2067
2068bool TextBlock::isBeforeByRule1(const TextBlock *blk1)
2069{
2070 bool before = false;
2071 bool overlap = false;
2072
2073 switch (this->page->primaryRot) {
2074 case 0:
2075 case 2:
2076 overlap = ((this->ExMin <= blk1->ExMin) && (blk1->ExMin <= this->ExMax)) || ((blk1->ExMin <= this->ExMin) && (this->ExMin <= blk1->ExMax));
2077 break;
2078 case 1:
2079 case 3:
2080 overlap = ((this->EyMin <= blk1->EyMin) && (blk1->EyMin <= this->EyMax)) || ((blk1->EyMin <= this->EyMin) && (this->EyMin <= blk1->EyMax));
2081 break;
2082 }
2083 switch (this->page->primaryRot) {
2084 case 0:
2085 before = overlap && this->EyMin < blk1->EyMin;
2086 break;
2087 case 1:
2088 before = overlap && this->ExMax > blk1->ExMax;
2089 break;
2090 case 2:
2091 before = overlap && this->EyMax > blk1->EyMax;
2092 break;
2093 case 3:
2094 before = overlap && this->ExMin < blk1->ExMin;
2095 break;
2096 }
2097 return before;
2098}
2099
2100bool TextBlock::isBeforeByRule2(const TextBlock *blk1)
2101{
2102 double cmp = 0;
2103 int rotLR = rot;
2104
2105 if (!page->primaryLR) {
2106 rotLR = (rotLR + 2) % 4;
2107 }
2108
2109 switch (rotLR) {
2110 case 0:
2111 cmp = ExMax - blk1->ExMin;
2112 break;
2113 case 1:
2114 cmp = EyMin - blk1->EyMax;
2115 break;
2116 case 2:
2117 cmp = blk1->ExMax - ExMin;
2118 break;
2119 case 3:
2120 cmp = blk1->EyMin - EyMax;
2121 break;
2122 }
2123 return cmp <= 0;
2124}
2125
2126// Sort into reading order by performing a topological sort using the rules
2127// given in "High Performance Document Layout Analysis", T.M. Breuel, 2003.
2128// See http://pubs.iupr.org/#2003-breuel-sdiut
2129// Topological sort is done by depth first search, see
2130// http://en.wikipedia.org/wiki/Topological_sorting
2131int TextBlock::visitDepthFirst(TextBlock *blkList, int pos1, TextBlock **sorted, int sortPos, bool *visited, TextBlock **cache, int cacheSize)
2132{
2133 int pos2;
2134 TextBlock *blk1, *blk2, *blk3;
2135 bool before;
2136
2137 if (visited[pos1]) {
2138 return sortPos;
2139 }
2140
2141 blk1 = this;
2142
2143#if 0 // for debugging
2144 printf("visited: %d %.2f..%.2f %.2f..%.2f\n",
2145 sortPos, blk1->ExMin, blk1->ExMax, blk1->EyMin, blk1->EyMax);
2146#endif
2147 visited[pos1] = true;
2148 pos2 = -1;
2149 for (blk2 = blkList; blk2; blk2 = blk2->next) {
2150 pos2++;
2151 if (visited[pos2]) {
2152 // skip visited nodes
2153 continue;
2154 }
2155 before = false;
2156
2157 // is blk2 before blk1? (for table entries)
2158 if (blk1->tableId >= 0 && blk1->tableId == blk2->tableId) {
2159 if (page->primaryLR) {
2160 if (blk2->xMax <= blk1->xMin && blk2->yMin <= blk1->yMax && blk2->yMax >= blk1->yMin) {
2161 before = true;
2162 }
2163 } else {
2164 if (blk2->xMin >= blk1->xMax && blk2->yMin <= blk1->yMax && blk2->yMax >= blk1->yMin) {
2165 before = true;
2166 }
2167 }
2168
2169 if (blk2->yMax <= blk1->yMin) {
2170 before = true;
2171 }
2172 } else {
2173 if (blk2->isBeforeByRule1(blk1)) {
2174 // Rule (1) blk1 and blk2 overlap, and blk2 is above blk1.
2175 before = true;
2176#if 0 // for debugging
2177 printf("rule1: %.2f..%.2f %.2f..%.2f %.2f..%.2f %.2f..%.2f\n",
2178 blk2->ExMin, blk2->ExMax, blk2->EyMin, blk2->EyMax,
2179 blk1->ExMin, blk1->ExMax, blk1->EyMin, blk1->EyMax);
2180#endif
2181 } else if (blk2->isBeforeByRule2(blk1)) {
2182 // Rule (2) blk2 left of blk1, and no intervening blk3
2183 // such that blk1 is before blk3 by rule 1,
2184 // and blk3 is before blk2 by rule 1.
2185 before = true;
2186 for (int i = 0; i < cacheSize && cache[i]; ++i) {
2187 if (blk1->isBeforeByRule1(blk1: cache[i]) && cache[i]->isBeforeByRule1(blk1: blk2)) {
2188 before = false;
2189 std::rotate(first: cache, middle: cache + i, last: cache + i + 1);
2190 break;
2191 }
2192 }
2193
2194 if (before) {
2195 for (blk3 = blkList; blk3; blk3 = blk3->next) {
2196 if (blk3 == blk2 || blk3 == blk1) {
2197 continue;
2198 }
2199 if (blk1->isBeforeByRule1(blk1: blk3) && blk3->isBeforeByRule1(blk1: blk2)) {
2200 before = false;
2201 std::copy_backward(first: cache, last: cache + cacheSize - 1, result: cache + cacheSize);
2202 cache[0] = blk3;
2203 break;
2204 }
2205 }
2206 }
2207#if 0 // for debugging
2208 if (before) {
2209 printf("rule2: %.2f..%.2f %.2f..%.2f %.2f..%.2f %.2f..%.2f\n",
2210 blk1->ExMin, blk1->ExMax, blk1->EyMin, blk1->EyMax,
2211 blk2->ExMin, blk2->ExMax, blk2->EyMin, blk2->EyMax);
2212 }
2213#endif
2214 }
2215 }
2216 if (before) {
2217 // blk2 is before blk1, so it needs to be visited
2218 // before we can add blk1 to the sorted list.
2219 sortPos = blk2->visitDepthFirst(blkList, pos1: pos2, sorted, sortPos, visited, cache, cacheSize);
2220 }
2221 }
2222#if 0 // for debugging
2223 printf("sorted: %d %.2f..%.2f %.2f..%.2f\n",
2224 sortPos, blk1->ExMin, blk1->ExMax, blk1->EyMin, blk1->EyMax);
2225#endif
2226 sorted[sortPos++] = blk1;
2227 return sortPos;
2228}
2229
2230int TextBlock::visitDepthFirst(TextBlock *blkList, int pos1, TextBlock **sorted, int sortPos, bool *visited)
2231{
2232 const int blockCacheSize = 4;
2233 TextBlock *blockCache[blockCacheSize];
2234 std::fill(first: blockCache, last: blockCache + blockCacheSize, value: nullptr);
2235 return visitDepthFirst(blkList, pos1, sorted, sortPos, visited, cache: blockCache, cacheSize: blockCacheSize);
2236}
2237
2238//------------------------------------------------------------------------
2239// TextFlow
2240//------------------------------------------------------------------------
2241
2242TextFlow::TextFlow(TextPage *pageA, TextBlock *blk)
2243{
2244 page = pageA;
2245 xMin = blk->xMin;
2246 xMax = blk->xMax;
2247 yMin = blk->yMin;
2248 yMax = blk->yMax;
2249 priMin = blk->priMin;
2250 priMax = blk->priMax;
2251 blocks = lastBlk = blk;
2252 next = nullptr;
2253}
2254
2255TextFlow::~TextFlow()
2256{
2257 TextBlock *blk;
2258
2259 while (blocks) {
2260 blk = blocks;
2261 blocks = blocks->next;
2262 delete blk;
2263 }
2264}
2265
2266void TextFlow::addBlock(TextBlock *blk)
2267{
2268 if (lastBlk) {
2269 lastBlk->next = blk;
2270 } else {
2271 blocks = blk;
2272 }
2273 lastBlk = blk;
2274 if (blk->xMin < xMin) {
2275 xMin = blk->xMin;
2276 }
2277 if (blk->xMax > xMax) {
2278 xMax = blk->xMax;
2279 }
2280 if (blk->yMin < yMin) {
2281 yMin = blk->yMin;
2282 }
2283 if (blk->yMax > yMax) {
2284 yMax = blk->yMax;
2285 }
2286}
2287
2288bool TextFlow::blockFits(const TextBlock *blk, const TextBlock *prevBlk) const
2289{
2290 bool fits;
2291
2292 // lower blocks must use smaller fonts
2293 if (blk->lines->words->fontSize > lastBlk->lines->words->fontSize) {
2294 return false;
2295 }
2296
2297 fits = false; // make gcc happy
2298 switch (page->primaryRot) {
2299 case 0:
2300 fits = blk->xMin >= priMin && blk->xMax <= priMax;
2301 break;
2302 case 1:
2303 fits = blk->yMin >= priMin && blk->yMax <= priMax;
2304 break;
2305 case 2:
2306 fits = blk->xMin >= priMin && blk->xMax <= priMax;
2307 break;
2308 case 3:
2309 fits = blk->yMin >= priMin && blk->yMax <= priMax;
2310 break;
2311 }
2312 return fits;
2313}
2314
2315#ifdef TEXTOUT_WORD_LIST
2316
2317//------------------------------------------------------------------------
2318// TextWordList
2319//------------------------------------------------------------------------
2320
2321TextWordList::TextWordList(const TextPage *text, bool physLayout)
2322{
2323 TextFlow *flow;
2324 TextBlock *blk;
2325 TextLine *line;
2326 TextWord *word;
2327 TextWord **wordArray;
2328 int nWords, i;
2329
2330 if (text->rawOrder) {
2331 for (word = text->rawWords; word; word = word->next) {
2332 words.push_back(x: word);
2333 }
2334
2335 } else if (physLayout) {
2336 // this is inefficient, but it's also the least useful of these
2337 // three cases
2338 nWords = 0;
2339 for (flow = text->flows; flow; flow = flow->next) {
2340 for (blk = flow->blocks; blk; blk = blk->next) {
2341 for (line = blk->lines; line; line = line->next) {
2342 for (word = line->words; word; word = word->next) {
2343 ++nWords;
2344 }
2345 }
2346 }
2347 }
2348 wordArray = (TextWord **)gmallocn(count: nWords, size: sizeof(TextWord *));
2349 i = 0;
2350 for (flow = text->flows; flow; flow = flow->next) {
2351 for (blk = flow->blocks; blk; blk = blk->next) {
2352 for (line = blk->lines; line; line = line->next) {
2353 for (word = line->words; word; word = word->next) {
2354 wordArray[i++] = word;
2355 }
2356 }
2357 }
2358 }
2359 qsort(base: wordArray, nmemb: nWords, size: sizeof(TextWord *), compar: &TextWord::cmpYX);
2360 for (i = 0; i < nWords; ++i) {
2361 words.push_back(x: wordArray[i]);
2362 }
2363 gfree(p: wordArray);
2364
2365 } else {
2366 for (flow = text->flows; flow; flow = flow->next) {
2367 for (blk = flow->blocks; blk; blk = blk->next) {
2368 for (line = blk->lines; line; line = line->next) {
2369 for (word = line->words; word; word = word->next) {
2370 words.push_back(x: word);
2371 }
2372 }
2373 }
2374 }
2375 }
2376}
2377
2378TextWordList::~TextWordList() { }
2379
2380int TextWordList::getLength() const
2381{
2382 return words.size();
2383}
2384
2385TextWord *TextWordList::get(int idx)
2386{
2387 if (idx < 0 || idx >= (int)words.size()) {
2388 return nullptr;
2389 }
2390 return words[idx];
2391}
2392
2393#endif // TEXTOUT_WORD_LIST
2394
2395//------------------------------------------------------------------------
2396// TextPage
2397//------------------------------------------------------------------------
2398
2399TextPage::TextPage(bool rawOrderA, bool discardDiagA)
2400{
2401 int rot;
2402
2403 refCnt = 1;
2404 rawOrder = rawOrderA;
2405 discardDiag = discardDiagA;
2406 curWord = nullptr;
2407 charPos = 0;
2408 curFont = nullptr;
2409 curFontSize = 0;
2410 nest = 0;
2411 nTinyChars = 0;
2412 lastCharOverlap = false;
2413 if (!rawOrder) {
2414 for (rot = 0; rot < 4; ++rot) {
2415 pools[rot] = std::make_unique<TextPool>();
2416 }
2417 }
2418 flows = nullptr;
2419 blocks = nullptr;
2420 rawWords = nullptr;
2421 rawLastWord = nullptr;
2422 lastFindXMin = lastFindYMin = 0;
2423 haveLastFind = false;
2424 mergeCombining = true;
2425 diagonal = false;
2426}
2427
2428TextPage::~TextPage()
2429{
2430 clear();
2431}
2432
2433void TextPage::incRefCnt()
2434{
2435 refCnt++;
2436}
2437
2438void TextPage::decRefCnt()
2439{
2440 if (--refCnt == 0) {
2441 delete this;
2442 }
2443}
2444
2445void TextPage::startPage(const GfxState *state)
2446{
2447 clear();
2448 if (state) {
2449 pageWidth = state->getPageWidth();
2450 pageHeight = state->getPageHeight();
2451 } else {
2452 pageWidth = pageHeight = 0;
2453 }
2454}
2455
2456void TextPage::endPage()
2457{
2458 if (curWord) {
2459 endWord();
2460 }
2461}
2462
2463void TextPage::clear()
2464{
2465 int rot;
2466 TextFlow *flow;
2467 TextWord *word;
2468
2469 if (curWord) {
2470 delete curWord;
2471 curWord = nullptr;
2472 }
2473 if (rawOrder) {
2474 while (rawWords) {
2475 word = rawWords;
2476 rawWords = rawWords->next;
2477 delete word;
2478 }
2479 } else {
2480 for (rot = 0; rot < 4; ++rot) {
2481 pools[rot] = std::make_unique<TextPool>();
2482 }
2483 while (flows) {
2484 flow = flows;
2485 flows = flows->next;
2486 delete flow;
2487 }
2488 gfree(p: blocks);
2489 }
2490 fonts.clear();
2491 underlines.clear();
2492 links.clear();
2493
2494 diagonal = false;
2495 curWord = nullptr;
2496 charPos = 0;
2497 curFont = nullptr;
2498 curFontSize = 0;
2499 nest = 0;
2500 nTinyChars = 0;
2501 flows = nullptr;
2502 blocks = nullptr;
2503 rawWords = nullptr;
2504 rawLastWord = nullptr;
2505}
2506
2507void TextPage::updateFont(const GfxState *state)
2508{
2509 const double *fm;
2510 const char *name;
2511 int code, mCode, letterCode, anyCode;
2512 double w;
2513
2514 // get the font info object
2515 curFont = nullptr;
2516 for (const std::unique_ptr<TextFontInfo> &f : fonts) {
2517 if (f->matches(state)) {
2518 curFont = f.get();
2519 break;
2520 }
2521 }
2522 if (!curFont) {
2523 fonts.emplace_back(args: std::make_unique<TextFontInfo>(args&: state));
2524 curFont = fonts.back().get();
2525 }
2526
2527 // adjust the font size
2528 GfxFont *const gfxFont = state->getFont().get();
2529 curFontSize = state->getTransformedFontSize();
2530 if (gfxFont && gfxFont->getType() == fontType3) {
2531 // This is a hack which makes it possible to deal with some Type 3
2532 // fonts. The problem is that it's impossible to know what the
2533 // base coordinate system used in the font is without actually
2534 // rendering the font. This code tries to guess by looking at the
2535 // width of the character 'm' (which breaks if the font is a
2536 // subset that doesn't contain 'm').
2537 mCode = letterCode = anyCode = -1;
2538 for (code = 0; code < 256; ++code) {
2539 name = ((Gfx8BitFont *)gfxFont)->getCharName(code);
2540 int nameLen = name ? strlen(s: name) : 0;
2541 bool nameOneChar = nameLen == 1 || (nameLen > 1 && name[1] == '\0');
2542 if (nameOneChar && name[0] == 'm') {
2543 mCode = code;
2544 }
2545 if (letterCode < 0 && nameOneChar && ((name[0] >= 'A' && name[0] <= 'Z') || (name[0] >= 'a' && name[0] <= 'z'))) {
2546 letterCode = code;
2547 }
2548 if (anyCode < 0 && name && ((Gfx8BitFont *)gfxFont)->getWidth(c: code) > 0) {
2549 anyCode = code;
2550 }
2551 }
2552 if (mCode >= 0 && (w = ((Gfx8BitFont *)gfxFont)->getWidth(c: mCode)) > 0) {
2553 // 0.6 is a generic average 'm' width -- yes, this is a hack
2554 curFontSize *= w / 0.6;
2555 } else if (letterCode >= 0 && (w = ((Gfx8BitFont *)gfxFont)->getWidth(c: letterCode)) > 0) {
2556 // even more of a hack: 0.5 is a generic letter width
2557 curFontSize *= w / 0.5;
2558 } else if (anyCode >= 0 && (w = ((Gfx8BitFont *)gfxFont)->getWidth(c: anyCode)) > 0) {
2559 // better than nothing: 0.5 is a generic character width
2560 curFontSize *= w / 0.5;
2561 }
2562 fm = gfxFont->getFontMatrix();
2563 if (fm[0] != 0) {
2564 curFontSize *= fabs(x: fm[3] / fm[0]);
2565 }
2566 }
2567}
2568
2569void TextPage::beginWord(const GfxState *state)
2570{
2571 const double *fontm;
2572 double m[4], m2[4];
2573 int rot;
2574
2575 // This check is needed because Type 3 characters can contain
2576 // text-drawing operations (when TextPage is being used via
2577 // {X,Win}SplashOutputDev rather than TextOutputDev).
2578 if (curWord) {
2579 ++nest;
2580 return;
2581 }
2582
2583 // compute the rotation
2584 state->getFontTransMat(m11: &m[0], m12: &m[1], m21: &m[2], m22: &m[3]);
2585 std::shared_ptr<GfxFont> gfxFont = state->getFont();
2586 if (gfxFont && gfxFont->getType() == fontType3) {
2587 fontm = state->getFont()->getFontMatrix();
2588 m2[0] = fontm[0] * m[0] + fontm[1] * m[2];
2589 m2[1] = fontm[0] * m[1] + fontm[1] * m[3];
2590 m2[2] = fontm[2] * m[0] + fontm[3] * m[2];
2591 m2[3] = fontm[2] * m[1] + fontm[3] * m[3];
2592 m[0] = m2[0];
2593 m[1] = m2[1];
2594 m[2] = m2[2];
2595 m[3] = m2[3];
2596 }
2597 if (fabs(x: m[0] * m[3]) > fabs(x: m[1] * m[2])) {
2598 rot = (m[0] > 0 || m[3] < 0) ? 0 : 2;
2599 } else {
2600 rot = (m[2] > 0) ? 1 : 3;
2601 }
2602 if (fabs(x: m[0]) >= fabs(x: m[1])) {
2603 diagonal = fabs(x: m[1]) > diagonalThreshold * fabs(x: m[0]);
2604 } else {
2605 diagonal = fabs(x: m[0]) > diagonalThreshold * fabs(x: m[1]);
2606 }
2607
2608 // for vertical writing mode, the lines are effectively rotated 90
2609 // degrees
2610 if (gfxFont && gfxFont->getWMode()) {
2611 rot = (rot + 1) & 3;
2612 }
2613
2614 curWord = new TextWord(state, rot, curFontSize);
2615}
2616
2617void TextPage::addChar(const GfxState *state, double x, double y, double dx, double dy, CharCode c, int nBytes, const Unicode *u, int uLen)
2618{
2619 double x1, y1, w1, h1, dx2, dy2, base, sp, delta;
2620 bool overlap;
2621 int i;
2622 int wMode;
2623 Matrix mat;
2624
2625 // subtract char and word spacing from the dx,dy values
2626 sp = state->getCharSpace();
2627 if (c == (CharCode)0x20) {
2628 sp += state->getWordSpace();
2629 }
2630 state->textTransformDelta(x1: sp * state->getHorizScaling(), y1: 0, x2: &dx2, y2: &dy2);
2631 dx -= dx2;
2632 dy -= dy2;
2633 state->transformDelta(x1: dx, y1: dy, x2: &w1, y2: &h1);
2634
2635 // throw away chars that aren't inside the page bounds
2636 // (and also do a sanity check on the character size)
2637 state->transform(x1: x, y1: y, x2: &x1, y2: &y1);
2638 if (x1 + w1 < 0 || x1 > pageWidth || y1 + h1 < 0 || y1 > pageHeight || std::isnan(x: x1) || std::isnan(x: y1) || std::isnan(x: w1) || std::isnan(x: h1)) {
2639 charPos += nBytes;
2640 return;
2641 }
2642
2643 // check the tiny chars limit
2644 if (fabs(x: w1) < 3 && fabs(x: h1) < 3) {
2645 if (++nTinyChars > 50000) {
2646 charPos += nBytes;
2647 return;
2648 }
2649 }
2650
2651 // break words at space character
2652 if (uLen == 1 && UnicodeIsWhitespace(ucs4: u[0])) {
2653 charPos += nBytes;
2654 endWord();
2655 return;
2656 } else if (uLen == 1 && u[0] == (Unicode)0x0) {
2657 // ignore null characters
2658 charPos += nBytes;
2659 return;
2660 }
2661
2662 state->getFontTransMat(m11: &mat.m[0], m12: &mat.m[1], m21: &mat.m[2], m22: &mat.m[3]);
2663 mat.m[0] *= state->getHorizScaling();
2664 mat.m[1] *= state->getHorizScaling();
2665 mat.m[4] = x1;
2666 mat.m[5] = y1;
2667
2668 if (mergeCombining && curWord && uLen == 1 && curWord->addCombining(state, fontA: curFont, fontSizeA: curFontSize, x: x1, y: y1, dx: w1, dy: h1, charPosA: charPos, charLen: nBytes, c, u: u[0], textMatA: mat)) {
2669 charPos += nBytes;
2670 return;
2671 }
2672
2673 // start a new word if:
2674 // (1) this character doesn't fall in the right place relative to
2675 // the end of the previous word (this places upper and lower
2676 // constraints on the position deltas along both the primary
2677 // and secondary axes), or
2678 // (2) this character overlaps the previous one (duplicated text), or
2679 // (3) the previous character was an overlap (we want each duplicated
2680 // character to be in a word by itself at this stage),
2681 // (4) the font size has changed
2682 // (5) the WMode changed
2683 if (curWord && curWord->len() > 0) {
2684 base = sp = delta = 0; // make gcc happy
2685 switch (curWord->rot) {
2686 case 0:
2687 base = y1;
2688 sp = x1 - curWord->xMax;
2689 delta = x1 - curWord->chars.back().edge;
2690 break;
2691 case 1:
2692 base = x1;
2693 sp = y1 - curWord->yMax;
2694 delta = y1 - curWord->chars.back().edge;
2695 break;
2696 case 2:
2697 base = y1;
2698 sp = curWord->xMin - x1;
2699 delta = curWord->chars.back().edge - x1;
2700 break;
2701 case 3:
2702 base = x1;
2703 sp = curWord->yMin - y1;
2704 delta = curWord->chars.back().edge - y1;
2705 break;
2706 }
2707 overlap = fabs(x: delta) < dupMaxPriDelta * curWord->fontSize && fabs(x: base - curWord->base) < dupMaxSecDelta * curWord->fontSize;
2708 wMode = curFont->getWMode();
2709 if (overlap || lastCharOverlap || sp < -minDupBreakOverlap * curWord->fontSize || sp > minWordBreakSpace * curWord->fontSize || fabs(x: base - curWord->base) > 0.5 || curFontSize != curWord->fontSize || wMode != curWord->wMode) {
2710 endWord();
2711 }
2712 lastCharOverlap = overlap;
2713 } else {
2714 lastCharOverlap = false;
2715 }
2716
2717 if (uLen != 0) {
2718 // start a new word if needed
2719 if (!curWord) {
2720 beginWord(state);
2721 }
2722
2723 // throw away diagonal chars
2724 if (discardDiag && diagonal) {
2725 charPos += nBytes;
2726 return;
2727 }
2728
2729 // page rotation and/or transform matrices can cause text to be
2730 // drawn in reverse order -- in this case, swap the begin/end
2731 // coordinates and break text into individual chars
2732 if ((curWord->rot == 0 && w1 < 0) || (curWord->rot == 1 && h1 < 0) || (curWord->rot == 2 && w1 > 0) || (curWord->rot == 3 && h1 > 0)) {
2733 endWord();
2734 beginWord(state);
2735
2736 // throw away diagonal chars
2737 if (discardDiag && diagonal) {
2738 charPos += nBytes;
2739 return;
2740 }
2741
2742 x1 += w1;
2743 y1 += h1;
2744 w1 = -w1;
2745 h1 = -h1;
2746 }
2747
2748 // add the characters to the current word
2749 w1 /= uLen;
2750 h1 /= uLen;
2751 for (i = 0; i < uLen; ++i) {
2752 curWord->addChar(state, fontA: curFont, x: x1 + i * w1, y: y1 + i * h1, dx: w1, dy: h1, charPosA: charPos, charLen: nBytes, c, u: u[i], textMatA: mat);
2753 }
2754 }
2755 charPos += nBytes;
2756}
2757
2758void TextPage::incCharCount(int nChars)
2759{
2760 charPos += nChars;
2761}
2762
2763void TextPage::endWord()
2764{
2765 // This check is needed because Type 3 characters can contain
2766 // text-drawing operations (when TextPage is being used via
2767 // {X,Win}SplashOutputDev rather than TextOutputDev).
2768 if (nest > 0) {
2769 --nest;
2770 return;
2771 }
2772
2773 if (curWord) {
2774 addWord(word: curWord);
2775 curWord = nullptr;
2776 }
2777}
2778
2779void TextPage::addWord(TextWord *word)
2780{
2781 // throw away zero-length words -- they don't have valid xMin/xMax
2782 // values, and they're useless anyway
2783 if (word->len() == 0) {
2784 delete word;
2785 return;
2786 }
2787
2788 if (rawOrder) {
2789 if (rawLastWord) {
2790 rawLastWord->next = word;
2791 } else {
2792 rawWords = word;
2793 }
2794 rawLastWord = word;
2795 } else {
2796 pools[word->rot]->addWord(word);
2797 }
2798}
2799
2800void TextPage::addUnderline(double x0, double y0, double x1, double y1)
2801{
2802 underlines.emplace_back(args: std::make_unique<TextUnderline>(args&: x0, args&: y0, args&: x1, args&: y1));
2803}
2804
2805void TextPage::addLink(int xMin, int yMin, int xMax, int yMax, AnnotLink *link)
2806{
2807 links.emplace_back(args: std::make_unique<TextLink>(args&: xMin, args&: yMin, args&: xMax, args&: yMax, args&: link));
2808}
2809
2810void TextPage::coalesce(bool physLayout, double fixedPitch, bool doHTML)
2811{
2812 coalesce(physLayout, fixedPitch, doHTML, minColSpacing1: TextOutputDev::minColSpacing1_default);
2813}
2814
2815void TextPage::coalesce(bool physLayout, double fixedPitch, bool doHTML, double minColSpacing1)
2816{
2817 TextWord *word0, *word1, *word2;
2818 TextLine *line;
2819 TextBlock *blkList, *blk, *lastBlk, *blk0, *blk1, *blk2;
2820 TextFlow *flow, *lastFlow;
2821 int rot, poolMinBaseIdx, baseIdx, startBaseIdx, endBaseIdx;
2822 double minBase, maxBase, newMinBase, newMaxBase;
2823 double fontSize, colSpace1, colSpace2, lineSpace, intraLineSpace, blkSpace;
2824 bool found;
2825 int count[4];
2826 int lrCount;
2827 int col1, col2;
2828 int j, n;
2829
2830 if (rawOrder) {
2831 primaryRot = 0;
2832 primaryLR = true;
2833 return;
2834 }
2835
2836 const UnicodeMap *uMap = globalParams->getTextEncoding();
2837 blkList = nullptr;
2838 lastBlk = nullptr;
2839 nBlocks = 0;
2840 primaryRot = 0;
2841
2842#if 0 // for debugging
2843 printf("*** initial words ***\n");
2844 for (rot = 0; rot < 4; ++rot) {
2845 pool = pools[rot];
2846 for (baseIdx = pool->minBaseIdx; baseIdx <= pool->maxBaseIdx; ++baseIdx) {
2847 for (word0 = pool->getPool(baseIdx); word0; word0 = word0->next) {
2848 printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSize=%.2f rot=%d link=%p '",
2849 word0->xMin, word0->xMax, word0->yMin, word0->yMax,
2850 word0->base, word0->fontSize, rot*90, word0->link);
2851 for (i = 0; i < word0->len; ++i) {
2852 fputc(word0->text[i] & 0xff, stdout);
2853 }
2854 printf("'\n");
2855 }
2856 }
2857 }
2858 printf("\n");
2859#endif
2860
2861#if 0 //~ for debugging
2862 for (i = 0; i < underlines->getLength(); ++i) {
2863 underline = (TextUnderline *)underlines->get(i);
2864 printf("underline: x=%g..%g y=%g..%g horiz=%d\n",
2865 underline->x0, underline->x1, underline->y0, underline->y1,
2866 underline->horiz);
2867 }
2868#endif
2869
2870 if (doHTML) {
2871
2872 //----- handle underlining
2873 for (const std::unique_ptr<TextUnderline> &underline : underlines) {
2874 if (underline->horiz) {
2875 // rot = 0
2876 if (pools[0]->minBaseIdx <= pools[0]->maxBaseIdx) {
2877 startBaseIdx = pools[0]->getBaseIdx(base: underline->y0 + minUnderlineGap);
2878 endBaseIdx = pools[0]->getBaseIdx(base: underline->y0 + maxUnderlineGap);
2879 for (j = startBaseIdx; j <= endBaseIdx; ++j) {
2880 for (word0 = pools[0]->getPool(baseIdx: j); word0; word0 = word0->next) {
2881 //~ need to check the y value against the word baseline
2882 if (underline->x0 < word0->xMin + underlineSlack && word0->xMax - underlineSlack < underline->x1) {
2883 word0->underlined = true;
2884 }
2885 }
2886 }
2887 }
2888
2889 // rot = 2
2890 if (pools[2]->minBaseIdx <= pools[2]->maxBaseIdx) {
2891 startBaseIdx = pools[2]->getBaseIdx(base: underline->y0 - maxUnderlineGap);
2892 endBaseIdx = pools[2]->getBaseIdx(base: underline->y0 - minUnderlineGap);
2893 for (j = startBaseIdx; j <= endBaseIdx; ++j) {
2894 for (word0 = pools[2]->getPool(baseIdx: j); word0; word0 = word0->next) {
2895 if (underline->x0 < word0->xMin + underlineSlack && word0->xMax - underlineSlack < underline->x1) {
2896 word0->underlined = true;
2897 }
2898 }
2899 }
2900 }
2901 } else {
2902 // rot = 1
2903 if (pools[1]->minBaseIdx <= pools[1]->maxBaseIdx) {
2904 startBaseIdx = pools[1]->getBaseIdx(base: underline->x0 - maxUnderlineGap);
2905 endBaseIdx = pools[1]->getBaseIdx(base: underline->x0 - minUnderlineGap);
2906 for (j = startBaseIdx; j <= endBaseIdx; ++j) {
2907 for (word0 = pools[1]->getPool(baseIdx: j); word0; word0 = word0->next) {
2908 if (underline->y0 < word0->yMin + underlineSlack && word0->yMax - underlineSlack < underline->y1) {
2909 word0->underlined = true;
2910 }
2911 }
2912 }
2913 }
2914
2915 // rot = 3
2916 if (pools[3]->minBaseIdx <= pools[3]->maxBaseIdx) {
2917 startBaseIdx = pools[3]->getBaseIdx(base: underline->x0 + minUnderlineGap);
2918 endBaseIdx = pools[3]->getBaseIdx(base: underline->x0 + maxUnderlineGap);
2919 for (j = startBaseIdx; j <= endBaseIdx; ++j) {
2920 for (word0 = pools[3]->getPool(baseIdx: j); word0; word0 = word0->next) {
2921 if (underline->y0 < word0->yMin + underlineSlack && word0->yMax - underlineSlack < underline->y1) {
2922 word0->underlined = true;
2923 }
2924 }
2925 }
2926 }
2927 }
2928 }
2929
2930 //----- handle links
2931 for (const std::unique_ptr<TextLink> &link : links) {
2932 // rot = 0
2933 if (pools[0]->minBaseIdx <= pools[0]->maxBaseIdx) {
2934 startBaseIdx = pools[0]->getBaseIdx(base: link->yMin);
2935 endBaseIdx = pools[0]->getBaseIdx(base: link->yMax);
2936 for (j = startBaseIdx; j <= endBaseIdx; ++j) {
2937 for (word0 = pools[0]->getPool(baseIdx: j); word0; word0 = word0->next) {
2938 if (link->xMin < word0->xMin + hyperlinkSlack && word0->xMax - hyperlinkSlack < link->xMax && link->yMin < word0->yMin + hyperlinkSlack && word0->yMax - hyperlinkSlack < link->yMax) {
2939 word0->link = link->link;
2940 }
2941 }
2942 }
2943 }
2944
2945 // rot = 2
2946 if (pools[2]->minBaseIdx <= pools[2]->maxBaseIdx) {
2947 startBaseIdx = pools[2]->getBaseIdx(base: link->yMin);
2948 endBaseIdx = pools[2]->getBaseIdx(base: link->yMax);
2949 for (j = startBaseIdx; j <= endBaseIdx; ++j) {
2950 for (word0 = pools[2]->getPool(baseIdx: j); word0; word0 = word0->next) {
2951 if (link->xMin < word0->xMin + hyperlinkSlack && word0->xMax - hyperlinkSlack < link->xMax && link->yMin < word0->yMin + hyperlinkSlack && word0->yMax - hyperlinkSlack < link->yMax) {
2952 word0->link = link->link;
2953 }
2954 }
2955 }
2956 }
2957
2958 // rot = 1
2959 if (pools[1]->minBaseIdx <= pools[1]->maxBaseIdx) {
2960 startBaseIdx = pools[1]->getBaseIdx(base: link->xMin);
2961 endBaseIdx = pools[1]->getBaseIdx(base: link->xMax);
2962 for (j = startBaseIdx; j <= endBaseIdx; ++j) {
2963 for (word0 = pools[1]->getPool(baseIdx: j); word0; word0 = word0->next) {
2964 if (link->yMin < word0->yMin + hyperlinkSlack && word0->yMax - hyperlinkSlack < link->yMax && link->xMin < word0->xMin + hyperlinkSlack && word0->xMax - hyperlinkSlack < link->xMax) {
2965 word0->link = link->link;
2966 }
2967 }
2968 }
2969 }
2970
2971 // rot = 3
2972 if (pools[3]->minBaseIdx <= pools[3]->maxBaseIdx) {
2973 startBaseIdx = pools[3]->getBaseIdx(base: link->xMin);
2974 endBaseIdx = pools[3]->getBaseIdx(base: link->xMax);
2975 for (j = startBaseIdx; j <= endBaseIdx; ++j) {
2976 for (word0 = pools[3]->getPool(baseIdx: j); word0; word0 = word0->next) {
2977 if (link->yMin < word0->yMin + hyperlinkSlack && word0->yMax - hyperlinkSlack < link->yMax && link->xMin < word0->xMin + hyperlinkSlack && word0->xMax - hyperlinkSlack < link->xMax) {
2978 word0->link = link->link;
2979 }
2980 }
2981 }
2982 }
2983 }
2984 }
2985
2986 //----- assemble the blocks
2987
2988 //~ add an outer loop for writing mode (vertical text)
2989
2990 // build blocks for each rotation value
2991 for (rot = 0; rot < 4; ++rot) {
2992 std::unique_ptr<TextPool> &pool = pools[rot];
2993 poolMinBaseIdx = pool->minBaseIdx;
2994 count[rot] = 0;
2995
2996 // add blocks until no more words are left
2997 while (true) {
2998
2999 // find the first non-empty line in the pool
3000 for (; poolMinBaseIdx <= pool->maxBaseIdx && !pool->getPool(baseIdx: poolMinBaseIdx); ++poolMinBaseIdx) {
3001 ;
3002 }
3003 if (poolMinBaseIdx > pool->maxBaseIdx) {
3004 break;
3005 }
3006
3007 // look for the left-most word in the first four lines of the
3008 // pool -- this avoids starting with a superscript word
3009 startBaseIdx = poolMinBaseIdx;
3010 for (baseIdx = poolMinBaseIdx + 1; baseIdx < poolMinBaseIdx + 4 && baseIdx <= pool->maxBaseIdx; ++baseIdx) {
3011 if (!pool->getPool(baseIdx)) {
3012 continue;
3013 }
3014 if (pool->getPool(baseIdx)->primaryCmp(word: pool->getPool(baseIdx: startBaseIdx)) < 0) {
3015 startBaseIdx = baseIdx;
3016 }
3017 }
3018
3019 // create a new block
3020 word0 = pool->getPool(baseIdx: startBaseIdx);
3021 pool->setPool(baseIdx: startBaseIdx, p: word0->next);
3022 word0->next = nullptr;
3023 blk = new TextBlock(this, rot);
3024 blk->addWord(word: word0);
3025
3026 fontSize = word0->fontSize;
3027 minBase = maxBase = word0->base;
3028 colSpace1 = minColSpacing1 * fontSize;
3029 colSpace2 = minColSpacing2 * fontSize;
3030 lineSpace = maxLineSpacingDelta * fontSize;
3031 intraLineSpace = maxIntraLineDelta * fontSize;
3032
3033 // add words to the block
3034 do {
3035 found = false;
3036
3037 // look for words on the line above the current top edge of
3038 // the block
3039 newMinBase = minBase;
3040 for (baseIdx = pool->getBaseIdx(base: minBase); baseIdx >= pool->getBaseIdx(base: minBase - lineSpace); --baseIdx) {
3041 word0 = nullptr;
3042 word1 = pool->getPool(baseIdx);
3043 while (word1) {
3044 if (word1->base < minBase && word1->base >= minBase - lineSpace && ((rot == 0 || rot == 2) ? (word1->xMin < blk->xMax && word1->xMax > blk->xMin) : (word1->yMin < blk->yMax && word1->yMax > blk->yMin))
3045 && fabs(x: word1->fontSize - fontSize) < maxBlockFontSizeDelta1 * fontSize) {
3046 word2 = word1;
3047 if (word0) {
3048 word0->next = word1->next;
3049 } else {
3050 pool->setPool(baseIdx, p: word1->next);
3051 }
3052 word1 = word1->next;
3053 word2->next = nullptr;
3054 blk->addWord(word: word2);
3055 found = true;
3056 newMinBase = word2->base;
3057 } else {
3058 word0 = word1;
3059 word1 = word1->next;
3060 }
3061 }
3062 }
3063 minBase = newMinBase;
3064
3065 // look for words on the line below the current bottom edge of
3066 // the block
3067 newMaxBase = maxBase;
3068 for (baseIdx = pool->getBaseIdx(base: maxBase); baseIdx <= pool->getBaseIdx(base: maxBase + lineSpace); ++baseIdx) {
3069 word0 = nullptr;
3070 word1 = pool->getPool(baseIdx);
3071 while (word1) {
3072 if (word1->base > maxBase && word1->base <= maxBase + lineSpace && ((rot == 0 || rot == 2) ? (word1->xMin < blk->xMax && word1->xMax > blk->xMin) : (word1->yMin < blk->yMax && word1->yMax > blk->yMin))
3073 && fabs(x: word1->fontSize - fontSize) < maxBlockFontSizeDelta1 * fontSize) {
3074 word2 = word1;
3075 if (word0) {
3076 word0->next = word1->next;
3077 } else {
3078 pool->setPool(baseIdx, p: word1->next);
3079 }
3080 word1 = word1->next;
3081 word2->next = nullptr;
3082 blk->addWord(word: word2);
3083 found = true;
3084 newMaxBase = word2->base;
3085 } else {
3086 word0 = word1;
3087 word1 = word1->next;
3088 }
3089 }
3090 }
3091 maxBase = newMaxBase;
3092
3093 // look for words that are on lines already in the block, and
3094 // that overlap the block horizontally
3095 for (baseIdx = pool->getBaseIdx(base: minBase - intraLineSpace); baseIdx <= pool->getBaseIdx(base: maxBase + intraLineSpace); ++baseIdx) {
3096 word0 = nullptr;
3097 word1 = pool->getPool(baseIdx);
3098 while (word1) {
3099 if (word1->base >= minBase - intraLineSpace && word1->base <= maxBase + intraLineSpace
3100 && ((rot == 0 || rot == 2) ? (word1->xMin < blk->xMax + colSpace1 && word1->xMax > blk->xMin - colSpace1) : (word1->yMin < blk->yMax + colSpace1 && word1->yMax > blk->yMin - colSpace1))
3101 && fabs(x: word1->fontSize - fontSize) < maxBlockFontSizeDelta2 * fontSize) {
3102 word2 = word1;
3103 if (word0) {
3104 word0->next = word1->next;
3105 } else {
3106 pool->setPool(baseIdx, p: word1->next);
3107 }
3108 word1 = word1->next;
3109 word2->next = nullptr;
3110 blk->addWord(word: word2);
3111 found = true;
3112 } else {
3113 word0 = word1;
3114 word1 = word1->next;
3115 }
3116 }
3117 }
3118
3119 // only check for outlying words (the next two chunks of code)
3120 // if we didn't find anything else
3121 if (found) {
3122 continue;
3123 }
3124
3125 // scan down the left side of the block, looking for words
3126 // that are near (but not overlapping) the block; if there are
3127 // three or fewer, add them to the block
3128 n = 0;
3129 for (baseIdx = pool->getBaseIdx(base: minBase - intraLineSpace); baseIdx <= pool->getBaseIdx(base: maxBase + intraLineSpace); ++baseIdx) {
3130 word1 = pool->getPool(baseIdx);
3131 while (word1) {
3132 if (word1->base >= minBase - intraLineSpace && word1->base <= maxBase + intraLineSpace
3133 && ((rot == 0 || rot == 2) ? (word1->xMax <= blk->xMin && word1->xMax > blk->xMin - colSpace2) : (word1->yMax <= blk->yMin && word1->yMax > blk->yMin - colSpace2))
3134 && fabs(x: word1->fontSize - fontSize) < maxBlockFontSizeDelta3 * fontSize) {
3135 ++n;
3136 break;
3137 }
3138 word1 = word1->next;
3139 }
3140 }
3141 if (n > 0 && n <= 3) {
3142 for (baseIdx = pool->getBaseIdx(base: minBase - intraLineSpace); baseIdx <= pool->getBaseIdx(base: maxBase + intraLineSpace); ++baseIdx) {
3143 word0 = nullptr;
3144 word1 = pool->getPool(baseIdx);
3145 while (word1) {
3146 if (word1->base >= minBase - intraLineSpace && word1->base <= maxBase + intraLineSpace
3147 && ((rot == 0 || rot == 2) ? (word1->xMax <= blk->xMin && word1->xMax > blk->xMin - colSpace2) : (word1->yMax <= blk->yMin && word1->yMax > blk->yMin - colSpace2))
3148 && fabs(x: word1->fontSize - fontSize) < maxBlockFontSizeDelta3 * fontSize) {
3149 word2 = word1;
3150 if (word0) {
3151 word0->next = word1->next;
3152 } else {
3153 pool->setPool(baseIdx, p: word1->next);
3154 }
3155 word1 = word1->next;
3156 word2->next = nullptr;
3157 blk->addWord(word: word2);
3158 if (word2->base < minBase) {
3159 minBase = word2->base;
3160 } else if (word2->base > maxBase) {
3161 maxBase = word2->base;
3162 }
3163 found = true;
3164 break;
3165 } else {
3166 word0 = word1;
3167 word1 = word1->next;
3168 }
3169 }
3170 }
3171 }
3172
3173 // scan down the right side of the block, looking for words
3174 // that are near (but not overlapping) the block; if there are
3175 // three or fewer, add them to the block
3176 n = 0;
3177 for (baseIdx = pool->getBaseIdx(base: minBase - intraLineSpace); baseIdx <= pool->getBaseIdx(base: maxBase + intraLineSpace); ++baseIdx) {
3178 word1 = pool->getPool(baseIdx);
3179 while (word1) {
3180 if (word1->base >= minBase - intraLineSpace && word1->base <= maxBase + intraLineSpace
3181 && ((rot == 0 || rot == 2) ? (word1->xMin >= blk->xMax && word1->xMin < blk->xMax + colSpace2) : (word1->yMin >= blk->yMax && word1->yMin < blk->yMax + colSpace2))
3182 && fabs(x: word1->fontSize - fontSize) < maxBlockFontSizeDelta3 * fontSize) {
3183 ++n;
3184 break;
3185 }
3186 word1 = word1->next;
3187 }
3188 }
3189 if (n > 0 && n <= 3) {
3190 for (baseIdx = pool->getBaseIdx(base: minBase - intraLineSpace); baseIdx <= pool->getBaseIdx(base: maxBase + intraLineSpace); ++baseIdx) {
3191 word0 = nullptr;
3192 word1 = pool->getPool(baseIdx);
3193 while (word1) {
3194 if (word1->base >= minBase - intraLineSpace && word1->base <= maxBase + intraLineSpace
3195 && ((rot == 0 || rot == 2) ? (word1->xMin >= blk->xMax && word1->xMin < blk->xMax + colSpace2) : (word1->yMin >= blk->yMax && word1->yMin < blk->yMax + colSpace2))
3196 && fabs(x: word1->fontSize - fontSize) < maxBlockFontSizeDelta3 * fontSize) {
3197 word2 = word1;
3198 if (word0) {
3199 word0->next = word1->next;
3200 } else {
3201 pool->setPool(baseIdx, p: word1->next);
3202 }
3203 word1 = word1->next;
3204 word2->next = nullptr;
3205 blk->addWord(word: word2);
3206 if (word2->base < minBase) {
3207 minBase = word2->base;
3208 } else if (word2->base > maxBase) {
3209 maxBase = word2->base;
3210 }
3211 found = true;
3212 break;
3213 } else {
3214 word0 = word1;
3215 word1 = word1->next;
3216 }
3217 }
3218 }
3219 }
3220
3221 } while (found);
3222
3223 //~ need to compute the primary writing mode (horiz/vert) in
3224 //~ addition to primary rotation
3225
3226 // coalesce the block, and add it to the list
3227 blk->coalesce(uMap, fixedPitch);
3228 if (lastBlk) {
3229 lastBlk->next = blk;
3230 } else {
3231 blkList = blk;
3232 }
3233 lastBlk = blk;
3234 count[rot] += blk->charCount;
3235 ++nBlocks;
3236 }
3237
3238 if (count[rot] > count[primaryRot]) {
3239 primaryRot = rot;
3240 }
3241 }
3242
3243#if 0 // for debugging
3244 printf("*** rotation ***\n");
3245 for (rot = 0; rot < 4; ++rot) {
3246 printf(" %d: %6d\n", rot, count[rot]);
3247 }
3248 printf(" primary rot = %d\n", primaryRot);
3249 printf("\n");
3250#endif
3251
3252#if 0 // for debugging
3253 printf("*** blocks ***\n");
3254 for (blk = blkList; blk; blk = blk->next) {
3255 printf("block: rot=%d x=%.2f..%.2f y=%.2f..%.2f\n",
3256 blk->rot, blk->xMin, blk->xMax, blk->yMin, blk->yMax);
3257 for (line = blk->lines; line; line = line->next) {
3258 printf(" line: x=%.2f..%.2f y=%.2f..%.2f base=%.2f\n",
3259 line->xMin, line->xMax, line->yMin, line->yMax, line->base);
3260 for (word0 = line->words; word0; word0 = word0->next) {
3261 printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSize=%.2f space=%d: '",
3262 word0->xMin, word0->xMax, word0->yMin, word0->yMax,
3263 word0->base, word0->fontSize, word0->spaceAfter);
3264 for (i = 0; i < word0->len; ++i) {
3265 fputc(word0->text[i] & 0xff, stdout);
3266 }
3267 printf("'\n");
3268 }
3269 }
3270 }
3271 printf("\n");
3272#endif
3273
3274 // determine the primary direction
3275 lrCount = 0;
3276 for (blk = blkList; blk; blk = blk->next) {
3277 for (line = blk->lines; line; line = line->next) {
3278 for (word0 = line->words; word0; word0 = word0->next) {
3279 for (size_t i = 0; i < word0->len(); ++i) {
3280 if (unicodeTypeL(c: word0->chars[i].text)) {
3281 ++lrCount;
3282 } else if (unicodeTypeR(c: word0->chars[i].text)) {
3283 --lrCount;
3284 }
3285 }
3286 }
3287 }
3288 }
3289 primaryLR = lrCount >= 0;
3290
3291#if 0 // for debugging
3292 printf("*** direction ***\n");
3293 printf("lrCount = %d\n", lrCount);
3294 printf("primaryLR = %d\n", primaryLR);
3295#endif
3296
3297 //----- column assignment
3298
3299 // sort blocks into xy order for column assignment
3300 if (blocks) {
3301 gfree(p: blocks);
3302 }
3303 if (physLayout && fixedPitch) {
3304
3305 blocks = (TextBlock **)gmallocn(count: nBlocks, size: sizeof(TextBlock *));
3306 int i;
3307 for (blk = blkList, i = 0; blk; blk = blk->next, ++i) {
3308 blocks[i] = blk;
3309 col1 = 0; // make gcc happy
3310 switch (primaryRot) {
3311 case 0:
3312 col1 = (int)(blk->xMin / fixedPitch + 0.5);
3313 break;
3314 case 1:
3315 col1 = (int)(blk->yMin / fixedPitch + 0.5);
3316 break;
3317 case 2:
3318 col1 = (int)((pageWidth - blk->xMax) / fixedPitch + 0.5);
3319 break;
3320 case 3:
3321 col1 = (int)((pageHeight - blk->yMax) / fixedPitch + 0.5);
3322 break;
3323 }
3324 blk->col = col1;
3325 for (line = blk->lines; line; line = line->next) {
3326 for (j = 0; j <= line->len; ++j) {
3327 line->col[j] += col1;
3328 }
3329 }
3330 }
3331
3332 } else {
3333
3334 // sort blocks into xy order for column assignment
3335 blocks = (TextBlock **)gmallocn(count: nBlocks, size: sizeof(TextBlock *));
3336 int i;
3337 for (blk = blkList, i = 0; blk; blk = blk->next, ++i) {
3338 blocks[i] = blk;
3339 }
3340 if (blocks) {
3341 qsort(base: blocks, nmemb: nBlocks, size: sizeof(TextBlock *), compar: &TextBlock::cmpXYPrimaryRot);
3342 }
3343
3344 // column assignment
3345 for (i = 0; i < nBlocks; ++i) {
3346 blk0 = blocks[i];
3347 col1 = 0;
3348 for (j = 0; j < i; ++j) {
3349 blk1 = blocks[j];
3350 col2 = 0; // make gcc happy
3351 switch (primaryRot) {
3352 case 0:
3353 if (blk0->xMin > blk1->xMax) {
3354 col2 = blk1->col + blk1->nColumns + 3;
3355 } else if (blk1->xMax == blk1->xMin) {
3356 col2 = blk1->col;
3357 } else {
3358 col2 = blk1->col + (int)(((blk0->xMin - blk1->xMin) / (blk1->xMax - blk1->xMin)) * blk1->nColumns);
3359 }
3360 break;
3361 case 1:
3362 if (blk0->yMin > blk1->yMax) {
3363 col2 = blk1->col + blk1->nColumns + 3;
3364 } else if (blk1->yMax == blk1->yMin) {
3365 col2 = blk1->col;
3366 } else {
3367 col2 = blk1->col + (int)(((blk0->yMin - blk1->yMin) / (blk1->yMax - blk1->yMin)) * blk1->nColumns);
3368 }
3369 break;
3370 case 2:
3371 if (blk0->xMax < blk1->xMin) {
3372 col2 = blk1->col + blk1->nColumns + 3;
3373 } else if (blk1->xMin == blk1->xMax) {
3374 col2 = blk1->col;
3375 } else {
3376 col2 = blk1->col + (int)(((blk0->xMax - blk1->xMax) / (blk1->xMin - blk1->xMax)) * blk1->nColumns);
3377 }
3378 break;
3379 case 3:
3380 if (blk0->yMax < blk1->yMin) {
3381 col2 = blk1->col + blk1->nColumns + 3;
3382 } else if (blk1->yMin == blk1->yMax) {
3383 col2 = blk1->col;
3384 } else {
3385 col2 = blk1->col + (int)(((blk0->yMax - blk1->yMax) / (blk1->yMin - blk1->yMax)) * blk1->nColumns);
3386 }
3387 break;
3388 }
3389 if (col2 > col1) {
3390 col1 = col2;
3391 }
3392 }
3393 blk0->col = col1;
3394 for (line = blk0->lines; line; line = line->next) {
3395 for (j = 0; j <= line->len; ++j) {
3396 line->col[j] += col1;
3397 }
3398 }
3399 }
3400 }
3401
3402#if 0 // for debugging
3403 printf("*** blocks, after column assignment ***\n");
3404 for (blk = blkList; blk; blk = blk->next) {
3405 printf("block: rot=%d x=%.2f..%.2f y=%.2f..%.2f col=%d nCols=%d\n",
3406 blk->rot, blk->xMin, blk->xMax, blk->yMin, blk->yMax, blk->col,
3407 blk->nColumns);
3408 for (line = blk->lines; line; line = line->next) {
3409 printf(" line: col[0]=%d\n", line->col[0]);
3410 for (word0 = line->words; word0; word0 = word0->next) {
3411 printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSize=%.2f space=%d: '",
3412 word0->xMin, word0->xMax, word0->yMin, word0->yMax,
3413 word0->base, word0->fontSize, word0->spaceAfter);
3414 for (i = 0; i < word0->len; ++i) {
3415 fputc(word0->text[i] & 0xff, stdout);
3416 }
3417 printf("'\n");
3418 }
3419 }
3420 }
3421 printf("\n");
3422#endif
3423
3424 //----- reading order sort
3425
3426 // compute space on left and right sides of each block
3427 for (int i = 0; i < nBlocks; ++i) {
3428 blk0 = blocks[i];
3429 for (j = 0; j < nBlocks; ++j) {
3430 blk1 = blocks[j];
3431 if (blk1 != blk0) {
3432 blk0->updatePriMinMax(blk: blk1);
3433 }
3434 }
3435 }
3436
3437#if 0 // for debugging
3438 printf("PAGE\n");
3439#endif
3440
3441 int sortPos = 0;
3442 bool *visited = (bool *)gmallocn(count: nBlocks, size: sizeof(bool));
3443 for (int i = 0; i < nBlocks; i++) {
3444 visited[i] = false;
3445 }
3446
3447 double bxMin0, byMin0, bxMin1, byMin1;
3448 int numTables = 0;
3449 int tableId = -1;
3450 int correspondenceX, correspondenceY;
3451 double xCentre1, yCentre1, xCentre2, yCentre2;
3452 double xCentre3, yCentre3, xCentre4, yCentre4;
3453 double deltaX, deltaY;
3454 TextBlock *fblk2 = nullptr, *fblk3 = nullptr, *fblk4 = nullptr;
3455
3456 for (blk1 = blkList; blk1; blk1 = blk1->next) {
3457 blk1->ExMin = blk1->xMin;
3458 blk1->ExMax = blk1->xMax;
3459 blk1->EyMin = blk1->yMin;
3460 blk1->EyMax = blk1->yMax;
3461
3462 bxMin0 = DBL_MAX;
3463 byMin0 = DBL_MAX;
3464 bxMin1 = DBL_MAX;
3465 byMin1 = DBL_MAX;
3466
3467 fblk2 = nullptr;
3468 fblk3 = nullptr;
3469 fblk4 = nullptr;
3470
3471 /* find fblk2, fblk3 and fblk4 so that
3472 * fblk2 is on the right of blk1 and overlap with blk1 in y axis
3473 * fblk3 is under blk1 and overlap with blk1 in x axis
3474 * fblk4 is under blk1 and on the right of blk1
3475 * and they are closest to blk1
3476 */
3477 for (blk2 = blkList; blk2; blk2 = blk2->next) {
3478 if (blk2 != blk1) {
3479 if (blk2->yMin <= blk1->yMax && blk2->yMax >= blk1->yMin && blk2->xMin > blk1->xMax && blk2->xMin < bxMin0) {
3480 bxMin0 = blk2->xMin;
3481 fblk2 = blk2;
3482 } else if (blk2->xMin <= blk1->xMax && blk2->xMax >= blk1->xMin && blk2->yMin > blk1->yMax && blk2->yMin < byMin0) {
3483 byMin0 = blk2->yMin;
3484 fblk3 = blk2;
3485 } else if (blk2->xMin > blk1->xMax && blk2->xMin < bxMin1 && blk2->yMin > blk1->yMax && blk2->yMin < byMin1) {
3486 bxMin1 = blk2->xMin;
3487 byMin1 = blk2->yMin;
3488 fblk4 = blk2;
3489 }
3490 }
3491 }
3492
3493 /* fblk4 can not overlap with fblk3 in x and with fblk2 in y
3494 * fblk2 can not overlap with fblk3 in x and y
3495 * fblk4 has to overlap with fblk3 in y and with fblk2 in x
3496 */
3497 if (fblk2 != nullptr && fblk3 != nullptr && fblk4 != nullptr) {
3498 if (((fblk3->xMin <= fblk4->xMax && fblk3->xMax >= fblk4->xMin) || (fblk2->yMin <= fblk4->yMax && fblk2->yMax >= fblk4->yMin) || (fblk2->xMin <= fblk3->xMax && fblk2->xMax >= fblk3->xMin)
3499 || (fblk2->yMin <= fblk3->yMax && fblk2->yMax >= fblk3->yMin))
3500 || !(fblk4->xMin <= fblk2->xMax && fblk4->xMax >= fblk2->xMin && fblk4->yMin <= fblk3->yMax && fblk4->yMax >= fblk3->yMin)) {
3501 fblk2 = nullptr;
3502 fblk3 = nullptr;
3503 fblk4 = nullptr;
3504 }
3505 }
3506
3507 // if we found any then look whether they form a table
3508 if (fblk2 != nullptr && fblk3 != nullptr && fblk4 != nullptr) {
3509 tableId = -1;
3510 correspondenceX = 0;
3511 correspondenceY = 0;
3512 deltaX = 0.0;
3513 deltaY = 0.0;
3514
3515 if (blk1->lines && blk1->lines->words) {
3516 deltaX = blk1->lines->words->getFontSize();
3517 }
3518 if (fblk2->lines && fblk2->lines->words) {
3519 deltaX = deltaX < fblk2->lines->words->getFontSize() ? deltaX : fblk2->lines->words->getFontSize();
3520 }
3521 if (fblk3->lines && fblk3->lines->words) {
3522 deltaX = deltaX < fblk3->lines->words->getFontSize() ? deltaX : fblk3->lines->words->getFontSize();
3523 }
3524 if (fblk4->lines && fblk4->lines->words) {
3525 deltaX = deltaX < fblk4->lines->words->getFontSize() ? deltaX : fblk4->lines->words->getFontSize();
3526 }
3527
3528 deltaY = deltaX;
3529
3530 deltaX *= minColSpacing1;
3531 deltaY *= maxIntraLineDelta;
3532
3533 xCentre1 = (blk1->xMax + blk1->xMin) / 2.0;
3534 yCentre1 = (blk1->yMax + blk1->yMin) / 2.0;
3535 xCentre2 = (fblk2->xMax + fblk2->xMin) / 2.0;
3536 yCentre2 = (fblk2->yMax + fblk2->yMin) / 2.0;
3537 xCentre3 = (fblk3->xMax + fblk3->xMin) / 2.0;
3538 yCentre3 = (fblk3->yMax + fblk3->yMin) / 2.0;
3539 xCentre4 = (fblk4->xMax + fblk4->xMin) / 2.0;
3540 yCentre4 = (fblk4->yMax + fblk4->yMin) / 2.0;
3541
3542 // are blocks centrally aligned in x ?
3543 if (fabs(x: xCentre1 - xCentre3) <= deltaX && fabs(x: xCentre2 - xCentre4) <= deltaX) {
3544 correspondenceX++;
3545 }
3546
3547 // are blocks centrally aligned in y ?
3548 if (fabs(x: yCentre1 - yCentre2) <= deltaY && fabs(x: yCentre3 - yCentre4) <= deltaY) {
3549 correspondenceY++;
3550 }
3551
3552 // are blocks aligned to the left ?
3553 if (fabs(x: blk1->xMin - fblk3->xMin) <= deltaX && fabs(x: fblk2->xMin - fblk4->xMin) <= deltaX) {
3554 correspondenceX++;
3555 }
3556
3557 // are blocks aligned to the right ?
3558 if (fabs(x: blk1->xMax - fblk3->xMax) <= deltaX && fabs(x: fblk2->xMax - fblk4->xMax) <= deltaX) {
3559 correspondenceX++;
3560 }
3561
3562 // are blocks aligned to the top ?
3563 if (fabs(x: blk1->yMin - fblk2->yMin) <= deltaY && fabs(x: fblk3->yMin - fblk4->yMin) <= deltaY) {
3564 correspondenceY++;
3565 }
3566
3567 // are blocks aligned to the bottom ?
3568 if (fabs(x: blk1->yMax - fblk2->yMax) <= deltaY && fabs(x: fblk3->yMax - fblk4->yMax) <= deltaY) {
3569 correspondenceY++;
3570 }
3571
3572 // are blocks aligned in x and y ?
3573 if (correspondenceX > 0 && correspondenceY > 0) {
3574
3575 // find maximal tableId
3576 tableId = tableId < fblk4->tableId ? fblk4->tableId : tableId;
3577 tableId = tableId < fblk3->tableId ? fblk3->tableId : tableId;
3578 tableId = tableId < fblk2->tableId ? fblk2->tableId : tableId;
3579 tableId = tableId < blk1->tableId ? blk1->tableId : tableId;
3580
3581 // if the tableId is -1, then we found new table
3582 if (tableId < 0) {
3583 tableId = numTables;
3584 numTables++;
3585 }
3586
3587 blk1->tableId = tableId;
3588 fblk2->tableId = tableId;
3589 fblk3->tableId = tableId;
3590 fblk4->tableId = tableId;
3591 }
3592 }
3593 }
3594
3595 /* set extended bounding boxes of all table entries
3596 * so that they contain whole table
3597 * (we need to process whole table size when comparing it
3598 * with regular text blocks)
3599 */
3600 PDFRectangle *envelopes = new PDFRectangle[numTables];
3601 TextBlock **ending_blocks = new TextBlock *[numTables];
3602
3603 for (int i = 0; i < numTables; i++) {
3604 envelopes[i].x1 = DBL_MAX;
3605 envelopes[i].x2 = DBL_MIN;
3606 envelopes[i].y1 = DBL_MAX;
3607 envelopes[i].y2 = DBL_MIN;
3608 ending_blocks[i] = nullptr;
3609 }
3610
3611 for (blk1 = blkList; blk1; blk1 = blk1->next) {
3612 if (blk1->tableId >= 0) {
3613 if (blk1->ExMin < envelopes[blk1->tableId].x1) {
3614 envelopes[blk1->tableId].x1 = blk1->ExMin;
3615 if (!blk1->page->primaryLR) {
3616 ending_blocks[blk1->tableId] = blk1;
3617 }
3618 }
3619
3620 if (blk1->ExMax > envelopes[blk1->tableId].x2) {
3621 envelopes[blk1->tableId].x2 = blk1->ExMax;
3622 if (blk1->page->primaryLR) {
3623 ending_blocks[blk1->tableId] = blk1;
3624 }
3625 }
3626
3627 envelopes[blk1->tableId].y1 = blk1->EyMin < envelopes[blk1->tableId].y1 ? blk1->EyMin : envelopes[blk1->tableId].y1;
3628 envelopes[blk1->tableId].y2 = blk1->EyMax > envelopes[blk1->tableId].y2 ? blk1->EyMax : envelopes[blk1->tableId].y2;
3629 }
3630 }
3631
3632 for (blk1 = blkList; blk1; blk1 = blk1->next) {
3633 if (blk1->tableId >= 0 && ending_blocks[blk1->tableId] && blk1->xMin <= ending_blocks[blk1->tableId]->xMax && blk1->xMax >= ending_blocks[blk1->tableId]->xMin) {
3634 blk1->tableEnd = true;
3635 }
3636 }
3637
3638 for (blk1 = blkList; blk1; blk1 = blk1->next) {
3639 if (blk1->tableId >= 0) {
3640 blk1->ExMin = envelopes[blk1->tableId].x1;
3641 blk1->ExMax = envelopes[blk1->tableId].x2;
3642 blk1->EyMin = envelopes[blk1->tableId].y1;
3643 blk1->EyMax = envelopes[blk1->tableId].y2;
3644 }
3645 }
3646 delete[] envelopes;
3647 delete[] ending_blocks;
3648
3649 /* set extended bounding boxes of all other blocks
3650 * so that they extend in x without hitting neighbours
3651 */
3652 for (blk1 = blkList; blk1; blk1 = blk1->next) {
3653 if (!(blk1->tableId >= 0)) {
3654 double xMax = DBL_MAX;
3655 double xMin = DBL_MIN;
3656
3657 for (blk2 = blkList; blk2; blk2 = blk2->next) {
3658 if (blk2 == blk1) {
3659 continue;
3660 }
3661
3662 if (blk1->yMin <= blk2->yMax && blk1->yMax >= blk2->yMin) {
3663 if (blk2->xMin < xMax && blk2->xMin > blk1->xMax) {
3664 xMax = blk2->xMin;
3665 }
3666
3667 if (blk2->xMax > xMin && blk2->xMax < blk1->xMin) {
3668 xMin = blk2->xMax;
3669 }
3670 }
3671 }
3672
3673 for (blk2 = blkList; blk2; blk2 = blk2->next) {
3674 if (blk2 == blk1) {
3675 continue;
3676 }
3677
3678 if (blk2->xMax > blk1->ExMax && blk2->xMax <= xMax && blk2->yMin >= blk1->yMax) {
3679 blk1->ExMax = blk2->xMax;
3680 }
3681
3682 if (blk2->xMin < blk1->ExMin && blk2->xMin >= xMin && blk2->yMin >= blk1->yMax) {
3683 blk1->ExMin = blk2->xMin;
3684 }
3685 }
3686 }
3687 }
3688
3689 int i = -1;
3690 for (blk1 = blkList; blk1; blk1 = blk1->next) {
3691 i++;
3692 sortPos = blk1->visitDepthFirst(blkList, pos1: i, sorted: blocks, sortPos, visited);
3693 }
3694 if (visited) {
3695 gfree(p: visited);
3696 }
3697
3698#if 0 // for debugging
3699 printf("*** blocks, after ro sort ***\n");
3700 for (i = 0; i < nBlocks; ++i) {
3701 blk = blocks[i];
3702 printf("block: rot=%d x=%.2f..%.2f y=%.2f..%.2f space=%.2f..%.2f\n",
3703 blk->rot, blk->xMin, blk->xMax, blk->yMin, blk->yMax,
3704 blk->priMin, blk->priMax);
3705 for (line = blk->lines; line; line = line->next) {
3706 printf(" line:\n");
3707 for (word0 = line->words; word0; word0 = word0->next) {
3708 printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSize=%.2f space=%d: '",
3709 word0->xMin, word0->xMax, word0->yMin, word0->yMax,
3710 word0->base, word0->fontSize, word0->spaceAfter);
3711 for (j = 0; j < word0->len; ++j) {
3712 fputc(word0->text[j] & 0xff, stdout);
3713 }
3714 printf("'\n");
3715 }
3716 }
3717 }
3718 printf("\n");
3719 fflush(stdout);
3720#endif
3721
3722 // build the flows
3723 //~ this needs to be adjusted for writing mode (vertical text)
3724 //~ this also needs to account for right-to-left column ordering
3725 while (flows) {
3726 flow = flows;
3727 flows = flows->next;
3728 delete flow;
3729 }
3730 flow = nullptr;
3731 flows = lastFlow = nullptr;
3732 // assume blocks are already in reading order,
3733 // and construct flows accordingly.
3734 for (i = 0; i < nBlocks; i++) {
3735 blk = blocks[i];
3736 blk->next = nullptr;
3737 if (flow) {
3738 blk1 = blocks[i - 1];
3739 blkSpace = maxBlockSpacing * blk1->lines->words->fontSize;
3740 if (blk1->secondaryDelta(blk) <= blkSpace && blk->isBelow(blk: blk1) && flow->blockFits(blk, prevBlk: blk1)) {
3741 flow->addBlock(blk);
3742 continue;
3743 }
3744 }
3745 flow = new TextFlow(this, blk);
3746 if (lastFlow) {
3747 lastFlow->next = flow;
3748 } else {
3749 flows = flow;
3750 }
3751 lastFlow = flow;
3752 }
3753
3754#if 0 // for debugging
3755 printf("*** flows ***\n");
3756 for (flow = flows; flow; flow = flow->next) {
3757 printf("flow: x=%.2f..%.2f y=%.2f..%.2f pri:%.2f..%.2f\n",
3758 flow->xMin, flow->xMax, flow->yMin, flow->yMax,
3759 flow->priMin, flow->priMax);
3760 for (blk = flow->blocks; blk; blk = blk->next) {
3761 printf(" block: rot=%d x=%.2f..%.2f y=%.2f..%.2f pri=%.2f..%.2f\n",
3762 blk->rot, blk->ExMin, blk->ExMax, blk->EyMin, blk->EyMax,
3763 blk->priMin, blk->priMax);
3764 for (line = blk->lines; line; line = line->next) {
3765 printf(" line:\n");
3766 for (word0 = line->words; word0; word0 = word0->next) {
3767 printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSize=%.2f space=%d: '",
3768 word0->xMin, word0->xMax, word0->yMin, word0->yMax,
3769 word0->base, word0->fontSize, word0->spaceAfter);
3770 for (i = 0; i < word0->len; ++i) {
3771 fputc(word0->text[i] & 0xff, stdout);
3772 }
3773 printf("'\n");
3774 }
3775 }
3776 }
3777 }
3778 printf("\n");
3779#endif
3780}
3781
3782void TextPage::adjustRotation(TextLine *line, int start, int end, double *xMin, double *xMax, double *yMin, double *yMax)
3783{
3784 switch (line->rot) {
3785 case 0:
3786 *xMin = line->edge[start];
3787 *xMax = line->edge[end];
3788 *yMin = line->yMin;
3789 *yMax = line->yMax;
3790 break;
3791 case 1:
3792 *xMin = line->xMin;
3793 *xMax = line->xMax;
3794 *yMin = line->edge[start];
3795 *yMax = line->edge[end];
3796 break;
3797 case 2:
3798 *xMin = line->edge[end];
3799 *xMax = line->edge[start];
3800 *yMin =