1 | //======================================================================== |
2 | // |
3 | // TextOutputDev.cc |
4 | // |
5 | // Copyright 1997-2003 Glyph & Cog, LLC |
6 | // |
7 | //======================================================================== |
8 | |
9 | //======================================================================== |
10 | // |
11 | // Modified under the Poppler project - http://poppler.freedesktop.org |
12 | // |
13 | // All changes made under the Poppler project to this file are licensed |
14 | // under GPL version 2 or later |
15 | // |
16 | // Copyright (C) 2005-2007 Kristian Høgsberg <krh@redhat.com> |
17 | // Copyright (C) 2005 Nickolay V. Shmyrev <nshmyrev@yandex.ru> |
18 | // Copyright (C) 2006-2008, 2011-2013 Carlos Garcia Campos <carlosgc@gnome.org> |
19 | // Copyright (C) 2006, 2007, 2013 Ed Catmur <ed@catmur.co.uk> |
20 | // Copyright (C) 2006 Jeff Muizelaar <jeff@infidigm.net> |
21 | // Copyright (C) 2007, 2008, 2012, 2017 Adrian Johnson <ajohnson@redneon.com> |
22 | // Copyright (C) 2008 Koji Otani <sho@bbr.jp> |
23 | // Copyright (C) 2008, 2010-2012, 2014-2022, 2024 Albert Astals Cid <aacid@kde.org> |
24 | // Copyright (C) 2008 Pino Toscano <pino@kde.org> |
25 | // Copyright (C) 2008, 2010 Hib Eris <hib@hiberis.nl> |
26 | // Copyright (C) 2009 Ross Moore <ross@maths.mq.edu.au> |
27 | // Copyright (C) 2009 Kovid Goyal <kovid@kovidgoyal.net> |
28 | // Copyright (C) 2010 Brian Ewins <brian.ewins@gmail.com> |
29 | // Copyright (C) 2010, 2021 Marek Kasik <mkasik@redhat.com> |
30 | // Copyright (C) 2010, 2020 Suzuki Toshiya <mpsuzuki@hiroshima-u.ac.jp> |
31 | // Copyright (C) 2011 Sam Liao <phyomh@gmail.com> |
32 | // Copyright (C) 2012 Horst Prote <prote@fmi.uni-stuttgart.de> |
33 | // Copyright (C) 2012, 2013-2018 Jason Crain <jason@aquaticape.us> |
34 | // Copyright (C) 2012 Peter Breitenlohner <peb@mppmu.mpg.de> |
35 | // Copyright (C) 2013 José Aliste <jaliste@src.gnome.org> |
36 | // Copyright (C) 2013 Thomas Freitag <Thomas.Freitag@alfa.de> |
37 | // Copyright (C) 2013 Ed Catmur <ed@catmur.co.uk> |
38 | // Copyright (C) 2016 Khaled Hosny <khaledhosny@eglug.org> |
39 | // Copyright (C) 2018 Klarälvdalens Datakonsult AB, a KDAB Group company, <info@kdab.com>. Work sponsored by the LiMux project of the city of Munich |
40 | // Copyright (C) 2018 Sanchit Anand <sanxchit@gmail.com> |
41 | // Copyright (C) 2018 Adam Reichold <adam.reichold@t-online.de> |
42 | // Copyright (C) 2018-2022, 2024 Nelson Benítez León <nbenitezl@gmail.com> |
43 | // Copyright (C) 2019 Christian Persch <chpe@src.gnome.org> |
44 | // Copyright (C) 2019, 2022 Oliver Sander <oliver.sander@tu-dresden.de> |
45 | // Copyright (C) 2019 Dan Shea <dan.shea@logical-innovations.com> |
46 | // Copyright (C) 2021 Peter Williams <peter@newton.cx> |
47 | // Copyright (C) 2024 Adam Sampson <ats@offog.org> |
48 | // Copyright (C) 2024 g10 Code GmbH, Author: Sune Stolborg Vuorela <sune@vuorela.dk> |
49 | // Copyright (C) 2024 Stefan Brüns <stefan.bruens@rwth-aachen.de> |
50 | // |
51 | // To see a description of the changes please see the Changelog file that |
52 | // came with your tarball or type make ChangeLog if you are building from git |
53 | // |
54 | //======================================================================== |
55 | |
56 | #include <config.h> |
57 | |
58 | #include <cstdio> |
59 | #include <cstdlib> |
60 | #include <cstddef> |
61 | #include <cmath> |
62 | #include <cfloat> |
63 | #include <cctype> |
64 | #include <algorithm> |
65 | #if defined(_WIN32) || defined(__CYGWIN__) |
66 | # include <fcntl.h> // for O_BINARY |
67 | # include <io.h> // for _setmode |
68 | #endif |
69 | #include "goo/gfile.h" |
70 | #include "goo/gmem.h" |
71 | #include "goo/GooString.h" |
72 | #include "poppler-config.h" |
73 | #include "Error.h" |
74 | #include "GlobalParams.h" |
75 | #include "UnicodeMap.h" |
76 | #include "UnicodeTypeTable.h" |
77 | #include "Link.h" |
78 | #include "TextOutputDev.h" |
79 | #include "Page.h" |
80 | #include "Annot.h" |
81 | #include "UTF.h" |
82 | |
83 | //------------------------------------------------------------------------ |
84 | // parameters |
85 | //------------------------------------------------------------------------ |
86 | |
87 | // Each bucket in a text pool includes baselines within a range of |
88 | // this many points. |
89 | #define textPoolStep 4 |
90 | |
91 | // Inter-character space width which will cause addChar to start a new |
92 | // word. |
93 | #define minWordBreakSpace 0.1 |
94 | |
95 | // Negative inter-character space width, i.e., overlap, which will |
96 | // cause addChar to start a new word. |
97 | #define minDupBreakOverlap 0.2 |
98 | |
99 | // Max distance between baselines of two lines within a block, as a |
100 | // fraction of the font size. |
101 | #define maxLineSpacingDelta 1.5 |
102 | |
103 | // Max difference in primary font sizes on two lines in the same |
104 | // block. Delta1 is used when examining new lines above and below the |
105 | // current block; delta2 is used when examining text that overlaps the |
106 | // current block; delta3 is used when examining text to the left and |
107 | // right of the current block. |
108 | #define maxBlockFontSizeDelta1 0.05 |
109 | #define maxBlockFontSizeDelta2 0.6 |
110 | #define maxBlockFontSizeDelta3 0.2 |
111 | |
112 | // Max difference in font sizes inside a word. |
113 | #define maxWordFontSizeDelta 0.05 |
114 | |
115 | // Maximum distance between baselines of two words on the same line, |
116 | // e.g., distance between subscript or superscript and the primary |
117 | // baseline, as a fraction of the font size. |
118 | #define maxIntraLineDelta 0.5 |
119 | |
120 | // Minimum inter-word spacing, as a fraction of the font size. (Only |
121 | // used for raw ordering.) |
122 | #define minWordSpacing 0.15 |
123 | |
124 | // Maximum inter-word spacing, as a fraction of the font size. |
125 | #define maxWordSpacing 1.5 |
126 | |
127 | // Maximum horizontal spacing which will allow a word to be pulled |
128 | // into a block, as a fraction of the font size. |
129 | // This default value can be tweaked via API. |
130 | double TextOutputDev::minColSpacing1_default = 0.7; |
131 | |
132 | // Minimum spacing between columns, as a fraction of the font size. |
133 | #define minColSpacing2 1.0 |
134 | |
135 | // Maximum vertical spacing between blocks within a flow, as a |
136 | // multiple of the font size. |
137 | #define maxBlockSpacing 2.5 |
138 | |
139 | // Minimum spacing between characters within a word, as a fraction of |
140 | // the font size. |
141 | #define minCharSpacing -0.5 |
142 | |
143 | // Maximum spacing between characters within a word, as a fraction of |
144 | // the font size, when there is no obvious extra-wide character |
145 | // spacing. |
146 | #define maxCharSpacing 0.03 |
147 | |
148 | // When extra-wide character spacing is detected, the inter-character |
149 | // space threshold is set to the minimum inter-character space |
150 | // multiplied by this constant. |
151 | #define maxWideCharSpacingMul 1.3 |
152 | |
153 | // Upper limit on spacing between characters in a word. |
154 | #define maxWideCharSpacing 0.4 |
155 | |
156 | // Max difference in primary,secondary coordinates (as a fraction of |
157 | // the font size) allowed for duplicated text (fake boldface, drop |
158 | // shadows) which is to be discarded. |
159 | #define dupMaxPriDelta 0.1 |
160 | #define dupMaxSecDelta 0.2 |
161 | |
162 | // Max width of underlines (in points). |
163 | #define maxUnderlineWidth 3 |
164 | |
165 | // Min distance between baseline and underline (in points). |
166 | //~ this should be font-size-dependent |
167 | #define minUnderlineGap -2 |
168 | |
169 | // Max distance between baseline and underline (in points). |
170 | //~ this should be font-size-dependent |
171 | #define maxUnderlineGap 4 |
172 | |
173 | // Max horizontal distance between edge of word and start of underline |
174 | // (in points). |
175 | //~ this should be font-size-dependent |
176 | #define underlineSlack 1 |
177 | |
178 | // Max distance between edge of text and edge of link border |
179 | #define hyperlinkSlack 2 |
180 | |
181 | // Max distance between characters when combining a base character and |
182 | // combining character |
183 | #define combMaxMidDelta 0.3 |
184 | #define combMaxBaseDelta 0.4 |
185 | |
186 | // Text is considered diagonal if abs(tan(angle)) > diagonalThreshold. |
187 | // (Or 1/tan(angle) for 90/270 degrees.) |
188 | #define diagonalThreshold 0.1 |
189 | |
190 | // How opaque a selection on a glyphless font should be. Since the font is |
191 | // glyphless and overlaid over text in image form, this must enable users |
192 | // to read the underlying image. Issue #157 |
193 | #define glyphlessSelectionOpacity 0.4 |
194 | |
195 | // Returns whether x is between a and b or equal to a or b. |
196 | // a and b don't need to be sorted. |
197 | #define XBetweenAB(x, a, b) (!(((x) > (a) && (x) > (b)) || ((x) < (a) && (x) < (b))) ? true : false) |
198 | |
199 | namespace { |
200 | |
201 | inline bool isAscii7(Unicode uchar) |
202 | { |
203 | return uchar < 128; |
204 | } |
205 | |
206 | } |
207 | |
208 | static int reorderText(const Unicode *text, int len, const UnicodeMap *uMap, bool primaryLR, GooString *s, Unicode *u) |
209 | { |
210 | char lre[8], rle[8], popdf[8], buf[8]; |
211 | int lreLen = 0, rleLen = 0, popdfLen = 0, n; |
212 | int nCols, i, j, k; |
213 | |
214 | nCols = 0; |
215 | |
216 | if (s) { |
217 | lreLen = uMap->mapUnicode(u: 0x202a, buf: lre, bufSize: sizeof(lre)); |
218 | rleLen = uMap->mapUnicode(u: 0x202b, buf: rle, bufSize: sizeof(rle)); |
219 | popdfLen = uMap->mapUnicode(u: 0x202c, buf: popdf, bufSize: sizeof(popdf)); |
220 | } |
221 | |
222 | if (primaryLR) { |
223 | i = 0; |
224 | while (i < len) { |
225 | // output a left-to-right section |
226 | for (j = i; j < len && !unicodeTypeR(c: text[j]); ++j) { |
227 | ; |
228 | } |
229 | for (k = i; k < j; ++k) { |
230 | if (s) { |
231 | n = uMap->mapUnicode(u: text[k], buf, bufSize: sizeof(buf)); |
232 | s->append(str: buf, lengthA: n); |
233 | } |
234 | if (u) { |
235 | u[nCols] = text[k]; |
236 | } |
237 | ++nCols; |
238 | } |
239 | i = j; |
240 | // output a right-to-left section |
241 | for (j = i; j < len && !(unicodeTypeL(c: text[j]) || unicodeTypeNum(c: text[j])); ++j) { |
242 | ; |
243 | } |
244 | if (j > i) { |
245 | if (s) { |
246 | s->append(str: rle, lengthA: rleLen); |
247 | } |
248 | for (k = j - 1; k >= i; --k) { |
249 | if (s) { |
250 | n = uMap->mapUnicode(u: text[k], buf, bufSize: sizeof(buf)); |
251 | s->append(str: buf, lengthA: n); |
252 | } |
253 | if (u) { |
254 | u[nCols] = text[k]; |
255 | } |
256 | ++nCols; |
257 | } |
258 | if (s) { |
259 | s->append(str: popdf, lengthA: popdfLen); |
260 | } |
261 | i = j; |
262 | } |
263 | } |
264 | } else { |
265 | // Note: This code treats numeric characters (European and |
266 | // Arabic/Indic) as left-to-right, which isn't strictly correct |
267 | // (incurs extra LRE/POPDF pairs), but does produce correct |
268 | // visual formatting. |
269 | if (s) { |
270 | s->append(str: rle, lengthA: rleLen); |
271 | } |
272 | i = len - 1; |
273 | while (i >= 0) { |
274 | // output a right-to-left section |
275 | for (j = i; j >= 0 && !(unicodeTypeL(c: text[j]) || unicodeTypeNum(c: text[j])); --j) { |
276 | ; |
277 | } |
278 | for (k = i; k > j; --k) { |
279 | if (s) { |
280 | n = uMap->mapUnicode(u: text[k], buf, bufSize: sizeof(buf)); |
281 | s->append(str: buf, lengthA: n); |
282 | } |
283 | if (u) { |
284 | u[nCols] = text[k]; |
285 | } |
286 | ++nCols; |
287 | } |
288 | i = j; |
289 | // output a left-to-right section |
290 | for (j = i; j >= 0 && !unicodeTypeR(c: text[j]); --j) { |
291 | ; |
292 | } |
293 | if (j < i) { |
294 | if (s) { |
295 | s->append(str: lre, lengthA: lreLen); |
296 | } |
297 | for (k = j + 1; k <= i; ++k) { |
298 | if (s) { |
299 | n = uMap->mapUnicode(u: text[k], buf, bufSize: sizeof(buf)); |
300 | s->append(str: buf, lengthA: n); |
301 | } |
302 | if (u) { |
303 | u[nCols] = text[k]; |
304 | } |
305 | ++nCols; |
306 | } |
307 | if (s) { |
308 | s->append(str: popdf, lengthA: popdfLen); |
309 | } |
310 | i = j; |
311 | } |
312 | } |
313 | if (s) { |
314 | s->append(str: popdf, lengthA: popdfLen); |
315 | } |
316 | } |
317 | |
318 | return nCols; |
319 | } |
320 | |
321 | //------------------------------------------------------------------------ |
322 | // TextUnderline |
323 | //------------------------------------------------------------------------ |
324 | |
325 | class TextUnderline |
326 | { |
327 | public: |
328 | TextUnderline(double x0A, double y0A, double x1A, double y1A) |
329 | { |
330 | x0 = x0A; |
331 | y0 = y0A; |
332 | x1 = x1A; |
333 | y1 = y1A; |
334 | horiz = y0 == y1; |
335 | } |
336 | ~TextUnderline() { } |
337 | |
338 | double x0, y0, x1, y1; |
339 | bool horiz; |
340 | }; |
341 | |
342 | //------------------------------------------------------------------------ |
343 | // TextLink |
344 | //------------------------------------------------------------------------ |
345 | |
346 | class TextLink |
347 | { |
348 | public: |
349 | TextLink(int xMinA, int yMinA, int xMaxA, int yMaxA, AnnotLink *linkA) |
350 | { |
351 | xMin = xMinA; |
352 | yMin = yMinA; |
353 | xMax = xMaxA; |
354 | yMax = yMaxA; |
355 | link = linkA; |
356 | } |
357 | ~TextLink() { } |
358 | |
359 | int xMin, yMin, xMax, yMax; |
360 | AnnotLink *link; |
361 | }; |
362 | |
363 | //------------------------------------------------------------------------ |
364 | // TextFontInfo |
365 | //------------------------------------------------------------------------ |
366 | |
367 | TextFontInfo::TextFontInfo(const GfxState *state) |
368 | { |
369 | gfxFont = state->getFont(); |
370 | #ifdef TEXTOUT_WORD_LIST |
371 | fontName = (gfxFont && gfxFont->getName()) ? new GooString(*gfxFont->getName()) : nullptr; |
372 | flags = gfxFont ? gfxFont->getFlags() : 0; |
373 | #endif |
374 | } |
375 | |
376 | TextFontInfo::~TextFontInfo() |
377 | { |
378 | #ifdef TEXTOUT_WORD_LIST |
379 | if (fontName) { |
380 | delete fontName; |
381 | } |
382 | #endif |
383 | } |
384 | |
385 | bool TextFontInfo::matches(const GfxState *state) const |
386 | { |
387 | return state->getFont() == gfxFont; |
388 | } |
389 | |
390 | bool TextFontInfo::matches(const TextFontInfo *fontInfo) const |
391 | { |
392 | return gfxFont == fontInfo->gfxFont; |
393 | } |
394 | |
395 | bool TextFontInfo::matches(const Ref *ref) const |
396 | { |
397 | return gfxFont && (*(gfxFont->getID()) == *ref); |
398 | } |
399 | |
400 | double TextFontInfo::getAscent() const |
401 | { |
402 | return gfxFont ? gfxFont->getAscent() : 0.95; |
403 | } |
404 | |
405 | double TextFontInfo::getDescent() const |
406 | { |
407 | return gfxFont ? gfxFont->getDescent() : -0.35; |
408 | } |
409 | |
410 | int TextFontInfo::getWMode() const |
411 | { |
412 | return gfxFont ? gfxFont->getWMode() : 0; |
413 | } |
414 | |
415 | //------------------------------------------------------------------------ |
416 | // TextWord |
417 | //------------------------------------------------------------------------ |
418 | |
419 | TextWord::TextWord(const GfxState *state, int rotA, double fontSizeA) |
420 | { |
421 | rot = rotA; |
422 | fontSize = fontSizeA; |
423 | spaceAfter = false; |
424 | next = nullptr; |
425 | invisible = state->getRender() == 3; |
426 | |
427 | #ifdef TEXTOUT_WORD_LIST |
428 | GfxRGB rgb; |
429 | |
430 | if ((state->getRender() & 3) == 1) { |
431 | state->getStrokeRGB(rgb: &rgb); |
432 | } else { |
433 | state->getFillRGB(rgb: &rgb); |
434 | } |
435 | colorR = colToDbl(x: rgb.r); |
436 | colorG = colToDbl(x: rgb.g); |
437 | colorB = colToDbl(x: rgb.b); |
438 | #endif |
439 | |
440 | underlined = false; |
441 | link = nullptr; |
442 | } |
443 | |
444 | TextWord::~TextWord() { } |
445 | |
446 | void TextWord::addChar(const GfxState *state, TextFontInfo *fontA, double x, double y, double dx, double dy, int charPosA, int charLen, CharCode c, Unicode u, const Matrix &textMatA) |
447 | { |
448 | chars.push_back(x: CharInfo { .text: u, .charcode: c, .charPos: charPosA, .edge: 0.0, .font: fontA, .textMat: textMatA }); |
449 | charPosEnd = charPosA + charLen; |
450 | |
451 | if (len() == 1) { |
452 | setInitialBounds(fontA, x, y); |
453 | } |
454 | |
455 | if (wMode) { // vertical writing mode |
456 | // NB: the rotation value has been incremented by 1 (in |
457 | // TextPage::beginWord()) for vertical writing mode |
458 | switch (rot) { |
459 | case 0: |
460 | chars.back().edge = x - fontSize; |
461 | xMax = edgeEnd = x; |
462 | break; |
463 | case 1: |
464 | chars.back().edge = y - fontSize; |
465 | yMax = edgeEnd = y; |
466 | break; |
467 | case 2: |
468 | chars.back().edge = x + fontSize; |
469 | xMin = edgeEnd = x; |
470 | break; |
471 | case 3: |
472 | chars.back().edge = y + fontSize; |
473 | yMin = edgeEnd = y; |
474 | break; |
475 | } |
476 | } else { // horizontal writing mode |
477 | switch (rot) { |
478 | case 0: |
479 | chars.back().edge = x; |
480 | xMax = edgeEnd = x + dx; |
481 | break; |
482 | case 1: |
483 | chars.back().edge = y; |
484 | yMax = edgeEnd = y + dy; |
485 | break; |
486 | case 2: |
487 | chars.back().edge = x; |
488 | xMin = edgeEnd = x + dx; |
489 | break; |
490 | case 3: |
491 | chars.back().edge = y; |
492 | yMin = edgeEnd = y + dy; |
493 | break; |
494 | } |
495 | } |
496 | } |
497 | |
498 | void TextWord::setInitialBounds(TextFontInfo *fontA, double x, double y) |
499 | { |
500 | double ascent = fontA->getAscent() * fontSize; |
501 | double descent = fontA->getDescent() * fontSize; |
502 | wMode = fontA->getWMode(); |
503 | |
504 | if (wMode) { // vertical writing mode |
505 | // NB: the rotation value has been incremented by 1 (in |
506 | // TextPage::beginWord()) for vertical writing mode |
507 | switch (rot) { |
508 | case 0: |
509 | xMin = x - fontSize; |
510 | yMin = y - fontSize; |
511 | yMax = y; |
512 | base = y; |
513 | break; |
514 | case 1: |
515 | xMin = x; |
516 | yMin = y - fontSize; |
517 | xMax = x + fontSize; |
518 | base = x; |
519 | break; |
520 | case 2: |
521 | yMin = y; |
522 | xMax = x + fontSize; |
523 | yMax = y + fontSize; |
524 | base = y; |
525 | break; |
526 | case 3: |
527 | xMin = x - fontSize; |
528 | xMax = x; |
529 | yMax = y + fontSize; |
530 | base = x; |
531 | break; |
532 | } |
533 | } else { // horizontal writing mode |
534 | switch (rot) { |
535 | case 0: |
536 | xMin = x; |
537 | yMin = y - ascent; |
538 | yMax = y - descent; |
539 | if (yMin == yMax) { |
540 | // this is a sanity check for a case that shouldn't happen -- but |
541 | // if it does happen, we want to avoid dividing by zero later |
542 | yMin = y; |
543 | yMax = y + 1; |
544 | } |
545 | base = y; |
546 | break; |
547 | case 1: |
548 | xMin = x + descent; |
549 | yMin = y; |
550 | xMax = x + ascent; |
551 | if (xMin == xMax) { |
552 | // this is a sanity check for a case that shouldn't happen -- but |
553 | // if it does happen, we want to avoid dividing by zero later |
554 | xMin = x; |
555 | xMax = x + 1; |
556 | } |
557 | base = x; |
558 | break; |
559 | case 2: |
560 | yMin = y + descent; |
561 | xMax = x; |
562 | yMax = y + ascent; |
563 | if (yMin == yMax) { |
564 | // this is a sanity check for a case that shouldn't happen -- but |
565 | // if it does happen, we want to avoid dividing by zero later |
566 | yMin = y; |
567 | yMax = y + 1; |
568 | } |
569 | base = y; |
570 | break; |
571 | case 3: |
572 | xMin = x - ascent; |
573 | xMax = x - descent; |
574 | yMax = y; |
575 | if (xMin == xMax) { |
576 | // this is a sanity check for a case that shouldn't happen -- but |
577 | // if it does happen, we want to avoid dividing by zero later |
578 | xMin = x; |
579 | xMax = x + 1; |
580 | } |
581 | base = x; |
582 | break; |
583 | } |
584 | } |
585 | } |
586 | |
587 | struct CombiningTable |
588 | { |
589 | Unicode base; |
590 | Unicode comb; |
591 | }; |
592 | |
593 | static const struct CombiningTable combiningTable[] = { |
594 | { .base: 0x0060, .comb: 0x0300 }, // grave |
595 | { .base: 0x00a8, .comb: 0x0308 }, // dieresis |
596 | { .base: 0x00af, .comb: 0x0304 }, // macron |
597 | { .base: 0x00b4, .comb: 0x0301 }, // acute |
598 | { .base: 0x00b8, .comb: 0x0327 }, // cedilla |
599 | { .base: 0x02c6, .comb: 0x0302 }, // circumflex |
600 | { .base: 0x02c7, .comb: 0x030c }, // caron |
601 | { .base: 0x02d8, .comb: 0x0306 }, // breve |
602 | { .base: 0x02d9, .comb: 0x0307 }, // dotaccent |
603 | { .base: 0x02da, .comb: 0x030a }, // ring |
604 | { .base: 0x02dc, .comb: 0x0303 }, // tilde |
605 | { .base: 0x02dd, .comb: 0x030b } // hungarumlaut (double acute accent) |
606 | }; |
607 | |
608 | // returning combining versions of characters |
609 | static Unicode getCombiningChar(Unicode u) |
610 | { |
611 | for (const CombiningTable &combining : combiningTable) { |
612 | if (u == combining.base) { |
613 | return combining.comb; |
614 | } |
615 | } |
616 | return 0; |
617 | } |
618 | |
619 | bool TextWord::addCombining(const GfxState *state, TextFontInfo *fontA, double fontSizeA, double x, double y, double dx, double dy, int charPosA, int charLen, CharCode c, Unicode u, const Matrix &textMatA) |
620 | { |
621 | if (chars.empty() || wMode != 0 || fontA->getWMode() != 0) { |
622 | return false; |
623 | } |
624 | |
625 | Unicode cCurrent = getCombiningChar(u); |
626 | if (cCurrent != 0 && unicodeTypeAlphaNum(c: chars.back().text)) { |
627 | // Current is a combining character, previous is base character |
628 | double maxScaledMidDelta = fabs(x: edgeEnd - chars.back().edge) * combMaxMidDelta; |
629 | double charMid, charBase, maxScaledBaseDelta; |
630 | |
631 | // Test if characters overlap |
632 | if (rot == 0 || rot == 2) { |
633 | charMid = x + (dx / 2); |
634 | charBase = y; |
635 | maxScaledBaseDelta = (yMax - yMin) * combMaxBaseDelta; |
636 | } else { |
637 | charMid = y + (dy / 2); |
638 | charBase = x; |
639 | maxScaledBaseDelta = (xMax - xMin) * combMaxBaseDelta; |
640 | } |
641 | |
642 | double edgeMid = (chars.back().edge + edgeEnd) / 2; |
643 | if (fabs(x: charMid - edgeMid) >= maxScaledMidDelta || fabs(x: charBase - base) >= maxScaledBaseDelta) { |
644 | return false; |
645 | } |
646 | |
647 | // Add character, but don't adjust edge / bounding box because |
648 | // combining character's positioning could be odd. |
649 | chars.emplace_back(args: CharInfo { .text: cCurrent, .charcode: c, .charPos: charPosA, .edge: edgeMid, .font: fontA, .textMat: textMatA }); |
650 | charPosEnd = charPosA + charLen; |
651 | |
652 | return true; |
653 | } |
654 | |
655 | Unicode cPrev = getCombiningChar(u: chars.back().text); |
656 | if (cPrev != 0 && unicodeTypeAlphaNum(c: u)) { |
657 | // Previous is a combining character, current is base character |
658 | double maxScaledBaseDelta = (fontA->getAscent() - fontA->getDescent()) * fontSizeA * combMaxBaseDelta; |
659 | double charMid, charBase, maxScaledMidDelta; |
660 | |
661 | // Test if characters overlap |
662 | if (rot == 0 || rot == 2) { |
663 | charMid = x + (dx / 2); |
664 | charBase = y; |
665 | maxScaledMidDelta = fabs(x: dx * combMaxMidDelta); |
666 | } else { |
667 | charMid = y + (dy / 2); |
668 | charBase = x; |
669 | maxScaledMidDelta = fabs(x: dy * combMaxMidDelta); |
670 | } |
671 | |
672 | double edgeMid = (chars.back().edge + edgeEnd) / 2; |
673 | if (fabs(x: charMid - edgeMid) >= maxScaledMidDelta || fabs(x: charBase - base) >= maxScaledBaseDelta) { |
674 | return false; |
675 | } |
676 | |
677 | fontSize = fontSizeA; |
678 | // move combining character to after base character |
679 | chars.emplace_back(args: CharInfo { .text: cPrev, .charcode: chars.back().charcode, .charPos: charPosA, .edge: edgeMid, .font: chars.back().font, .textMat: chars.back().textMat }); |
680 | |
681 | auto &lastChar = chars[chars.size() - 2]; |
682 | |
683 | charPosEnd = charPosA + charLen; |
684 | lastChar.text = u; |
685 | lastChar.charcode = c; |
686 | lastChar.font = fontA; |
687 | lastChar.textMat = textMatA; |
688 | |
689 | if (len() == 2) { |
690 | setInitialBounds(fontA, x, y); |
691 | } |
692 | |
693 | // Updated edges / bounding box because we changed the base |
694 | // character. |
695 | if (wMode) { |
696 | // FIXME unreachable, wMode == 0 |
697 | switch (rot) { |
698 | case 0: |
699 | lastChar.edge = x - fontSize; |
700 | xMax = edgeEnd = x; |
701 | break; |
702 | case 1: |
703 | lastChar.edge = y - fontSize; |
704 | yMax = edgeEnd = y; |
705 | break; |
706 | case 2: |
707 | lastChar.edge = x + fontSize; |
708 | xMin = edgeEnd = x; |
709 | break; |
710 | case 3: |
711 | lastChar.edge = y + fontSize; |
712 | yMin = edgeEnd = y; |
713 | break; |
714 | } |
715 | } else { |
716 | switch (rot) { |
717 | case 0: |
718 | lastChar.edge = x; |
719 | xMax = edgeEnd = x + dx; |
720 | break; |
721 | case 1: |
722 | lastChar.edge = y; |
723 | yMax = edgeEnd = y + dy; |
724 | break; |
725 | case 2: |
726 | lastChar.edge = x; |
727 | xMin = edgeEnd = x + dx; |
728 | break; |
729 | case 3: |
730 | lastChar.edge = y; |
731 | yMin = edgeEnd = y + dy; |
732 | break; |
733 | } |
734 | } |
735 | |
736 | chars.back().edge = (edgeEnd + lastChar.edge) / 2; |
737 | return true; |
738 | } |
739 | return false; |
740 | } |
741 | |
742 | void TextWord::merge(TextWord *word) |
743 | { |
744 | if (word->xMin < xMin) { |
745 | xMin = word->xMin; |
746 | } |
747 | if (word->yMin < yMin) { |
748 | yMin = word->yMin; |
749 | } |
750 | if (word->xMax > xMax) { |
751 | xMax = word->xMax; |
752 | } |
753 | if (word->yMax > yMax) { |
754 | yMax = word->yMax; |
755 | } |
756 | chars.insert(position: chars.end(), first: word->chars.begin(), last: word->chars.end()); |
757 | edgeEnd = word->edgeEnd; |
758 | charPosEnd = word->charPosEnd; |
759 | } |
760 | |
761 | inline int TextWord::primaryCmp(const TextWord *word) const |
762 | { |
763 | double cmp; |
764 | |
765 | cmp = 0; // make gcc happy |
766 | switch (rot) { |
767 | case 0: |
768 | cmp = xMin - word->xMin; |
769 | break; |
770 | case 1: |
771 | cmp = yMin - word->yMin; |
772 | break; |
773 | case 2: |
774 | cmp = word->xMax - xMax; |
775 | break; |
776 | case 3: |
777 | cmp = word->yMax - yMax; |
778 | break; |
779 | } |
780 | return cmp < 0 ? -1 : cmp > 0 ? 1 : 0; |
781 | } |
782 | |
783 | double TextWord::primaryDelta(const TextWord *word) const |
784 | { |
785 | double delta; |
786 | |
787 | delta = 0; // make gcc happy |
788 | switch (rot) { |
789 | case 0: |
790 | delta = word->xMin - xMax; |
791 | break; |
792 | case 1: |
793 | delta = word->yMin - yMax; |
794 | break; |
795 | case 2: |
796 | delta = xMin - word->xMax; |
797 | break; |
798 | case 3: |
799 | delta = yMin - word->yMax; |
800 | break; |
801 | } |
802 | return delta; |
803 | } |
804 | |
805 | int TextWord::cmpYX(const void *p1, const void *p2) |
806 | { |
807 | TextWord *word1 = *(TextWord **)p1; |
808 | TextWord *word2 = *(TextWord **)p2; |
809 | double cmp; |
810 | |
811 | cmp = word1->yMin - word2->yMin; |
812 | if (cmp == 0) { |
813 | cmp = word1->xMin - word2->xMin; |
814 | } |
815 | return cmp < 0 ? -1 : cmp > 0 ? 1 : 0; |
816 | } |
817 | |
818 | #ifdef TEXTOUT_WORD_LIST |
819 | |
820 | GooString *TextWord::getText() const |
821 | { |
822 | GooString *s; |
823 | const UnicodeMap *uMap; |
824 | char buf[8]; |
825 | |
826 | s = new GooString(); |
827 | if (!(uMap = globalParams->getTextEncoding())) { |
828 | return s; |
829 | } |
830 | for (size_t i = 0; i < len(); ++i) { |
831 | auto n = uMap->mapUnicode(u: chars[i].text, buf, bufSize: sizeof(buf)); |
832 | s->append(str: buf, lengthA: n); |
833 | } |
834 | return s; |
835 | } |
836 | |
837 | void TextWord::getCharBBox(int charIdx, double *xMinA, double *yMinA, double *xMaxA, double *yMaxA) const |
838 | { |
839 | if (charIdx < 0) { |
840 | return; |
841 | } |
842 | size_t uCharIdx = charIdx; |
843 | if (uCharIdx >= len()) { |
844 | return; |
845 | } |
846 | auto startingEdge = chars[uCharIdx].edge; |
847 | auto endingEdge = (uCharIdx + 1 == len()) ? edgeEnd : chars[charIdx + 1].edge; |
848 | switch (rot) { |
849 | case 0: |
850 | *xMinA = startingEdge; |
851 | *xMaxA = endingEdge; |
852 | *yMinA = yMin; |
853 | *yMaxA = yMax; |
854 | break; |
855 | case 1: |
856 | *xMinA = xMin; |
857 | *xMaxA = xMax; |
858 | *yMinA = startingEdge; |
859 | *yMaxA = endingEdge; |
860 | break; |
861 | case 2: |
862 | *xMinA = endingEdge; |
863 | *xMaxA = startingEdge; |
864 | *yMinA = yMin; |
865 | *yMaxA = yMax; |
866 | break; |
867 | case 3: |
868 | *xMinA = xMin; |
869 | *xMaxA = xMax; |
870 | *yMinA = endingEdge; |
871 | *yMaxA = startingEdge; |
872 | break; |
873 | } |
874 | } |
875 | |
876 | #endif // TEXTOUT_WORD_LIST |
877 | |
878 | //------------------------------------------------------------------------ |
879 | // TextPool |
880 | //------------------------------------------------------------------------ |
881 | |
882 | TextPool::TextPool() |
883 | { |
884 | minBaseIdx = 0; |
885 | maxBaseIdx = -1; |
886 | pool = nullptr; |
887 | cursor = nullptr; |
888 | cursorBaseIdx = -1; |
889 | } |
890 | |
891 | TextPool::~TextPool() |
892 | { |
893 | int baseIdx; |
894 | TextWord *word, *word2; |
895 | |
896 | for (baseIdx = minBaseIdx; baseIdx <= maxBaseIdx; ++baseIdx) { |
897 | for (word = pool[baseIdx - minBaseIdx]; word; word = word2) { |
898 | word2 = word->next; |
899 | delete word; |
900 | } |
901 | } |
902 | gfree(p: pool); |
903 | } |
904 | |
905 | int TextPool::getBaseIdx(double base) const |
906 | { |
907 | const double baseIdxDouble = base / textPoolStep; |
908 | if (std::isnan(x: baseIdxDouble) || baseIdxDouble < minBaseIdx) { |
909 | return minBaseIdx; |
910 | } |
911 | if (baseIdxDouble > maxBaseIdx) { |
912 | return maxBaseIdx; |
913 | } |
914 | return (int)baseIdxDouble; |
915 | } |
916 | |
917 | void TextPool::addWord(TextWord *word) |
918 | { |
919 | int wordBaseIdx, newMinBaseIdx, newMaxBaseIdx, baseIdx; |
920 | TextWord *w0, *w1; |
921 | |
922 | // expand the array if needed |
923 | wordBaseIdx = (int)(word->base / textPoolStep); |
924 | if (unlikely(wordBaseIdx <= INT_MIN + 128 || wordBaseIdx >= INT_MAX - 128)) { |
925 | error(category: errSyntaxWarning, pos: -1, msg: "wordBaseIdx out of range" ); |
926 | delete word; |
927 | return; |
928 | } |
929 | if (minBaseIdx > maxBaseIdx) { |
930 | minBaseIdx = wordBaseIdx - 128; |
931 | maxBaseIdx = wordBaseIdx + 128; |
932 | pool = (TextWord **)gmallocn(count: maxBaseIdx - minBaseIdx + 1, size: sizeof(TextWord *)); |
933 | for (baseIdx = minBaseIdx; baseIdx <= maxBaseIdx; ++baseIdx) { |
934 | pool[baseIdx - minBaseIdx] = nullptr; |
935 | } |
936 | } else if (wordBaseIdx < minBaseIdx) { |
937 | newMinBaseIdx = wordBaseIdx - 128; |
938 | TextWord **newPool = (TextWord **)gmallocn_checkoverflow(count: maxBaseIdx - newMinBaseIdx + 1, size: sizeof(TextWord *)); |
939 | if (unlikely(!newPool)) { |
940 | error(category: errSyntaxWarning, pos: -1, msg: "newPool would overflow" ); |
941 | delete word; |
942 | return; |
943 | } |
944 | for (baseIdx = newMinBaseIdx; baseIdx < minBaseIdx; ++baseIdx) { |
945 | newPool[baseIdx - newMinBaseIdx] = nullptr; |
946 | } |
947 | memcpy(dest: &newPool[minBaseIdx - newMinBaseIdx], src: pool, n: (maxBaseIdx - minBaseIdx + 1) * sizeof(TextWord *)); |
948 | gfree(p: pool); |
949 | pool = newPool; |
950 | minBaseIdx = newMinBaseIdx; |
951 | } else if (wordBaseIdx > maxBaseIdx) { |
952 | newMaxBaseIdx = wordBaseIdx + 128; |
953 | TextWord **reallocatedPool = (TextWord **)greallocn(p: pool, count: newMaxBaseIdx - minBaseIdx + 1, size: sizeof(TextWord *), checkoverflow: true /*checkoverflow*/, free_p: false /*free_pool*/); |
954 | if (!reallocatedPool) { |
955 | error(category: errSyntaxWarning, pos: -1, msg: "new pool size would overflow" ); |
956 | delete word; |
957 | return; |
958 | } |
959 | pool = reallocatedPool; |
960 | for (baseIdx = maxBaseIdx + 1; baseIdx <= newMaxBaseIdx; ++baseIdx) { |
961 | pool[baseIdx - minBaseIdx] = nullptr; |
962 | } |
963 | maxBaseIdx = newMaxBaseIdx; |
964 | } |
965 | |
966 | // insert the new word |
967 | if (cursor && wordBaseIdx == cursorBaseIdx && word->primaryCmp(word: cursor) >= 0) { |
968 | w0 = cursor; |
969 | w1 = cursor->next; |
970 | } else { |
971 | w0 = nullptr; |
972 | w1 = pool[wordBaseIdx - minBaseIdx]; |
973 | } |
974 | for (; w1 && word->primaryCmp(word: w1) > 0; w0 = w1, w1 = w1->next) { |
975 | ; |
976 | } |
977 | word->next = w1; |
978 | if (w0) { |
979 | w0->next = word; |
980 | } else { |
981 | pool[wordBaseIdx - minBaseIdx] = word; |
982 | } |
983 | cursor = word; |
984 | cursorBaseIdx = wordBaseIdx; |
985 | } |
986 | |
987 | //------------------------------------------------------------------------ |
988 | // TextLine |
989 | //------------------------------------------------------------------------ |
990 | |
991 | TextLine::TextLine(TextBlock *blkA, int rotA, double baseA) |
992 | { |
993 | blk = blkA; |
994 | rot = rotA; |
995 | base = baseA; |
996 | words = lastWord = nullptr; |
997 | text = nullptr; |
998 | edge = nullptr; |
999 | col = nullptr; |
1000 | len = 0; |
1001 | convertedLen = 0; |
1002 | hyphenated = false; |
1003 | next = nullptr; |
1004 | xMin = yMin = 0; |
1005 | xMax = yMax = -1; |
1006 | normalized = nullptr; |
1007 | normalized_len = 0; |
1008 | normalized_idx = nullptr; |
1009 | ascii_translation = nullptr; |
1010 | ascii_len = 0; |
1011 | ascii_idx = nullptr; |
1012 | } |
1013 | |
1014 | TextLine::~TextLine() |
1015 | { |
1016 | TextWord *word; |
1017 | |
1018 | while (words) { |
1019 | word = words; |
1020 | words = words->next; |
1021 | delete word; |
1022 | } |
1023 | gfree(p: text); |
1024 | gfree(p: edge); |
1025 | gfree(p: col); |
1026 | if (normalized) { |
1027 | gfree(p: normalized); |
1028 | gfree(p: normalized_idx); |
1029 | } |
1030 | if (ascii_translation) { |
1031 | gfree(p: ascii_translation); |
1032 | gfree(p: ascii_idx); |
1033 | } |
1034 | } |
1035 | |
1036 | void TextLine::addWord(TextWord *word) |
1037 | { |
1038 | if (lastWord) { |
1039 | lastWord->next = word; |
1040 | } else { |
1041 | words = word; |
1042 | } |
1043 | lastWord = word; |
1044 | |
1045 | if (xMin > xMax) { |
1046 | xMin = word->xMin; |
1047 | xMax = word->xMax; |
1048 | yMin = word->yMin; |
1049 | yMax = word->yMax; |
1050 | } else { |
1051 | if (word->xMin < xMin) { |
1052 | xMin = word->xMin; |
1053 | } |
1054 | if (word->xMax > xMax) { |
1055 | xMax = word->xMax; |
1056 | } |
1057 | if (word->yMin < yMin) { |
1058 | yMin = word->yMin; |
1059 | } |
1060 | if (word->yMax > yMax) { |
1061 | yMax = word->yMax; |
1062 | } |
1063 | } |
1064 | } |
1065 | |
1066 | double TextLine::primaryDelta(const TextLine *line) const |
1067 | { |
1068 | double delta; |
1069 | |
1070 | delta = 0; // make gcc happy |
1071 | switch (rot) { |
1072 | case 0: |
1073 | delta = line->xMin - xMax; |
1074 | break; |
1075 | case 1: |
1076 | delta = line->yMin - yMax; |
1077 | break; |
1078 | case 2: |
1079 | delta = xMin - line->xMax; |
1080 | break; |
1081 | case 3: |
1082 | delta = yMin - line->yMax; |
1083 | break; |
1084 | } |
1085 | return delta; |
1086 | } |
1087 | |
1088 | int TextLine::primaryCmp(const TextLine *line) const |
1089 | { |
1090 | double cmp; |
1091 | |
1092 | cmp = 0; // make gcc happy |
1093 | switch (rot) { |
1094 | case 0: |
1095 | cmp = xMin - line->xMin; |
1096 | break; |
1097 | case 1: |
1098 | cmp = yMin - line->yMin; |
1099 | break; |
1100 | case 2: |
1101 | cmp = line->xMax - xMax; |
1102 | break; |
1103 | case 3: |
1104 | cmp = line->yMax - yMax; |
1105 | break; |
1106 | } |
1107 | return cmp < 0 ? -1 : cmp > 0 ? 1 : 0; |
1108 | } |
1109 | |
1110 | int TextLine::secondaryCmp(const TextLine *line) const |
1111 | { |
1112 | double cmp; |
1113 | |
1114 | cmp = (rot == 0 || rot == 3) ? base - line->base : line->base - base; |
1115 | return cmp < 0 ? -1 : cmp > 0 ? 1 : 0; |
1116 | } |
1117 | |
1118 | int TextLine::cmpYX(const TextLine *line) const |
1119 | { |
1120 | int cmp; |
1121 | |
1122 | if ((cmp = secondaryCmp(line))) { |
1123 | return cmp; |
1124 | } |
1125 | return primaryCmp(line); |
1126 | } |
1127 | |
1128 | int TextLine::cmpXY(const void *p1, const void *p2) |
1129 | { |
1130 | TextLine *line1 = *(TextLine **)p1; |
1131 | TextLine *line2 = *(TextLine **)p2; |
1132 | int cmp; |
1133 | |
1134 | if ((cmp = line1->primaryCmp(line: line2))) { |
1135 | return cmp; |
1136 | } |
1137 | return line1->secondaryCmp(line: line2); |
1138 | } |
1139 | |
1140 | void TextLine::coalesce(const UnicodeMap *uMap) |
1141 | { |
1142 | double space, delta, minSpace; |
1143 | bool isUnicode; |
1144 | char buf[8]; |
1145 | |
1146 | if (words->next) { |
1147 | |
1148 | // compute the inter-word space threshold |
1149 | if (words->len() > 1 || words->next->len() > 1) { |
1150 | minSpace = 0; |
1151 | } else { |
1152 | minSpace = words->primaryDelta(word: words->next); |
1153 | for (auto word0 = words->next, word1 = word0->next; word1 && minSpace > 0; word0 = word1, word1 = word0->next) { |
1154 | if (word1->len() > 1) { |
1155 | minSpace = 0; |
1156 | } |
1157 | delta = word0->primaryDelta(word: word1); |
1158 | if (delta < minSpace) { |
1159 | minSpace = delta; |
1160 | } |
1161 | } |
1162 | } |
1163 | if (minSpace <= 0) { |
1164 | space = maxCharSpacing * words->fontSize; |
1165 | } else { |
1166 | space = maxWideCharSpacingMul * minSpace; |
1167 | if (space > maxWideCharSpacing * words->fontSize) { |
1168 | space = maxWideCharSpacing * words->fontSize; |
1169 | } |
1170 | } |
1171 | |
1172 | // merge words |
1173 | auto word0 = words; |
1174 | auto word1 = words->next; |
1175 | while (word1) { |
1176 | if (word0->primaryDelta(word: word1) >= space) { |
1177 | word0->spaceAfter = true; |
1178 | word0 = word1; |
1179 | word1 = word1->next; |
1180 | } else if (word0->chars.back().font == word1->chars.front().font // |
1181 | && word0->underlined == word1->underlined // |
1182 | && fabs(x: word0->fontSize - word1->fontSize) < maxWordFontSizeDelta * words->fontSize // |
1183 | && word1->chars.front().charPos == word0->charPosEnd) { |
1184 | word0->merge(word: word1); |
1185 | word0->next = word1->next; |
1186 | delete word1; |
1187 | word1 = word0->next; |
1188 | } else { |
1189 | word0 = word1; |
1190 | word1 = word1->next; |
1191 | } |
1192 | } |
1193 | } |
1194 | |
1195 | // build the line text |
1196 | isUnicode = uMap ? uMap->isUnicode() : false; |
1197 | len = 0; |
1198 | for (auto word1 = words; word1; word1 = word1->next) { |
1199 | len += word1->len(); |
1200 | if (word1->spaceAfter) { |
1201 | ++len; |
1202 | } |
1203 | } |
1204 | text = (Unicode *)gmallocn(count: len, size: sizeof(Unicode)); |
1205 | edge = (double *)gmallocn(count: len + 1, size: sizeof(double)); |
1206 | size_t i = 0; |
1207 | for (auto word1 = words; word1; word1 = word1->next) { |
1208 | for (size_t j = 0; j < word1->len(); ++j) { |
1209 | text[i] = word1->chars[j].text; |
1210 | edge[i] = word1->chars[j].edge; |
1211 | ++i; |
1212 | } |
1213 | edge[i] = word1->edgeEnd; |
1214 | if (word1->spaceAfter) { |
1215 | text[i] = (Unicode)0x0020; |
1216 | ++i; |
1217 | } |
1218 | } |
1219 | |
1220 | // compute convertedLen and set up the col array |
1221 | col = (int *)gmallocn(count: len + 1, size: sizeof(int)); |
1222 | convertedLen = 0; |
1223 | for (int ci = 0; ci < len; ++ci) { |
1224 | col[ci] = convertedLen; |
1225 | if (isUnicode) { |
1226 | ++convertedLen; |
1227 | } else if (uMap) { |
1228 | convertedLen += uMap->mapUnicode(u: text[ci], buf, bufSize: sizeof(buf)); |
1229 | } |
1230 | } |
1231 | col[len] = convertedLen; |
1232 | |
1233 | // check for hyphen at end of line |
1234 | //~ need to check for other chars used as hyphens |
1235 | hyphenated = text[len - 1] == (Unicode)'-'; |
1236 | } |
1237 | |
1238 | //------------------------------------------------------------------------ |
1239 | // TextLineFrag |
1240 | //------------------------------------------------------------------------ |
1241 | |
1242 | class TextLineFrag |
1243 | { |
1244 | public: |
1245 | TextLine *line; // the line object |
1246 | int start, len; // offset and length of this fragment |
1247 | // (in Unicode chars) |
1248 | double xMin, xMax; // bounding box coordinates |
1249 | double yMin, yMax; |
1250 | double base; // baseline virtual coordinate |
1251 | int col; // first column |
1252 | |
1253 | void init(TextLine *lineA, int startA, int lenA); |
1254 | void computeCoords(bool oneRot); |
1255 | |
1256 | static int cmpYXPrimaryRot(const void *p1, const void *p2); |
1257 | static int cmpYXLineRot(const void *p1, const void *p2); |
1258 | static int cmpXYLineRot(const void *p1, const void *p2); |
1259 | static int cmpXYColumnPrimaryRot(const void *p1, const void *p2); |
1260 | static int cmpXYColumnLineRot(const void *p1, const void *p2); |
1261 | }; |
1262 | |
1263 | void TextLineFrag::init(TextLine *lineA, int startA, int lenA) |
1264 | { |
1265 | line = lineA; |
1266 | start = startA; |
1267 | len = lenA; |
1268 | col = line->col[start]; |
1269 | } |
1270 | |
1271 | void TextLineFrag::computeCoords(bool oneRot) |
1272 | { |
1273 | TextBlock *blk; |
1274 | double d0, d1, d2, d3, d4; |
1275 | |
1276 | if (oneRot) { |
1277 | |
1278 | switch (line->rot) { |
1279 | case 0: |
1280 | xMin = line->edge[start]; |
1281 | xMax = line->edge[start + len]; |
1282 | yMin = line->yMin; |
1283 | yMax = line->yMax; |
1284 | break; |
1285 | case 1: |
1286 | xMin = line->xMin; |
1287 | xMax = line->xMax; |
1288 | yMin = line->edge[start]; |
1289 | yMax = line->edge[start + len]; |
1290 | break; |
1291 | case 2: |
1292 | xMin = line->edge[start + len]; |
1293 | xMax = line->edge[start]; |
1294 | yMin = line->yMin; |
1295 | yMax = line->yMax; |
1296 | break; |
1297 | case 3: |
1298 | xMin = line->xMin; |
1299 | xMax = line->xMax; |
1300 | yMin = line->edge[start + len]; |
1301 | yMax = line->edge[start]; |
1302 | break; |
1303 | } |
1304 | base = line->base; |
1305 | |
1306 | } else { |
1307 | |
1308 | if (line->rot == 0 && line->blk->page->primaryRot == 0) { |
1309 | |
1310 | xMin = line->edge[start]; |
1311 | xMax = line->edge[start + len]; |
1312 | yMin = line->yMin; |
1313 | yMax = line->yMax; |
1314 | base = line->base; |
1315 | |
1316 | } else { |
1317 | |
1318 | blk = line->blk; |
1319 | d0 = line->edge[start]; |
1320 | d1 = line->edge[start + len]; |
1321 | d2 = d3 = d4 = 0; // make gcc happy |
1322 | |
1323 | switch (line->rot) { |
1324 | case 0: |
1325 | d2 = line->yMin; |
1326 | d3 = line->yMax; |
1327 | d4 = line->base; |
1328 | d0 = (d0 - blk->xMin) / (blk->xMax - blk->xMin); |
1329 | d1 = (d1 - blk->xMin) / (blk->xMax - blk->xMin); |
1330 | d2 = (d2 - blk->yMin) / (blk->yMax - blk->yMin); |
1331 | d3 = (d3 - blk->yMin) / (blk->yMax - blk->yMin); |
1332 | d4 = (d4 - blk->yMin) / (blk->yMax - blk->yMin); |
1333 | break; |
1334 | case 1: |
1335 | d2 = line->xMax; |
1336 | d3 = line->xMin; |
1337 | d4 = line->base; |
1338 | d0 = (d0 - blk->yMin) / (blk->yMax - blk->yMin); |
1339 | d1 = (d1 - blk->yMin) / (blk->yMax - blk->yMin); |
1340 | d2 = (blk->xMax - d2) / (blk->xMax - blk->xMin); |
1341 | d3 = (blk->xMax - d3) / (blk->xMax - blk->xMin); |
1342 | d4 = (blk->xMax - d4) / (blk->xMax - blk->xMin); |
1343 | break; |
1344 | case 2: |
1345 | d2 = line->yMax; |
1346 | d3 = line->yMin; |
1347 | d4 = line->base; |
1348 | d0 = (blk->xMax - d0) / (blk->xMax - blk->xMin); |
1349 | d1 = (blk->xMax - d1) / (blk->xMax - blk->xMin); |
1350 | d2 = (blk->yMax - d2) / (blk->yMax - blk->yMin); |
1351 | d3 = (blk->yMax - d3) / (blk->yMax - blk->yMin); |
1352 | d4 = (blk->yMax - d4) / (blk->yMax - blk->yMin); |
1353 | break; |
1354 | case 3: |
1355 | d2 = line->xMin; |
1356 | d3 = line->xMax; |
1357 | d4 = line->base; |
1358 | d0 = (blk->yMax - d0) / (blk->yMax - blk->yMin); |
1359 | d1 = (blk->yMax - d1) / (blk->yMax - blk->yMin); |
1360 | d2 = (d2 - blk->xMin) / (blk->xMax - blk->xMin); |
1361 | d3 = (d3 - blk->xMin) / (blk->xMax - blk->xMin); |
1362 | d4 = (d4 - blk->xMin) / (blk->xMax - blk->xMin); |
1363 | break; |
1364 | } |
1365 | |
1366 | switch (line->blk->page->primaryRot) { |
1367 | case 0: |
1368 | xMin = blk->xMin + d0 * (blk->xMax - blk->xMin); |
1369 | xMax = blk->xMin + d1 * (blk->xMax - blk->xMin); |
1370 | yMin = blk->yMin + d2 * (blk->yMax - blk->yMin); |
1371 | yMax = blk->yMin + d3 * (blk->yMax - blk->yMin); |
1372 | base = blk->yMin + d4 * (blk->yMax - blk->yMin); |
1373 | break; |
1374 | case 1: |
1375 | xMin = blk->xMax - d3 * (blk->xMax - blk->xMin); |
1376 | xMax = blk->xMax - d2 * (blk->xMax - blk->xMin); |
1377 | yMin = blk->yMin + d0 * (blk->yMax - blk->yMin); |
1378 | yMax = blk->yMin + d1 * (blk->yMax - blk->yMin); |
1379 | base = blk->xMax - d4 * (blk->xMax - blk->xMin); |
1380 | break; |
1381 | case 2: |
1382 | xMin = blk->xMax - d1 * (blk->xMax - blk->xMin); |
1383 | xMax = blk->xMax - d0 * (blk->xMax - blk->xMin); |
1384 | yMin = blk->yMax - d3 * (blk->yMax - blk->yMin); |
1385 | yMax = blk->yMax - d2 * (blk->yMax - blk->yMin); |
1386 | base = blk->yMax - d4 * (blk->yMax - blk->yMin); |
1387 | break; |
1388 | case 3: |
1389 | xMin = blk->xMin + d2 * (blk->xMax - blk->xMin); |
1390 | xMax = blk->xMin + d3 * (blk->xMax - blk->xMin); |
1391 | yMin = blk->yMax - d1 * (blk->yMax - blk->yMin); |
1392 | yMax = blk->yMax - d0 * (blk->yMax - blk->yMin); |
1393 | base = blk->xMin + d4 * (blk->xMax - blk->xMin); |
1394 | break; |
1395 | } |
1396 | } |
1397 | } |
1398 | } |
1399 | |
1400 | int TextLineFrag::cmpYXPrimaryRot(const void *p1, const void *p2) |
1401 | { |
1402 | TextLineFrag *frag1 = (TextLineFrag *)p1; |
1403 | TextLineFrag *frag2 = (TextLineFrag *)p2; |
1404 | double cmp; |
1405 | |
1406 | cmp = 0; // make gcc happy |
1407 | switch (frag1->line->blk->page->primaryRot) { |
1408 | case 0: |
1409 | if (fabs(x: cmp = frag1->yMin - frag2->yMin) < 0.01) { |
1410 | cmp = frag1->xMin - frag2->xMin; |
1411 | } |
1412 | break; |
1413 | case 1: |
1414 | if (fabs(x: cmp = frag2->xMax - frag1->xMax) < 0.01) { |
1415 | cmp = frag1->yMin - frag2->yMin; |
1416 | } |
1417 | break; |
1418 | case 2: |
1419 | if (fabs(x: cmp = frag2->yMin - frag1->yMin) < 0.01) { |
1420 | cmp = frag2->xMax - frag1->xMax; |
1421 | } |
1422 | break; |
1423 | case 3: |
1424 | if (fabs(x: cmp = frag1->xMax - frag2->xMax) < 0.01) { |
1425 | cmp = frag2->yMax - frag1->yMax; |
1426 | } |
1427 | break; |
1428 | } |
1429 | return cmp < 0 ? -1 : cmp > 0 ? 1 : 0; |
1430 | } |
1431 | |
1432 | int TextLineFrag::cmpYXLineRot(const void *p1, const void *p2) |
1433 | { |
1434 | TextLineFrag *frag1 = (TextLineFrag *)p1; |
1435 | TextLineFrag *frag2 = (TextLineFrag *)p2; |
1436 | double cmp; |
1437 | |
1438 | cmp = 0; // make gcc happy |
1439 | switch (frag1->line->rot) { |
1440 | case 0: |
1441 | if ((cmp = frag1->yMin - frag2->yMin) == 0) { |
1442 | cmp = frag1->xMin - frag2->xMin; |
1443 | } |
1444 | break; |
1445 | case 1: |
1446 | if ((cmp = frag2->xMax - frag1->xMax) == 0) { |
1447 | cmp = frag1->yMin - frag2->yMin; |
1448 | } |
1449 | break; |
1450 | case 2: |
1451 | if ((cmp = frag2->yMin - frag1->yMin) == 0) { |
1452 | cmp = frag2->xMax - frag1->xMax; |
1453 | } |
1454 | break; |
1455 | case 3: |
1456 | if ((cmp = frag1->xMax - frag2->xMax) == 0) { |
1457 | cmp = frag2->yMax - frag1->yMax; |
1458 | } |
1459 | break; |
1460 | } |
1461 | return cmp < 0 ? -1 : cmp > 0 ? 1 : 0; |
1462 | } |
1463 | |
1464 | int TextLineFrag::cmpXYLineRot(const void *p1, const void *p2) |
1465 | { |
1466 | TextLineFrag *frag1 = (TextLineFrag *)p1; |
1467 | TextLineFrag *frag2 = (TextLineFrag *)p2; |
1468 | double cmp; |
1469 | |
1470 | cmp = 0; // make gcc happy |
1471 | switch (frag1->line->rot) { |
1472 | case 0: |
1473 | if ((cmp = frag1->xMin - frag2->xMin) == 0) { |
1474 | cmp = frag1->yMin - frag2->yMin; |
1475 | } |
1476 | break; |
1477 | case 1: |
1478 | if ((cmp = frag1->yMin - frag2->yMin) == 0) { |
1479 | cmp = frag2->xMax - frag1->xMax; |
1480 | } |
1481 | break; |
1482 | case 2: |
1483 | if ((cmp = frag2->xMax - frag1->xMax) == 0) { |
1484 | cmp = frag2->yMin - frag1->yMin; |
1485 | } |
1486 | break; |
1487 | case 3: |
1488 | if ((cmp = frag2->yMax - frag1->yMax) == 0) { |
1489 | cmp = frag1->xMax - frag2->xMax; |
1490 | } |
1491 | break; |
1492 | } |
1493 | return cmp < 0 ? -1 : cmp > 0 ? 1 : 0; |
1494 | } |
1495 | |
1496 | int TextLineFrag::cmpXYColumnPrimaryRot(const void *p1, const void *p2) |
1497 | { |
1498 | TextLineFrag *frag1 = (TextLineFrag *)p1; |
1499 | TextLineFrag *frag2 = (TextLineFrag *)p2; |
1500 | double cmp; |
1501 | |
1502 | // if columns overlap, compare y values |
1503 | if (frag1->col < frag2->col + (frag2->line->col[frag2->start + frag2->len] - frag2->line->col[frag2->start]) && frag2->col < frag1->col + (frag1->line->col[frag1->start + frag1->len] - frag1->line->col[frag1->start])) { |
1504 | cmp = 0; // make gcc happy |
1505 | switch (frag1->line->blk->page->primaryRot) { |
1506 | case 0: |
1507 | cmp = frag1->yMin - frag2->yMin; |
1508 | break; |
1509 | case 1: |
1510 | cmp = frag2->xMax - frag1->xMax; |
1511 | break; |
1512 | case 2: |
1513 | cmp = frag2->yMin - frag1->yMin; |
1514 | break; |
1515 | case 3: |
1516 | cmp = frag1->xMax - frag2->xMax; |
1517 | break; |
1518 | } |
1519 | return cmp < 0 ? -1 : cmp > 0 ? 1 : 0; |
1520 | } |
1521 | |
1522 | // otherwise, compare starting column |
1523 | return frag1->col - frag2->col; |
1524 | } |
1525 | |
1526 | int TextLineFrag::cmpXYColumnLineRot(const void *p1, const void *p2) |
1527 | { |
1528 | TextLineFrag *frag1 = (TextLineFrag *)p1; |
1529 | TextLineFrag *frag2 = (TextLineFrag *)p2; |
1530 | double cmp; |
1531 | |
1532 | // if columns overlap, compare y values |
1533 | if (frag1->col < frag2->col + (frag2->line->col[frag2->start + frag2->len] - frag2->line->col[frag2->start]) && frag2->col < frag1->col + (frag1->line->col[frag1->start + frag1->len] - frag1->line->col[frag1->start])) { |
1534 | cmp = 0; // make gcc happy |
1535 | switch (frag1->line->rot) { |
1536 | case 0: |
1537 | cmp = frag1->yMin - frag2->yMin; |
1538 | break; |
1539 | case 1: |
1540 | cmp = frag2->xMax - frag1->xMax; |
1541 | break; |
1542 | case 2: |
1543 | cmp = frag2->yMin - frag1->yMin; |
1544 | break; |
1545 | case 3: |
1546 | cmp = frag1->xMax - frag2->xMax; |
1547 | break; |
1548 | } |
1549 | return cmp < 0 ? -1 : cmp > 0 ? 1 : 0; |
1550 | } |
1551 | |
1552 | // otherwise, compare starting column |
1553 | return frag1->col - frag2->col; |
1554 | } |
1555 | |
1556 | //------------------------------------------------------------------------ |
1557 | // TextBlock |
1558 | //------------------------------------------------------------------------ |
1559 | |
1560 | TextBlock::TextBlock(TextPage *pageA, int rotA) |
1561 | { |
1562 | page = pageA; |
1563 | rot = rotA; |
1564 | xMin = yMin = 0; |
1565 | xMax = yMax = -1; |
1566 | priMin = 0; |
1567 | priMax = page->pageWidth; |
1568 | pool = new TextPool(); |
1569 | lines = nullptr; |
1570 | curLine = nullptr; |
1571 | next = nullptr; |
1572 | stackNext = nullptr; |
1573 | tableId = -1; |
1574 | tableEnd = false; |
1575 | } |
1576 | |
1577 | TextBlock::~TextBlock() |
1578 | { |
1579 | TextLine *line; |
1580 | |
1581 | delete pool; |
1582 | while (lines) { |
1583 | line = lines; |
1584 | lines = lines->next; |
1585 | delete line; |
1586 | } |
1587 | } |
1588 | |
1589 | void TextBlock::addWord(TextWord *word) |
1590 | { |
1591 | pool->addWord(word); |
1592 | if (xMin > xMax) { |
1593 | xMin = word->xMin; |
1594 | xMax = word->xMax; |
1595 | yMin = word->yMin; |
1596 | yMax = word->yMax; |
1597 | } else { |
1598 | if (word->xMin < xMin) { |
1599 | xMin = word->xMin; |
1600 | } |
1601 | if (word->xMax > xMax) { |
1602 | xMax = word->xMax; |
1603 | } |
1604 | if (word->yMin < yMin) { |
1605 | yMin = word->yMin; |
1606 | } |
1607 | if (word->yMax > yMax) { |
1608 | yMax = word->yMax; |
1609 | } |
1610 | } |
1611 | } |
1612 | |
1613 | void TextBlock::coalesce(const UnicodeMap *uMap, double fixedPitch) |
1614 | { |
1615 | // discard duplicated text (fake boldface, drop shadows) |
1616 | for (int idx0 = pool->minBaseIdx; idx0 <= pool->maxBaseIdx; ++idx0) { |
1617 | // Get the first LHS word from the pool |
1618 | TextWord *word0 = pool->getPool(baseIdx: idx0); |
1619 | |
1620 | while (word0) { |
1621 | double priDelta = dupMaxPriDelta * word0->fontSize; |
1622 | double secDelta = dupMaxSecDelta * word0->fontSize; |
1623 | double xDelta = ((rot == 0) || (rot == 2)) ? priDelta : secDelta; |
1624 | double yDelta = ((rot == 0) || (rot == 2)) ? secDelta : priDelta; |
1625 | |
1626 | int maxBaseIdx = pool->getBaseIdx(base: word0->base + secDelta); |
1627 | |
1628 | for (int idx1 = idx0; idx1 <= maxBaseIdx; idx1++) { |
1629 | TextWord *prevWord; |
1630 | /* In case the RHS word is from the same pool as the LHS word, |
1631 | * start the inner loop with the word following the LHS word. |
1632 | * Otherwise, start with the second word from the subsequent pools |
1633 | * - the first word is compared at the end. |
1634 | */ |
1635 | if (idx0 == idx1) { |
1636 | prevWord = word0; |
1637 | } else { |
1638 | prevWord = pool->getPool(baseIdx: idx1); |
1639 | if (!prevWord) { |
1640 | continue; |
1641 | } |
1642 | } |
1643 | TextWord *word1 = prevWord->next; |
1644 | |
1645 | auto equalText = [](const TextWord &w1, const TextWord &w2) -> bool { // |
1646 | return std::equal(first1: w1.chars.begin(), last1: w1.chars.end(), first2: w2.chars.begin(), last2: w2.chars.end(), // |
1647 | binary_pred: [](auto c1, auto c2) { return c1.text == c2.text; }); |
1648 | }; |
1649 | auto match = [&equalText, xDelta, yDelta](const TextWord &w1, const TextWord &w2) -> bool { |
1650 | if (!equalText(w1, w2)) { |
1651 | return false; |
1652 | } |
1653 | return fabs(x: w1.xMin - w2.xMin) < xDelta && fabs(x: w1.xMax - w2.xMax) < xDelta // |
1654 | && fabs(x: w1.yMin - w2.yMin) < yDelta && fabs(x: w1.yMax - w2.yMax) < yDelta; |
1655 | }; |
1656 | |
1657 | while (word1) { |
1658 | if (match(*word0, *word1)) { |
1659 | prevWord->next = word1->next; |
1660 | delete word1; |
1661 | word1 = prevWord->next; |
1662 | } else { |
1663 | prevWord = word1; |
1664 | word1 = word1->next; |
1665 | } |
1666 | } |
1667 | |
1668 | // Check the first word from each subsequent pool |
1669 | if (idx0 != idx1) { |
1670 | word1 = pool->getPool(baseIdx: idx1); |
1671 | } |
1672 | if (word1 && match(*word0, *word1)) { |
1673 | pool->setPool(baseIdx: idx1, p: word1->next); |
1674 | delete word1; |
1675 | } |
1676 | } |
1677 | |
1678 | word0 = word0->next; |
1679 | } |
1680 | } |
1681 | |
1682 | TextWord *word0, *word1; |
1683 | TextWord *bestWord0, *bestWord1, *lastWord; |
1684 | TextLine *line, *line0, *line1; |
1685 | TextLine **lineArray; |
1686 | int poolMinBaseIdx, startBaseIdx, minBaseIdx, maxBaseIdx; |
1687 | int baseIdx, bestWordBaseIdx; |
1688 | double minBase, maxBase; |
1689 | double fontSize, wordSpacing, delta; |
1690 | bool overlap; |
1691 | int col1, col2; |
1692 | int i, j, k; |
1693 | |
1694 | // build the lines |
1695 | curLine = nullptr; |
1696 | poolMinBaseIdx = pool->minBaseIdx; |
1697 | charCount = 0; |
1698 | nLines = 0; |
1699 | while (true) { |
1700 | |
1701 | // find the first non-empty line in the pool |
1702 | for (; poolMinBaseIdx <= pool->maxBaseIdx && !pool->getPool(baseIdx: poolMinBaseIdx); ++poolMinBaseIdx) { |
1703 | ; |
1704 | } |
1705 | if (poolMinBaseIdx > pool->maxBaseIdx) { |
1706 | break; |
1707 | } |
1708 | |
1709 | // look for the left-most word in the first four lines of the |
1710 | // pool -- this avoids starting with a superscript word |
1711 | startBaseIdx = poolMinBaseIdx; |
1712 | for (baseIdx = poolMinBaseIdx + 1; baseIdx < poolMinBaseIdx + 4 && baseIdx <= pool->maxBaseIdx; ++baseIdx) { |
1713 | if (!pool->getPool(baseIdx)) { |
1714 | continue; |
1715 | } |
1716 | if (pool->getPool(baseIdx)->primaryCmp(word: pool->getPool(baseIdx: startBaseIdx)) < 0) { |
1717 | startBaseIdx = baseIdx; |
1718 | } |
1719 | } |
1720 | |
1721 | // create a new line |
1722 | word0 = pool->getPool(baseIdx: startBaseIdx); |
1723 | pool->setPool(baseIdx: startBaseIdx, p: word0->next); |
1724 | word0->next = nullptr; |
1725 | line = new TextLine(this, word0->rot, word0->base); |
1726 | line->addWord(word: word0); |
1727 | lastWord = word0; |
1728 | |
1729 | // compute the search range |
1730 | fontSize = word0->fontSize; |
1731 | minBase = word0->base - maxIntraLineDelta * fontSize; |
1732 | maxBase = word0->base + maxIntraLineDelta * fontSize; |
1733 | minBaseIdx = pool->getBaseIdx(base: minBase); |
1734 | maxBaseIdx = pool->getBaseIdx(base: maxBase); |
1735 | wordSpacing = fixedPitch ? fixedPitch : maxWordSpacing * fontSize; |
1736 | |
1737 | // find the rest of the words in this line |
1738 | while (true) { |
1739 | |
1740 | // find the left-most word whose baseline is in the range for |
1741 | // this line |
1742 | bestWordBaseIdx = 0; |
1743 | bestWord0 = bestWord1 = nullptr; |
1744 | overlap = false; |
1745 | for (baseIdx = minBaseIdx; !overlap && baseIdx <= maxBaseIdx; ++baseIdx) { |
1746 | for (word0 = nullptr, word1 = pool->getPool(baseIdx); word1; word0 = word1, word1 = word1->next) { |
1747 | if (word1->base >= minBase && word1->base <= maxBase) { |
1748 | delta = lastWord->primaryDelta(word: word1); |
1749 | if (delta < minCharSpacing * fontSize) { |
1750 | overlap = true; |
1751 | break; |
1752 | } else { |
1753 | if (delta < wordSpacing && (!bestWord1 || word1->primaryCmp(word: bestWord1) < 0)) { |
1754 | bestWordBaseIdx = baseIdx; |
1755 | bestWord0 = word0; |
1756 | bestWord1 = word1; |
1757 | } |
1758 | break; |
1759 | } |
1760 | } |
1761 | } |
1762 | } |
1763 | if (overlap || !bestWord1) { |
1764 | break; |
1765 | } |
1766 | |
1767 | // remove it from the pool, and add it to the line |
1768 | if (bestWord0) { |
1769 | bestWord0->next = bestWord1->next; |
1770 | } else { |
1771 | pool->setPool(baseIdx: bestWordBaseIdx, p: bestWord1->next); |
1772 | } |
1773 | bestWord1->next = nullptr; |
1774 | line->addWord(word: bestWord1); |
1775 | lastWord = bestWord1; |
1776 | } |
1777 | |
1778 | // add the line |
1779 | if (curLine && line->cmpYX(line: curLine) > 0) { |
1780 | line0 = curLine; |
1781 | line1 = curLine->next; |
1782 | } else { |
1783 | line0 = nullptr; |
1784 | line1 = lines; |
1785 | } |
1786 | for (; line1 && line->cmpYX(line: line1) > 0; line0 = line1, line1 = line1->next) { |
1787 | ; |
1788 | } |
1789 | if (line0) { |
1790 | line0->next = line; |
1791 | } else { |
1792 | lines = line; |
1793 | } |
1794 | line->next = line1; |
1795 | curLine = line; |
1796 | line->coalesce(uMap); |
1797 | charCount += line->len; |
1798 | ++nLines; |
1799 | } |
1800 | |
1801 | // sort lines into xy order for column assignment |
1802 | lineArray = (TextLine **)gmallocn(count: nLines, size: sizeof(TextLine *)); |
1803 | for (line = lines, i = 0; line; line = line->next, ++i) { |
1804 | lineArray[i] = line; |
1805 | } |
1806 | qsort(base: lineArray, nmemb: nLines, size: sizeof(TextLine *), compar: &TextLine::cmpXY); |
1807 | |
1808 | // column assignment |
1809 | nColumns = 0; |
1810 | if (fixedPitch) { |
1811 | for (i = 0; i < nLines; ++i) { |
1812 | line0 = lineArray[i]; |
1813 | col1 = 0; // make gcc happy |
1814 | switch (rot) { |
1815 | case 0: |
1816 | col1 = (int)((line0->xMin - xMin) / fixedPitch + 0.5); |
1817 | break; |
1818 | case 1: |
1819 | col1 = (int)((line0->yMin - yMin) / fixedPitch + 0.5); |
1820 | break; |
1821 | case 2: |
1822 | col1 = (int)((xMax - line0->xMax) / fixedPitch + 0.5); |
1823 | break; |
1824 | case 3: |
1825 | col1 = (int)((yMax - line0->yMax) / fixedPitch + 0.5); |
1826 | break; |
1827 | } |
1828 | for (k = 0; k <= line0->len; ++k) { |
1829 | line0->col[k] += col1; |
1830 | } |
1831 | if (line0->col[line0->len] > nColumns) { |
1832 | nColumns = line0->col[line0->len]; |
1833 | } |
1834 | } |
1835 | } else { |
1836 | for (i = 0; i < nLines; ++i) { |
1837 | line0 = lineArray[i]; |
1838 | col1 = 0; |
1839 | for (j = 0; j < i; ++j) { |
1840 | line1 = lineArray[j]; |
1841 | if (line1->primaryDelta(line: line0) >= 0) { |
1842 | col2 = line1->col[line1->len] + 1; |
1843 | } else { |
1844 | k = 0; // make gcc happy |
1845 | switch (rot) { |
1846 | case 0: |
1847 | for (k = 0; k < line1->len && line0->xMin >= 0.5 * (line1->edge[k] + line1->edge[k + 1]); ++k) { |
1848 | ; |
1849 | } |
1850 | break; |
1851 | case 1: |
1852 | for (k = 0; k < line1->len && line0->yMin >= 0.5 * (line1->edge[k] + line1->edge[k + 1]); ++k) { |
1853 | ; |
1854 | } |
1855 | break; |
1856 | case 2: |
1857 | for (k = 0; k < line1->len && line0->xMax <= 0.5 * (line1->edge[k] + line1->edge[k + 1]); ++k) { |
1858 | ; |
1859 | } |
1860 | break; |
1861 | case 3: |
1862 | for (k = 0; k < line1->len && line0->yMax <= 0.5 * (line1->edge[k] + line1->edge[k + 1]); ++k) { |
1863 | ; |
1864 | } |
1865 | break; |
1866 | } |
1867 | col2 = line1->col[k]; |
1868 | } |
1869 | if (col2 > col1) { |
1870 | col1 = col2; |
1871 | } |
1872 | } |
1873 | for (k = 0; k <= line0->len; ++k) { |
1874 | line0->col[k] += col1; |
1875 | } |
1876 | if (line0->col[line0->len] > nColumns) { |
1877 | nColumns = line0->col[line0->len]; |
1878 | } |
1879 | } |
1880 | } |
1881 | gfree(p: lineArray); |
1882 | } |
1883 | |
1884 | void TextBlock::updatePriMinMax(const TextBlock *blk) |
1885 | { |
1886 | double newPriMin, newPriMax; |
1887 | bool gotPriMin, gotPriMax; |
1888 | |
1889 | gotPriMin = gotPriMax = false; |
1890 | newPriMin = newPriMax = 0; // make gcc happy |
1891 | switch (page->primaryRot) { |
1892 | case 0: |
1893 | case 2: |
1894 | if (blk->yMin < yMax && blk->yMax > yMin) { |
1895 | if (blk->xMin < xMin) { |
1896 | newPriMin = blk->xMax; |
1897 | gotPriMin = true; |
1898 | } |
1899 | if (blk->xMax > xMax) { |
1900 | newPriMax = blk->xMin; |
1901 | gotPriMax = true; |
1902 | } |
1903 | } |
1904 | break; |
1905 | case 1: |
1906 | case 3: |
1907 | if (blk->xMin < xMax && blk->xMax > xMin) { |
1908 | if (blk->yMin < yMin) { |
1909 | newPriMin = blk->yMax; |
1910 | gotPriMin = true; |
1911 | } |
1912 | if (blk->yMax > yMax) { |
1913 | newPriMax = blk->yMin; |
1914 | gotPriMax = true; |
1915 | } |
1916 | } |
1917 | break; |
1918 | } |
1919 | if (gotPriMin) { |
1920 | if (newPriMin > xMin) { |
1921 | newPriMin = xMin; |
1922 | } |
1923 | if (newPriMin > priMin) { |
1924 | priMin = newPriMin; |
1925 | } |
1926 | } |
1927 | if (gotPriMax) { |
1928 | if (newPriMax < xMax) { |
1929 | newPriMax = xMax; |
1930 | } |
1931 | if (newPriMax < priMax) { |
1932 | priMax = newPriMax; |
1933 | } |
1934 | } |
1935 | } |
1936 | |
1937 | int TextBlock::cmpXYPrimaryRot(const void *p1, const void *p2) |
1938 | { |
1939 | TextBlock *blk1 = *(TextBlock **)p1; |
1940 | TextBlock *blk2 = *(TextBlock **)p2; |
1941 | double cmp; |
1942 | |
1943 | cmp = 0; // make gcc happy |
1944 | switch (blk1->page->primaryRot) { |
1945 | case 0: |
1946 | if ((cmp = blk1->xMin - blk2->xMin) == 0) { |
1947 | cmp = blk1->yMin - blk2->yMin; |
1948 | } |
1949 | break; |
1950 | case 1: |
1951 | if ((cmp = blk1->yMin - blk2->yMin) == 0) { |
1952 | cmp = blk2->xMax - blk1->xMax; |
1953 | } |
1954 | break; |
1955 | case 2: |
1956 | if ((cmp = blk2->xMax - blk1->xMax) == 0) { |
1957 | cmp = blk2->yMin - blk1->yMin; |
1958 | } |
1959 | break; |
1960 | case 3: |
1961 | if ((cmp = blk2->yMax - blk1->yMax) == 0) { |
1962 | cmp = blk1->xMax - blk2->xMax; |
1963 | } |
1964 | break; |
1965 | } |
1966 | return cmp < 0 ? -1 : cmp > 0 ? 1 : 0; |
1967 | } |
1968 | |
1969 | int TextBlock::cmpYXPrimaryRot(const void *p1, const void *p2) |
1970 | { |
1971 | TextBlock *blk1 = *(TextBlock **)p1; |
1972 | TextBlock *blk2 = *(TextBlock **)p2; |
1973 | double cmp; |
1974 | |
1975 | cmp = 0; // make gcc happy |
1976 | switch (blk1->page->primaryRot) { |
1977 | case 0: |
1978 | if ((cmp = blk1->yMin - blk2->yMin) == 0) { |
1979 | cmp = blk1->xMin - blk2->xMin; |
1980 | } |
1981 | break; |
1982 | case 1: |
1983 | if ((cmp = blk2->xMax - blk1->xMax) == 0) { |
1984 | cmp = blk1->yMin - blk2->yMin; |
1985 | } |
1986 | break; |
1987 | case 2: |
1988 | if ((cmp = blk2->yMin - blk1->yMin) == 0) { |
1989 | cmp = blk2->xMax - blk1->xMax; |
1990 | } |
1991 | break; |
1992 | case 3: |
1993 | if ((cmp = blk1->xMax - blk2->xMax) == 0) { |
1994 | cmp = blk2->yMax - blk1->yMax; |
1995 | } |
1996 | break; |
1997 | } |
1998 | return cmp < 0 ? -1 : cmp > 0 ? 1 : 0; |
1999 | } |
2000 | |
2001 | int TextBlock::primaryCmp(const TextBlock *blk) const |
2002 | { |
2003 | double cmp; |
2004 | |
2005 | cmp = 0; // make gcc happy |
2006 | switch (rot) { |
2007 | case 0: |
2008 | cmp = xMin - blk->xMin; |
2009 | break; |
2010 | case 1: |
2011 | cmp = yMin - blk->yMin; |
2012 | break; |
2013 | case 2: |
2014 | cmp = blk->xMax - xMax; |
2015 | break; |
2016 | case 3: |
2017 | cmp = blk->yMax - yMax; |
2018 | break; |
2019 | } |
2020 | return cmp < 0 ? -1 : cmp > 0 ? 1 : 0; |
2021 | } |
2022 | |
2023 | double TextBlock::secondaryDelta(const TextBlock *blk) const |
2024 | { |
2025 | double delta; |
2026 | |
2027 | delta = 0; // make gcc happy |
2028 | switch (rot) { |
2029 | case 0: |
2030 | delta = blk->yMin - yMax; |
2031 | break; |
2032 | case 1: |
2033 | delta = xMin - blk->xMax; |
2034 | break; |
2035 | case 2: |
2036 | delta = yMin - blk->yMax; |
2037 | break; |
2038 | case 3: |
2039 | delta = blk->xMin - xMax; |
2040 | break; |
2041 | } |
2042 | return delta; |
2043 | } |
2044 | |
2045 | bool TextBlock::isBelow(const TextBlock *blk) const |
2046 | { |
2047 | bool below; |
2048 | |
2049 | below = false; // make gcc happy |
2050 | switch (page->primaryRot) { |
2051 | case 0: |
2052 | below = xMin >= blk->priMin && xMax <= blk->priMax && yMin > blk->yMin; |
2053 | break; |
2054 | case 1: |
2055 | below = yMin >= blk->priMin && yMax <= blk->priMax && xMax < blk->xMax; |
2056 | break; |
2057 | case 2: |
2058 | below = xMin >= blk->priMin && xMax <= blk->priMax && yMax < blk->yMax; |
2059 | break; |
2060 | case 3: |
2061 | below = yMin >= blk->priMin && yMax <= blk->priMax && xMin > blk->xMin; |
2062 | break; |
2063 | } |
2064 | |
2065 | return below; |
2066 | } |
2067 | |
2068 | bool TextBlock::isBeforeByRule1(const TextBlock *blk1) |
2069 | { |
2070 | bool before = false; |
2071 | bool overlap = false; |
2072 | |
2073 | switch (this->page->primaryRot) { |
2074 | case 0: |
2075 | case 2: |
2076 | overlap = ((this->ExMin <= blk1->ExMin) && (blk1->ExMin <= this->ExMax)) || ((blk1->ExMin <= this->ExMin) && (this->ExMin <= blk1->ExMax)); |
2077 | break; |
2078 | case 1: |
2079 | case 3: |
2080 | overlap = ((this->EyMin <= blk1->EyMin) && (blk1->EyMin <= this->EyMax)) || ((blk1->EyMin <= this->EyMin) && (this->EyMin <= blk1->EyMax)); |
2081 | break; |
2082 | } |
2083 | switch (this->page->primaryRot) { |
2084 | case 0: |
2085 | before = overlap && this->EyMin < blk1->EyMin; |
2086 | break; |
2087 | case 1: |
2088 | before = overlap && this->ExMax > blk1->ExMax; |
2089 | break; |
2090 | case 2: |
2091 | before = overlap && this->EyMax > blk1->EyMax; |
2092 | break; |
2093 | case 3: |
2094 | before = overlap && this->ExMin < blk1->ExMin; |
2095 | break; |
2096 | } |
2097 | return before; |
2098 | } |
2099 | |
2100 | bool TextBlock::isBeforeByRule2(const TextBlock *blk1) |
2101 | { |
2102 | double cmp = 0; |
2103 | int rotLR = rot; |
2104 | |
2105 | if (!page->primaryLR) { |
2106 | rotLR = (rotLR + 2) % 4; |
2107 | } |
2108 | |
2109 | switch (rotLR) { |
2110 | case 0: |
2111 | cmp = ExMax - blk1->ExMin; |
2112 | break; |
2113 | case 1: |
2114 | cmp = EyMin - blk1->EyMax; |
2115 | break; |
2116 | case 2: |
2117 | cmp = blk1->ExMax - ExMin; |
2118 | break; |
2119 | case 3: |
2120 | cmp = blk1->EyMin - EyMax; |
2121 | break; |
2122 | } |
2123 | return cmp <= 0; |
2124 | } |
2125 | |
2126 | // Sort into reading order by performing a topological sort using the rules |
2127 | // given in "High Performance Document Layout Analysis", T.M. Breuel, 2003. |
2128 | // See http://pubs.iupr.org/#2003-breuel-sdiut |
2129 | // Topological sort is done by depth first search, see |
2130 | // http://en.wikipedia.org/wiki/Topological_sorting |
2131 | int TextBlock::visitDepthFirst(TextBlock *blkList, int pos1, TextBlock **sorted, int sortPos, bool *visited, TextBlock **cache, int cacheSize) |
2132 | { |
2133 | int pos2; |
2134 | TextBlock *blk1, *blk2, *blk3; |
2135 | bool before; |
2136 | |
2137 | if (visited[pos1]) { |
2138 | return sortPos; |
2139 | } |
2140 | |
2141 | blk1 = this; |
2142 | |
2143 | #if 0 // for debugging |
2144 | printf("visited: %d %.2f..%.2f %.2f..%.2f\n" , |
2145 | sortPos, blk1->ExMin, blk1->ExMax, blk1->EyMin, blk1->EyMax); |
2146 | #endif |
2147 | visited[pos1] = true; |
2148 | pos2 = -1; |
2149 | for (blk2 = blkList; blk2; blk2 = blk2->next) { |
2150 | pos2++; |
2151 | if (visited[pos2]) { |
2152 | // skip visited nodes |
2153 | continue; |
2154 | } |
2155 | before = false; |
2156 | |
2157 | // is blk2 before blk1? (for table entries) |
2158 | if (blk1->tableId >= 0 && blk1->tableId == blk2->tableId) { |
2159 | if (page->primaryLR) { |
2160 | if (blk2->xMax <= blk1->xMin && blk2->yMin <= blk1->yMax && blk2->yMax >= blk1->yMin) { |
2161 | before = true; |
2162 | } |
2163 | } else { |
2164 | if (blk2->xMin >= blk1->xMax && blk2->yMin <= blk1->yMax && blk2->yMax >= blk1->yMin) { |
2165 | before = true; |
2166 | } |
2167 | } |
2168 | |
2169 | if (blk2->yMax <= blk1->yMin) { |
2170 | before = true; |
2171 | } |
2172 | } else { |
2173 | if (blk2->isBeforeByRule1(blk1)) { |
2174 | // Rule (1) blk1 and blk2 overlap, and blk2 is above blk1. |
2175 | before = true; |
2176 | #if 0 // for debugging |
2177 | printf("rule1: %.2f..%.2f %.2f..%.2f %.2f..%.2f %.2f..%.2f\n" , |
2178 | blk2->ExMin, blk2->ExMax, blk2->EyMin, blk2->EyMax, |
2179 | blk1->ExMin, blk1->ExMax, blk1->EyMin, blk1->EyMax); |
2180 | #endif |
2181 | } else if (blk2->isBeforeByRule2(blk1)) { |
2182 | // Rule (2) blk2 left of blk1, and no intervening blk3 |
2183 | // such that blk1 is before blk3 by rule 1, |
2184 | // and blk3 is before blk2 by rule 1. |
2185 | before = true; |
2186 | for (int i = 0; i < cacheSize && cache[i]; ++i) { |
2187 | if (blk1->isBeforeByRule1(blk1: cache[i]) && cache[i]->isBeforeByRule1(blk1: blk2)) { |
2188 | before = false; |
2189 | std::rotate(first: cache, middle: cache + i, last: cache + i + 1); |
2190 | break; |
2191 | } |
2192 | } |
2193 | |
2194 | if (before) { |
2195 | for (blk3 = blkList; blk3; blk3 = blk3->next) { |
2196 | if (blk3 == blk2 || blk3 == blk1) { |
2197 | continue; |
2198 | } |
2199 | if (blk1->isBeforeByRule1(blk1: blk3) && blk3->isBeforeByRule1(blk1: blk2)) { |
2200 | before = false; |
2201 | std::copy_backward(first: cache, last: cache + cacheSize - 1, result: cache + cacheSize); |
2202 | cache[0] = blk3; |
2203 | break; |
2204 | } |
2205 | } |
2206 | } |
2207 | #if 0 // for debugging |
2208 | if (before) { |
2209 | printf("rule2: %.2f..%.2f %.2f..%.2f %.2f..%.2f %.2f..%.2f\n" , |
2210 | blk1->ExMin, blk1->ExMax, blk1->EyMin, blk1->EyMax, |
2211 | blk2->ExMin, blk2->ExMax, blk2->EyMin, blk2->EyMax); |
2212 | } |
2213 | #endif |
2214 | } |
2215 | } |
2216 | if (before) { |
2217 | // blk2 is before blk1, so it needs to be visited |
2218 | // before we can add blk1 to the sorted list. |
2219 | sortPos = blk2->visitDepthFirst(blkList, pos1: pos2, sorted, sortPos, visited, cache, cacheSize); |
2220 | } |
2221 | } |
2222 | #if 0 // for debugging |
2223 | printf("sorted: %d %.2f..%.2f %.2f..%.2f\n" , |
2224 | sortPos, blk1->ExMin, blk1->ExMax, blk1->EyMin, blk1->EyMax); |
2225 | #endif |
2226 | sorted[sortPos++] = blk1; |
2227 | return sortPos; |
2228 | } |
2229 | |
2230 | int TextBlock::visitDepthFirst(TextBlock *blkList, int pos1, TextBlock **sorted, int sortPos, bool *visited) |
2231 | { |
2232 | const int blockCacheSize = 4; |
2233 | TextBlock *blockCache[blockCacheSize]; |
2234 | std::fill(first: blockCache, last: blockCache + blockCacheSize, value: nullptr); |
2235 | return visitDepthFirst(blkList, pos1, sorted, sortPos, visited, cache: blockCache, cacheSize: blockCacheSize); |
2236 | } |
2237 | |
2238 | //------------------------------------------------------------------------ |
2239 | // TextFlow |
2240 | //------------------------------------------------------------------------ |
2241 | |
2242 | TextFlow::TextFlow(TextPage *pageA, TextBlock *blk) |
2243 | { |
2244 | page = pageA; |
2245 | xMin = blk->xMin; |
2246 | xMax = blk->xMax; |
2247 | yMin = blk->yMin; |
2248 | yMax = blk->yMax; |
2249 | priMin = blk->priMin; |
2250 | priMax = blk->priMax; |
2251 | blocks = lastBlk = blk; |
2252 | next = nullptr; |
2253 | } |
2254 | |
2255 | TextFlow::~TextFlow() |
2256 | { |
2257 | TextBlock *blk; |
2258 | |
2259 | while (blocks) { |
2260 | blk = blocks; |
2261 | blocks = blocks->next; |
2262 | delete blk; |
2263 | } |
2264 | } |
2265 | |
2266 | void TextFlow::addBlock(TextBlock *blk) |
2267 | { |
2268 | if (lastBlk) { |
2269 | lastBlk->next = blk; |
2270 | } else { |
2271 | blocks = blk; |
2272 | } |
2273 | lastBlk = blk; |
2274 | if (blk->xMin < xMin) { |
2275 | xMin = blk->xMin; |
2276 | } |
2277 | if (blk->xMax > xMax) { |
2278 | xMax = blk->xMax; |
2279 | } |
2280 | if (blk->yMin < yMin) { |
2281 | yMin = blk->yMin; |
2282 | } |
2283 | if (blk->yMax > yMax) { |
2284 | yMax = blk->yMax; |
2285 | } |
2286 | } |
2287 | |
2288 | bool TextFlow::blockFits(const TextBlock *blk, const TextBlock *prevBlk) const |
2289 | { |
2290 | bool fits; |
2291 | |
2292 | // lower blocks must use smaller fonts |
2293 | if (blk->lines->words->fontSize > lastBlk->lines->words->fontSize) { |
2294 | return false; |
2295 | } |
2296 | |
2297 | fits = false; // make gcc happy |
2298 | switch (page->primaryRot) { |
2299 | case 0: |
2300 | fits = blk->xMin >= priMin && blk->xMax <= priMax; |
2301 | break; |
2302 | case 1: |
2303 | fits = blk->yMin >= priMin && blk->yMax <= priMax; |
2304 | break; |
2305 | case 2: |
2306 | fits = blk->xMin >= priMin && blk->xMax <= priMax; |
2307 | break; |
2308 | case 3: |
2309 | fits = blk->yMin >= priMin && blk->yMax <= priMax; |
2310 | break; |
2311 | } |
2312 | return fits; |
2313 | } |
2314 | |
2315 | #ifdef TEXTOUT_WORD_LIST |
2316 | |
2317 | //------------------------------------------------------------------------ |
2318 | // TextWordList |
2319 | //------------------------------------------------------------------------ |
2320 | |
2321 | TextWordList::TextWordList(const TextPage *text, bool physLayout) |
2322 | { |
2323 | TextFlow *flow; |
2324 | TextBlock *blk; |
2325 | TextLine *line; |
2326 | TextWord *word; |
2327 | TextWord **wordArray; |
2328 | int nWords, i; |
2329 | |
2330 | if (text->rawOrder) { |
2331 | for (word = text->rawWords; word; word = word->next) { |
2332 | words.push_back(x: word); |
2333 | } |
2334 | |
2335 | } else if (physLayout) { |
2336 | // this is inefficient, but it's also the least useful of these |
2337 | // three cases |
2338 | nWords = 0; |
2339 | for (flow = text->flows; flow; flow = flow->next) { |
2340 | for (blk = flow->blocks; blk; blk = blk->next) { |
2341 | for (line = blk->lines; line; line = line->next) { |
2342 | for (word = line->words; word; word = word->next) { |
2343 | ++nWords; |
2344 | } |
2345 | } |
2346 | } |
2347 | } |
2348 | wordArray = (TextWord **)gmallocn(count: nWords, size: sizeof(TextWord *)); |
2349 | i = 0; |
2350 | for (flow = text->flows; flow; flow = flow->next) { |
2351 | for (blk = flow->blocks; blk; blk = blk->next) { |
2352 | for (line = blk->lines; line; line = line->next) { |
2353 | for (word = line->words; word; word = word->next) { |
2354 | wordArray[i++] = word; |
2355 | } |
2356 | } |
2357 | } |
2358 | } |
2359 | qsort(base: wordArray, nmemb: nWords, size: sizeof(TextWord *), compar: &TextWord::cmpYX); |
2360 | for (i = 0; i < nWords; ++i) { |
2361 | words.push_back(x: wordArray[i]); |
2362 | } |
2363 | gfree(p: wordArray); |
2364 | |
2365 | } else { |
2366 | for (flow = text->flows; flow; flow = flow->next) { |
2367 | for (blk = flow->blocks; blk; blk = blk->next) { |
2368 | for (line = blk->lines; line; line = line->next) { |
2369 | for (word = line->words; word; word = word->next) { |
2370 | words.push_back(x: word); |
2371 | } |
2372 | } |
2373 | } |
2374 | } |
2375 | } |
2376 | } |
2377 | |
2378 | TextWordList::~TextWordList() { } |
2379 | |
2380 | int TextWordList::getLength() const |
2381 | { |
2382 | return words.size(); |
2383 | } |
2384 | |
2385 | TextWord *TextWordList::get(int idx) |
2386 | { |
2387 | if (idx < 0 || idx >= (int)words.size()) { |
2388 | return nullptr; |
2389 | } |
2390 | return words[idx]; |
2391 | } |
2392 | |
2393 | #endif // TEXTOUT_WORD_LIST |
2394 | |
2395 | //------------------------------------------------------------------------ |
2396 | // TextPage |
2397 | //------------------------------------------------------------------------ |
2398 | |
2399 | TextPage::TextPage(bool rawOrderA, bool discardDiagA) |
2400 | { |
2401 | int rot; |
2402 | |
2403 | refCnt = 1; |
2404 | rawOrder = rawOrderA; |
2405 | discardDiag = discardDiagA; |
2406 | curWord = nullptr; |
2407 | charPos = 0; |
2408 | curFont = nullptr; |
2409 | curFontSize = 0; |
2410 | nest = 0; |
2411 | nTinyChars = 0; |
2412 | lastCharOverlap = false; |
2413 | if (!rawOrder) { |
2414 | for (rot = 0; rot < 4; ++rot) { |
2415 | pools[rot] = std::make_unique<TextPool>(); |
2416 | } |
2417 | } |
2418 | flows = nullptr; |
2419 | blocks = nullptr; |
2420 | rawWords = nullptr; |
2421 | rawLastWord = nullptr; |
2422 | lastFindXMin = lastFindYMin = 0; |
2423 | haveLastFind = false; |
2424 | mergeCombining = true; |
2425 | diagonal = false; |
2426 | } |
2427 | |
2428 | TextPage::~TextPage() |
2429 | { |
2430 | clear(); |
2431 | } |
2432 | |
2433 | void TextPage::incRefCnt() |
2434 | { |
2435 | refCnt++; |
2436 | } |
2437 | |
2438 | void TextPage::decRefCnt() |
2439 | { |
2440 | if (--refCnt == 0) { |
2441 | delete this; |
2442 | } |
2443 | } |
2444 | |
2445 | void TextPage::startPage(const GfxState *state) |
2446 | { |
2447 | clear(); |
2448 | if (state) { |
2449 | pageWidth = state->getPageWidth(); |
2450 | pageHeight = state->getPageHeight(); |
2451 | } else { |
2452 | pageWidth = pageHeight = 0; |
2453 | } |
2454 | } |
2455 | |
2456 | void TextPage::endPage() |
2457 | { |
2458 | if (curWord) { |
2459 | endWord(); |
2460 | } |
2461 | } |
2462 | |
2463 | void TextPage::clear() |
2464 | { |
2465 | int rot; |
2466 | TextFlow *flow; |
2467 | TextWord *word; |
2468 | |
2469 | if (curWord) { |
2470 | delete curWord; |
2471 | curWord = nullptr; |
2472 | } |
2473 | if (rawOrder) { |
2474 | while (rawWords) { |
2475 | word = rawWords; |
2476 | rawWords = rawWords->next; |
2477 | delete word; |
2478 | } |
2479 | } else { |
2480 | for (rot = 0; rot < 4; ++rot) { |
2481 | pools[rot] = std::make_unique<TextPool>(); |
2482 | } |
2483 | while (flows) { |
2484 | flow = flows; |
2485 | flows = flows->next; |
2486 | delete flow; |
2487 | } |
2488 | gfree(p: blocks); |
2489 | } |
2490 | fonts.clear(); |
2491 | underlines.clear(); |
2492 | links.clear(); |
2493 | |
2494 | diagonal = false; |
2495 | curWord = nullptr; |
2496 | charPos = 0; |
2497 | curFont = nullptr; |
2498 | curFontSize = 0; |
2499 | nest = 0; |
2500 | nTinyChars = 0; |
2501 | flows = nullptr; |
2502 | blocks = nullptr; |
2503 | rawWords = nullptr; |
2504 | rawLastWord = nullptr; |
2505 | } |
2506 | |
2507 | void TextPage::updateFont(const GfxState *state) |
2508 | { |
2509 | const double *fm; |
2510 | const char *name; |
2511 | int code, mCode, letterCode, anyCode; |
2512 | double w; |
2513 | |
2514 | // get the font info object |
2515 | curFont = nullptr; |
2516 | for (const std::unique_ptr<TextFontInfo> &f : fonts) { |
2517 | if (f->matches(state)) { |
2518 | curFont = f.get(); |
2519 | break; |
2520 | } |
2521 | } |
2522 | if (!curFont) { |
2523 | fonts.emplace_back(args: std::make_unique<TextFontInfo>(args&: state)); |
2524 | curFont = fonts.back().get(); |
2525 | } |
2526 | |
2527 | // adjust the font size |
2528 | GfxFont *const gfxFont = state->getFont().get(); |
2529 | curFontSize = state->getTransformedFontSize(); |
2530 | if (gfxFont && gfxFont->getType() == fontType3) { |
2531 | // This is a hack which makes it possible to deal with some Type 3 |
2532 | // fonts. The problem is that it's impossible to know what the |
2533 | // base coordinate system used in the font is without actually |
2534 | // rendering the font. This code tries to guess by looking at the |
2535 | // width of the character 'm' (which breaks if the font is a |
2536 | // subset that doesn't contain 'm'). |
2537 | mCode = letterCode = anyCode = -1; |
2538 | for (code = 0; code < 256; ++code) { |
2539 | name = ((Gfx8BitFont *)gfxFont)->getCharName(code); |
2540 | int nameLen = name ? strlen(s: name) : 0; |
2541 | bool nameOneChar = nameLen == 1 || (nameLen > 1 && name[1] == '\0'); |
2542 | if (nameOneChar && name[0] == 'm') { |
2543 | mCode = code; |
2544 | } |
2545 | if (letterCode < 0 && nameOneChar && ((name[0] >= 'A' && name[0] <= 'Z') || (name[0] >= 'a' && name[0] <= 'z'))) { |
2546 | letterCode = code; |
2547 | } |
2548 | if (anyCode < 0 && name && ((Gfx8BitFont *)gfxFont)->getWidth(c: code) > 0) { |
2549 | anyCode = code; |
2550 | } |
2551 | } |
2552 | if (mCode >= 0 && (w = ((Gfx8BitFont *)gfxFont)->getWidth(c: mCode)) > 0) { |
2553 | // 0.6 is a generic average 'm' width -- yes, this is a hack |
2554 | curFontSize *= w / 0.6; |
2555 | } else if (letterCode >= 0 && (w = ((Gfx8BitFont *)gfxFont)->getWidth(c: letterCode)) > 0) { |
2556 | // even more of a hack: 0.5 is a generic letter width |
2557 | curFontSize *= w / 0.5; |
2558 | } else if (anyCode >= 0 && (w = ((Gfx8BitFont *)gfxFont)->getWidth(c: anyCode)) > 0) { |
2559 | // better than nothing: 0.5 is a generic character width |
2560 | curFontSize *= w / 0.5; |
2561 | } |
2562 | fm = gfxFont->getFontMatrix(); |
2563 | if (fm[0] != 0) { |
2564 | curFontSize *= fabs(x: fm[3] / fm[0]); |
2565 | } |
2566 | } |
2567 | } |
2568 | |
2569 | void TextPage::beginWord(const GfxState *state) |
2570 | { |
2571 | const double *fontm; |
2572 | double m[4], m2[4]; |
2573 | int rot; |
2574 | |
2575 | // This check is needed because Type 3 characters can contain |
2576 | // text-drawing operations (when TextPage is being used via |
2577 | // {X,Win}SplashOutputDev rather than TextOutputDev). |
2578 | if (curWord) { |
2579 | ++nest; |
2580 | return; |
2581 | } |
2582 | |
2583 | // compute the rotation |
2584 | state->getFontTransMat(m11: &m[0], m12: &m[1], m21: &m[2], m22: &m[3]); |
2585 | std::shared_ptr<GfxFont> gfxFont = state->getFont(); |
2586 | if (gfxFont && gfxFont->getType() == fontType3) { |
2587 | fontm = state->getFont()->getFontMatrix(); |
2588 | m2[0] = fontm[0] * m[0] + fontm[1] * m[2]; |
2589 | m2[1] = fontm[0] * m[1] + fontm[1] * m[3]; |
2590 | m2[2] = fontm[2] * m[0] + fontm[3] * m[2]; |
2591 | m2[3] = fontm[2] * m[1] + fontm[3] * m[3]; |
2592 | m[0] = m2[0]; |
2593 | m[1] = m2[1]; |
2594 | m[2] = m2[2]; |
2595 | m[3] = m2[3]; |
2596 | } |
2597 | if (fabs(x: m[0] * m[3]) > fabs(x: m[1] * m[2])) { |
2598 | rot = (m[0] > 0 || m[3] < 0) ? 0 : 2; |
2599 | } else { |
2600 | rot = (m[2] > 0) ? 1 : 3; |
2601 | } |
2602 | if (fabs(x: m[0]) >= fabs(x: m[1])) { |
2603 | diagonal = fabs(x: m[1]) > diagonalThreshold * fabs(x: m[0]); |
2604 | } else { |
2605 | diagonal = fabs(x: m[0]) > diagonalThreshold * fabs(x: m[1]); |
2606 | } |
2607 | |
2608 | // for vertical writing mode, the lines are effectively rotated 90 |
2609 | // degrees |
2610 | if (gfxFont && gfxFont->getWMode()) { |
2611 | rot = (rot + 1) & 3; |
2612 | } |
2613 | |
2614 | curWord = new TextWord(state, rot, curFontSize); |
2615 | } |
2616 | |
2617 | void TextPage::addChar(const GfxState *state, double x, double y, double dx, double dy, CharCode c, int nBytes, const Unicode *u, int uLen) |
2618 | { |
2619 | double x1, y1, w1, h1, dx2, dy2, base, sp, delta; |
2620 | bool overlap; |
2621 | int i; |
2622 | int wMode; |
2623 | Matrix mat; |
2624 | |
2625 | // subtract char and word spacing from the dx,dy values |
2626 | sp = state->getCharSpace(); |
2627 | if (c == (CharCode)0x20) { |
2628 | sp += state->getWordSpace(); |
2629 | } |
2630 | state->textTransformDelta(x1: sp * state->getHorizScaling(), y1: 0, x2: &dx2, y2: &dy2); |
2631 | dx -= dx2; |
2632 | dy -= dy2; |
2633 | state->transformDelta(x1: dx, y1: dy, x2: &w1, y2: &h1); |
2634 | |
2635 | // throw away chars that aren't inside the page bounds |
2636 | // (and also do a sanity check on the character size) |
2637 | state->transform(x1: x, y1: y, x2: &x1, y2: &y1); |
2638 | if (x1 + w1 < 0 || x1 > pageWidth || y1 + h1 < 0 || y1 > pageHeight || std::isnan(x: x1) || std::isnan(x: y1) || std::isnan(x: w1) || std::isnan(x: h1)) { |
2639 | charPos += nBytes; |
2640 | return; |
2641 | } |
2642 | |
2643 | // check the tiny chars limit |
2644 | if (fabs(x: w1) < 3 && fabs(x: h1) < 3) { |
2645 | if (++nTinyChars > 50000) { |
2646 | charPos += nBytes; |
2647 | return; |
2648 | } |
2649 | } |
2650 | |
2651 | // break words at space character |
2652 | if (uLen == 1 && UnicodeIsWhitespace(ucs4: u[0])) { |
2653 | charPos += nBytes; |
2654 | endWord(); |
2655 | return; |
2656 | } else if (uLen == 1 && u[0] == (Unicode)0x0) { |
2657 | // ignore null characters |
2658 | charPos += nBytes; |
2659 | return; |
2660 | } |
2661 | |
2662 | state->getFontTransMat(m11: &mat.m[0], m12: &mat.m[1], m21: &mat.m[2], m22: &mat.m[3]); |
2663 | mat.m[0] *= state->getHorizScaling(); |
2664 | mat.m[1] *= state->getHorizScaling(); |
2665 | mat.m[4] = x1; |
2666 | mat.m[5] = y1; |
2667 | |
2668 | if (mergeCombining && curWord && uLen == 1 && curWord->addCombining(state, fontA: curFont, fontSizeA: curFontSize, x: x1, y: y1, dx: w1, dy: h1, charPosA: charPos, charLen: nBytes, c, u: u[0], textMatA: mat)) { |
2669 | charPos += nBytes; |
2670 | return; |
2671 | } |
2672 | |
2673 | // start a new word if: |
2674 | // (1) this character doesn't fall in the right place relative to |
2675 | // the end of the previous word (this places upper and lower |
2676 | // constraints on the position deltas along both the primary |
2677 | // and secondary axes), or |
2678 | // (2) this character overlaps the previous one (duplicated text), or |
2679 | // (3) the previous character was an overlap (we want each duplicated |
2680 | // character to be in a word by itself at this stage), |
2681 | // (4) the font size has changed |
2682 | // (5) the WMode changed |
2683 | if (curWord && curWord->len() > 0) { |
2684 | base = sp = delta = 0; // make gcc happy |
2685 | switch (curWord->rot) { |
2686 | case 0: |
2687 | base = y1; |
2688 | sp = x1 - curWord->xMax; |
2689 | delta = x1 - curWord->chars.back().edge; |
2690 | break; |
2691 | case 1: |
2692 | base = x1; |
2693 | sp = y1 - curWord->yMax; |
2694 | delta = y1 - curWord->chars.back().edge; |
2695 | break; |
2696 | case 2: |
2697 | base = y1; |
2698 | sp = curWord->xMin - x1; |
2699 | delta = curWord->chars.back().edge - x1; |
2700 | break; |
2701 | case 3: |
2702 | base = x1; |
2703 | sp = curWord->yMin - y1; |
2704 | delta = curWord->chars.back().edge - y1; |
2705 | break; |
2706 | } |
2707 | overlap = fabs(x: delta) < dupMaxPriDelta * curWord->fontSize && fabs(x: base - curWord->base) < dupMaxSecDelta * curWord->fontSize; |
2708 | wMode = curFont->getWMode(); |
2709 | if (overlap || lastCharOverlap || sp < -minDupBreakOverlap * curWord->fontSize || sp > minWordBreakSpace * curWord->fontSize || fabs(x: base - curWord->base) > 0.5 || curFontSize != curWord->fontSize || wMode != curWord->wMode) { |
2710 | endWord(); |
2711 | } |
2712 | lastCharOverlap = overlap; |
2713 | } else { |
2714 | lastCharOverlap = false; |
2715 | } |
2716 | |
2717 | if (uLen != 0) { |
2718 | // start a new word if needed |
2719 | if (!curWord) { |
2720 | beginWord(state); |
2721 | } |
2722 | |
2723 | // throw away diagonal chars |
2724 | if (discardDiag && diagonal) { |
2725 | charPos += nBytes; |
2726 | return; |
2727 | } |
2728 | |
2729 | // page rotation and/or transform matrices can cause text to be |
2730 | // drawn in reverse order -- in this case, swap the begin/end |
2731 | // coordinates and break text into individual chars |
2732 | if ((curWord->rot == 0 && w1 < 0) || (curWord->rot == 1 && h1 < 0) || (curWord->rot == 2 && w1 > 0) || (curWord->rot == 3 && h1 > 0)) { |
2733 | endWord(); |
2734 | beginWord(state); |
2735 | |
2736 | // throw away diagonal chars |
2737 | if (discardDiag && diagonal) { |
2738 | charPos += nBytes; |
2739 | return; |
2740 | } |
2741 | |
2742 | x1 += w1; |
2743 | y1 += h1; |
2744 | w1 = -w1; |
2745 | h1 = -h1; |
2746 | } |
2747 | |
2748 | // add the characters to the current word |
2749 | w1 /= uLen; |
2750 | h1 /= uLen; |
2751 | for (i = 0; i < uLen; ++i) { |
2752 | curWord->addChar(state, fontA: curFont, x: x1 + i * w1, y: y1 + i * h1, dx: w1, dy: h1, charPosA: charPos, charLen: nBytes, c, u: u[i], textMatA: mat); |
2753 | } |
2754 | } |
2755 | charPos += nBytes; |
2756 | } |
2757 | |
2758 | void TextPage::incCharCount(int nChars) |
2759 | { |
2760 | charPos += nChars; |
2761 | } |
2762 | |
2763 | void TextPage::endWord() |
2764 | { |
2765 | // This check is needed because Type 3 characters can contain |
2766 | // text-drawing operations (when TextPage is being used via |
2767 | // {X,Win}SplashOutputDev rather than TextOutputDev). |
2768 | if (nest > 0) { |
2769 | --nest; |
2770 | return; |
2771 | } |
2772 | |
2773 | if (curWord) { |
2774 | addWord(word: curWord); |
2775 | curWord = nullptr; |
2776 | } |
2777 | } |
2778 | |
2779 | void TextPage::addWord(TextWord *word) |
2780 | { |
2781 | // throw away zero-length words -- they don't have valid xMin/xMax |
2782 | // values, and they're useless anyway |
2783 | if (word->len() == 0) { |
2784 | delete word; |
2785 | return; |
2786 | } |
2787 | |
2788 | if (rawOrder) { |
2789 | if (rawLastWord) { |
2790 | rawLastWord->next = word; |
2791 | } else { |
2792 | rawWords = word; |
2793 | } |
2794 | rawLastWord = word; |
2795 | } else { |
2796 | pools[word->rot]->addWord(word); |
2797 | } |
2798 | } |
2799 | |
2800 | void TextPage::addUnderline(double x0, double y0, double x1, double y1) |
2801 | { |
2802 | underlines.emplace_back(args: std::make_unique<TextUnderline>(args&: x0, args&: y0, args&: x1, args&: y1)); |
2803 | } |
2804 | |
2805 | void TextPage::addLink(int xMin, int yMin, int xMax, int yMax, AnnotLink *link) |
2806 | { |
2807 | links.emplace_back(args: std::make_unique<TextLink>(args&: xMin, args&: yMin, args&: xMax, args&: yMax, args&: link)); |
2808 | } |
2809 | |
2810 | void TextPage::coalesce(bool physLayout, double fixedPitch, bool doHTML) |
2811 | { |
2812 | coalesce(physLayout, fixedPitch, doHTML, minColSpacing1: TextOutputDev::minColSpacing1_default); |
2813 | } |
2814 | |
2815 | void TextPage::coalesce(bool physLayout, double fixedPitch, bool doHTML, double minColSpacing1) |
2816 | { |
2817 | TextWord *word0, *word1, *word2; |
2818 | TextLine *line; |
2819 | TextBlock *blkList, *blk, *lastBlk, *blk0, *blk1, *blk2; |
2820 | TextFlow *flow, *lastFlow; |
2821 | int rot, poolMinBaseIdx, baseIdx, startBaseIdx, endBaseIdx; |
2822 | double minBase, maxBase, newMinBase, newMaxBase; |
2823 | double fontSize, colSpace1, colSpace2, lineSpace, intraLineSpace, blkSpace; |
2824 | bool found; |
2825 | int count[4]; |
2826 | int lrCount; |
2827 | int col1, col2; |
2828 | int j, n; |
2829 | |
2830 | if (rawOrder) { |
2831 | primaryRot = 0; |
2832 | primaryLR = true; |
2833 | return; |
2834 | } |
2835 | |
2836 | const UnicodeMap *uMap = globalParams->getTextEncoding(); |
2837 | blkList = nullptr; |
2838 | lastBlk = nullptr; |
2839 | nBlocks = 0; |
2840 | primaryRot = 0; |
2841 | |
2842 | #if 0 // for debugging |
2843 | printf("*** initial words ***\n" ); |
2844 | for (rot = 0; rot < 4; ++rot) { |
2845 | pool = pools[rot]; |
2846 | for (baseIdx = pool->minBaseIdx; baseIdx <= pool->maxBaseIdx; ++baseIdx) { |
2847 | for (word0 = pool->getPool(baseIdx); word0; word0 = word0->next) { |
2848 | printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSize=%.2f rot=%d link=%p '" , |
2849 | word0->xMin, word0->xMax, word0->yMin, word0->yMax, |
2850 | word0->base, word0->fontSize, rot*90, word0->link); |
2851 | for (i = 0; i < word0->len; ++i) { |
2852 | fputc(word0->text[i] & 0xff, stdout); |
2853 | } |
2854 | printf("'\n" ); |
2855 | } |
2856 | } |
2857 | } |
2858 | printf("\n" ); |
2859 | #endif |
2860 | |
2861 | #if 0 //~ for debugging |
2862 | for (i = 0; i < underlines->getLength(); ++i) { |
2863 | underline = (TextUnderline *)underlines->get(i); |
2864 | printf("underline: x=%g..%g y=%g..%g horiz=%d\n" , |
2865 | underline->x0, underline->x1, underline->y0, underline->y1, |
2866 | underline->horiz); |
2867 | } |
2868 | #endif |
2869 | |
2870 | if (doHTML) { |
2871 | |
2872 | //----- handle underlining |
2873 | for (const std::unique_ptr<TextUnderline> &underline : underlines) { |
2874 | if (underline->horiz) { |
2875 | // rot = 0 |
2876 | if (pools[0]->minBaseIdx <= pools[0]->maxBaseIdx) { |
2877 | startBaseIdx = pools[0]->getBaseIdx(base: underline->y0 + minUnderlineGap); |
2878 | endBaseIdx = pools[0]->getBaseIdx(base: underline->y0 + maxUnderlineGap); |
2879 | for (j = startBaseIdx; j <= endBaseIdx; ++j) { |
2880 | for (word0 = pools[0]->getPool(baseIdx: j); word0; word0 = word0->next) { |
2881 | //~ need to check the y value against the word baseline |
2882 | if (underline->x0 < word0->xMin + underlineSlack && word0->xMax - underlineSlack < underline->x1) { |
2883 | word0->underlined = true; |
2884 | } |
2885 | } |
2886 | } |
2887 | } |
2888 | |
2889 | // rot = 2 |
2890 | if (pools[2]->minBaseIdx <= pools[2]->maxBaseIdx) { |
2891 | startBaseIdx = pools[2]->getBaseIdx(base: underline->y0 - maxUnderlineGap); |
2892 | endBaseIdx = pools[2]->getBaseIdx(base: underline->y0 - minUnderlineGap); |
2893 | for (j = startBaseIdx; j <= endBaseIdx; ++j) { |
2894 | for (word0 = pools[2]->getPool(baseIdx: j); word0; word0 = word0->next) { |
2895 | if (underline->x0 < word0->xMin + underlineSlack && word0->xMax - underlineSlack < underline->x1) { |
2896 | word0->underlined = true; |
2897 | } |
2898 | } |
2899 | } |
2900 | } |
2901 | } else { |
2902 | // rot = 1 |
2903 | if (pools[1]->minBaseIdx <= pools[1]->maxBaseIdx) { |
2904 | startBaseIdx = pools[1]->getBaseIdx(base: underline->x0 - maxUnderlineGap); |
2905 | endBaseIdx = pools[1]->getBaseIdx(base: underline->x0 - minUnderlineGap); |
2906 | for (j = startBaseIdx; j <= endBaseIdx; ++j) { |
2907 | for (word0 = pools[1]->getPool(baseIdx: j); word0; word0 = word0->next) { |
2908 | if (underline->y0 < word0->yMin + underlineSlack && word0->yMax - underlineSlack < underline->y1) { |
2909 | word0->underlined = true; |
2910 | } |
2911 | } |
2912 | } |
2913 | } |
2914 | |
2915 | // rot = 3 |
2916 | if (pools[3]->minBaseIdx <= pools[3]->maxBaseIdx) { |
2917 | startBaseIdx = pools[3]->getBaseIdx(base: underline->x0 + minUnderlineGap); |
2918 | endBaseIdx = pools[3]->getBaseIdx(base: underline->x0 + maxUnderlineGap); |
2919 | for (j = startBaseIdx; j <= endBaseIdx; ++j) { |
2920 | for (word0 = pools[3]->getPool(baseIdx: j); word0; word0 = word0->next) { |
2921 | if (underline->y0 < word0->yMin + underlineSlack && word0->yMax - underlineSlack < underline->y1) { |
2922 | word0->underlined = true; |
2923 | } |
2924 | } |
2925 | } |
2926 | } |
2927 | } |
2928 | } |
2929 | |
2930 | //----- handle links |
2931 | for (const std::unique_ptr<TextLink> &link : links) { |
2932 | // rot = 0 |
2933 | if (pools[0]->minBaseIdx <= pools[0]->maxBaseIdx) { |
2934 | startBaseIdx = pools[0]->getBaseIdx(base: link->yMin); |
2935 | endBaseIdx = pools[0]->getBaseIdx(base: link->yMax); |
2936 | for (j = startBaseIdx; j <= endBaseIdx; ++j) { |
2937 | for (word0 = pools[0]->getPool(baseIdx: j); word0; word0 = word0->next) { |
2938 | if (link->xMin < word0->xMin + hyperlinkSlack && word0->xMax - hyperlinkSlack < link->xMax && link->yMin < word0->yMin + hyperlinkSlack && word0->yMax - hyperlinkSlack < link->yMax) { |
2939 | word0->link = link->link; |
2940 | } |
2941 | } |
2942 | } |
2943 | } |
2944 | |
2945 | // rot = 2 |
2946 | if (pools[2]->minBaseIdx <= pools[2]->maxBaseIdx) { |
2947 | startBaseIdx = pools[2]->getBaseIdx(base: link->yMin); |
2948 | endBaseIdx = pools[2]->getBaseIdx(base: link->yMax); |
2949 | for (j = startBaseIdx; j <= endBaseIdx; ++j) { |
2950 | for (word0 = pools[2]->getPool(baseIdx: j); word0; word0 = word0->next) { |
2951 | if (link->xMin < word0->xMin + hyperlinkSlack && word0->xMax - hyperlinkSlack < link->xMax && link->yMin < word0->yMin + hyperlinkSlack && word0->yMax - hyperlinkSlack < link->yMax) { |
2952 | word0->link = link->link; |
2953 | } |
2954 | } |
2955 | } |
2956 | } |
2957 | |
2958 | // rot = 1 |
2959 | if (pools[1]->minBaseIdx <= pools[1]->maxBaseIdx) { |
2960 | startBaseIdx = pools[1]->getBaseIdx(base: link->xMin); |
2961 | endBaseIdx = pools[1]->getBaseIdx(base: link->xMax); |
2962 | for (j = startBaseIdx; j <= endBaseIdx; ++j) { |
2963 | for (word0 = pools[1]->getPool(baseIdx: j); word0; word0 = word0->next) { |
2964 | if (link->yMin < word0->yMin + hyperlinkSlack && word0->yMax - hyperlinkSlack < link->yMax && link->xMin < word0->xMin + hyperlinkSlack && word0->xMax - hyperlinkSlack < link->xMax) { |
2965 | word0->link = link->link; |
2966 | } |
2967 | } |
2968 | } |
2969 | } |
2970 | |
2971 | // rot = 3 |
2972 | if (pools[3]->minBaseIdx <= pools[3]->maxBaseIdx) { |
2973 | startBaseIdx = pools[3]->getBaseIdx(base: link->xMin); |
2974 | endBaseIdx = pools[3]->getBaseIdx(base: link->xMax); |
2975 | for (j = startBaseIdx; j <= endBaseIdx; ++j) { |
2976 | for (word0 = pools[3]->getPool(baseIdx: j); word0; word0 = word0->next) { |
2977 | if (link->yMin < word0->yMin + hyperlinkSlack && word0->yMax - hyperlinkSlack < link->yMax && link->xMin < word0->xMin + hyperlinkSlack && word0->xMax - hyperlinkSlack < link->xMax) { |
2978 | word0->link = link->link; |
2979 | } |
2980 | } |
2981 | } |
2982 | } |
2983 | } |
2984 | } |
2985 | |
2986 | //----- assemble the blocks |
2987 | |
2988 | //~ add an outer loop for writing mode (vertical text) |
2989 | |
2990 | // build blocks for each rotation value |
2991 | for (rot = 0; rot < 4; ++rot) { |
2992 | std::unique_ptr<TextPool> &pool = pools[rot]; |
2993 | poolMinBaseIdx = pool->minBaseIdx; |
2994 | count[rot] = 0; |
2995 | |
2996 | // add blocks until no more words are left |
2997 | while (true) { |
2998 | |
2999 | // find the first non-empty line in the pool |
3000 | for (; poolMinBaseIdx <= pool->maxBaseIdx && !pool->getPool(baseIdx: poolMinBaseIdx); ++poolMinBaseIdx) { |
3001 | ; |
3002 | } |
3003 | if (poolMinBaseIdx > pool->maxBaseIdx) { |
3004 | break; |
3005 | } |
3006 | |
3007 | // look for the left-most word in the first four lines of the |
3008 | // pool -- this avoids starting with a superscript word |
3009 | startBaseIdx = poolMinBaseIdx; |
3010 | for (baseIdx = poolMinBaseIdx + 1; baseIdx < poolMinBaseIdx + 4 && baseIdx <= pool->maxBaseIdx; ++baseIdx) { |
3011 | if (!pool->getPool(baseIdx)) { |
3012 | continue; |
3013 | } |
3014 | if (pool->getPool(baseIdx)->primaryCmp(word: pool->getPool(baseIdx: startBaseIdx)) < 0) { |
3015 | startBaseIdx = baseIdx; |
3016 | } |
3017 | } |
3018 | |
3019 | // create a new block |
3020 | word0 = pool->getPool(baseIdx: startBaseIdx); |
3021 | pool->setPool(baseIdx: startBaseIdx, p: word0->next); |
3022 | word0->next = nullptr; |
3023 | blk = new TextBlock(this, rot); |
3024 | blk->addWord(word: word0); |
3025 | |
3026 | fontSize = word0->fontSize; |
3027 | minBase = maxBase = word0->base; |
3028 | colSpace1 = minColSpacing1 * fontSize; |
3029 | colSpace2 = minColSpacing2 * fontSize; |
3030 | lineSpace = maxLineSpacingDelta * fontSize; |
3031 | intraLineSpace = maxIntraLineDelta * fontSize; |
3032 | |
3033 | // add words to the block |
3034 | do { |
3035 | found = false; |
3036 | |
3037 | // look for words on the line above the current top edge of |
3038 | // the block |
3039 | newMinBase = minBase; |
3040 | for (baseIdx = pool->getBaseIdx(base: minBase); baseIdx >= pool->getBaseIdx(base: minBase - lineSpace); --baseIdx) { |
3041 | word0 = nullptr; |
3042 | word1 = pool->getPool(baseIdx); |
3043 | while (word1) { |
3044 | if (word1->base < minBase && word1->base >= minBase - lineSpace && ((rot == 0 || rot == 2) ? (word1->xMin < blk->xMax && word1->xMax > blk->xMin) : (word1->yMin < blk->yMax && word1->yMax > blk->yMin)) |
3045 | && fabs(x: word1->fontSize - fontSize) < maxBlockFontSizeDelta1 * fontSize) { |
3046 | word2 = word1; |
3047 | if (word0) { |
3048 | word0->next = word1->next; |
3049 | } else { |
3050 | pool->setPool(baseIdx, p: word1->next); |
3051 | } |
3052 | word1 = word1->next; |
3053 | word2->next = nullptr; |
3054 | blk->addWord(word: word2); |
3055 | found = true; |
3056 | newMinBase = word2->base; |
3057 | } else { |
3058 | word0 = word1; |
3059 | word1 = word1->next; |
3060 | } |
3061 | } |
3062 | } |
3063 | minBase = newMinBase; |
3064 | |
3065 | // look for words on the line below the current bottom edge of |
3066 | // the block |
3067 | newMaxBase = maxBase; |
3068 | for (baseIdx = pool->getBaseIdx(base: maxBase); baseIdx <= pool->getBaseIdx(base: maxBase + lineSpace); ++baseIdx) { |
3069 | word0 = nullptr; |
3070 | word1 = pool->getPool(baseIdx); |
3071 | while (word1) { |
3072 | if (word1->base > maxBase && word1->base <= maxBase + lineSpace && ((rot == 0 || rot == 2) ? (word1->xMin < blk->xMax && word1->xMax > blk->xMin) : (word1->yMin < blk->yMax && word1->yMax > blk->yMin)) |
3073 | && fabs(x: word1->fontSize - fontSize) < maxBlockFontSizeDelta1 * fontSize) { |
3074 | word2 = word1; |
3075 | if (word0) { |
3076 | word0->next = word1->next; |
3077 | } else { |
3078 | pool->setPool(baseIdx, p: word1->next); |
3079 | } |
3080 | word1 = word1->next; |
3081 | word2->next = nullptr; |
3082 | blk->addWord(word: word2); |
3083 | found = true; |
3084 | newMaxBase = word2->base; |
3085 | } else { |
3086 | word0 = word1; |
3087 | word1 = word1->next; |
3088 | } |
3089 | } |
3090 | } |
3091 | maxBase = newMaxBase; |
3092 | |
3093 | // look for words that are on lines already in the block, and |
3094 | // that overlap the block horizontally |
3095 | for (baseIdx = pool->getBaseIdx(base: minBase - intraLineSpace); baseIdx <= pool->getBaseIdx(base: maxBase + intraLineSpace); ++baseIdx) { |
3096 | word0 = nullptr; |
3097 | word1 = pool->getPool(baseIdx); |
3098 | while (word1) { |
3099 | if (word1->base >= minBase - intraLineSpace && word1->base <= maxBase + intraLineSpace |
3100 | && ((rot == 0 || rot == 2) ? (word1->xMin < blk->xMax + colSpace1 && word1->xMax > blk->xMin - colSpace1) : (word1->yMin < blk->yMax + colSpace1 && word1->yMax > blk->yMin - colSpace1)) |
3101 | && fabs(x: word1->fontSize - fontSize) < maxBlockFontSizeDelta2 * fontSize) { |
3102 | word2 = word1; |
3103 | if (word0) { |
3104 | word0->next = word1->next; |
3105 | } else { |
3106 | pool->setPool(baseIdx, p: word1->next); |
3107 | } |
3108 | word1 = word1->next; |
3109 | word2->next = nullptr; |
3110 | blk->addWord(word: word2); |
3111 | found = true; |
3112 | } else { |
3113 | word0 = word1; |
3114 | word1 = word1->next; |
3115 | } |
3116 | } |
3117 | } |
3118 | |
3119 | // only check for outlying words (the next two chunks of code) |
3120 | // if we didn't find anything else |
3121 | if (found) { |
3122 | continue; |
3123 | } |
3124 | |
3125 | // scan down the left side of the block, looking for words |
3126 | // that are near (but not overlapping) the block; if there are |
3127 | // three or fewer, add them to the block |
3128 | n = 0; |
3129 | for (baseIdx = pool->getBaseIdx(base: minBase - intraLineSpace); baseIdx <= pool->getBaseIdx(base: maxBase + intraLineSpace); ++baseIdx) { |
3130 | word1 = pool->getPool(baseIdx); |
3131 | while (word1) { |
3132 | if (word1->base >= minBase - intraLineSpace && word1->base <= maxBase + intraLineSpace |
3133 | && ((rot == 0 || rot == 2) ? (word1->xMax <= blk->xMin && word1->xMax > blk->xMin - colSpace2) : (word1->yMax <= blk->yMin && word1->yMax > blk->yMin - colSpace2)) |
3134 | && fabs(x: word1->fontSize - fontSize) < maxBlockFontSizeDelta3 * fontSize) { |
3135 | ++n; |
3136 | break; |
3137 | } |
3138 | word1 = word1->next; |
3139 | } |
3140 | } |
3141 | if (n > 0 && n <= 3) { |
3142 | for (baseIdx = pool->getBaseIdx(base: minBase - intraLineSpace); baseIdx <= pool->getBaseIdx(base: maxBase + intraLineSpace); ++baseIdx) { |
3143 | word0 = nullptr; |
3144 | word1 = pool->getPool(baseIdx); |
3145 | while (word1) { |
3146 | if (word1->base >= minBase - intraLineSpace && word1->base <= maxBase + intraLineSpace |
3147 | && ((rot == 0 || rot == 2) ? (word1->xMax <= blk->xMin && word1->xMax > blk->xMin - colSpace2) : (word1->yMax <= blk->yMin && word1->yMax > blk->yMin - colSpace2)) |
3148 | && fabs(x: word1->fontSize - fontSize) < maxBlockFontSizeDelta3 * fontSize) { |
3149 | word2 = word1; |
3150 | if (word0) { |
3151 | word0->next = word1->next; |
3152 | } else { |
3153 | pool->setPool(baseIdx, p: word1->next); |
3154 | } |
3155 | word1 = word1->next; |
3156 | word2->next = nullptr; |
3157 | blk->addWord(word: word2); |
3158 | if (word2->base < minBase) { |
3159 | minBase = word2->base; |
3160 | } else if (word2->base > maxBase) { |
3161 | maxBase = word2->base; |
3162 | } |
3163 | found = true; |
3164 | break; |
3165 | } else { |
3166 | word0 = word1; |
3167 | word1 = word1->next; |
3168 | } |
3169 | } |
3170 | } |
3171 | } |
3172 | |
3173 | // scan down the right side of the block, looking for words |
3174 | // that are near (but not overlapping) the block; if there are |
3175 | // three or fewer, add them to the block |
3176 | n = 0; |
3177 | for (baseIdx = pool->getBaseIdx(base: minBase - intraLineSpace); baseIdx <= pool->getBaseIdx(base: maxBase + intraLineSpace); ++baseIdx) { |
3178 | word1 = pool->getPool(baseIdx); |
3179 | while (word1) { |
3180 | if (word1->base >= minBase - intraLineSpace && word1->base <= maxBase + intraLineSpace |
3181 | && ((rot == 0 || rot == 2) ? (word1->xMin >= blk->xMax && word1->xMin < blk->xMax + colSpace2) : (word1->yMin >= blk->yMax && word1->yMin < blk->yMax + colSpace2)) |
3182 | && fabs(x: word1->fontSize - fontSize) < maxBlockFontSizeDelta3 * fontSize) { |
3183 | ++n; |
3184 | break; |
3185 | } |
3186 | word1 = word1->next; |
3187 | } |
3188 | } |
3189 | if (n > 0 && n <= 3) { |
3190 | for (baseIdx = pool->getBaseIdx(base: minBase - intraLineSpace); baseIdx <= pool->getBaseIdx(base: maxBase + intraLineSpace); ++baseIdx) { |
3191 | word0 = nullptr; |
3192 | word1 = pool->getPool(baseIdx); |
3193 | while (word1) { |
3194 | if (word1->base >= minBase - intraLineSpace && word1->base <= maxBase + intraLineSpace |
3195 | && ((rot == 0 || rot == 2) ? (word1->xMin >= blk->xMax && word1->xMin < blk->xMax + colSpace2) : (word1->yMin >= blk->yMax && word1->yMin < blk->yMax + colSpace2)) |
3196 | && fabs(x: word1->fontSize - fontSize) < maxBlockFontSizeDelta3 * fontSize) { |
3197 | word2 = word1; |
3198 | if (word0) { |
3199 | word0->next = word1->next; |
3200 | } else { |
3201 | pool->setPool(baseIdx, p: word1->next); |
3202 | } |
3203 | word1 = word1->next; |
3204 | word2->next = nullptr; |
3205 | blk->addWord(word: word2); |
3206 | if (word2->base < minBase) { |
3207 | minBase = word2->base; |
3208 | } else if (word2->base > maxBase) { |
3209 | maxBase = word2->base; |
3210 | } |
3211 | found = true; |
3212 | break; |
3213 | } else { |
3214 | word0 = word1; |
3215 | word1 = word1->next; |
3216 | } |
3217 | } |
3218 | } |
3219 | } |
3220 | |
3221 | } while (found); |
3222 | |
3223 | //~ need to compute the primary writing mode (horiz/vert) in |
3224 | //~ addition to primary rotation |
3225 | |
3226 | // coalesce the block, and add it to the list |
3227 | blk->coalesce(uMap, fixedPitch); |
3228 | if (lastBlk) { |
3229 | lastBlk->next = blk; |
3230 | } else { |
3231 | blkList = blk; |
3232 | } |
3233 | lastBlk = blk; |
3234 | count[rot] += blk->charCount; |
3235 | ++nBlocks; |
3236 | } |
3237 | |
3238 | if (count[rot] > count[primaryRot]) { |
3239 | primaryRot = rot; |
3240 | } |
3241 | } |
3242 | |
3243 | #if 0 // for debugging |
3244 | printf("*** rotation ***\n" ); |
3245 | for (rot = 0; rot < 4; ++rot) { |
3246 | printf(" %d: %6d\n" , rot, count[rot]); |
3247 | } |
3248 | printf(" primary rot = %d\n" , primaryRot); |
3249 | printf("\n" ); |
3250 | #endif |
3251 | |
3252 | #if 0 // for debugging |
3253 | printf("*** blocks ***\n" ); |
3254 | for (blk = blkList; blk; blk = blk->next) { |
3255 | printf("block: rot=%d x=%.2f..%.2f y=%.2f..%.2f\n" , |
3256 | blk->rot, blk->xMin, blk->xMax, blk->yMin, blk->yMax); |
3257 | for (line = blk->lines; line; line = line->next) { |
3258 | printf(" line: x=%.2f..%.2f y=%.2f..%.2f base=%.2f\n" , |
3259 | line->xMin, line->xMax, line->yMin, line->yMax, line->base); |
3260 | for (word0 = line->words; word0; word0 = word0->next) { |
3261 | printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSize=%.2f space=%d: '" , |
3262 | word0->xMin, word0->xMax, word0->yMin, word0->yMax, |
3263 | word0->base, word0->fontSize, word0->spaceAfter); |
3264 | for (i = 0; i < word0->len; ++i) { |
3265 | fputc(word0->text[i] & 0xff, stdout); |
3266 | } |
3267 | printf("'\n" ); |
3268 | } |
3269 | } |
3270 | } |
3271 | printf("\n" ); |
3272 | #endif |
3273 | |
3274 | // determine the primary direction |
3275 | lrCount = 0; |
3276 | for (blk = blkList; blk; blk = blk->next) { |
3277 | for (line = blk->lines; line; line = line->next) { |
3278 | for (word0 = line->words; word0; word0 = word0->next) { |
3279 | for (size_t i = 0; i < word0->len(); ++i) { |
3280 | if (unicodeTypeL(c: word0->chars[i].text)) { |
3281 | ++lrCount; |
3282 | } else if (unicodeTypeR(c: word0->chars[i].text)) { |
3283 | --lrCount; |
3284 | } |
3285 | } |
3286 | } |
3287 | } |
3288 | } |
3289 | primaryLR = lrCount >= 0; |
3290 | |
3291 | #if 0 // for debugging |
3292 | printf("*** direction ***\n" ); |
3293 | printf("lrCount = %d\n" , lrCount); |
3294 | printf("primaryLR = %d\n" , primaryLR); |
3295 | #endif |
3296 | |
3297 | //----- column assignment |
3298 | |
3299 | // sort blocks into xy order for column assignment |
3300 | if (blocks) { |
3301 | gfree(p: blocks); |
3302 | } |
3303 | if (physLayout && fixedPitch) { |
3304 | |
3305 | blocks = (TextBlock **)gmallocn(count: nBlocks, size: sizeof(TextBlock *)); |
3306 | int i; |
3307 | for (blk = blkList, i = 0; blk; blk = blk->next, ++i) { |
3308 | blocks[i] = blk; |
3309 | col1 = 0; // make gcc happy |
3310 | switch (primaryRot) { |
3311 | case 0: |
3312 | col1 = (int)(blk->xMin / fixedPitch + 0.5); |
3313 | break; |
3314 | case 1: |
3315 | col1 = (int)(blk->yMin / fixedPitch + 0.5); |
3316 | break; |
3317 | case 2: |
3318 | col1 = (int)((pageWidth - blk->xMax) / fixedPitch + 0.5); |
3319 | break; |
3320 | case 3: |
3321 | col1 = (int)((pageHeight - blk->yMax) / fixedPitch + 0.5); |
3322 | break; |
3323 | } |
3324 | blk->col = col1; |
3325 | for (line = blk->lines; line; line = line->next) { |
3326 | for (j = 0; j <= line->len; ++j) { |
3327 | line->col[j] += col1; |
3328 | } |
3329 | } |
3330 | } |
3331 | |
3332 | } else { |
3333 | |
3334 | // sort blocks into xy order for column assignment |
3335 | blocks = (TextBlock **)gmallocn(count: nBlocks, size: sizeof(TextBlock *)); |
3336 | int i; |
3337 | for (blk = blkList, i = 0; blk; blk = blk->next, ++i) { |
3338 | blocks[i] = blk; |
3339 | } |
3340 | if (blocks) { |
3341 | qsort(base: blocks, nmemb: nBlocks, size: sizeof(TextBlock *), compar: &TextBlock::cmpXYPrimaryRot); |
3342 | } |
3343 | |
3344 | // column assignment |
3345 | for (i = 0; i < nBlocks; ++i) { |
3346 | blk0 = blocks[i]; |
3347 | col1 = 0; |
3348 | for (j = 0; j < i; ++j) { |
3349 | blk1 = blocks[j]; |
3350 | col2 = 0; // make gcc happy |
3351 | switch (primaryRot) { |
3352 | case 0: |
3353 | if (blk0->xMin > blk1->xMax) { |
3354 | col2 = blk1->col + blk1->nColumns + 3; |
3355 | } else if (blk1->xMax == blk1->xMin) { |
3356 | col2 = blk1->col; |
3357 | } else { |
3358 | col2 = blk1->col + (int)(((blk0->xMin - blk1->xMin) / (blk1->xMax - blk1->xMin)) * blk1->nColumns); |
3359 | } |
3360 | break; |
3361 | case 1: |
3362 | if (blk0->yMin > blk1->yMax) { |
3363 | col2 = blk1->col + blk1->nColumns + 3; |
3364 | } else if (blk1->yMax == blk1->yMin) { |
3365 | col2 = blk1->col; |
3366 | } else { |
3367 | col2 = blk1->col + (int)(((blk0->yMin - blk1->yMin) / (blk1->yMax - blk1->yMin)) * blk1->nColumns); |
3368 | } |
3369 | break; |
3370 | case 2: |
3371 | if (blk0->xMax < blk1->xMin) { |
3372 | col2 = blk1->col + blk1->nColumns + 3; |
3373 | } else if (blk1->xMin == blk1->xMax) { |
3374 | col2 = blk1->col; |
3375 | } else { |
3376 | col2 = blk1->col + (int)(((blk0->xMax - blk1->xMax) / (blk1->xMin - blk1->xMax)) * blk1->nColumns); |
3377 | } |
3378 | break; |
3379 | case 3: |
3380 | if (blk0->yMax < blk1->yMin) { |
3381 | col2 = blk1->col + blk1->nColumns + 3; |
3382 | } else if (blk1->yMin == blk1->yMax) { |
3383 | col2 = blk1->col; |
3384 | } else { |
3385 | col2 = blk1->col + (int)(((blk0->yMax - blk1->yMax) / (blk1->yMin - blk1->yMax)) * blk1->nColumns); |
3386 | } |
3387 | break; |
3388 | } |
3389 | if (col2 > col1) { |
3390 | col1 = col2; |
3391 | } |
3392 | } |
3393 | blk0->col = col1; |
3394 | for (line = blk0->lines; line; line = line->next) { |
3395 | for (j = 0; j <= line->len; ++j) { |
3396 | line->col[j] += col1; |
3397 | } |
3398 | } |
3399 | } |
3400 | } |
3401 | |
3402 | #if 0 // for debugging |
3403 | printf("*** blocks, after column assignment ***\n" ); |
3404 | for (blk = blkList; blk; blk = blk->next) { |
3405 | printf("block: rot=%d x=%.2f..%.2f y=%.2f..%.2f col=%d nCols=%d\n" , |
3406 | blk->rot, blk->xMin, blk->xMax, blk->yMin, blk->yMax, blk->col, |
3407 | blk->nColumns); |
3408 | for (line = blk->lines; line; line = line->next) { |
3409 | printf(" line: col[0]=%d\n" , line->col[0]); |
3410 | for (word0 = line->words; word0; word0 = word0->next) { |
3411 | printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSize=%.2f space=%d: '" , |
3412 | word0->xMin, word0->xMax, word0->yMin, word0->yMax, |
3413 | word0->base, word0->fontSize, word0->spaceAfter); |
3414 | for (i = 0; i < word0->len; ++i) { |
3415 | fputc(word0->text[i] & 0xff, stdout); |
3416 | } |
3417 | printf("'\n" ); |
3418 | } |
3419 | } |
3420 | } |
3421 | printf("\n" ); |
3422 | #endif |
3423 | |
3424 | //----- reading order sort |
3425 | |
3426 | // compute space on left and right sides of each block |
3427 | for (int i = 0; i < nBlocks; ++i) { |
3428 | blk0 = blocks[i]; |
3429 | for (j = 0; j < nBlocks; ++j) { |
3430 | blk1 = blocks[j]; |
3431 | if (blk1 != blk0) { |
3432 | blk0->updatePriMinMax(blk: blk1); |
3433 | } |
3434 | } |
3435 | } |
3436 | |
3437 | #if 0 // for debugging |
3438 | printf("PAGE\n" ); |
3439 | #endif |
3440 | |
3441 | int sortPos = 0; |
3442 | bool *visited = (bool *)gmallocn(count: nBlocks, size: sizeof(bool)); |
3443 | for (int i = 0; i < nBlocks; i++) { |
3444 | visited[i] = false; |
3445 | } |
3446 | |
3447 | double bxMin0, byMin0, bxMin1, byMin1; |
3448 | int numTables = 0; |
3449 | int tableId = -1; |
3450 | int correspondenceX, correspondenceY; |
3451 | double xCentre1, yCentre1, xCentre2, yCentre2; |
3452 | double xCentre3, yCentre3, xCentre4, yCentre4; |
3453 | double deltaX, deltaY; |
3454 | TextBlock *fblk2 = nullptr, *fblk3 = nullptr, *fblk4 = nullptr; |
3455 | |
3456 | for (blk1 = blkList; blk1; blk1 = blk1->next) { |
3457 | blk1->ExMin = blk1->xMin; |
3458 | blk1->ExMax = blk1->xMax; |
3459 | blk1->EyMin = blk1->yMin; |
3460 | blk1->EyMax = blk1->yMax; |
3461 | |
3462 | bxMin0 = DBL_MAX; |
3463 | byMin0 = DBL_MAX; |
3464 | bxMin1 = DBL_MAX; |
3465 | byMin1 = DBL_MAX; |
3466 | |
3467 | fblk2 = nullptr; |
3468 | fblk3 = nullptr; |
3469 | fblk4 = nullptr; |
3470 | |
3471 | /* find fblk2, fblk3 and fblk4 so that |
3472 | * fblk2 is on the right of blk1 and overlap with blk1 in y axis |
3473 | * fblk3 is under blk1 and overlap with blk1 in x axis |
3474 | * fblk4 is under blk1 and on the right of blk1 |
3475 | * and they are closest to blk1 |
3476 | */ |
3477 | for (blk2 = blkList; blk2; blk2 = blk2->next) { |
3478 | if (blk2 != blk1) { |
3479 | if (blk2->yMin <= blk1->yMax && blk2->yMax >= blk1->yMin && blk2->xMin > blk1->xMax && blk2->xMin < bxMin0) { |
3480 | bxMin0 = blk2->xMin; |
3481 | fblk2 = blk2; |
3482 | } else if (blk2->xMin <= blk1->xMax && blk2->xMax >= blk1->xMin && blk2->yMin > blk1->yMax && blk2->yMin < byMin0) { |
3483 | byMin0 = blk2->yMin; |
3484 | fblk3 = blk2; |
3485 | } else if (blk2->xMin > blk1->xMax && blk2->xMin < bxMin1 && blk2->yMin > blk1->yMax && blk2->yMin < byMin1) { |
3486 | bxMin1 = blk2->xMin; |
3487 | byMin1 = blk2->yMin; |
3488 | fblk4 = blk2; |
3489 | } |
3490 | } |
3491 | } |
3492 | |
3493 | /* fblk4 can not overlap with fblk3 in x and with fblk2 in y |
3494 | * fblk2 can not overlap with fblk3 in x and y |
3495 | * fblk4 has to overlap with fblk3 in y and with fblk2 in x |
3496 | */ |
3497 | if (fblk2 != nullptr && fblk3 != nullptr && fblk4 != nullptr) { |
3498 | if (((fblk3->xMin <= fblk4->xMax && fblk3->xMax >= fblk4->xMin) || (fblk2->yMin <= fblk4->yMax && fblk2->yMax >= fblk4->yMin) || (fblk2->xMin <= fblk3->xMax && fblk2->xMax >= fblk3->xMin) |
3499 | || (fblk2->yMin <= fblk3->yMax && fblk2->yMax >= fblk3->yMin)) |
3500 | || !(fblk4->xMin <= fblk2->xMax && fblk4->xMax >= fblk2->xMin && fblk4->yMin <= fblk3->yMax && fblk4->yMax >= fblk3->yMin)) { |
3501 | fblk2 = nullptr; |
3502 | fblk3 = nullptr; |
3503 | fblk4 = nullptr; |
3504 | } |
3505 | } |
3506 | |
3507 | // if we found any then look whether they form a table |
3508 | if (fblk2 != nullptr && fblk3 != nullptr && fblk4 != nullptr) { |
3509 | tableId = -1; |
3510 | correspondenceX = 0; |
3511 | correspondenceY = 0; |
3512 | deltaX = 0.0; |
3513 | deltaY = 0.0; |
3514 | |
3515 | if (blk1->lines && blk1->lines->words) { |
3516 | deltaX = blk1->lines->words->getFontSize(); |
3517 | } |
3518 | if (fblk2->lines && fblk2->lines->words) { |
3519 | deltaX = deltaX < fblk2->lines->words->getFontSize() ? deltaX : fblk2->lines->words->getFontSize(); |
3520 | } |
3521 | if (fblk3->lines && fblk3->lines->words) { |
3522 | deltaX = deltaX < fblk3->lines->words->getFontSize() ? deltaX : fblk3->lines->words->getFontSize(); |
3523 | } |
3524 | if (fblk4->lines && fblk4->lines->words) { |
3525 | deltaX = deltaX < fblk4->lines->words->getFontSize() ? deltaX : fblk4->lines->words->getFontSize(); |
3526 | } |
3527 | |
3528 | deltaY = deltaX; |
3529 | |
3530 | deltaX *= minColSpacing1; |
3531 | deltaY *= maxIntraLineDelta; |
3532 | |
3533 | xCentre1 = (blk1->xMax + blk1->xMin) / 2.0; |
3534 | yCentre1 = (blk1->yMax + blk1->yMin) / 2.0; |
3535 | xCentre2 = (fblk2->xMax + fblk2->xMin) / 2.0; |
3536 | yCentre2 = (fblk2->yMax + fblk2->yMin) / 2.0; |
3537 | xCentre3 = (fblk3->xMax + fblk3->xMin) / 2.0; |
3538 | yCentre3 = (fblk3->yMax + fblk3->yMin) / 2.0; |
3539 | xCentre4 = (fblk4->xMax + fblk4->xMin) / 2.0; |
3540 | yCentre4 = (fblk4->yMax + fblk4->yMin) / 2.0; |
3541 | |
3542 | // are blocks centrally aligned in x ? |
3543 | if (fabs(x: xCentre1 - xCentre3) <= deltaX && fabs(x: xCentre2 - xCentre4) <= deltaX) { |
3544 | correspondenceX++; |
3545 | } |
3546 | |
3547 | // are blocks centrally aligned in y ? |
3548 | if (fabs(x: yCentre1 - yCentre2) <= deltaY && fabs(x: yCentre3 - yCentre4) <= deltaY) { |
3549 | correspondenceY++; |
3550 | } |
3551 | |
3552 | // are blocks aligned to the left ? |
3553 | if (fabs(x: blk1->xMin - fblk3->xMin) <= deltaX && fabs(x: fblk2->xMin - fblk4->xMin) <= deltaX) { |
3554 | correspondenceX++; |
3555 | } |
3556 | |
3557 | // are blocks aligned to the right ? |
3558 | if (fabs(x: blk1->xMax - fblk3->xMax) <= deltaX && fabs(x: fblk2->xMax - fblk4->xMax) <= deltaX) { |
3559 | correspondenceX++; |
3560 | } |
3561 | |
3562 | // are blocks aligned to the top ? |
3563 | if (fabs(x: blk1->yMin - fblk2->yMin) <= deltaY && fabs(x: fblk3->yMin - fblk4->yMin) <= deltaY) { |
3564 | correspondenceY++; |
3565 | } |
3566 | |
3567 | // are blocks aligned to the bottom ? |
3568 | if (fabs(x: blk1->yMax - fblk2->yMax) <= deltaY && fabs(x: fblk3->yMax - fblk4->yMax) <= deltaY) { |
3569 | correspondenceY++; |
3570 | } |
3571 | |
3572 | // are blocks aligned in x and y ? |
3573 | if (correspondenceX > 0 && correspondenceY > 0) { |
3574 | |
3575 | // find maximal tableId |
3576 | tableId = tableId < fblk4->tableId ? fblk4->tableId : tableId; |
3577 | tableId = tableId < fblk3->tableId ? fblk3->tableId : tableId; |
3578 | tableId = tableId < fblk2->tableId ? fblk2->tableId : tableId; |
3579 | tableId = tableId < blk1->tableId ? blk1->tableId : tableId; |
3580 | |
3581 | // if the tableId is -1, then we found new table |
3582 | if (tableId < 0) { |
3583 | tableId = numTables; |
3584 | numTables++; |
3585 | } |
3586 | |
3587 | blk1->tableId = tableId; |
3588 | fblk2->tableId = tableId; |
3589 | fblk3->tableId = tableId; |
3590 | fblk4->tableId = tableId; |
3591 | } |
3592 | } |
3593 | } |
3594 | |
3595 | /* set extended bounding boxes of all table entries |
3596 | * so that they contain whole table |
3597 | * (we need to process whole table size when comparing it |
3598 | * with regular text blocks) |
3599 | */ |
3600 | PDFRectangle *envelopes = new PDFRectangle[numTables]; |
3601 | TextBlock **ending_blocks = new TextBlock *[numTables]; |
3602 | |
3603 | for (int i = 0; i < numTables; i++) { |
3604 | envelopes[i].x1 = DBL_MAX; |
3605 | envelopes[i].x2 = DBL_MIN; |
3606 | envelopes[i].y1 = DBL_MAX; |
3607 | envelopes[i].y2 = DBL_MIN; |
3608 | ending_blocks[i] = nullptr; |
3609 | } |
3610 | |
3611 | for (blk1 = blkList; blk1; blk1 = blk1->next) { |
3612 | if (blk1->tableId >= 0) { |
3613 | if (blk1->ExMin < envelopes[blk1->tableId].x1) { |
3614 | envelopes[blk1->tableId].x1 = blk1->ExMin; |
3615 | if (!blk1->page->primaryLR) { |
3616 | ending_blocks[blk1->tableId] = blk1; |
3617 | } |
3618 | } |
3619 | |
3620 | if (blk1->ExMax > envelopes[blk1->tableId].x2) { |
3621 | envelopes[blk1->tableId].x2 = blk1->ExMax; |
3622 | if (blk1->page->primaryLR) { |
3623 | ending_blocks[blk1->tableId] = blk1; |
3624 | } |
3625 | } |
3626 | |
3627 | envelopes[blk1->tableId].y1 = blk1->EyMin < envelopes[blk1->tableId].y1 ? blk1->EyMin : envelopes[blk1->tableId].y1; |
3628 | envelopes[blk1->tableId].y2 = blk1->EyMax > envelopes[blk1->tableId].y2 ? blk1->EyMax : envelopes[blk1->tableId].y2; |
3629 | } |
3630 | } |
3631 | |
3632 | for (blk1 = blkList; blk1; blk1 = blk1->next) { |
3633 | if (blk1->tableId >= 0 && ending_blocks[blk1->tableId] && blk1->xMin <= ending_blocks[blk1->tableId]->xMax && blk1->xMax >= ending_blocks[blk1->tableId]->xMin) { |
3634 | blk1->tableEnd = true; |
3635 | } |
3636 | } |
3637 | |
3638 | for (blk1 = blkList; blk1; blk1 = blk1->next) { |
3639 | if (blk1->tableId >= 0) { |
3640 | blk1->ExMin = envelopes[blk1->tableId].x1; |
3641 | blk1->ExMax = envelopes[blk1->tableId].x2; |
3642 | blk1->EyMin = envelopes[blk1->tableId].y1; |
3643 | blk1->EyMax = envelopes[blk1->tableId].y2; |
3644 | } |
3645 | } |
3646 | delete[] envelopes; |
3647 | delete[] ending_blocks; |
3648 | |
3649 | /* set extended bounding boxes of all other blocks |
3650 | * so that they extend in x without hitting neighbours |
3651 | */ |
3652 | for (blk1 = blkList; blk1; blk1 = blk1->next) { |
3653 | if (!(blk1->tableId >= 0)) { |
3654 | double xMax = DBL_MAX; |
3655 | double xMin = DBL_MIN; |
3656 | |
3657 | for (blk2 = blkList; blk2; blk2 = blk2->next) { |
3658 | if (blk2 == blk1) { |
3659 | continue; |
3660 | } |
3661 | |
3662 | if (blk1->yMin <= blk2->yMax && blk1->yMax >= blk2->yMin) { |
3663 | if (blk2->xMin < xMax && blk2->xMin > blk1->xMax) { |
3664 | xMax = blk2->xMin; |
3665 | } |
3666 | |
3667 | if (blk2->xMax > xMin && blk2->xMax < blk1->xMin) { |
3668 | xMin = blk2->xMax; |
3669 | } |
3670 | } |
3671 | } |
3672 | |
3673 | for (blk2 = blkList; blk2; blk2 = blk2->next) { |
3674 | if (blk2 == blk1) { |
3675 | continue; |
3676 | } |
3677 | |
3678 | if (blk2->xMax > blk1->ExMax && blk2->xMax <= xMax && blk2->yMin >= blk1->yMax) { |
3679 | blk1->ExMax = blk2->xMax; |
3680 | } |
3681 | |
3682 | if (blk2->xMin < blk1->ExMin && blk2->xMin >= xMin && blk2->yMin >= blk1->yMax) { |
3683 | blk1->ExMin = blk2->xMin; |
3684 | } |
3685 | } |
3686 | } |
3687 | } |
3688 | |
3689 | int i = -1; |
3690 | for (blk1 = blkList; blk1; blk1 = blk1->next) { |
3691 | i++; |
3692 | sortPos = blk1->visitDepthFirst(blkList, pos1: i, sorted: blocks, sortPos, visited); |
3693 | } |
3694 | if (visited) { |
3695 | gfree(p: visited); |
3696 | } |
3697 | |
3698 | #if 0 // for debugging |
3699 | printf("*** blocks, after ro sort ***\n" ); |
3700 | for (i = 0; i < nBlocks; ++i) { |
3701 | blk = blocks[i]; |
3702 | printf("block: rot=%d x=%.2f..%.2f y=%.2f..%.2f space=%.2f..%.2f\n" , |
3703 | blk->rot, blk->xMin, blk->xMax, blk->yMin, blk->yMax, |
3704 | blk->priMin, blk->priMax); |
3705 | for (line = blk->lines; line; line = line->next) { |
3706 | printf(" line:\n" ); |
3707 | for (word0 = line->words; word0; word0 = word0->next) { |
3708 | printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSize=%.2f space=%d: '" , |
3709 | word0->xMin, word0->xMax, word0->yMin, word0->yMax, |
3710 | word0->base, word0->fontSize, word0->spaceAfter); |
3711 | for (j = 0; j < word0->len; ++j) { |
3712 | fputc(word0->text[j] & 0xff, stdout); |
3713 | } |
3714 | printf("'\n" ); |
3715 | } |
3716 | } |
3717 | } |
3718 | printf("\n" ); |
3719 | fflush(stdout); |
3720 | #endif |
3721 | |
3722 | // build the flows |
3723 | //~ this needs to be adjusted for writing mode (vertical text) |
3724 | //~ this also needs to account for right-to-left column ordering |
3725 | while (flows) { |
3726 | flow = flows; |
3727 | flows = flows->next; |
3728 | delete flow; |
3729 | } |
3730 | flow = nullptr; |
3731 | flows = lastFlow = nullptr; |
3732 | // assume blocks are already in reading order, |
3733 | // and construct flows accordingly. |
3734 | for (i = 0; i < nBlocks; i++) { |
3735 | blk = blocks[i]; |
3736 | blk->next = nullptr; |
3737 | if (flow) { |
3738 | blk1 = blocks[i - 1]; |
3739 | blkSpace = maxBlockSpacing * blk1->lines->words->fontSize; |
3740 | if (blk1->secondaryDelta(blk) <= blkSpace && blk->isBelow(blk: blk1) && flow->blockFits(blk, prevBlk: blk1)) { |
3741 | flow->addBlock(blk); |
3742 | continue; |
3743 | } |
3744 | } |
3745 | flow = new TextFlow(this, blk); |
3746 | if (lastFlow) { |
3747 | lastFlow->next = flow; |
3748 | } else { |
3749 | flows = flow; |
3750 | } |
3751 | lastFlow = flow; |
3752 | } |
3753 | |
3754 | #if 0 // for debugging |
3755 | printf("*** flows ***\n" ); |
3756 | for (flow = flows; flow; flow = flow->next) { |
3757 | printf("flow: x=%.2f..%.2f y=%.2f..%.2f pri:%.2f..%.2f\n" , |
3758 | flow->xMin, flow->xMax, flow->yMin, flow->yMax, |
3759 | flow->priMin, flow->priMax); |
3760 | for (blk = flow->blocks; blk; blk = blk->next) { |
3761 | printf(" block: rot=%d x=%.2f..%.2f y=%.2f..%.2f pri=%.2f..%.2f\n" , |
3762 | blk->rot, blk->ExMin, blk->ExMax, blk->EyMin, blk->EyMax, |
3763 | blk->priMin, blk->priMax); |
3764 | for (line = blk->lines; line; line = line->next) { |
3765 | printf(" line:\n" ); |
3766 | for (word0 = line->words; word0; word0 = word0->next) { |
3767 | printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSize=%.2f space=%d: '" , |
3768 | word0->xMin, word0->xMax, word0->yMin, word0->yMax, |
3769 | word0->base, word0->fontSize, word0->spaceAfter); |
3770 | for (i = 0; i < word0->len; ++i) { |
3771 | fputc(word0->text[i] & 0xff, stdout); |
3772 | } |
3773 | printf("'\n" ); |
3774 | } |
3775 | } |
3776 | } |
3777 | } |
3778 | printf("\n" ); |
3779 | #endif |
3780 | } |
3781 | |
3782 | void TextPage::adjustRotation(TextLine *line, int start, int end, double *xMin, double *xMax, double *yMin, double *yMax) |
3783 | { |
3784 | switch (line->rot) { |
3785 | case 0: |
3786 | *xMin = line->edge[start]; |
3787 | *xMax = line->edge[end]; |
3788 | *yMin = line->yMin; |
3789 | *yMax = line->yMax; |
3790 | break; |
3791 | case 1: |
3792 | *xMin = line->xMin; |
3793 | *xMax = line->xMax; |
3794 | *yMin = line->edge[start]; |
3795 | *yMax = line->edge[end]; |
3796 | break; |
3797 | case 2: |
3798 | *xMin = line->edge[end]; |
3799 | *xMax = line->edge[start]; |
3800 | *yMin = line->yMin; |
3801 | *yMax = line->yMax; |
3802 | break; |
3803 | case 3: |
3804 | *xMin = line->xMin; |
3805 | *xMax = line->xMax; |
3806 | *yMin = line->edge[end]; |
3807 | *yMax = line->edge[start]; |
3808 | break; |
3809 | } |
3810 | } |
3811 | |
3812 | bool TextPage::findText(const Unicode *s, int len, bool startAtTop, bool stopAtBottom, bool startAtLast, bool stopAtLast, bool caseSensitive, bool backward, bool wholeWord, double *xMin, double *yMin, double *xMax, double *yMax) |
3813 | { |
3814 | return findText(s, len, startAtTop, stopAtBottom, startAtLast, stopAtLast, caseSensitive, ignoreDiacritics: false, matchAcrossLines: false, backward, wholeWord, xMin, yMin, xMax, yMax, continueMatch: nullptr, ignoredHyphen: nullptr); |
3815 | } |
3816 | |
3817 | bool TextPage::findText(const Unicode *s, int len, bool startAtTop, bool stopAtBottom, bool startAtLast, bool stopAtLast, bool caseSensitive, bool ignoreDiacritics, bool backward, bool wholeWord, double *xMin, double *yMin, double *xMax, |
3818 | double *yMax) |
3819 | { |
3820 | return findText(s, len, startAtTop, stopAtBottom, startAtLast, stopAtLast, caseSensitive, ignoreDiacritics, matchAcrossLines: false, backward, wholeWord, xMin, yMin, xMax, yMax, continueMatch: nullptr, ignoredHyphen: nullptr); |
3821 | } |
3822 | |
3823 | bool TextPage::findText(const Unicode *s, int len, bool startAtTop, bool stopAtBottom, bool startAtLast, bool stopAtLast, bool caseSensitive, bool ignoreDiacritics, bool matchAcrossLines, bool backward, bool wholeWord, double *xMin, |
3824 | double *yMin, double *xMax, double *yMax, PDFRectangle *continueMatch, bool *ignoredHyphen) |
3825 | { |
3826 | TextBlock *blk; |
3827 | TextLine *line; |
3828 | Unicode *s2, *txt, *reordered; |
3829 | Unicode *p; |
3830 | TextLine *nextline; |
3831 | Unicode *nextline_txt; |
3832 | int nextline_len; |
3833 | bool nextlineAfterHyphen = false; |
3834 | int txtSize, m, i, j, k; |
3835 | double xStart, yStart, xStop, yStop; |
3836 | double xMin0, yMin0, xMax0, yMax0; |
3837 | double xMin1, yMin1, xMax1, yMax1; |
3838 | double xMin2, yMin2, xMax2, yMax2; |
3839 | bool found; |
3840 | |
3841 | if (len == 0) { |
3842 | return false; |
3843 | } |
3844 | |
3845 | if (rawOrder) { |
3846 | return false; |
3847 | } |
3848 | |
3849 | if (matchAcrossLines && backward) { |
3850 | // matchAcrossLines is unimplemented for backward search |
3851 | matchAcrossLines = false; |
3852 | } |
3853 | |
3854 | // handle right-to-left text |
3855 | reordered = (Unicode *)gmallocn(count: len, size: sizeof(Unicode)); |
3856 | reorderText(text: s, len, uMap: nullptr, primaryLR, s: nullptr, u: reordered); |
3857 | |
3858 | // normalize the search string |
3859 | s2 = unicodeNormalizeNFKC(in: reordered, len, out_len: &len, indices: nullptr); |
3860 | |
3861 | // if search string is not pure ascii then don't |
3862 | // use ignoreDiacritics (as they won't match) |
3863 | if (!caseSensitive) { |
3864 | // convert the search string to uppercase |
3865 | for (i = 0; i < len; ++i) { |
3866 | s2[i] = unicodeToUpper(c: s2[i]); |
3867 | if (ignoreDiacritics && !isAscii7(uchar: s2[i])) { |
3868 | ignoreDiacritics = false; |
3869 | } |
3870 | } |
3871 | } else if (ignoreDiacritics) { |
3872 | for (i = 0; i < len; ++i) { |
3873 | if (!isAscii7(uchar: s2[i])) { |
3874 | ignoreDiacritics = false; |
3875 | break; |
3876 | } |
3877 | } |
3878 | } |
3879 | |
3880 | txt = nullptr; |
3881 | txtSize = 0; |
3882 | |
3883 | xStart = yStart = xStop = yStop = 0; |
3884 | if (startAtLast && haveLastFind) { |
3885 | xStart = lastFindXMin; |
3886 | yStart = lastFindYMin; |
3887 | } else if (!startAtTop) { |
3888 | xStart = *xMin; |
3889 | yStart = *yMin; |
3890 | } |
3891 | if (stopAtLast && haveLastFind) { |
3892 | xStop = lastFindXMin; |
3893 | yStop = lastFindYMin; |
3894 | } else if (!stopAtBottom) { |
3895 | xStop = *xMax; |
3896 | yStop = *yMax; |
3897 | } |
3898 | |
3899 | found = false; |
3900 | xMin0 = xMax0 = yMin0 = yMax0 = 0; // make gcc happy |
3901 | xMin1 = xMax1 = yMin1 = yMax1 = 0; // make gcc happy |
3902 | |
3903 | for (i = backward ? nBlocks - 1 : 0; backward ? i >= 0 : i < nBlocks; i += backward ? -1 : 1) { |
3904 | blk = blocks[i]; |
3905 | |
3906 | // check: is the block above the top limit? |
3907 | // (this only works if the page's primary rotation is zero -- |
3908 | // otherwise the blocks won't be sorted in the useful order) |
3909 | if (!startAtTop && primaryRot == 0 && (backward ? blk->yMin > yStart : blk->yMax < yStart)) { |
3910 | continue; |
3911 | } |
3912 | |
3913 | // check: is the block below the bottom limit? |
3914 | // (this only works if the page's primary rotation is zero -- |
3915 | // otherwise the blocks won't be sorted in the useful order) |
3916 | if (!stopAtBottom && primaryRot == 0 && (backward ? blk->yMax < yStop : blk->yMin > yStop)) { |
3917 | break; |
3918 | } |
3919 | |
3920 | for (line = blk->lines; line; line = line->next) { |
3921 | |
3922 | // check: is the line above the top limit? |
3923 | // (this only works if the page's primary rotation is zero -- |
3924 | // otherwise the lines won't be sorted in the useful order) |
3925 | if (!startAtTop && primaryRot == 0 && (backward ? line->yMin > yStart : line->yMin < yStart)) { |
3926 | continue; |
3927 | } |
3928 | |
3929 | // check: is the line below the bottom limit? |
3930 | // (this only works if the page's primary rotation is zero -- |
3931 | // otherwise the lines won't be sorted in the useful order) |
3932 | if (!stopAtBottom && primaryRot == 0 && (backward ? line->yMin < yStop : line->yMin > yStop)) { |
3933 | continue; |
3934 | } |
3935 | |
3936 | if (!line->normalized) { |
3937 | line->normalized = unicodeNormalizeNFKC(in: line->text, len: line->len, out_len: &line->normalized_len, indices: &line->normalized_idx, reverseRTL: true); |
3938 | } |
3939 | |
3940 | nextline = nullptr; |
3941 | nextline_txt = nullptr; |
3942 | nextline_len = 0; |
3943 | if (line->next) { |
3944 | nextline = line->next; |
3945 | } else { |
3946 | // set nextline to first line of next block |
3947 | int ind = i + (backward ? -1 : 1); |
3948 | if ((backward && ind >= 0) || (!backward && ind < nBlocks)) { |
3949 | nextline = blocks[ind]->lines; |
3950 | } |
3951 | } |
3952 | |
3953 | if (matchAcrossLines && nextline && !nextline->normalized) { |
3954 | nextline->normalized = unicodeNormalizeNFKC(in: nextline->text, len: nextline->len, out_len: &nextline->normalized_len, indices: &nextline->normalized_idx, reverseRTL: true); |
3955 | } |
3956 | |
3957 | // convert the line to uppercase |
3958 | m = line->normalized_len; |
3959 | |
3960 | if (ignoreDiacritics) { |
3961 | if (!line->ascii_translation) { |
3962 | unicodeToAscii7(in: line->normalized, len: line->normalized_len, ucs4_out: &line->ascii_translation, out_len: &line->ascii_len, in_idx: line->normalized_idx, indices: &line->ascii_idx); |
3963 | } |
3964 | if (line->ascii_len) { |
3965 | m = line->ascii_len; |
3966 | } else { |
3967 | ignoreDiacritics = false; |
3968 | } |
3969 | |
3970 | if (matchAcrossLines && nextline && !nextline->ascii_translation) { |
3971 | unicodeToAscii7(in: nextline->normalized, len: nextline->normalized_len, ucs4_out: &nextline->ascii_translation, out_len: &nextline->ascii_len, in_idx: nextline->normalized_idx, indices: &nextline->ascii_idx); |
3972 | } |
3973 | } |
3974 | if (!caseSensitive) { |
3975 | if (m > txtSize) { |
3976 | txt = (Unicode *)greallocn(p: txt, count: m, size: sizeof(Unicode)); |
3977 | txtSize = m; |
3978 | } |
3979 | for (k = 0; k < m; ++k) { |
3980 | if (ignoreDiacritics) { |
3981 | txt[k] = unicodeToUpper(c: line->ascii_translation[k]); |
3982 | } else { |
3983 | txt[k] = unicodeToUpper(c: line->normalized[k]); |
3984 | } |
3985 | } |
3986 | if (matchAcrossLines && nextline) { |
3987 | nextline_len = ignoreDiacritics ? nextline->ascii_len : nextline->normalized_len; |
3988 | nextline_txt = (Unicode *)gmallocn(count: nextline_len, size: sizeof(Unicode)); |
3989 | for (k = 0; k < nextline_len; ++k) { |
3990 | nextline_txt[k] = ignoreDiacritics ? unicodeToUpper(c: nextline->ascii_translation[k]) : unicodeToUpper(c: nextline->normalized[k]); |
3991 | } |
3992 | } |
3993 | } else { |
3994 | if (ignoreDiacritics) { |
3995 | txt = line->ascii_translation; |
3996 | } else { |
3997 | txt = line->normalized; |
3998 | } |
3999 | |
4000 | if (matchAcrossLines && nextline) { |
4001 | nextline_len = ignoreDiacritics ? nextline->ascii_len : nextline->normalized_len; |
4002 | nextline_txt = ignoreDiacritics ? nextline->ascii_translation : nextline->normalized; |
4003 | } |
4004 | } |
4005 | |
4006 | // search each position in this line |
4007 | j = backward ? m - len : 0; |
4008 | p = txt + j; |
4009 | while (backward ? j >= 0 : j <= m - (nextline_txt ? 1 : len)) { |
4010 | bool wholeWordStartIsOk, wholeWordEndIsOk; |
4011 | if (wholeWord) { |
4012 | wholeWordStartIsOk = j == 0 || !unicodeTypeAlphaNum(c: txt[j - 1]); |
4013 | if (nextline_txt) { |
4014 | wholeWordEndIsOk = true; // word end may be in next line, so we'll check it later |
4015 | } else { |
4016 | wholeWordEndIsOk = j + len == m || !unicodeTypeAlphaNum(c: txt[j + len]); |
4017 | } |
4018 | } |
4019 | if (!wholeWord || (wholeWordStartIsOk && wholeWordEndIsOk)) { |
4020 | int n = 0; |
4021 | bool spaceConsumedByNewline = false; |
4022 | bool found_it; |
4023 | |
4024 | // compare the strings |
4025 | for (k = 0; k < len; ++k) { |
4026 | bool last_char_of_line = j + k == m - 1; |
4027 | bool last_char_of_search_term = k == len - 1; |
4028 | bool match_started = (bool)k; |
4029 | |
4030 | if (p[k] != s2[k] || (nextline_txt && last_char_of_line && !last_char_of_search_term)) { |
4031 | // now check if the comparison failed at the end-of-line hyphen, |
4032 | // and if so, keep on comparing at the next line |
4033 | nextlineAfterHyphen = false; |
4034 | |
4035 | if (s2[k] == p[k]) { |
4036 | if (p[k] != (Unicode)'-' && !UnicodeIsWhitespace(ucs4: s2[k + 1])) { |
4037 | break; |
4038 | } |
4039 | k++; |
4040 | } else if (!match_started || p[k] != (Unicode)'-' || !last_char_of_line || UnicodeIsWhitespace(ucs4: s2[k])) { |
4041 | break; |
4042 | } else { |
4043 | nextlineAfterHyphen = true; |
4044 | } |
4045 | |
4046 | for (; n < nextline_len && k < len; ++k, ++n) { |
4047 | if (nextline_txt[n] != s2[k]) { |
4048 | if (!spaceConsumedByNewline && !n && UnicodeIsWhitespace(ucs4: s2[k])) { |
4049 | n = -1; |
4050 | spaceConsumedByNewline = true; |
4051 | continue; |
4052 | } |
4053 | break; |
4054 | } |
4055 | } |
4056 | break; |
4057 | } |
4058 | } |
4059 | |
4060 | found_it = k == len; |
4061 | if (found_it && nextline_txt && wholeWord) { // check word end for nextline case |
4062 | if (n) { // Match ended at next line |
4063 | wholeWordEndIsOk = n == nextline_len || !unicodeTypeAlphaNum(c: nextline_txt[n]); |
4064 | } else { // Match ended on same line |
4065 | wholeWordEndIsOk = j + len == m || !unicodeTypeAlphaNum(c: txt[j + len]); |
4066 | } |
4067 | |
4068 | if (!wholeWordEndIsOk) { |
4069 | found_it = false; |
4070 | } |
4071 | } |
4072 | // found it |
4073 | if (found_it) { |
4074 | bool nextLineMatch = (bool)n; |
4075 | if (spaceConsumedByNewline) { |
4076 | k--; |
4077 | } |
4078 | // where s2 matches a subsequence of a compatibility equivalence |
4079 | // decomposition, highlight the entire glyph, since we don't know |
4080 | // the internal layout of subglyph components |
4081 | int normStart, normAfterEnd; |
4082 | if (ignoreDiacritics) { |
4083 | normStart = line->ascii_idx[j]; |
4084 | if (nextline_txt) { |
4085 | normAfterEnd = line->ascii_idx[j + k - n]; |
4086 | } else { |
4087 | normAfterEnd = line->ascii_idx[j + len - 1] + 1; |
4088 | } |
4089 | } else { |
4090 | normStart = line->normalized_idx[j]; |
4091 | if (nextline_txt) { |
4092 | normAfterEnd = line->normalized_idx[j + k - n]; |
4093 | } else { |
4094 | normAfterEnd = line->normalized_idx[j + len - 1] + 1; |
4095 | } |
4096 | } |
4097 | |
4098 | adjustRotation(line, start: normStart, end: normAfterEnd, xMin: &xMin1, xMax: &xMax1, yMin: &yMin1, yMax: &yMax1); |
4099 | |
4100 | if (backward) { |
4101 | if ((startAtTop || yMin1 < yStart || (yMin1 == yStart && xMin1 < xStart)) && (stopAtBottom || yMin1 > yStop || (yMin1 == yStop && xMin1 > xStop))) { |
4102 | if (!found || yMin1 > yMin0 || (yMin1 == yMin0 && xMin1 > xMin0)) { |
4103 | xMin0 = xMin1; |
4104 | xMax0 = xMax1; |
4105 | yMin0 = yMin1; |
4106 | yMax0 = yMax1; |
4107 | found = true; |
4108 | } |
4109 | } |
4110 | } else { |
4111 | if ((startAtTop || yMin1 > yStart || (yMin1 == yStart && xMin1 > xStart)) && (stopAtBottom || yMin1 < yStop || (yMin1 == yStop && xMin1 < xStop))) { |
4112 | if (!found || yMin1 < yMin0 || (yMin1 == yMin0 && xMin1 < xMin0)) { |
4113 | xMin0 = xMin1; |
4114 | xMax0 = xMax1; |
4115 | yMin0 = yMin1; |
4116 | yMax0 = yMax1; |
4117 | found = true; |
4118 | if (nextLineMatch) { // set the out parameters |
4119 | if (ignoredHyphen) { |
4120 | *ignoredHyphen = nextlineAfterHyphen; |
4121 | } |
4122 | |
4123 | if (continueMatch) { |
4124 | adjustRotation(line: nextline, start: 0, end: n, xMin: &xMin2, xMax: &xMax2, yMin: &yMin2, yMax: &yMax2); |
4125 | continueMatch->x1 = xMin2; |
4126 | continueMatch->y1 = yMax2; |
4127 | continueMatch->x2 = xMax2; |
4128 | continueMatch->y2 = yMin2; |
4129 | } |
4130 | } else if (continueMatch && continueMatch->x1 != std::numeric_limits<double>::max()) { |
4131 | if (ignoredHyphen) { |
4132 | *ignoredHyphen = false; |
4133 | } |
4134 | |
4135 | continueMatch->x1 = std::numeric_limits<double>::max(); |
4136 | } |
4137 | } |
4138 | } |
4139 | } |
4140 | } |
4141 | } |
4142 | if (backward) { |
4143 | --j; |
4144 | --p; |
4145 | } else { |
4146 | ++j; |
4147 | ++p; |
4148 | } |
4149 | } |
4150 | |
4151 | if (nextline_txt && nextline_txt != nextline->ascii_translation && nextline_txt != nextline->normalized) { |
4152 | gfree(p: nextline_txt); |
4153 | } |
4154 | } |
4155 | } |
4156 | |
4157 | gfree(p: s2); |
4158 | gfree(p: reordered); |
4159 | if (!caseSensitive) { |
4160 | gfree(p: txt); |
4161 | } |
4162 | |
4163 | if (found) { |
4164 | *xMin = xMin0; |
4165 | *xMax = xMax0; |
4166 | *yMin = yMin0; |
4167 | *yMax = yMax0; |
4168 | lastFindXMin = xMin0; |
4169 | lastFindYMin = yMin0; |
4170 | haveLastFind = true; |
4171 | return true; |
4172 | } |
4173 | |
4174 | return false; |
4175 | } |
4176 | |
4177 | GooString *TextPage::getText(double xMin, double yMin, double xMax, double yMax, EndOfLineKind textEOL) const |
4178 | { |
4179 | GooString *s; |
4180 | const UnicodeMap *uMap; |
4181 | TextBlock *blk; |
4182 | TextLine *line; |
4183 | TextLineFrag *frags; |
4184 | int nFrags, fragsSize; |
4185 | TextLineFrag *frag; |
4186 | char space[8], eol[16]; |
4187 | int spaceLen, eolLen; |
4188 | int lastRot; |
4189 | double x, y, delta; |
4190 | int col, idx0, idx1, i, j; |
4191 | bool multiLine, oneRot; |
4192 | |
4193 | s = new GooString(); |
4194 | |
4195 | // get the output encoding |
4196 | if (!(uMap = globalParams->getTextEncoding())) { |
4197 | return s; |
4198 | } |
4199 | |
4200 | if (rawOrder) { |
4201 | TextWord *word; |
4202 | char mbc[16]; |
4203 | int mbc_len; |
4204 | |
4205 | for (word = rawWords; word && word <= rawLastWord; word = word->next) { |
4206 | for (j = 0; j < word->getLength(); ++j) { |
4207 | double gXMin, gXMax, gYMin, gYMax; |
4208 | word->getCharBBox(charIdx: j, xMinA: &gXMin, yMinA: &gYMin, xMaxA: &gXMax, yMaxA: &gYMax); |
4209 | if (xMin <= gXMin && gXMax <= xMax && yMin <= gYMin && gYMax <= yMax) { |
4210 | mbc_len = uMap->mapUnicode(u: *(word->getChar(idx: j)), buf: mbc, bufSize: sizeof(mbc)); |
4211 | s->append(str: mbc, lengthA: mbc_len); |
4212 | } |
4213 | } |
4214 | } |
4215 | return s; |
4216 | } |
4217 | |
4218 | spaceLen = uMap->mapUnicode(u: 0x20, buf: space, bufSize: sizeof(space)); |
4219 | eolLen = 0; // make gcc happy |
4220 | switch (textEOL) { |
4221 | case eolUnix: |
4222 | eolLen = uMap->mapUnicode(u: 0x0a, buf: eol, bufSize: sizeof(eol)); |
4223 | break; |
4224 | case eolDOS: |
4225 | eolLen = uMap->mapUnicode(u: 0x0d, buf: eol, bufSize: sizeof(eol)); |
4226 | eolLen += uMap->mapUnicode(u: 0x0a, buf: eol + eolLen, bufSize: sizeof(eol) - eolLen); |
4227 | break; |
4228 | case eolMac: |
4229 | eolLen = uMap->mapUnicode(u: 0x0d, buf: eol, bufSize: sizeof(eol)); |
4230 | break; |
4231 | } |
4232 | |
4233 | //~ writing mode (horiz/vert) |
4234 | |
4235 | // collect the line fragments that are in the rectangle |
4236 | fragsSize = 256; |
4237 | frags = (TextLineFrag *)gmallocn(count: fragsSize, size: sizeof(TextLineFrag)); |
4238 | nFrags = 0; |
4239 | lastRot = -1; |
4240 | oneRot = true; |
4241 | for (i = 0; i < nBlocks; ++i) { |
4242 | blk = blocks[i]; |
4243 | if (xMin < blk->xMax && blk->xMin < xMax && yMin < blk->yMax && blk->yMin < yMax) { |
4244 | for (line = blk->lines; line; line = line->next) { |
4245 | if (xMin < line->xMax && line->xMin < xMax && yMin < line->yMax && line->yMin < yMax) { |
4246 | idx0 = idx1 = -1; |
4247 | switch (line->rot) { |
4248 | case 0: |
4249 | y = 0.5 * (line->yMin + line->yMax); |
4250 | if (yMin < y && y < yMax) { |
4251 | j = 0; |
4252 | while (j < line->len) { |
4253 | if (0.5 * (line->edge[j] + line->edge[j + 1]) > xMin) { |
4254 | idx0 = j; |
4255 | break; |
4256 | } |
4257 | ++j; |
4258 | } |
4259 | j = line->len - 1; |
4260 | while (j >= 0) { |
4261 | if (0.5 * (line->edge[j] + line->edge[j + 1]) < xMax) { |
4262 | idx1 = j; |
4263 | break; |
4264 | } |
4265 | --j; |
4266 | } |
4267 | } |
4268 | break; |
4269 | case 1: |
4270 | x = 0.5 * (line->xMin + line->xMax); |
4271 | if (xMin < x && x < xMax) { |
4272 | j = 0; |
4273 | while (j < line->len) { |
4274 | if (0.5 * (line->edge[j] + line->edge[j + 1]) > yMin) { |
4275 | idx0 = j; |
4276 | break; |
4277 | } |
4278 | ++j; |
4279 | } |
4280 | j = line->len - 1; |
4281 | while (j >= 0) { |
4282 | if (0.5 * (line->edge[j] + line->edge[j + 1]) < yMax) { |
4283 | idx1 = j; |
4284 | break; |
4285 | } |
4286 | --j; |
4287 | } |
4288 | } |
4289 | break; |
4290 | case 2: |
4291 | y = 0.5 * (line->yMin + line->yMax); |
4292 | if (yMin < y && y < yMax) { |
4293 | j = 0; |
4294 | while (j < line->len) { |
4295 | if (0.5 * (line->edge[j] + line->edge[j + 1]) < xMax) { |
4296 | idx0 = j; |
4297 | break; |
4298 | } |
4299 | ++j; |
4300 | } |
4301 | j = line->len - 1; |
4302 | while (j >= 0) { |
4303 | if (0.5 * (line->edge[j] + line->edge[j + 1]) > xMin) { |
4304 | idx1 = j; |
4305 | break; |
4306 | } |
4307 | --j; |
4308 | } |
4309 | } |
4310 | break; |
4311 | case 3: |
4312 | x = 0.5 * (line->xMin + line->xMax); |
4313 | if (xMin < x && x < xMax) { |
4314 | j = 0; |
4315 | while (j < line->len) { |
4316 | if (0.5 * (line->edge[j] + line->edge[j + 1]) < yMax) { |
4317 | idx0 = j; |
4318 | break; |
4319 | } |
4320 | ++j; |
4321 | } |
4322 | j = line->len - 1; |
4323 | while (j >= 0) { |
4324 | if (0.5 * (line->edge[j] + line->edge[j + 1]) > yMin) { |
4325 | idx1 = j; |
4326 | break; |
4327 | } |
4328 | --j; |
4329 | } |
4330 | } |
4331 | break; |
4332 | } |
4333 | if (idx0 >= 0 && idx1 >= 0) { |
4334 | if (nFrags == fragsSize) { |
4335 | fragsSize *= 2; |
4336 | frags = (TextLineFrag *)greallocn(p: frags, count: fragsSize, size: sizeof(TextLineFrag)); |
4337 | } |
4338 | frags[nFrags].init(lineA: line, startA: idx0, lenA: idx1 - idx0 + 1); |
4339 | ++nFrags; |
4340 | if (lastRot >= 0 && line->rot != lastRot) { |
4341 | oneRot = false; |
4342 | } |
4343 | lastRot = line->rot; |
4344 | } |
4345 | } |
4346 | } |
4347 | } |
4348 | } |
4349 | |
4350 | // sort the fragments and generate the string |
4351 | if (nFrags > 0) { |
4352 | |
4353 | for (i = 0; i < nFrags; ++i) { |
4354 | frags[i].computeCoords(oneRot); |
4355 | } |
4356 | assignColumns(frags, nFrags, rot: oneRot); |
4357 | |
4358 | // if all lines in the region have the same rotation, use it; |
4359 | // otherwise, use the page's primary rotation |
4360 | if (oneRot) { |
4361 | qsort(base: frags, nmemb: nFrags, size: sizeof(TextLineFrag), compar: &TextLineFrag::cmpYXLineRot); |
4362 | } else { |
4363 | qsort(base: frags, nmemb: nFrags, size: sizeof(TextLineFrag), compar: &TextLineFrag::cmpYXPrimaryRot); |
4364 | } |
4365 | i = 0; |
4366 | while (i < nFrags) { |
4367 | delta = maxIntraLineDelta * frags[i].line->words->fontSize; |
4368 | for (j = i + 1; j < nFrags && fabs(x: frags[j].base - frags[i].base) < delta; ++j) { |
4369 | ; |
4370 | } |
4371 | qsort(base: frags + i, nmemb: j - i, size: sizeof(TextLineFrag), compar: oneRot ? &TextLineFrag::cmpXYColumnLineRot : &TextLineFrag::cmpXYColumnPrimaryRot); |
4372 | i = j; |
4373 | } |
4374 | |
4375 | col = 0; |
4376 | multiLine = false; |
4377 | for (i = 0; i < nFrags; ++i) { |
4378 | frag = &frags[i]; |
4379 | |
4380 | // insert a return |
4381 | if (frag->col < col || (i > 0 && fabs(x: frag->base - frags[i - 1].base) > maxIntraLineDelta * frags[i - 1].line->words->fontSize)) { |
4382 | s->append(str: eol, lengthA: eolLen); |
4383 | col = 0; |
4384 | multiLine = true; |
4385 | } |
4386 | |
4387 | // column alignment |
4388 | for (; col < frag->col; ++col) { |
4389 | s->append(str: space, lengthA: spaceLen); |
4390 | } |
4391 | |
4392 | // get the fragment text |
4393 | col += dumpFragment(text: frag->line->text + frag->start, len: frag->len, uMap, s); |
4394 | } |
4395 | |
4396 | if (multiLine) { |
4397 | s->append(str: eol, lengthA: eolLen); |
4398 | } |
4399 | } |
4400 | |
4401 | gfree(p: frags); |
4402 | |
4403 | return s; |
4404 | } |
4405 | |
4406 | class TextSelectionVisitor |
4407 | { |
4408 | public: |
4409 | explicit TextSelectionVisitor(TextPage *page); |
4410 | virtual ~TextSelectionVisitor(); |
4411 | TextSelectionVisitor(const TextSelectionVisitor &) = delete; |
4412 | TextSelectionVisitor &operator=(const TextSelectionVisitor &) = delete; |
4413 | virtual void visitBlock(TextBlock *block, TextLine *begin, TextLine *end, const PDFRectangle *selection) = 0; |
4414 | virtual void visitLine(TextLine *line, TextWord *begin, TextWord *end, int edge_begin, int edge_end, const PDFRectangle *selection) = 0; |
4415 | virtual void visitWord(TextWord *word, int begin, int end, const PDFRectangle *selection) = 0; |
4416 | |
4417 | protected: |
4418 | TextPage *page; |
4419 | }; |
4420 | |
4421 | TextSelectionVisitor::TextSelectionVisitor(TextPage *p) : page(p) { } |
4422 | |
4423 | TextSelectionVisitor::~TextSelectionVisitor() = default; |
4424 | |
4425 | class TextSelectionDumper : public TextSelectionVisitor |
4426 | { |
4427 | public: |
4428 | explicit TextSelectionDumper(TextPage *page); |
4429 | ~TextSelectionDumper() override; |
4430 | |
4431 | void visitBlock(TextBlock *block, TextLine *begin, TextLine *end, const PDFRectangle *selection) override {}; |
4432 | void visitLine(TextLine *line, TextWord *begin, TextWord *end, int edge_begin, int edge_end, const PDFRectangle *selection) override; |
4433 | void visitWord(TextWord *word, int begin, int end, const PDFRectangle *selection) override; |
4434 | void endPage(); |
4435 | |
4436 | GooString *getText(); |
4437 | std::vector<TextWordSelection *> **takeWordList(int *nLines); |
4438 | |
4439 | private: |
4440 | void startLine(); |
4441 | void finishLine(); |
4442 | |
4443 | std::vector<TextWordSelection *> **lines; |
4444 | int nLines, linesSize; |
4445 | std::vector<TextWordSelection *> *words; |
4446 | int tableId; |
4447 | TextBlock *currentBlock; |
4448 | }; |
4449 | |
4450 | TextSelectionDumper::TextSelectionDumper(TextPage *p) : TextSelectionVisitor(p) |
4451 | { |
4452 | linesSize = 256; |
4453 | lines = (std::vector<TextWordSelection *> **)gmallocn(count: linesSize, size: sizeof(std::vector<TextWordSelection *> *)); |
4454 | nLines = 0; |
4455 | |
4456 | tableId = -1; |
4457 | currentBlock = nullptr; |
4458 | words = nullptr; |
4459 | } |
4460 | |
4461 | TextSelectionDumper::~TextSelectionDumper() |
4462 | { |
4463 | for (int i = 0; i < nLines; i++) { |
4464 | for (auto entry : *(lines[i])) { |
4465 | delete entry; |
4466 | } |
4467 | delete lines[i]; |
4468 | } |
4469 | gfree(p: lines); |
4470 | } |
4471 | |
4472 | void TextSelectionDumper::startLine() |
4473 | { |
4474 | finishLine(); |
4475 | words = new std::vector<TextWordSelection *>(); |
4476 | } |
4477 | |
4478 | void TextSelectionDumper::finishLine() |
4479 | { |
4480 | if (nLines == linesSize) { |
4481 | linesSize *= 2; |
4482 | lines = (std::vector<TextWordSelection *> **)grealloc(p: lines, size: linesSize * sizeof(std::vector<TextWordSelection *> *)); |
4483 | } |
4484 | |
4485 | if (words && words->size() > 0) { |
4486 | // Reverse word order for RTL text. Fixes #53 for glib backend (Evince) |
4487 | if (!page->primaryLR) { |
4488 | std::reverse(first: words->begin(), last: words->end()); |
4489 | } |
4490 | |
4491 | lines[nLines++] = words; |
4492 | } else if (words) { |
4493 | delete words; |
4494 | } |
4495 | words = nullptr; |
4496 | } |
4497 | |
4498 | void TextSelectionDumper::visitLine(TextLine *line, TextWord *begin, TextWord *end, int edge_begin, int edge_end, const PDFRectangle *selection) |
4499 | { |
4500 | TextLineFrag frag; |
4501 | |
4502 | frag.init(lineA: line, startA: edge_begin, lenA: edge_end - edge_begin); |
4503 | |
4504 | if (tableId >= 0 && frag.line->blk->tableId < 0) { |
4505 | finishLine(); |
4506 | |
4507 | tableId = -1; |
4508 | currentBlock = nullptr; |
4509 | } |
4510 | |
4511 | if (frag.line->blk->tableId >= 0) { // a table |
4512 | if (tableId == -1) { |
4513 | tableId = frag.line->blk->tableId; |
4514 | currentBlock = frag.line->blk; |
4515 | } |
4516 | |
4517 | if (currentBlock == frag.line->blk) { // the same block |
4518 | startLine(); |
4519 | } else { // another block |
4520 | if (currentBlock->tableEnd) { // previous block ended its row |
4521 | startLine(); |
4522 | } |
4523 | currentBlock = frag.line->blk; |
4524 | } |
4525 | } else { // not a table |
4526 | startLine(); |
4527 | } |
4528 | } |
4529 | |
4530 | void TextSelectionDumper::visitWord(TextWord *word, int begin, int end, const PDFRectangle *selection) |
4531 | { |
4532 | words->push_back(x: new TextWordSelection(word, begin, end)); |
4533 | } |
4534 | |
4535 | void TextSelectionDumper::endPage() |
4536 | { |
4537 | finishLine(); |
4538 | } |
4539 | |
4540 | GooString *TextSelectionDumper::getText() |
4541 | { |
4542 | GooString *text; |
4543 | int i; |
4544 | const UnicodeMap *uMap; |
4545 | char space[8], eol[16]; |
4546 | int spaceLen, eolLen; |
4547 | |
4548 | text = new GooString(); |
4549 | |
4550 | if (!(uMap = globalParams->getTextEncoding())) { |
4551 | return text; |
4552 | } |
4553 | |
4554 | spaceLen = uMap->mapUnicode(u: 0x20, buf: space, bufSize: sizeof(space)); |
4555 | eolLen = uMap->mapUnicode(u: 0x0a, buf: eol, bufSize: sizeof(eol)); |
4556 | |
4557 | std::vector<Unicode> uText; |
4558 | for (i = 0; i < nLines; i++) { |
4559 | std::vector<TextWordSelection *> *lineWords = lines[i]; |
4560 | for (std::size_t j = 0; j < lineWords->size(); j++) { |
4561 | TextWordSelection *sel = (*lineWords)[j]; |
4562 | |
4563 | uText.resize(new_size: sel->end - sel->begin); |
4564 | std::transform(first: sel->word->chars.begin() + sel->begin, last: sel->word->chars.begin() + sel->end, result: uText.begin(), unary_op: [](auto &c) { return c.text; }); |
4565 | page->dumpFragment(text: uText.data(), len: uText.size(), uMap, s: text); |
4566 | |
4567 | if (j < lineWords->size() - 1 && sel->word->spaceAfter) { |
4568 | text->append(str: space, lengthA: spaceLen); |
4569 | } |
4570 | } |
4571 | if (i < nLines - 1) { |
4572 | text->append(str: eol, lengthA: eolLen); |
4573 | } |
4574 | } |
4575 | |
4576 | return text; |
4577 | } |
4578 | |
4579 | std::vector<TextWordSelection *> **TextSelectionDumper::takeWordList(int *nLinesOut) |
4580 | { |
4581 | std::vector<TextWordSelection *> **returnValue = lines; |
4582 | |
4583 | *nLinesOut = nLines; |
4584 | if (nLines == 0) { |
4585 | return nullptr; |
4586 | } |
4587 | |
4588 | nLines = 0; |
4589 | lines = nullptr; |
4590 | |
4591 | return returnValue; |
4592 | } |
4593 | |
4594 | class TextSelectionSizer : public TextSelectionVisitor |
4595 | { |
4596 | public: |
4597 | TextSelectionSizer(TextPage *page, double scale); |
4598 | ~TextSelectionSizer() override { delete list; } |
4599 | |
4600 | void visitBlock(TextBlock *block, TextLine *begin, TextLine *end, const PDFRectangle *selection) override {}; |
4601 | void visitLine(TextLine *line, TextWord *begin, TextWord *end, int edge_begin, int edge_end, const PDFRectangle *selection) override; |
4602 | void visitWord(TextWord *word, int begin, int end, const PDFRectangle *selection) override {}; |
4603 | |
4604 | std::vector<PDFRectangle *> *takeRegion() |
4605 | { |
4606 | auto aux = list; |
4607 | list = nullptr; |
4608 | return aux; |
4609 | } |
4610 | |
4611 | private: |
4612 | std::vector<PDFRectangle *> *list; |
4613 | double scale; |
4614 | }; |
4615 | |
4616 | TextSelectionSizer::TextSelectionSizer(TextPage *p, double s) : TextSelectionVisitor(p), scale(s) |
4617 | { |
4618 | list = new std::vector<PDFRectangle *>(); |
4619 | } |
4620 | |
4621 | void TextSelectionSizer::visitLine(TextLine *line, TextWord *begin, TextWord *end, int edge_begin, int edge_end, const PDFRectangle *selection) |
4622 | { |
4623 | PDFRectangle *rect; |
4624 | double x1, y1, x2, y2, margin; |
4625 | |
4626 | switch (line->rot) { |
4627 | default: |
4628 | case 0: |
4629 | margin = (line->yMax - line->yMin) / 8; |
4630 | x1 = line->edge[edge_begin]; |
4631 | x2 = line->edge[edge_end]; |
4632 | y1 = line->yMin - margin; |
4633 | y2 = line->yMax + margin; |
4634 | break; |
4635 | case 1: |
4636 | margin = (line->xMax - line->xMin) / 8; |
4637 | x1 = line->xMin - margin; |
4638 | x2 = line->xMax + margin; |
4639 | y1 = line->edge[edge_begin]; |
4640 | y2 = line->edge[edge_end]; |
4641 | break; |
4642 | case 2: |
4643 | margin = (line->yMax - line->yMin) / 8; |
4644 | x1 = line->edge[edge_end]; |
4645 | x2 = line->edge[edge_begin]; |
4646 | y1 = line->yMin - margin; |
4647 | y2 = line->yMax + margin; |
4648 | break; |
4649 | case 3: |
4650 | margin = (line->xMax - line->xMin) / 8; |
4651 | x1 = line->xMin - margin; |
4652 | x2 = line->xMax + margin; |
4653 | y1 = line->edge[edge_end]; |
4654 | y2 = line->edge[edge_begin]; |
4655 | break; |
4656 | } |
4657 | |
4658 | rect = new PDFRectangle(floor(x: x1 * scale), floor(x: y1 * scale), ceil(x: x2 * scale), ceil(x: y2 * scale)); |
4659 | list->push_back(x: rect); |
4660 | } |
4661 | |
4662 | class TextSelectionPainter : public TextSelectionVisitor |
4663 | { |
4664 | public: |
4665 | TextSelectionPainter(TextPage *page, double scale, int rotation, OutputDev *out, const GfxColor *box_color, const GfxColor *glyph_color); |
4666 | ~TextSelectionPainter() override; |
4667 | |
4668 | void visitBlock(TextBlock *block, TextLine *begin, TextLine *end, const PDFRectangle *selection) override {}; |
4669 | void visitLine(TextLine *line, TextWord *begin, TextWord *end, int edge_begin, int edge_end, const PDFRectangle *selection) override; |
4670 | void visitWord(TextWord *word, int begin, int end, const PDFRectangle *selection) override; |
4671 | void endPage(); |
4672 | |
4673 | private: |
4674 | OutputDev *out; |
4675 | const GfxColor *glyph_color; |
4676 | GfxState *state; |
4677 | std::vector<TextWordSelection *> *selectionList; |
4678 | Matrix ctm, ictm; |
4679 | bool hasGlyphLessFont(); |
4680 | }; |
4681 | |
4682 | TextSelectionPainter::TextSelectionPainter(TextPage *p, double scale, int rotation, OutputDev *outA, const GfxColor *box_color, const GfxColor *glyph_colorA) : TextSelectionVisitor(p), out(outA), glyph_color(glyph_colorA) |
4683 | { |
4684 | PDFRectangle box(0, 0, p->pageWidth, p->pageHeight); |
4685 | |
4686 | selectionList = new std::vector<TextWordSelection *>(); |
4687 | state = new GfxState(72 * scale, 72 * scale, &box, rotation, false); |
4688 | |
4689 | state->getCTM(m: &ctm); |
4690 | ctm.invertTo(other: &ictm); |
4691 | |
4692 | out->startPage(pageNum: 0, state, xref: nullptr); |
4693 | out->setDefaultCTM(state->getCTM()); |
4694 | |
4695 | state->setFillColorSpace(new GfxDeviceRGBColorSpace()); |
4696 | state->setFillColor(box_color); |
4697 | out->updateFillColor(state); |
4698 | } |
4699 | |
4700 | TextSelectionPainter::~TextSelectionPainter() |
4701 | { |
4702 | for (auto entry : *selectionList) { |
4703 | delete entry; |
4704 | } |
4705 | delete selectionList; |
4706 | delete state; |
4707 | } |
4708 | |
4709 | void TextSelectionPainter::visitLine(TextLine *line, TextWord *begin, TextWord *end, int edge_begin, int edge_end, const PDFRectangle *selection) |
4710 | { |
4711 | double x1, y1, x2, y2, margin; |
4712 | |
4713 | switch (line->rot) { |
4714 | default: |
4715 | case 0: |
4716 | margin = (line->yMax - line->yMin) / 8; |
4717 | x1 = line->edge[edge_begin]; |
4718 | x2 = line->edge[edge_end]; |
4719 | y1 = line->yMin - margin; |
4720 | y2 = line->yMax + margin; |
4721 | break; |
4722 | case 1: |
4723 | margin = (line->xMax - line->xMin) / 8; |
4724 | x1 = line->xMin - margin; |
4725 | x2 = line->xMax + margin; |
4726 | y1 = line->edge[edge_begin]; |
4727 | y2 = line->edge[edge_end]; |
4728 | break; |
4729 | case 2: |
4730 | margin = (line->yMax - line->yMin) / 8; |
4731 | x1 = line->edge[edge_end]; |
4732 | x2 = line->edge[edge_begin]; |
4733 | y1 = line->yMin - margin; |
4734 | y2 = line->yMax + margin; |
4735 | break; |
4736 | case 3: |
4737 | margin = (line->xMax - line->xMin) / 8; |
4738 | x1 = line->xMin - margin; |
4739 | x2 = line->xMax + margin; |
4740 | y1 = line->edge[edge_end]; |
4741 | y2 = line->edge[edge_begin]; |
4742 | break; |
4743 | } |
4744 | |
4745 | ctm.transform(x: x1, y: y1, tx: &x1, ty: &y1); |
4746 | ctm.transform(x: x2, y: y2, tx: &x2, ty: &y2); |
4747 | |
4748 | if (x1 < x2) { |
4749 | x1 = floor(x: x1); |
4750 | x2 = ceil(x: x2); |
4751 | } else { |
4752 | x1 = ceil(x: x1); |
4753 | x2 = floor(x: x2); |
4754 | } |
4755 | |
4756 | if (y1 < y2) { |
4757 | y1 = floor(x: y1); |
4758 | y2 = ceil(x: y2); |
4759 | } else { |
4760 | y1 = ceil(x: y1); |
4761 | y2 = floor(x: y2); |
4762 | } |
4763 | |
4764 | ictm.transform(x: x1, y: y1, tx: &x1, ty: &y1); |
4765 | ictm.transform(x: x2, y: y2, tx: &x2, ty: &y2); |
4766 | |
4767 | state->moveTo(x: x1, y: y1); |
4768 | state->lineTo(x: x2, y: y1); |
4769 | state->lineTo(x: x2, y: y2); |
4770 | state->lineTo(x: x1, y: y2); |
4771 | state->closePath(); |
4772 | } |
4773 | |
4774 | void TextSelectionPainter::visitWord(TextWord *word, int begin, int end, const PDFRectangle *selection) |
4775 | { |
4776 | selectionList->push_back(x: new TextWordSelection(word, begin, end)); |
4777 | } |
4778 | |
4779 | bool TextSelectionPainter::hasGlyphLessFont() |
4780 | { |
4781 | if (selectionList && selectionList->size()) { |
4782 | TextWordSelection *sel = (*selectionList)[0]; |
4783 | return sel->word->invisible; |
4784 | } |
4785 | |
4786 | return false; |
4787 | } |
4788 | |
4789 | void TextSelectionPainter::endPage() |
4790 | { |
4791 | /* Take a shortcut for glyphless fonts (eg. Tesseract scanned documents) |
4792 | * cause we just paint a transparent fill over existent text.Issue #157 */ |
4793 | if (hasGlyphLessFont()) { |
4794 | state->setFillOpacity(glyphlessSelectionOpacity); |
4795 | out->updateFillOpacity(state); |
4796 | out->fill(state); |
4797 | out->endPage(); |
4798 | return; |
4799 | } |
4800 | |
4801 | out->fill(state); |
4802 | |
4803 | out->saveState(state); |
4804 | out->clip(state); |
4805 | |
4806 | state->clearPath(); |
4807 | |
4808 | state->setFillColor(glyph_color); |
4809 | |
4810 | out->updateFillColor(state); |
4811 | |
4812 | GooString string; |
4813 | for (const TextWordSelection *sel : *selectionList) { |
4814 | int begin = sel->begin; |
4815 | |
4816 | while (begin < sel->end) { |
4817 | TextFontInfo *font = sel->word->chars[begin].font; |
4818 | const Matrix *mat = &sel->word->chars[begin].textMat; |
4819 | |
4820 | state->setTextMat(a: mat->m[0], b: mat->m[1], c: mat->m[2], d: mat->m[3], e: 0, f: 0); |
4821 | state->setFont(fontA: font->gfxFont, fontSizeA: 1); |
4822 | out->updateFont(state); |
4823 | |
4824 | int fEnd = begin + 1; |
4825 | while (fEnd < sel->end && font->matches(fontInfo: sel->word->chars[fEnd].font) // |
4826 | && mat->m[0] == sel->word->chars[fEnd].textMat.m[0] && mat->m[1] == sel->word->chars[fEnd].textMat.m[1] // |
4827 | && mat->m[2] == sel->word->chars[fEnd].textMat.m[2] && mat->m[3] == sel->word->chars[fEnd].textMat.m[3]) { |
4828 | fEnd++; |
4829 | } |
4830 | |
4831 | /* The only purpose of this string is to let the output device query |
4832 | * it's length. Might want to change this interface later. */ |
4833 | string.clear(); |
4834 | std::for_each(first: sel->word->chars.begin() + begin, last: sel->word->chars.begin() + fEnd, f: [&string](const auto c) { string.append(c.charcode); }); |
4835 | out->beginString(state, &string); |
4836 | |
4837 | for (int j = begin; j < fEnd; j++) { |
4838 | const auto &charJ = sel->word->chars[j]; |
4839 | if (j != begin && charJ.charPos == sel->word->chars[j - 1].charPos) { |
4840 | continue; |
4841 | } |
4842 | out->drawChar(state, charJ.textMat.m[4], charJ.textMat.m[5], 0, 0, 0, 0, charJ.charcode, 1, nullptr, 0); |
4843 | } |
4844 | out->endString(state); |
4845 | begin = fEnd; |
4846 | } |
4847 | } |
4848 | |
4849 | out->restoreState(state); |
4850 | out->endPage(); |
4851 | } |
4852 | |
4853 | void TextWord::visitSelection(TextSelectionVisitor *visitor, const PDFRectangle *selection, SelectionStyle style) |
4854 | { |
4855 | double mid, s1, s2; |
4856 | |
4857 | if (rot == 0 || rot == 2) { |
4858 | s1 = selection->x1; |
4859 | s2 = selection->x2; |
4860 | } else { |
4861 | s1 = selection->y1; |
4862 | s2 = selection->y2; |
4863 | } |
4864 | |
4865 | size_t begin = len(); |
4866 | size_t end = 0; |
4867 | for (size_t i = 0; i < len(); i++) { |
4868 | if (i + 1 < len()) { |
4869 | mid = (chars[i].edge + chars[i + 1].edge) / 2; |
4870 | } else { |
4871 | mid = (chars[i].edge + edgeEnd) / 2; |
4872 | } |
4873 | if (XBetweenAB(mid, s1, s2)) { |
4874 | if (i < begin) { |
4875 | begin = i; |
4876 | } |
4877 | |
4878 | end = i + 1; |
4879 | } |
4880 | } |
4881 | |
4882 | /* Skip empty selection. */ |
4883 | if (end <= begin) { |
4884 | return; |
4885 | } |
4886 | |
4887 | visitor->visitWord(word: this, begin, end, selection); |
4888 | } |
4889 | |
4890 | void TextLine::visitSelection(TextSelectionVisitor *visitor, const PDFRectangle *selection, SelectionStyle style) |
4891 | { |
4892 | TextWord *p, *begin, *end, *current; |
4893 | int i, edge_begin, edge_end; |
4894 | PDFRectangle child_selection; |
4895 | double s1, s2, pMin, pMax; |
4896 | |
4897 | if (rot == 0 || rot == 2) { |
4898 | s1 = selection->x1; |
4899 | s2 = selection->x2; |
4900 | } else { |
4901 | s1 = selection->y1; |
4902 | s2 = selection->y2; |
4903 | } |
4904 | |
4905 | begin = nullptr; |
4906 | end = nullptr; |
4907 | current = nullptr; |
4908 | for (p = words; p != nullptr; p = p->next) { |
4909 | if (rot == 0 || rot == 2) { |
4910 | pMin = p->xMin; |
4911 | pMax = p->xMax; |
4912 | } else { |
4913 | pMin = p->yMin; |
4914 | pMax = p->yMax; |
4915 | } |
4916 | |
4917 | if (blk->page->primaryLR) { |
4918 | if (((s1 < pMax) || (s2 < pMax)) && begin == nullptr) { |
4919 | begin = p; |
4920 | } |
4921 | |
4922 | if (((s1 > pMin) || (s2 > pMin)) && begin != nullptr) { |
4923 | end = p->next; |
4924 | current = p; |
4925 | } |
4926 | } else { |
4927 | if (((s1 > pMin) || (s2 > pMin)) && begin == nullptr) { |
4928 | begin = p; |
4929 | } |
4930 | |
4931 | if (((s1 < pMax) || (s2 < pMax)) && begin != nullptr) { |
4932 | end = p->next; |
4933 | current = p; |
4934 | } |
4935 | } |
4936 | } |
4937 | |
4938 | if (!current) { |
4939 | current = begin; |
4940 | } |
4941 | |
4942 | child_selection = *selection; |
4943 | if (style == selectionStyleWord) { |
4944 | if (rot == 0 || rot == 2) { |
4945 | child_selection.x1 = begin ? begin->xMin : xMin; |
4946 | if (end && end->xMax != -1) { |
4947 | child_selection.x2 = current->xMax; |
4948 | } else { |
4949 | child_selection.x2 = xMax; |
4950 | } |
4951 | } else { |
4952 | child_selection.y1 = begin ? begin->yMin : yMin; |
4953 | if (end && end->yMax != -1) { |
4954 | child_selection.y2 = current->yMax; |
4955 | } else { |
4956 | child_selection.y2 = yMax; |
4957 | } |
4958 | } |
4959 | } |
4960 | |
4961 | if (rot == 0 || rot == 2) { |
4962 | s1 = child_selection.x1; |
4963 | s2 = child_selection.x2; |
4964 | } else { |
4965 | s1 = child_selection.y1; |
4966 | s2 = child_selection.y2; |
4967 | } |
4968 | |
4969 | edge_begin = len; |
4970 | edge_end = 0; |
4971 | for (i = 0; i < len; i++) { |
4972 | double mid = (edge[i] + edge[i + 1]) / 2; |
4973 | if (XBetweenAB(mid, s1, s2)) { |
4974 | if (i < edge_begin) { |
4975 | edge_begin = i; |
4976 | } |
4977 | |
4978 | edge_end = i + 1; |
4979 | } |
4980 | } |
4981 | |
4982 | /* Skip empty selection. */ |
4983 | if (edge_end <= edge_begin) { |
4984 | return; |
4985 | } |
4986 | |
4987 | visitor->visitLine(line: this, begin, end, edge_begin, edge_end, selection: &child_selection); |
4988 | |
4989 | for (p = begin; p != end; p = p->next) { |
4990 | p->visitSelection(visitor, selection: &child_selection, style); |
4991 | } |
4992 | } |
4993 | |
4994 | void TextBlock::visitSelection(TextSelectionVisitor *visitor, const PDFRectangle *selection, SelectionStyle style) |
4995 | { |
4996 | PDFRectangle child_selection; |
4997 | double x[2], y[2], d, best_d[2]; |
4998 | TextLine *p, *best_line[2]; |
4999 | int i, count = 0, best_count[2], start, stop; |
5000 | bool all[2]; |
5001 | |
5002 | x[0] = selection->x1; |
5003 | y[0] = selection->y1; |
5004 | x[1] = selection->x2; |
5005 | y[1] = selection->y2; |
5006 | |
5007 | for (i = 0; i < 2; i++) { |
5008 | // the first/last lines are often not nearest |
5009 | // the corners, so we have to force them to be |
5010 | // selected when the selection runs outside this |
5011 | // block. |
5012 | if (page->primaryLR) { |
5013 | all[i] = x[i] >= this->xMax && y[i] >= this->yMax; |
5014 | if (x[i] <= this->xMin && y[i] <= this->yMin) { |
5015 | best_line[i] = this->lines; |
5016 | best_count[i] = 1; |
5017 | } else { |
5018 | best_line[i] = nullptr; |
5019 | best_count[i] = 0; |
5020 | } |
5021 | } else { |
5022 | all[i] = x[i] <= this->xMin && y[i] >= this->yMax; |
5023 | if (x[i] >= this->xMax && y[i] <= this->yMin) { |
5024 | best_line[i] = this->lines; |
5025 | best_count[i] = 1; |
5026 | } else { |
5027 | best_line[i] = nullptr; |
5028 | best_count[i] = 0; |
5029 | } |
5030 | } |
5031 | best_d[i] = 0; |
5032 | } |
5033 | |
5034 | // find the nearest line to the selection points |
5035 | // using the manhattan distance. |
5036 | for (p = this->lines; p; p = p->next) { |
5037 | count++; |
5038 | for (i = 0; i < 2; i++) { |
5039 | d = fmax(x: p->xMin - x[i], y: 0.0) + fmax(x: x[i] - p->xMax, y: 0.0) + fmax(x: p->yMin - y[i], y: 0.0) + fmax(x: y[i] - p->yMax, y: 0.0); |
5040 | if (!best_line[i] || all[i] || d < best_d[i]) { |
5041 | best_line[i] = p; |
5042 | best_count[i] = count; |
5043 | best_d[i] = d; |
5044 | } |
5045 | } |
5046 | } |
5047 | // assert: best is always set. |
5048 | if (!best_line[0] || !best_line[1]) { |
5049 | return; |
5050 | } |
5051 | |
5052 | // Now decide which point was first. |
5053 | if (best_count[0] < best_count[1] || (best_count[0] == best_count[1] && y[0] < y[1])) { |
5054 | start = 0; |
5055 | stop = 1; |
5056 | } else { |
5057 | start = 1; |
5058 | stop = 0; |
5059 | } |
5060 | |
5061 | visitor->visitBlock(block: this, begin: best_line[start], end: best_line[stop], selection); |
5062 | |
5063 | for (p = best_line[start]; p; p = p->next) { |
5064 | if (page->primaryLR) { |
5065 | child_selection.x1 = p->xMin; |
5066 | child_selection.x2 = p->xMax; |
5067 | } else { |
5068 | child_selection.x1 = p->xMax; |
5069 | child_selection.x2 = p->xMin; |
5070 | } |
5071 | child_selection.y1 = p->yMin; |
5072 | child_selection.y2 = p->yMax; |
5073 | if (style == selectionStyleLine) { |
5074 | if (p == best_line[start]) { |
5075 | child_selection.x1 = 0; |
5076 | child_selection.y1 = 0; |
5077 | } |
5078 | if (p == best_line[stop]) { |
5079 | child_selection.x2 = page->pageWidth; |
5080 | child_selection.y2 = page->pageHeight; |
5081 | } |
5082 | } else { |
5083 | if (p == best_line[start]) { |
5084 | child_selection.x1 = fmax(x: p->xMin, y: fmin(x: p->xMax, y: x[start])); |
5085 | child_selection.y1 = fmax(x: p->yMin, y: fmin(x: p->yMax, y: y[start])); |
5086 | } |
5087 | if (p == best_line[stop]) { |
5088 | child_selection.x2 = fmax(x: p->xMin, y: fmin(x: p->xMax, y: x[stop])); |
5089 | child_selection.y2 = fmax(x: p->yMin, y: fmin(x: p->yMax, y: y[stop])); |
5090 | } |
5091 | } |
5092 | p->visitSelection(visitor, selection: &child_selection, style); |
5093 | if (p == best_line[stop]) { |
5094 | return; |
5095 | } |
5096 | } |
5097 | } |
5098 | |
5099 | void TextPage::visitSelection(TextSelectionVisitor *visitor, const PDFRectangle *selection, SelectionStyle style) |
5100 | { |
5101 | PDFRectangle child_selection; |
5102 | double x[2], y[2], d, best_d[2]; |
5103 | double xMin, yMin, xMax, yMax; |
5104 | TextFlow *flow, *best_flow[2]; |
5105 | TextBlock *blk, *best_block[2]; |
5106 | int i, count = 0, best_count[2], start, stop; |
5107 | |
5108 | if (!flows) { |
5109 | return; |
5110 | } |
5111 | |
5112 | x[0] = selection->x1; |
5113 | y[0] = selection->y1; |
5114 | x[1] = selection->x2; |
5115 | y[1] = selection->y2; |
5116 | |
5117 | xMin = pageWidth; |
5118 | yMin = pageHeight; |
5119 | xMax = 0.0; |
5120 | yMax = 0.0; |
5121 | |
5122 | for (i = 0; i < 2; i++) { |
5123 | best_block[i] = nullptr; |
5124 | best_flow[i] = nullptr; |
5125 | best_count[i] = 0; |
5126 | best_d[i] = 0; |
5127 | } |
5128 | |
5129 | // find the nearest blocks to the selection points |
5130 | // using the manhattan distance. |
5131 | for (flow = flows; flow; flow = flow->next) { |
5132 | for (blk = flow->blocks; blk; blk = blk->next) { |
5133 | count++; |
5134 | // the first/last blocks in reading order are |
5135 | // often not the closest to the page corners; |
5136 | // track the corners, force those blocks to |
5137 | // be selected if the selection runs across |
5138 | // multiple pages. |
5139 | xMin = fmin(x: xMin, y: blk->xMin); |
5140 | yMin = fmin(x: yMin, y: blk->yMin); |
5141 | xMax = fmax(x: xMax, y: blk->xMax); |
5142 | yMax = fmax(x: yMax, y: blk->yMax); |
5143 | for (i = 0; i < 2; i++) { |
5144 | d = fmax(x: blk->xMin - x[i], y: 0.0) + fmax(x: x[i] - blk->xMax, y: 0.0) + fmax(x: blk->yMin - y[i], y: 0.0) + fmax(x: y[i] - blk->yMax, y: 0.0); |
5145 | if (!best_block[i] || d < best_d[i] || (!blk->next && !flow->next && x[i] >= fmin(x: xMax, y: pageWidth) && y[i] >= fmin(x: yMax, y: pageHeight))) { |
5146 | best_block[i] = blk; |
5147 | best_flow[i] = flow; |
5148 | best_count[i] = count; |
5149 | best_d[i] = d; |
5150 | } |
5151 | } |
5152 | } |
5153 | } |
5154 | for (i = 0; i < 2; i++) { |
5155 | if (primaryLR) { |
5156 | if (x[i] < xMin && y[i] < yMin) { |
5157 | best_block[i] = flows->blocks; |
5158 | best_flow[i] = flows; |
5159 | best_count[i] = 1; |
5160 | } |
5161 | } else { |
5162 | if (x[i] > xMax && y[i] < yMin) { |
5163 | best_block[i] = flows->blocks; |
5164 | best_flow[i] = flows; |
5165 | best_count[i] = 1; |
5166 | } |
5167 | } |
5168 | } |
5169 | // assert: best is always set. |
5170 | if (!best_block[0] || !best_block[1]) { |
5171 | return; |
5172 | } |
5173 | |
5174 | // Now decide which point was first. |
5175 | if (best_count[0] < best_count[1] || (best_count[0] == best_count[1] && y[0] < y[1])) { |
5176 | start = 0; |
5177 | stop = 1; |
5178 | } else { |
5179 | start = 1; |
5180 | stop = 0; |
5181 | } |
5182 | |
5183 | for (flow = best_flow[start]; flow; flow = flow->next) { |
5184 | if (flow == best_flow[start]) { |
5185 | blk = best_block[start]; |
5186 | } else { |
5187 | blk = flow->blocks; |
5188 | } |
5189 | for (; blk; blk = blk->next) { |
5190 | if (primaryLR) { |
5191 | child_selection.x1 = blk->xMin; |
5192 | child_selection.x2 = blk->xMax; |
5193 | } else { |
5194 | child_selection.x1 = blk->xMax; |
5195 | child_selection.x2 = blk->xMin; |
5196 | } |
5197 | child_selection.y1 = blk->yMin; |
5198 | child_selection.y2 = blk->yMax; |
5199 | if (blk == best_block[start]) { |
5200 | child_selection.x1 = fmax(x: blk->xMin, y: fmin(x: blk->xMax, y: x[start])); |
5201 | child_selection.y1 = fmax(x: blk->yMin, y: fmin(x: blk->yMax, y: y[start])); |
5202 | } |
5203 | if (blk == best_block[stop]) { |
5204 | child_selection.x2 = fmax(x: blk->xMin, y: fmin(x: blk->xMax, y: x[stop])); |
5205 | child_selection.y2 = fmax(x: blk->yMin, y: fmin(x: blk->yMax, y: y[stop])); |
5206 | blk->visitSelection(visitor, selection: &child_selection, style); |
5207 | return; |
5208 | } |
5209 | blk->visitSelection(visitor, selection: &child_selection, style); |
5210 | } |
5211 | } |
5212 | } |
5213 | |
5214 | void TextPage::drawSelection(OutputDev *out, double scale, int rotation, const PDFRectangle *selection, SelectionStyle style, const GfxColor *glyph_color, const GfxColor *box_color) |
5215 | { |
5216 | TextSelectionPainter painter(this, scale, rotation, out, box_color, glyph_color); |
5217 | |
5218 | visitSelection(visitor: &painter, selection, style); |
5219 | painter.endPage(); |
5220 | } |
5221 | |
5222 | std::vector<PDFRectangle *> *TextPage::getSelectionRegion(const PDFRectangle *selection, SelectionStyle style, double scale) |
5223 | { |
5224 | TextSelectionSizer sizer(this, scale); |
5225 | |
5226 | visitSelection(visitor: &sizer, selection, style); |
5227 | |
5228 | return sizer.takeRegion(); |
5229 | } |
5230 | |
5231 | GooString *TextPage::getSelectionText(const PDFRectangle *selection, SelectionStyle style) |
5232 | { |
5233 | TextSelectionDumper dumper(this); |
5234 | |
5235 | visitSelection(visitor: &dumper, selection, style); |
5236 | dumper.endPage(); |
5237 | |
5238 | return dumper.getText(); |
5239 | } |
5240 | |
5241 | std::vector<TextWordSelection *> **TextPage::getSelectionWords(const PDFRectangle *selection, SelectionStyle style, int *nLines) |
5242 | { |
5243 | TextSelectionDumper dumper(this); |
5244 | |
5245 | visitSelection(visitor: &dumper, selection, style); |
5246 | dumper.endPage(); |
5247 | |
5248 | return dumper.takeWordList(nLinesOut: nLines); |
5249 | } |
5250 | |
5251 | bool TextPage::findCharRange(int pos, int length, double *xMin, double *yMin, double *xMax, double *yMax) const |
5252 | { |
5253 | TextBlock *blk; |
5254 | TextLine *line; |
5255 | TextWord *word; |
5256 | double xMin0, xMax0, yMin0, yMax0; |
5257 | double xMin1, xMax1, yMin1, yMax1; |
5258 | bool first; |
5259 | |
5260 | if (rawOrder) { |
5261 | return false; |
5262 | } |
5263 | |
5264 | //~ this doesn't correctly handle ranges split across multiple lines |
5265 | //~ (the highlighted region is the bounding box of all the parts of |
5266 | //~ the range) |
5267 | first = true; |
5268 | xMin0 = xMax0 = yMin0 = yMax0 = 0; // make gcc happy |
5269 | xMin1 = xMax1 = yMin1 = yMax1 = 0; // make gcc happy |
5270 | for (int i = 0; i < nBlocks; ++i) { |
5271 | blk = blocks[i]; |
5272 | for (line = blk->lines; line; line = line->next) { |
5273 | for (word = line->words; word; word = word->next) { |
5274 | if (pos < word->charPosEnd && pos + length > word->chars.front().charPos) { |
5275 | size_t j0, j1; |
5276 | for (j0 = 0; (j0 + 1) < word->len() && pos >= word->chars[j0 + 1].charPos; ++j0) { |
5277 | ; |
5278 | } |
5279 | for (j1 = word->len(); j1 > j0 && pos + length <= word->chars[j1].charPos; --j1) { |
5280 | ; |
5281 | } |
5282 | auto startingEdge = word->chars[j0].edge; |
5283 | auto endingEdge = (j1 + 1 == word->len()) ? word->edgeEnd : word->chars[j1 + 1].edge; |
5284 | switch (line->rot) { |
5285 | case 0: |
5286 | xMin1 = startingEdge; |
5287 | xMax1 = endingEdge; |
5288 | yMin1 = word->yMin; |
5289 | yMax1 = word->yMax; |
5290 | break; |
5291 | case 1: |
5292 | xMin1 = word->xMin; |
5293 | xMax1 = word->xMax; |
5294 | yMin1 = startingEdge; |
5295 | yMax1 = endingEdge; |
5296 | break; |
5297 | case 2: |
5298 | xMin1 = endingEdge; |
5299 | xMax1 = startingEdge; |
5300 | yMin1 = word->yMin; |
5301 | yMax1 = word->yMax; |
5302 | break; |
5303 | case 3: |
5304 | xMin1 = word->xMin; |
5305 | xMax1 = word->xMax; |
5306 | yMin1 = endingEdge; |
5307 | yMax1 = startingEdge; |
5308 | break; |
5309 | } |
5310 | if (first || xMin1 < xMin0) { |
5311 | xMin0 = xMin1; |
5312 | } |
5313 | if (first || xMax1 > xMax0) { |
5314 | xMax0 = xMax1; |
5315 | } |
5316 | if (first || yMin1 < yMin0) { |
5317 | yMin0 = yMin1; |
5318 | } |
5319 | if (first || yMax1 > yMax0) { |
5320 | yMax0 = yMax1; |
5321 | } |
5322 | first = false; |
5323 | } |
5324 | } |
5325 | } |
5326 | } |
5327 | if (!first) { |
5328 | *xMin = xMin0; |
5329 | *xMax = xMax0; |
5330 | *yMin = yMin0; |
5331 | *yMax = yMax0; |
5332 | return true; |
5333 | } |
5334 | return false; |
5335 | } |
5336 | |
5337 | void TextPage::dump(void *outputStream, TextOutputFunc outputFunc, bool physLayout, EndOfLineKind textEOL, bool pageBreaks) |
5338 | { |
5339 | const UnicodeMap *uMap; |
5340 | TextFlow *flow; |
5341 | TextBlock *blk; |
5342 | TextLine *line; |
5343 | TextLineFrag *frags; |
5344 | TextWord *word; |
5345 | int nFrags, fragsSize; |
5346 | TextLineFrag *frag; |
5347 | char space[8], eol[16], eop[8]; |
5348 | int spaceLen, eolLen, eopLen; |
5349 | double delta; |
5350 | int col, i, j, d, n; |
5351 | |
5352 | // get the output encoding |
5353 | if (!(uMap = globalParams->getTextEncoding())) { |
5354 | return; |
5355 | } |
5356 | spaceLen = uMap->mapUnicode(u: 0x20, buf: space, bufSize: sizeof(space)); |
5357 | eolLen = 0; // make gcc happy |
5358 | switch (textEOL) { |
5359 | case eolUnix: |
5360 | eolLen = uMap->mapUnicode(u: 0x0a, buf: eol, bufSize: sizeof(eol)); |
5361 | break; |
5362 | case eolDOS: |
5363 | eolLen = uMap->mapUnicode(u: 0x0d, buf: eol, bufSize: sizeof(eol)); |
5364 | eolLen += uMap->mapUnicode(u: 0x0a, buf: eol + eolLen, bufSize: sizeof(eol) - eolLen); |
5365 | break; |
5366 | case eolMac: |
5367 | eolLen = uMap->mapUnicode(u: 0x0d, buf: eol, bufSize: sizeof(eol)); |
5368 | break; |
5369 | } |
5370 | eopLen = uMap->mapUnicode(u: 0x0c, buf: eop, bufSize: sizeof(eop)); |
5371 | |
5372 | //~ writing mode (horiz/vert) |
5373 | |
5374 | // output the page in raw (content stream) order |
5375 | if (rawOrder) { |
5376 | |
5377 | GooString s; |
5378 | std::vector<Unicode> uText; |
5379 | |
5380 | for (word = rawWords; word; word = word->next) { |
5381 | s.clear(); |
5382 | uText.resize(new_size: word->len()); |
5383 | std::transform(first: word->chars.begin(), last: word->chars.end(), result: uText.begin(), unary_op: [](auto &c) { return c.text; }); |
5384 | dumpFragment(text: uText.data(), len: uText.size(), uMap, s: &s); |
5385 | (*outputFunc)(outputStream, s.c_str(), s.getLength()); |
5386 | |
5387 | if (word->next && fabs(x: word->next->base - word->base) < maxIntraLineDelta * word->fontSize && word->next->xMin > word->xMax - minDupBreakOverlap * word->fontSize) { |
5388 | if (word->next->xMin > word->xMax + minWordSpacing * word->fontSize) { |
5389 | (*outputFunc)(outputStream, space, spaceLen); |
5390 | } |
5391 | } else { |
5392 | (*outputFunc)(outputStream, eol, eolLen); |
5393 | } |
5394 | } |
5395 | |
5396 | // output the page, maintaining the original physical layout |
5397 | } else if (physLayout) { |
5398 | |
5399 | // collect the line fragments for the page and sort them |
5400 | fragsSize = 256; |
5401 | frags = (TextLineFrag *)gmallocn(count: fragsSize, size: sizeof(TextLineFrag)); |
5402 | nFrags = 0; |
5403 | for (i = 0; i < nBlocks; ++i) { |
5404 | blk = blocks[i]; |
5405 | for (line = blk->lines; line; line = line->next) { |
5406 | if (nFrags == fragsSize) { |
5407 | fragsSize *= 2; |
5408 | frags = (TextLineFrag *)greallocn(p: frags, count: fragsSize, size: sizeof(TextLineFrag)); |
5409 | } |
5410 | frags[nFrags].init(lineA: line, startA: 0, lenA: line->len); |
5411 | frags[nFrags].computeCoords(oneRot: true); |
5412 | ++nFrags; |
5413 | } |
5414 | } |
5415 | qsort(base: frags, nmemb: nFrags, size: sizeof(TextLineFrag), compar: &TextLineFrag::cmpYXPrimaryRot); |
5416 | i = 0; |
5417 | while (i < nFrags) { |
5418 | delta = maxIntraLineDelta * frags[i].line->words->fontSize; |
5419 | for (j = i + 1; j < nFrags && fabs(x: frags[j].base - frags[i].base) < delta; ++j) { |
5420 | ; |
5421 | } |
5422 | qsort(base: frags + i, nmemb: j - i, size: sizeof(TextLineFrag), compar: &TextLineFrag::cmpXYColumnPrimaryRot); |
5423 | i = j; |
5424 | } |
5425 | |
5426 | #if 0 // for debugging |
5427 | printf("*** line fragments ***\n" ); |
5428 | for (i = 0; i < nFrags; ++i) { |
5429 | frag = &frags[i]; |
5430 | printf("frag: x=%.2f..%.2f y=%.2f..%.2f base=%.2f '" , |
5431 | frag->xMin, frag->xMax, frag->yMin, frag->yMax, frag->base); |
5432 | for (n = 0; n < frag->len; ++n) { |
5433 | fputc(frag->line->text[frag->start + n] & 0xff, stdout); |
5434 | } |
5435 | printf("'\n" ); |
5436 | } |
5437 | printf("\n" ); |
5438 | #endif |
5439 | |
5440 | GooString s; |
5441 | // generate output |
5442 | col = 0; |
5443 | for (i = 0; i < nFrags; ++i) { |
5444 | frag = &frags[i]; |
5445 | |
5446 | // column alignment |
5447 | for (; col < frag->col; ++col) { |
5448 | (*outputFunc)(outputStream, space, spaceLen); |
5449 | } |
5450 | |
5451 | // print the line |
5452 | s.clear(); |
5453 | col += dumpFragment(text: frag->line->text + frag->start, len: frag->len, uMap, s: &s); |
5454 | (*outputFunc)(outputStream, s.c_str(), s.getLength()); |
5455 | |
5456 | // print one or more returns if necessary |
5457 | if (i == nFrags - 1 || frags[i + 1].col < col || fabs(x: frags[i + 1].base - frag->base) > maxIntraLineDelta * frag->line->words->fontSize) { |
5458 | if (i < nFrags - 1) { |
5459 | d = (int)((frags[i + 1].base - frag->base) / frag->line->words->fontSize); |
5460 | if (d < 1) { |
5461 | d = 1; |
5462 | } else if (d > 5) { |
5463 | d = 5; |
5464 | } |
5465 | } else { |
5466 | d = 1; |
5467 | } |
5468 | for (; d > 0; --d) { |
5469 | (*outputFunc)(outputStream, eol, eolLen); |
5470 | } |
5471 | col = 0; |
5472 | } |
5473 | } |
5474 | |
5475 | gfree(p: frags); |
5476 | |
5477 | // output the page, "undoing" the layout |
5478 | } else { |
5479 | for (flow = flows; flow; flow = flow->next) { |
5480 | for (blk = flow->blocks; blk; blk = blk->next) { |
5481 | for (line = blk->lines; line; line = line->next) { |
5482 | n = line->len; |
5483 | if (line->hyphenated && (line->next || blk->next)) { |
5484 | --n; |
5485 | } |
5486 | GooString s; |
5487 | dumpFragment(text: line->text, len: n, uMap, s: &s); |
5488 | (*outputFunc)(outputStream, s.c_str(), s.getLength()); |
5489 | // output a newline when a hyphen is not suppressed |
5490 | if (n == line->len) { |
5491 | (*outputFunc)(outputStream, eol, eolLen); |
5492 | } |
5493 | } |
5494 | } |
5495 | (*outputFunc)(outputStream, eol, eolLen); |
5496 | } |
5497 | } |
5498 | |
5499 | // end of page |
5500 | if (pageBreaks) { |
5501 | (*outputFunc)(outputStream, eop, eopLen); |
5502 | } |
5503 | } |
5504 | |
5505 | void TextPage::setMergeCombining(bool merge) |
5506 | { |
5507 | mergeCombining = merge; |
5508 | } |
5509 | |
5510 | void TextPage::assignColumns(TextLineFrag *frags, int nFrags, bool oneRot) const |
5511 | { |
5512 | TextLineFrag *frag0, *frag1; |
5513 | int rot, col1, col2, i, j, k; |
5514 | |
5515 | // all text in the region has the same rotation -- recompute the |
5516 | // column numbers based only on the text in the region |
5517 | if (oneRot) { |
5518 | qsort(base: frags, nmemb: nFrags, size: sizeof(TextLineFrag), compar: &TextLineFrag::cmpXYLineRot); |
5519 | rot = frags[0].line->rot; |
5520 | for (i = 0; i < nFrags; ++i) { |
5521 | frag0 = &frags[i]; |
5522 | col1 = 0; |
5523 | for (j = 0; j < i; ++j) { |
5524 | frag1 = &frags[j]; |
5525 | col2 = 0; // make gcc happy |
5526 | switch (rot) { |
5527 | case 0: |
5528 | if (frag0->xMin >= frag1->xMax) { |
5529 | col2 = frag1->col + (frag1->line->col[frag1->start + frag1->len] - frag1->line->col[frag1->start]) + 1; |
5530 | } else { |
5531 | for (k = frag1->start; k < frag1->start + frag1->len && frag0->xMin >= 0.5 * (frag1->line->edge[k] + frag1->line->edge[k + 1]); ++k) { |
5532 | ; |
5533 | } |
5534 | col2 = frag1->col + frag1->line->col[k] - frag1->line->col[frag1->start]; |
5535 | } |
5536 | break; |
5537 | case 1: |
5538 | if (frag0->yMin >= frag1->yMax) { |
5539 | col2 = frag1->col + (frag1->line->col[frag1->start + frag1->len] - frag1->line->col[frag1->start]) + 1; |
5540 | } else { |
5541 | for (k = frag1->start; k < frag1->start + frag1->len && frag0->yMin >= 0.5 * (frag1->line->edge[k] + frag1->line->edge[k + 1]); ++k) { |
5542 | ; |
5543 | } |
5544 | col2 = frag1->col + frag1->line->col[k] - frag1->line->col[frag1->start]; |
5545 | } |
5546 | break; |
5547 | case 2: |
5548 | if (frag0->xMax <= frag1->xMin) { |
5549 | col2 = frag1->col + (frag1->line->col[frag1->start + frag1->len] - frag1->line->col[frag1->start]) + 1; |
5550 | } else { |
5551 | for (k = frag1->start; k < frag1->start + frag1->len && frag0->xMax <= 0.5 * (frag1->line->edge[k] + frag1->line->edge[k + 1]); ++k) { |
5552 | ; |
5553 | } |
5554 | col2 = frag1->col + frag1->line->col[k] - frag1->line->col[frag1->start]; |
5555 | } |
5556 | break; |
5557 | case 3: |
5558 | if (frag0->yMax <= frag1->yMin) { |
5559 | col2 = frag1->col + (frag1->line->col[frag1->start + frag1->len] - frag1->line->col[frag1->start]) + 1; |
5560 | } else { |
5561 | for (k = frag1->start; k < frag1->start + frag1->len && frag0->yMax <= 0.5 * (frag1->line->edge[k] + frag1->line->edge[k + 1]); ++k) { |
5562 | ; |
5563 | } |
5564 | col2 = frag1->col + frag1->line->col[k] - frag1->line->col[frag1->start]; |
5565 | } |
5566 | break; |
5567 | } |
5568 | if (col2 > col1) { |
5569 | col1 = col2; |
5570 | } |
5571 | } |
5572 | frag0->col = col1; |
5573 | } |
5574 | |
5575 | // the region includes text at different rotations -- use the |
5576 | // globally assigned column numbers, offset by the minimum column |
5577 | // number (i.e., shift everything over to column 0) |
5578 | } else { |
5579 | col1 = frags[0].col; |
5580 | for (i = 1; i < nFrags; ++i) { |
5581 | if (frags[i].col < col1) { |
5582 | col1 = frags[i].col; |
5583 | } |
5584 | } |
5585 | for (i = 0; i < nFrags; ++i) { |
5586 | frags[i].col -= col1; |
5587 | } |
5588 | } |
5589 | } |
5590 | |
5591 | int TextPage::dumpFragment(const Unicode *text, int len, const UnicodeMap *uMap, GooString *s) const |
5592 | { |
5593 | if (uMap->isUnicode()) { |
5594 | return reorderText(text, len, uMap, primaryLR, s, u: nullptr); |
5595 | } else { |
5596 | int nCols = 0; |
5597 | |
5598 | char buf[8]; |
5599 | int buflen = 0; |
5600 | |
5601 | for (int i = 0; i < len; ++i) { |
5602 | buflen = uMap->mapUnicode(u: text[i], buf, bufSize: sizeof(buf)); |
5603 | s->append(str: buf, lengthA: buflen); |
5604 | nCols += buflen; |
5605 | } |
5606 | |
5607 | return nCols; |
5608 | } |
5609 | } |
5610 | |
5611 | #ifdef TEXTOUT_WORD_LIST |
5612 | std::unique_ptr<TextWordList> TextPage::makeWordList(bool physLayout) |
5613 | { |
5614 | return std::make_unique<TextWordList>(args: this, args&: physLayout); |
5615 | } |
5616 | #endif |
5617 | |
5618 | //------------------------------------------------------------------------ |
5619 | // ActualText |
5620 | //------------------------------------------------------------------------ |
5621 | ActualText::ActualText(TextPage *out) |
5622 | { |
5623 | out->incRefCnt(); |
5624 | text = out; |
5625 | actualText = nullptr; |
5626 | actualTextNBytes = 0; |
5627 | } |
5628 | |
5629 | ActualText::~ActualText() |
5630 | { |
5631 | if (actualText) { |
5632 | delete actualText; |
5633 | } |
5634 | text->decRefCnt(); |
5635 | } |
5636 | |
5637 | void ActualText::addChar(const GfxState *state, double x, double y, double dx, double dy, CharCode c, int nBytes, const Unicode *u, int uLen) |
5638 | { |
5639 | if (!actualText) { |
5640 | text->addChar(state, x, y, dx, dy, c, nBytes, u, uLen); |
5641 | return; |
5642 | } |
5643 | |
5644 | // Inside ActualText span. |
5645 | if (!actualTextNBytes) { |
5646 | actualTextX0 = x; |
5647 | actualTextY0 = y; |
5648 | } |
5649 | actualTextX1 = x + dx; |
5650 | actualTextY1 = y + dy; |
5651 | actualTextNBytes += nBytes; |
5652 | } |
5653 | |
5654 | void ActualText::begin(const GfxState *state, const GooString *t) |
5655 | { |
5656 | if (actualText) { |
5657 | delete actualText; |
5658 | } |
5659 | actualText = new GooString(t); |
5660 | actualTextNBytes = 0; |
5661 | } |
5662 | |
5663 | void ActualText::end(const GfxState *state) |
5664 | { |
5665 | // ActualText span closed. Output the span text and the |
5666 | // extents of all the glyphs inside the span |
5667 | |
5668 | if (actualTextNBytes) { |
5669 | // now that we have the position info for all of the text inside |
5670 | // the marked content span, we feed the "ActualText" back through |
5671 | // text->addChar() |
5672 | std::vector<Unicode> uni = TextStringToUCS4(textStr: actualText->toStr()); |
5673 | text->addChar(state, x: actualTextX0, y: actualTextY0, dx: actualTextX1 - actualTextX0, dy: actualTextY1 - actualTextY0, c: 0, nBytes: actualTextNBytes, u: uni.data(), uLen: uni.size()); |
5674 | } |
5675 | |
5676 | delete actualText; |
5677 | actualText = nullptr; |
5678 | actualTextNBytes = 0; |
5679 | } |
5680 | |
5681 | //------------------------------------------------------------------------ |
5682 | // TextOutputDev |
5683 | //------------------------------------------------------------------------ |
5684 | |
5685 | static void TextOutputDev_outputToFile(void *stream, const char *text, int len) |
5686 | { |
5687 | fwrite(ptr: text, size: 1, n: len, s: (FILE *)stream); |
5688 | } |
5689 | |
5690 | TextOutputDev::TextOutputDev(const char *fileName, bool physLayoutA, double fixedPitchA, bool rawOrderA, bool append, bool discardDiagA) |
5691 | { |
5692 | text = nullptr; |
5693 | physLayout = physLayoutA; |
5694 | fixedPitch = physLayout ? fixedPitchA : 0; |
5695 | rawOrder = rawOrderA; |
5696 | discardDiag = discardDiagA; |
5697 | doHTML = false; |
5698 | textEOL = defaultEndOfLine(); |
5699 | textPageBreaks = true; |
5700 | ok = true; |
5701 | minColSpacing1 = minColSpacing1_default; |
5702 | |
5703 | // open file |
5704 | needClose = false; |
5705 | if (fileName) { |
5706 | if (!strcmp(s1: fileName, s2: "-" )) { |
5707 | outputStream = stdout; |
5708 | #if defined(_WIN32) || defined(__CYGWIN__) |
5709 | // keep DOS from munging the end-of-line characters |
5710 | _setmode(fileno(stdout), O_BINARY); |
5711 | #endif |
5712 | } else if ((outputStream = openFile(path: fileName, mode: append ? "ab" : "wb" ))) { |
5713 | needClose = true; |
5714 | } else { |
5715 | error(category: errIO, pos: -1, msg: "Couldn't open text file '{0:s}'" , fileName); |
5716 | ok = false; |
5717 | actualText = nullptr; |
5718 | return; |
5719 | } |
5720 | outputFunc = &TextOutputDev_outputToFile; |
5721 | } else { |
5722 | outputStream = nullptr; |
5723 | } |
5724 | |
5725 | // set up text object |
5726 | text = new TextPage(rawOrderA, discardDiagA); |
5727 | actualText = new ActualText(text); |
5728 | } |
5729 | |
5730 | TextOutputDev::TextOutputDev(TextOutputFunc func, void *stream, bool physLayoutA, double fixedPitchA, bool rawOrderA, bool discardDiagA) |
5731 | { |
5732 | outputFunc = func; |
5733 | outputStream = stream; |
5734 | needClose = false; |
5735 | physLayout = physLayoutA; |
5736 | fixedPitch = physLayout ? fixedPitchA : 0; |
5737 | rawOrder = rawOrderA; |
5738 | discardDiag = discardDiagA; |
5739 | doHTML = false; |
5740 | text = new TextPage(rawOrderA, discardDiagA); |
5741 | actualText = new ActualText(text); |
5742 | textEOL = defaultEndOfLine(); |
5743 | textPageBreaks = true; |
5744 | ok = true; |
5745 | minColSpacing1 = minColSpacing1_default; |
5746 | } |
5747 | |
5748 | TextOutputDev::~TextOutputDev() |
5749 | { |
5750 | if (needClose) { |
5751 | fclose(stream: (FILE *)outputStream); |
5752 | } |
5753 | if (text) { |
5754 | text->decRefCnt(); |
5755 | } |
5756 | delete actualText; |
5757 | } |
5758 | |
5759 | void TextOutputDev::startPage(int pageNum, GfxState *state, XRef *xref) |
5760 | { |
5761 | text->startPage(state); |
5762 | } |
5763 | |
5764 | void TextOutputDev::endPage() |
5765 | { |
5766 | text->endPage(); |
5767 | text->coalesce(physLayout, fixedPitch, doHTML, minColSpacing1); |
5768 | if (outputStream) { |
5769 | text->dump(outputStream, outputFunc, physLayout, textEOL, pageBreaks: textPageBreaks); |
5770 | } |
5771 | } |
5772 | |
5773 | void TextOutputDev::restoreState(GfxState *state) |
5774 | { |
5775 | text->updateFont(state); |
5776 | } |
5777 | |
5778 | void TextOutputDev::updateFont(GfxState *state) |
5779 | { |
5780 | text->updateFont(state); |
5781 | } |
5782 | |
5783 | void TextOutputDev::beginString(GfxState *state, const GooString *s) { } |
5784 | |
5785 | void TextOutputDev::endString(GfxState *state) { } |
5786 | |
5787 | void TextOutputDev::drawChar(GfxState *state, double x, double y, double dx, double dy, double originX, double originY, CharCode c, int nBytes, const Unicode *u, int uLen) |
5788 | { |
5789 | actualText->addChar(state, x, y, dx, dy, c, nBytes, u, uLen); |
5790 | } |
5791 | |
5792 | void TextOutputDev::incCharCount(int nChars) |
5793 | { |
5794 | text->incCharCount(nChars); |
5795 | } |
5796 | |
5797 | void TextOutputDev::beginActualText(GfxState *state, const GooString *t) |
5798 | { |
5799 | actualText->begin(state, t); |
5800 | } |
5801 | |
5802 | void TextOutputDev::endActualText(GfxState *state) |
5803 | { |
5804 | actualText->end(state); |
5805 | } |
5806 | |
5807 | void TextOutputDev::stroke(GfxState *state) |
5808 | { |
5809 | double x[2], y[2]; |
5810 | |
5811 | if (!doHTML) { |
5812 | return; |
5813 | } |
5814 | const GfxPath *path = state->getPath(); |
5815 | if (path->getNumSubpaths() != 1) { |
5816 | return; |
5817 | } |
5818 | const GfxSubpath *subpath = path->getSubpath(i: 0); |
5819 | if (subpath->getNumPoints() != 2) { |
5820 | return; |
5821 | } |
5822 | state->transform(x1: subpath->getX(i: 0), y1: subpath->getY(i: 0), x2: &x[0], y2: &y[0]); |
5823 | state->transform(x1: subpath->getX(i: 1), y1: subpath->getY(i: 1), x2: &x[1], y2: &y[1]); |
5824 | |
5825 | // look for a vertical or horizontal line |
5826 | if (x[0] == x[1] || y[0] == y[1]) { |
5827 | text->addUnderline(x0: x[0], y0: y[0], x1: x[1], y1: y[1]); |
5828 | } |
5829 | } |
5830 | |
5831 | void TextOutputDev::fill(GfxState *state) |
5832 | { |
5833 | double x[5], y[5]; |
5834 | double rx0, ry0, rx1, ry1, t; |
5835 | int i; |
5836 | |
5837 | if (!doHTML) { |
5838 | return; |
5839 | } |
5840 | const GfxPath *path = state->getPath(); |
5841 | if (path->getNumSubpaths() != 1) { |
5842 | return; |
5843 | } |
5844 | const GfxSubpath *subpath = path->getSubpath(i: 0); |
5845 | if (subpath->getNumPoints() != 5) { |
5846 | return; |
5847 | } |
5848 | for (i = 0; i < 5; ++i) { |
5849 | if (subpath->getCurve(i)) { |
5850 | return; |
5851 | } |
5852 | state->transform(x1: subpath->getX(i), y1: subpath->getY(i), x2: &x[i], y2: &y[i]); |
5853 | } |
5854 | |
5855 | // look for a rectangle |
5856 | if (x[0] == x[1] && y[1] == y[2] && x[2] == x[3] && y[3] == y[4] && x[0] == x[4] && y[0] == y[4]) { |
5857 | rx0 = x[0]; |
5858 | ry0 = y[0]; |
5859 | rx1 = x[2]; |
5860 | ry1 = y[1]; |
5861 | } else if (y[0] == y[1] && x[1] == x[2] && y[2] == y[3] && x[3] == x[4] && x[0] == x[4] && y[0] == y[4]) { |
5862 | rx0 = x[0]; |
5863 | ry0 = y[0]; |
5864 | rx1 = x[1]; |
5865 | ry1 = y[2]; |
5866 | } else { |
5867 | return; |
5868 | } |
5869 | if (rx1 < rx0) { |
5870 | t = rx0; |
5871 | rx0 = rx1; |
5872 | rx1 = t; |
5873 | } |
5874 | if (ry1 < ry0) { |
5875 | t = ry0; |
5876 | ry0 = ry1; |
5877 | ry1 = t; |
5878 | } |
5879 | |
5880 | // skinny horizontal rectangle |
5881 | if (ry1 - ry0 < rx1 - rx0) { |
5882 | if (ry1 - ry0 < maxUnderlineWidth) { |
5883 | ry0 = 0.5 * (ry0 + ry1); |
5884 | text->addUnderline(x0: rx0, y0: ry0, x1: rx1, y1: ry0); |
5885 | } |
5886 | |
5887 | // skinny vertical rectangle |
5888 | } else { |
5889 | if (rx1 - rx0 < maxUnderlineWidth) { |
5890 | rx0 = 0.5 * (rx0 + rx1); |
5891 | text->addUnderline(x0: rx0, y0: ry0, x1: rx0, y1: ry1); |
5892 | } |
5893 | } |
5894 | } |
5895 | |
5896 | void TextOutputDev::eoFill(GfxState *state) |
5897 | { |
5898 | if (!doHTML) { |
5899 | return; |
5900 | } |
5901 | fill(state); |
5902 | } |
5903 | |
5904 | void TextOutputDev::processLink(AnnotLink *link) |
5905 | { |
5906 | double x1, y1, x2, y2; |
5907 | int xMin, yMin, xMax, yMax, x, y; |
5908 | |
5909 | if (!doHTML) { |
5910 | return; |
5911 | } |
5912 | link->getRect(x1: &x1, y1: &y1, x2: &x2, y2: &y2); |
5913 | cvtUserToDev(ux: x1, uy: y1, dx: &x, dy: &y); |
5914 | xMin = xMax = x; |
5915 | yMin = yMax = y; |
5916 | cvtUserToDev(ux: x1, uy: y2, dx: &x, dy: &y); |
5917 | if (x < xMin) { |
5918 | xMin = x; |
5919 | } else if (x > xMax) { |
5920 | xMax = x; |
5921 | } |
5922 | if (y < yMin) { |
5923 | yMin = y; |
5924 | } else if (y > yMax) { |
5925 | yMax = y; |
5926 | } |
5927 | cvtUserToDev(ux: x2, uy: y1, dx: &x, dy: &y); |
5928 | if (x < xMin) { |
5929 | xMin = x; |
5930 | } else if (x > xMax) { |
5931 | xMax = x; |
5932 | } |
5933 | if (y < yMin) { |
5934 | yMin = y; |
5935 | } else if (y > yMax) { |
5936 | yMax = y; |
5937 | } |
5938 | cvtUserToDev(ux: x2, uy: y2, dx: &x, dy: &y); |
5939 | if (x < xMin) { |
5940 | xMin = x; |
5941 | } else if (x > xMax) { |
5942 | xMax = x; |
5943 | } |
5944 | if (y < yMin) { |
5945 | yMin = y; |
5946 | } else if (y > yMax) { |
5947 | yMax = y; |
5948 | } |
5949 | text->addLink(xMin, yMin, xMax, yMax, link); |
5950 | } |
5951 | |
5952 | bool TextOutputDev::findText(const Unicode *s, int len, bool startAtTop, bool stopAtBottom, bool startAtLast, bool stopAtLast, bool caseSensitive, bool backward, bool wholeWord, double *xMin, double *yMin, double *xMax, double *yMax) const |
5953 | { |
5954 | return text->findText(s, len, startAtTop, stopAtBottom, startAtLast, stopAtLast, caseSensitive, backward, wholeWord, xMin, yMin, xMax, yMax); |
5955 | } |
5956 | |
5957 | GooString *TextOutputDev::getText(double xMin, double yMin, double xMax, double yMax) const |
5958 | { |
5959 | return text->getText(xMin, yMin, xMax, yMax, textEOL); |
5960 | } |
5961 | |
5962 | void TextOutputDev::drawSelection(OutputDev *out, double scale, int rotation, const PDFRectangle *selection, SelectionStyle style, const GfxColor *glyph_color, const GfxColor *box_color) |
5963 | { |
5964 | text->drawSelection(out, scale, rotation, selection, style, glyph_color, box_color); |
5965 | } |
5966 | |
5967 | std::vector<PDFRectangle *> *TextOutputDev::getSelectionRegion(const PDFRectangle *selection, SelectionStyle style, double scale) |
5968 | { |
5969 | return text->getSelectionRegion(selection, style, scale); |
5970 | } |
5971 | |
5972 | GooString *TextOutputDev::getSelectionText(const PDFRectangle *selection, SelectionStyle style) |
5973 | { |
5974 | return text->getSelectionText(selection, style); |
5975 | } |
5976 | |
5977 | bool TextOutputDev::findCharRange(int pos, int length, double *xMin, double *yMin, double *xMax, double *yMax) const |
5978 | { |
5979 | return text->findCharRange(pos, length, xMin, yMin, xMax, yMax); |
5980 | } |
5981 | |
5982 | void TextOutputDev::setMergeCombining(bool merge) |
5983 | { |
5984 | text->setMergeCombining(merge); |
5985 | } |
5986 | |
5987 | #ifdef TEXTOUT_WORD_LIST |
5988 | std::unique_ptr<TextWordList> TextOutputDev::makeWordList() |
5989 | { |
5990 | return text->makeWordList(physLayout); |
5991 | } |
5992 | #endif |
5993 | |
5994 | TextPage *TextOutputDev::takeText() |
5995 | { |
5996 | TextPage *ret; |
5997 | |
5998 | ret = text; |
5999 | text = new TextPage(rawOrder, discardDiag); |
6000 | delete actualText; |
6001 | actualText = new ActualText(text); |
6002 | return ret; |
6003 | } |
6004 | |
6005 | const TextFlow *TextOutputDev::getFlows() const |
6006 | { |
6007 | return text->getFlows(); |
6008 | } |
6009 | |