TextOutputDev.cc source code [poppler/poppler/TextOutputDev.cc]

1	//========================================================================
2	//
3	// TextOutputDev.cc
4	//
5	// Copyright 1997-2003 Glyph & Cog, LLC
6	//
7	//========================================================================
8
9	//========================================================================
10	//
11	// Modified under the Poppler project - http://poppler.freedesktop.org
12	//
13	// All changes made under the Poppler project to this file are licensed
14	// under GPL version 2 or later
15	//
16	// Copyright (C) 2005-2007 Kristian Høgsberg <krh@redhat.com>
17	// Copyright (C) 2005 Nickolay V. Shmyrev <nshmyrev@yandex.ru>
18	// Copyright (C) 2006-2008, 2011-2013 Carlos Garcia Campos <carlosgc@gnome.org>
19	// Copyright (C) 2006, 2007, 2013 Ed Catmur <ed@catmur.co.uk>
20	// Copyright (C) 2006 Jeff Muizelaar <jeff@infidigm.net>
21	// Copyright (C) 2007, 2008, 2012, 2017 Adrian Johnson <ajohnson@redneon.com>
22	// Copyright (C) 2008 Koji Otani <sho@bbr.jp>
23	// Copyright (C) 2008, 2010-2012, 2014-2022, 2024 Albert Astals Cid <aacid@kde.org>
24	// Copyright (C) 2008 Pino Toscano <pino@kde.org>
25	// Copyright (C) 2008, 2010 Hib Eris <hib@hiberis.nl>
26	// Copyright (C) 2009 Ross Moore <ross@maths.mq.edu.au>
27	// Copyright (C) 2009 Kovid Goyal <kovid@kovidgoyal.net>
28	// Copyright (C) 2010 Brian Ewins <brian.ewins@gmail.com>
29	// Copyright (C) 2010, 2021 Marek Kasik <mkasik@redhat.com>
30	// Copyright (C) 2010, 2020 Suzuki Toshiya <mpsuzuki@hiroshima-u.ac.jp>
31	// Copyright (C) 2011 Sam Liao <phyomh@gmail.com>
32	// Copyright (C) 2012 Horst Prote <prote@fmi.uni-stuttgart.de>
33	// Copyright (C) 2012, 2013-2018 Jason Crain <jason@aquaticape.us>
34	// Copyright (C) 2012 Peter Breitenlohner <peb@mppmu.mpg.de>
35	// Copyright (C) 2013 José Aliste <jaliste@src.gnome.org>
36	// Copyright (C) 2013 Thomas Freitag <Thomas.Freitag@alfa.de>
37	// Copyright (C) 2013 Ed Catmur <ed@catmur.co.uk>
38	// Copyright (C) 2016 Khaled Hosny <khaledhosny@eglug.org>
39	// Copyright (C) 2018 Klarälvdalens Datakonsult AB, a KDAB Group company, <info@kdab.com>. Work sponsored by the LiMux project of the city of Munich
40	// Copyright (C) 2018 Sanchit Anand <sanxchit@gmail.com>
41	// Copyright (C) 2018 Adam Reichold <adam.reichold@t-online.de>
42	// Copyright (C) 2018-2022, 2024 Nelson Benítez León <nbenitezl@gmail.com>
43	// Copyright (C) 2019 Christian Persch <chpe@src.gnome.org>
44	// Copyright (C) 2019, 2022 Oliver Sander <oliver.sander@tu-dresden.de>
45	// Copyright (C) 2019 Dan Shea <dan.shea@logical-innovations.com>
46	// Copyright (C) 2021 Peter Williams <peter@newton.cx>
47	// Copyright (C) 2024 Adam Sampson <ats@offog.org>
48	// Copyright (C) 2024 g10 Code GmbH, Author: Sune Stolborg Vuorela <sune@vuorela.dk>
49	// Copyright (C) 2024 Stefan Brüns <stefan.bruens@rwth-aachen.de>
50	//
51	// To see a description of the changes please see the Changelog file that
52	// came with your tarball or type make ChangeLog if you are building from git
53	//
54	//========================================================================
55
56	#include <config.h>
57
58	#include <cstdio>
59	#include <cstdlib>
60	#include <cstddef>
61	#include <cmath>
62	#include <cfloat>
63	#include <cctype>
64	#include <algorithm>
65	#if defined(_WIN32) \|\| defined(__CYGWIN__)
66	# include <fcntl.h> // for O_BINARY
67	# include <io.h> // for _setmode
68	#endif
69	#include "goo/gfile.h"
70	#include "goo/gmem.h"
71	#include "goo/GooString.h"
72	#include "poppler-config.h"
73	#include "Error.h"
74	#include "GlobalParams.h"
75	#include "UnicodeMap.h"
76	#include "UnicodeTypeTable.h"
77	#include "Link.h"
78	#include "TextOutputDev.h"
79	#include "Page.h"
80	#include "Annot.h"
81	#include "UTF.h"
82
83	//------------------------------------------------------------------------
84	// parameters
85	//------------------------------------------------------------------------
86
87	// Each bucket in a text pool includes baselines within a range of
88	// this many points.
89	#define textPoolStep 4
90
91	// Inter-character space width which will cause addChar to start a new
92	// word.
93	#define minWordBreakSpace 0.1
94
95	// Negative inter-character space width, i.e., overlap, which will
96	// cause addChar to start a new word.
97	#define minDupBreakOverlap 0.2
98
99	// Max distance between baselines of two lines within a block, as a
100	// fraction of the font size.
101	#define maxLineSpacingDelta 1.5
102
103	// Max difference in primary font sizes on two lines in the same
104	// block. Delta1 is used when examining new lines above and below the
105	// current block; delta2 is used when examining text that overlaps the
106	// current block; delta3 is used when examining text to the left and
107	// right of the current block.
108	#define maxBlockFontSizeDelta1 0.05
109	#define maxBlockFontSizeDelta2 0.6
110	#define maxBlockFontSizeDelta3 0.2
111
112	// Max difference in font sizes inside a word.
113	#define maxWordFontSizeDelta 0.05
114
115	// Maximum distance between baselines of two words on the same line,
116	// e.g., distance between subscript or superscript and the primary
117	// baseline, as a fraction of the font size.
118	#define maxIntraLineDelta 0.5
119
120	// Minimum inter-word spacing, as a fraction of the font size. (Only
121	// used for raw ordering.)
122	#define minWordSpacing 0.15
123
124	// Maximum inter-word spacing, as a fraction of the font size.
125	#define maxWordSpacing 1.5
126
127	// Maximum horizontal spacing which will allow a word to be pulled
128	// into a block, as a fraction of the font size.
129	// This default value can be tweaked via API.
130	double TextOutputDev::minColSpacing1_default = `0.7`;
131
132	// Minimum spacing between columns, as a fraction of the font size.
133	#define minColSpacing2 1.0
134
135	// Maximum vertical spacing between blocks within a flow, as a
136	// multiple of the font size.
137	#define maxBlockSpacing 2.5
138
139	// Minimum spacing between characters within a word, as a fraction of
140	// the font size.
141	#define minCharSpacing -0.5
142
143	// Maximum spacing between characters within a word, as a fraction of
144	// the font size, when there is no obvious extra-wide character
145	// spacing.
146	#define maxCharSpacing 0.03
147
148	// When extra-wide character spacing is detected, the inter-character
149	// space threshold is set to the minimum inter-character space
150	// multiplied by this constant.
151	#define maxWideCharSpacingMul 1.3
152
153	// Upper limit on spacing between characters in a word.
154	#define maxWideCharSpacing 0.4
155
156	// Max difference in primary,secondary coordinates (as a fraction of
157	// the font size) allowed for duplicated text (fake boldface, drop
158	// shadows) which is to be discarded.
159	#define dupMaxPriDelta 0.1
160	#define dupMaxSecDelta 0.2
161
162	// Max width of underlines (in points).
163	#define maxUnderlineWidth 3
164
165	// Min distance between baseline and underline (in points).
166	//~ this should be font-size-dependent
167	#define minUnderlineGap -2
168
169	// Max distance between baseline and underline (in points).
170	//~ this should be font-size-dependent
171	#define maxUnderlineGap 4
172
173	// Max horizontal distance between edge of word and start of underline
174	// (in points).
175	//~ this should be font-size-dependent
176	#define underlineSlack 1
177
178	// Max distance between edge of text and edge of link border
179	#define hyperlinkSlack 2
180
181	// Max distance between characters when combining a base character and
182	// combining character
183	#define combMaxMidDelta 0.3
184	#define combMaxBaseDelta 0.4
185
186	// Text is considered diagonal if abs(tan(angle)) > diagonalThreshold.
187	// (Or 1/tan(angle) for 90/270 degrees.)
188	#define diagonalThreshold 0.1
189
190	// How opaque a selection on a glyphless font should be. Since the font is
191	// glyphless and overlaid over text in image form, this must enable users
192	// to read the underlying image. Issue #157
193	#define glyphlessSelectionOpacity 0.4
194
195	// Returns whether x is between a and b or equal to a or b.
196	// a and b don't need to be sorted.
197	#define XBetweenAB(x, a, b) (!(((x) > (a) && (x) > (b)) \|\| ((x) < (a) && (x) < (b))) ? true : false)
198
199	namespace {
200
201	inline bool isAscii7(Unicode uchar)
202	{
203	return uchar < `128`;
204	}
205
206	}
207
208	static int reorderText(const Unicode text, int* len, const UnicodeMap uMap, bool* primaryLR, GooString s, Unicode u)
209	{
210	char lre[`8`], rle[`8`], popdf[`8`], buf[`8`];
211	int lreLen = `0`, rleLen = `0`, popdfLen = `0`, n;
212	int nCols, i, j, k;
213
214	nCols = `0`;
215
216	if (s) {
217	lreLen = uMap->mapUnicode(u: `0x202a`, buf: lre, bufSize: sizeof(lre));
218	rleLen = uMap->mapUnicode(u: `0x202b`, buf: rle, bufSize: sizeof(rle));
219	popdfLen = uMap->mapUnicode(u: `0x202c`, buf: popdf, bufSize: sizeof(popdf));
220	}
221
222	if (primaryLR) {
223	i = `0`;
224	while (i < len) {
225	// output a left-to-right section
226	for (j = i; j < len && !unicodeTypeR(c: text[j]); ++j) {
227	;
228	}
229	for (k = i; k < j; ++k) {
230	if (s) {
231	n = uMap->mapUnicode(u: text[k], buf, bufSize: sizeof(buf));
232	s->append(str: buf, lengthA: n);
233	}
234	if (u) {
235	u[nCols] = text[k];
236	}
237	++nCols;
238	}
239	i = j;
240	// output a right-to-left section
241	for (j = i; j < len && !(unicodeTypeL(c: text[j]) \|\| unicodeTypeNum(c: text[j])); ++j) {
242	;
243	}
244	if (j > i) {
245	if (s) {
246	s->append(str: rle, lengthA: rleLen);
247	}
248	for (k = j - `1`; k >= i; --k) {
249	if (s) {
250	n = uMap->mapUnicode(u: text[k], buf, bufSize: sizeof(buf));
251	s->append(str: buf, lengthA: n);
252	}
253	if (u) {
254	u[nCols] = text[k];
255	}
256	++nCols;
257	}
258	if (s) {
259	s->append(str: popdf, lengthA: popdfLen);
260	}
261	i = j;
262	}
263	}
264	} else {
265	// Note: This code treats numeric characters (European and
266	// Arabic/Indic) as left-to-right, which isn't strictly correct
267	// (incurs extra LRE/POPDF pairs), but does produce correct
268	// visual formatting.
269	if (s) {
270	s->append(str: rle, lengthA: rleLen);
271	}
272	i = len - `1`;
273	while (i >= `0`) {
274	// output a right-to-left section
275	for (j = i; j >= `0` && !(unicodeTypeL(c: text[j]) \|\| unicodeTypeNum(c: text[j])); --j) {
276	;
277	}
278	for (k = i; k > j; --k) {
279	if (s) {
280	n = uMap->mapUnicode(u: text[k], buf, bufSize: sizeof(buf));
281	s->append(str: buf, lengthA: n);
282	}
283	if (u) {
284	u[nCols] = text[k];
285	}
286	++nCols;
287	}
288	i = j;
289	// output a left-to-right section
290	for (j = i; j >= `0` && !unicodeTypeR(c: text[j]); --j) {
291	;
292	}
293	if (j < i) {
294	if (s) {
295	s->append(str: lre, lengthA: lreLen);
296	}
297	for (k = j + `1`; k <= i; ++k) {
298	if (s) {
299	n = uMap->mapUnicode(u: text[k], buf, bufSize: sizeof(buf));
300	s->append(str: buf, lengthA: n);
301	}
302	if (u) {
303	u[nCols] = text[k];
304	}
305	++nCols;
306	}
307	if (s) {
308	s->append(str: popdf, lengthA: popdfLen);
309	}
310	i = j;
311	}
312	}
313	if (s) {
314	s->append(str: popdf, lengthA: popdfLen);
315	}
316	}
317
318	return nCols;
319	}
320
321	//------------------------------------------------------------------------
322	// TextUnderline
323	//------------------------------------------------------------------------
324
325	class TextUnderline
326	{
327	public:
328	TextUnderline(double x0A, double y0A, double x1A, double y1A)
329	{
330	x0 = x0A;
331	y0 = y0A;
332	x1 = x1A;
333	y1 = y1A;
334	horiz = y0 == y1;
335	}
336	~TextUnderline() { }
337
338	double x0, y0, x1, y1;
339	bool horiz;
340	};
341
342	//------------------------------------------------------------------------
343	// TextLink
344	//------------------------------------------------------------------------
345
346	class TextLink
347	{
348	public:
349	TextLink(int xMinA, int yMinA, int xMaxA, int yMaxA, AnnotLink *linkA)
350	{
351	xMin = xMinA;
352	yMin = yMinA;
353	xMax = xMaxA;
354	yMax = yMaxA;
355	link = linkA;
356	}
357	~TextLink() { }
358
359	int xMin, yMin, xMax, yMax;
360	AnnotLink *link;
361	};
362
363	//------------------------------------------------------------------------
364	// TextFontInfo
365	//------------------------------------------------------------------------
366
367	TextFontInfo::TextFontInfo(const GfxState *state)
368	{
369	gfxFont = state->getFont();
370	#ifdef TEXTOUT_WORD_LIST
371	fontName = (gfxFont && gfxFont ->getName()) ? new GooString (gfxFont ->getName()) : nullptr*;
372	flags = gfxFont ? gfxFont ->getFlags() : `0`;
373	#endif
374	}
375
376	TextFontInfo::~TextFontInfo()
377	{
378	#ifdef TEXTOUT_WORD_LIST
379	if (fontName) {
380	delete fontName;
381	}
382	#endif
383	}
384
385	bool TextFontInfo::matches(const GfxState state) const*
386	{
387	return state->getFont() == gfxFont;
388	}
389
390	bool TextFontInfo::matches(const TextFontInfo fontInfo) const*
391	{
392	return gfxFont == fontInfo->gfxFont;
393	}
394
395	bool TextFontInfo::matches(const Ref ref) const*
396	{
397	return gfxFont && ((gfxFont ->getID()) == ref);
398	}
399
400	double TextFontInfo::getAscent() const
401	{
402	return gfxFont ? gfxFont ->getAscent() : `0.95`;
403	}
404
405	double TextFontInfo::getDescent() const
406	{
407	return gfxFont ? gfxFont ->getDescent() : -`0.35`;
408	}
409
410	int TextFontInfo::getWMode() const
411	{
412	return gfxFont ? gfxFont ->getWMode() : `0`;
413	}
414
415	//------------------------------------------------------------------------
416	// TextWord
417	//------------------------------------------------------------------------
418
419	TextWord::TextWord(const GfxState state, int* rotA, double fontSizeA)
420	{
421	rot = rotA;
422	fontSize = fontSizeA;
423	spaceAfter = false;
424	next = nullptr;
425	invisible = state->getRender() == `3`;
426
427	#ifdef TEXTOUT_WORD_LIST
428	GfxRGB rgb;
429
430	if ((state->getRender() & `3`) == `1`) {
431	state->getStrokeRGB(rgb: &rgb);
432	} else {
433	state->getFillRGB(rgb: &rgb);
434	}
435	colorR = colToDbl(x: rgb.r);
436	colorG = colToDbl(x: rgb.g);
437	colorB = colToDbl(x: rgb.b);
438	#endif
439
440	underlined = false;
441	link = nullptr;
442	}
443
444	TextWord::~TextWord() { }
445
446	void TextWord::addChar(const GfxState state, TextFontInfo fontA, double x, double y, double dx, double dy, int charPosA, int charLen, CharCode c, Unicode u, const Matrix &textMatA)
447	{
448	chars.push_back(x: CharInfo { .text: u, .charcode: c, .charPos: charPosA, .edge: `0.0`, .font: fontA, .textMat: textMatA });
449	charPosEnd = charPosA + charLen;
450
451	if (len() == `1`) {
452	setInitialBounds(fontA, x, y);
453	}
454
455	if (wMode) { // vertical writing mode
456	// NB: the rotation value has been incremented by 1 (in
457	// TextPage::beginWord()) for vertical writing mode
458	switch (rot) {
459	case `0`:
460	chars.back().edge = x - fontSize;
461	xMax = edgeEnd = x;
462	break;
463	case `1`:
464	chars.back().edge = y - fontSize;
465	yMax = edgeEnd = y;
466	break;
467	case `2`:
468	chars.back().edge = x + fontSize;
469	xMin = edgeEnd = x;
470	break;
471	case `3`:
472	chars.back().edge = y + fontSize;
473	yMin = edgeEnd = y;
474	break;
475	}
476	} else { // horizontal writing mode
477	switch (rot) {
478	case `0`:
479	chars.back().edge = x;
480	xMax = edgeEnd = x + dx;
481	break;
482	case `1`:
483	chars.back().edge = y;
484	yMax = edgeEnd = y + dy;
485	break;
486	case `2`:
487	chars.back().edge = x;
488	xMin = edgeEnd = x + dx;
489	break;
490	case `3`:
491	chars.back().edge = y;
492	yMin = edgeEnd = y + dy;
493	break;
494	}
495	}
496	}
497
498	void TextWord::setInitialBounds(TextFontInfo fontA, double* x, double y)
499	{
500	double ascent = fontA->getAscent() * fontSize;
501	double descent = fontA->getDescent() * fontSize;
502	wMode = fontA->getWMode();
503
504	if (wMode) { // vertical writing mode
505	// NB: the rotation value has been incremented by 1 (in
506	// TextPage::beginWord()) for vertical writing mode
507	switch (rot) {
508	case `0`:
509	xMin = x - fontSize;
510	yMin = y - fontSize;
511	yMax = y;
512	base = y;
513	break;
514	case `1`:
515	xMin = x;
516	yMin = y - fontSize;
517	xMax = x + fontSize;
518	base = x;
519	break;
520	case `2`:
521	yMin = y;
522	xMax = x + fontSize;
523	yMax = y + fontSize;
524	base = y;
525	break;
526	case `3`:
527	xMin = x - fontSize;
528	xMax = x;
529	yMax = y + fontSize;
530	base = x;
531	break;
532	}
533	} else { // horizontal writing mode
534	switch (rot) {
535	case `0`:
536	xMin = x;
537	yMin = y - ascent;
538	yMax = y - descent;
539	if (yMin == yMax) {
540	// this is a sanity check for a case that shouldn't happen -- but
541	// if it does happen, we want to avoid dividing by zero later
542	yMin = y;
543	yMax = y + `1`;
544	}
545	base = y;
546	break;
547	case `1`:
548	xMin = x + descent;
549	yMin = y;
550	xMax = x + ascent;
551	if (xMin == xMax) {
552	// this is a sanity check for a case that shouldn't happen -- but
553	// if it does happen, we want to avoid dividing by zero later
554	xMin = x;
555	xMax = x + `1`;
556	}
557	base = x;
558	break;
559	case `2`:
560	yMin = y + descent;
561	xMax = x;
562	yMax = y + ascent;
563	if (yMin == yMax) {
564	// this is a sanity check for a case that shouldn't happen -- but
565	// if it does happen, we want to avoid dividing by zero later
566	yMin = y;
567	yMax = y + `1`;
568	}
569	base = y;
570	break;
571	case `3`:
572	xMin = x - ascent;
573	xMax = x - descent;
574	yMax = y;
575	if (xMin == xMax) {
576	// this is a sanity check for a case that shouldn't happen -- but
577	// if it does happen, we want to avoid dividing by zero later
578	xMin = x;
579	xMax = x + `1`;
580	}
581	base = x;
582	break;
583	}
584	}
585	}
586
587	struct CombiningTable
588	{
589	Unicode base;
590	Unicode comb;
591	};
592
593	static const struct CombiningTable combiningTable[] = {
594	{ .base: `0x0060`, .comb: `0x0300` }, // grave
595	{ .base: `0x00a8`, .comb: `0x0308` }, // dieresis
596	{ .base: `0x00af`, .comb: `0x0304` }, // macron
597	{ .base: `0x00b4`, .comb: `0x0301` }, // acute
598	{ .base: `0x00b8`, .comb: `0x0327` }, // cedilla
599	{ .base: `0x02c6`, .comb: `0x0302` }, // circumflex
600	{ .base: `0x02c7`, .comb: `0x030c` }, // caron
601	{ .base: `0x02d8`, .comb: `0x0306` }, // breve
602	{ .base: `0x02d9`, .comb: `0x0307` }, // dotaccent
603	{ .base: `0x02da`, .comb: `0x030a` }, // ring
604	{ .base: `0x02dc`, .comb: `0x0303` }, // tilde
605	{ .base: `0x02dd`, .comb: `0x030b` } // hungarumlaut (double acute accent)
606	};
607
608	// returning combining versions of characters
609	static Unicode getCombiningChar(Unicode u)
610	{
611	for (const CombiningTable &combining : combiningTable) {
612	if (u == combining.base) {
613	return combining.comb;
614	}
615	}
616	return `0`;
617	}
618
619	bool TextWord::addCombining(const GfxState state, TextFontInfo fontA, double fontSizeA, double x, double y, double dx, double dy, int charPosA, int charLen, CharCode c, Unicode u, const Matrix &textMatA)
620	{
621	if (chars.empty() \|\| wMode != `0` \|\| fontA->getWMode() != `0`) {
622	return false;
623	}
624
625	Unicode cCurrent = getCombiningChar(u);
626	if (cCurrent != `0` && unicodeTypeAlphaNum(c: chars.back().text)) {
627	// Current is a combining character, previous is base character
628	double maxScaledMidDelta = fabs(x: edgeEnd - chars.back().edge) * combMaxMidDelta;
629	double charMid, charBase, maxScaledBaseDelta;
630
631	// Test if characters overlap
632	if (rot == `0` \|\| rot == `2`) {
633	charMid = x + (dx / `2`);
634	charBase = y;
635	maxScaledBaseDelta = (yMax - yMin) * combMaxBaseDelta;
636	} else {
637	charMid = y + (dy / `2`);
638	charBase = x;
639	maxScaledBaseDelta = (xMax - xMin) * combMaxBaseDelta;
640	}
641
642	double edgeMid = (chars.back().edge + edgeEnd) / `2`;
643	if (fabs(x: charMid - edgeMid) >= maxScaledMidDelta \|\| fabs(x: charBase - base) >= maxScaledBaseDelta) {
644	return false;
645	}
646
647	// Add character, but don't adjust edge / bounding box because
648	// combining character's positioning could be odd.
649	chars.emplace_back(args: CharInfo { .text: cCurrent, .charcode: c, .charPos: charPosA, .edge: edgeMid, .font: fontA, .textMat: textMatA });
650	charPosEnd = charPosA + charLen;
651
652	return true;
653	}
654
655	Unicode cPrev = getCombiningChar(u: chars.back().text);
656	if (cPrev != `0` && unicodeTypeAlphaNum(c: u)) {
657	// Previous is a combining character, current is base character
658	double maxScaledBaseDelta = (fontA->getAscent() - fontA->getDescent()) * fontSizeA * combMaxBaseDelta;
659	double charMid, charBase, maxScaledMidDelta;
660
661	// Test if characters overlap
662	if (rot == `0` \|\| rot == `2`) {
663	charMid = x + (dx / `2`);
664	charBase = y;
665	maxScaledMidDelta = fabs(x: dx * combMaxMidDelta);
666	} else {
667	charMid = y + (dy / `2`);
668	charBase = x;
669	maxScaledMidDelta = fabs(x: dy * combMaxMidDelta);
670	}
671
672	double edgeMid = (chars.back().edge + edgeEnd) / `2`;
673	if (fabs(x: charMid - edgeMid) >= maxScaledMidDelta \|\| fabs(x: charBase - base) >= maxScaledBaseDelta) {
674	return false;
675	}
676
677	fontSize = fontSizeA;
678	// move combining character to after base character
679	chars.emplace_back(args: CharInfo { .text: cPrev, .charcode: chars.back().charcode, .charPos: charPosA, .edge: edgeMid, .font: chars.back().font, .textMat: chars.back().textMat });
680
681	auto &lastChar = chars [chars.size() - `2`];
682
683	charPosEnd = charPosA + charLen;
684	lastChar.text = u;
685	lastChar.charcode = c;
686	lastChar.font = fontA;
687	lastChar.textMat = textMatA;
688
689	if (len() == `2`) {
690	setInitialBounds(fontA, x, y);
691	}
692
693	// Updated edges / bounding box because we changed the base
694	// character.
695	if (wMode) {
696	// FIXME unreachable, wMode == 0
697	switch (rot) {
698	case `0`:
699	lastChar.edge = x - fontSize;
700	xMax = edgeEnd = x;
701	break;
702	case `1`:
703	lastChar.edge = y - fontSize;
704	yMax = edgeEnd = y;
705	break;
706	case `2`:
707	lastChar.edge = x + fontSize;
708	xMin = edgeEnd = x;
709	break;
710	case `3`:
711	lastChar.edge = y + fontSize;
712	yMin = edgeEnd = y;
713	break;
714	}
715	} else {
716	switch (rot) {
717	case `0`:
718	lastChar.edge = x;
719	xMax = edgeEnd = x + dx;
720	break;
721	case `1`:
722	lastChar.edge = y;
723	yMax = edgeEnd = y + dy;
724	break;
725	case `2`:
726	lastChar.edge = x;
727	xMin = edgeEnd = x + dx;
728	break;
729	case `3`:
730	lastChar.edge = y;
731	yMin = edgeEnd = y + dy;
732	break;
733	}
734	}
735
736	chars.back().edge = (edgeEnd + lastChar.edge) / `2`;
737	return true;
738	}
739	return false;
740	}
741
742	void TextWord::merge(TextWord *word)
743	{
744	if (word->xMin < xMin) {
745	xMin = word->xMin;
746	}
747	if (word->yMin < yMin) {
748	yMin = word->yMin;
749	}
750	if (word->xMax > xMax) {
751	xMax = word->xMax;
752	}
753	if (word->yMax > yMax) {
754	yMax = word->yMax;
755	}
756	chars.insert(position: chars.end(), first: word->chars.begin(), last: word->chars.end());
757	edgeEnd = word->edgeEnd;
758	charPosEnd = word->charPosEnd;
759	}
760
761	inline int TextWord::primaryCmp(const TextWord word) const*
762	{
763	double cmp;
764
765	cmp = `0`; // make gcc happy
766	switch (rot) {
767	case `0`:
768	cmp = xMin - word->xMin;
769	break;
770	case `1`:
771	cmp = yMin - word->yMin;
772	break;
773	case `2`:
774	cmp = word->xMax - xMax;
775	break;
776	case `3`:
777	cmp = word->yMax - yMax;
778	break;
779	}
780	return cmp < `0` ? -`1` : cmp > `0` ? `1` : `0`;
781	}
782
783	double TextWord::primaryDelta(const TextWord word) const*
784	{
785	double delta;
786
787	delta = `0`; // make gcc happy
788	switch (rot) {
789	case `0`:
790	delta = word->xMin - xMax;
791	break;
792	case `1`:
793	delta = word->yMin - yMax;
794	break;
795	case `2`:
796	delta = xMin - word->xMax;
797	break;
798	case `3`:
799	delta = yMin - word->yMax;
800	break;
801	}
802	return delta;
803	}
804
805	int TextWord::cmpYX(const void p1, const* void *p2)
806	{
807	TextWord word1 = (TextWord **)p1;
808	TextWord word2 = (TextWord **)p2;
809	double cmp;
810
811	cmp = word1->yMin - word2->yMin;
812	if (cmp == `0`) {
813	cmp = word1->xMin - word2->xMin;
814	}
815	return cmp < `0` ? -`1` : cmp > `0` ? `1` : `0`;
816	}
817
818	#ifdef TEXTOUT_WORD_LIST
819
820	GooString TextWord::getText() const*
821	{
822	GooString *s;
823	const UnicodeMap *uMap;
824	char buf[`8`];
825
826	s = new GooString ();
827	if (!(uMap = globalParams ->getTextEncoding())) {
828	return s;
829	}
830	for (size_t i = `0`; i < len(); ++i) {
831	auto n = uMap->mapUnicode(u: chars [i].text, buf, bufSize: sizeof(buf));
832	s->append(str: buf, lengthA: n);
833	}
834	return s;
835	}
836
837	void TextWord::getCharBBox(int charIdx, double xMinA, double* yMinA, double* xMaxA, double* yMaxA) const*
838	{
839	if (charIdx < `0`) {
840	return;
841	}
842	size_t uCharIdx = charIdx;
843	if (uCharIdx >= len()) {
844	return;
845	}
846	auto startingEdge = chars [uCharIdx].edge;
847	auto endingEdge = (uCharIdx + `1` == len()) ? edgeEnd : chars [charIdx + `1`].edge;
848	switch (rot) {
849	case `0`:
850	*xMinA = startingEdge;
851	*xMaxA = endingEdge;
852	*yMinA = yMin;
853	*yMaxA = yMax;
854	break;
855	case `1`:
856	*xMinA = xMin;
857	*xMaxA = xMax;
858	*yMinA = startingEdge;
859	*yMaxA = endingEdge;
860	break;
861	case `2`:
862	*xMinA = endingEdge;
863	*xMaxA = startingEdge;
864	*yMinA = yMin;
865	*yMaxA = yMax;
866	break;
867	case `3`:
868	*xMinA = xMin;
869	*xMaxA = xMax;
870	*yMinA = endingEdge;
871	*yMaxA = startingEdge;
872	break;
873	}
874	}
875
876	#endif // TEXTOUT_WORD_LIST
877
878	//------------------------------------------------------------------------
879	// TextPool
880	//------------------------------------------------------------------------
881
882	TextPool::TextPool()
883	{
884	minBaseIdx = `0`;
885	maxBaseIdx = -`1`;
886	pool = nullptr;
887	cursor = nullptr;
888	cursorBaseIdx = -`1`;
889	}
890
891	TextPool::~TextPool()
892	{
893	int baseIdx;
894	TextWord word, word2;
895
896	for (baseIdx = minBaseIdx; baseIdx <= maxBaseIdx; ++baseIdx) {
897	for (word = pool[baseIdx - minBaseIdx]; word; word = word2) {
898	word2 = word->next;
899	delete word;
900	}
901	}
902	gfree(p: pool);
903	}
904
905	int TextPool::getBaseIdx(double base) const
906	{
907	const double baseIdxDouble = base / textPoolStep;
908	if (std::isnan(x: baseIdxDouble) \|\| baseIdxDouble < minBaseIdx) {
909	return minBaseIdx;
910	}
911	if (baseIdxDouble > maxBaseIdx) {
912	return maxBaseIdx;
913	}
914	return (int)baseIdxDouble;
915	}
916
917	void TextPool::addWord(TextWord *word)
918	{
919	int wordBaseIdx, newMinBaseIdx, newMaxBaseIdx, baseIdx;
920	TextWord w0, w1;
921
922	// expand the array if needed
923	wordBaseIdx = (int)(word->base / textPoolStep);
924	if (unlikely(wordBaseIdx <= INT_MIN + `128` \|\| wordBaseIdx >= INT_MAX - `128`)) {
925	error(category: errSyntaxWarning, pos: -`1`, msg: "wordBaseIdx out of range");
926	delete word;
927	return;
928	}
929	if (minBaseIdx > maxBaseIdx) {
930	minBaseIdx = wordBaseIdx - `128`;
931	maxBaseIdx = wordBaseIdx + `128`;
932	pool = (TextWord )gmallocn(count: maxBaseIdx - minBaseIdx + `1`, size: sizeof*(TextWord ));
933	for (baseIdx = minBaseIdx; baseIdx <= maxBaseIdx; ++baseIdx) {
934	pool[baseIdx - minBaseIdx] = nullptr;
935	}
936	} else if (wordBaseIdx < minBaseIdx) {
937	newMinBaseIdx = wordBaseIdx - `128`;
938	TextWord newPool = (TextWord )gmallocn_checkoverflow(count: maxBaseIdx - newMinBaseIdx + `1`, size: sizeof(TextWord *));
939	if (unlikely(!newPool)) {
940	error(category: errSyntaxWarning, pos: -`1`, msg: "newPool would overflow");
941	delete word;
942	return;
943	}
944	for (baseIdx = newMinBaseIdx; baseIdx < minBaseIdx; ++baseIdx) {
945	newPool[baseIdx - newMinBaseIdx] = nullptr;
946	}
947	memcpy(dest: &newPool[minBaseIdx - newMinBaseIdx], src: pool, n: (maxBaseIdx - minBaseIdx + `1`) * sizeof(TextWord *));
948	gfree(p: pool);
949	pool = newPool;
950	minBaseIdx = newMinBaseIdx;
951	} else if (wordBaseIdx > maxBaseIdx) {
952	newMaxBaseIdx = wordBaseIdx + `128`;
953	TextWord reallocatedPool = (TextWord )greallocn(p: pool, count: newMaxBaseIdx - minBaseIdx + `1`, size: sizeof(TextWord ), checkoverflow: true* /checkoverflow/, free_p: false /free_pool/);
954	if (!reallocatedPool) {
955	error(category: errSyntaxWarning, pos: -`1`, msg: "new pool size would overflow");
956	delete word;
957	return;
958	}
959	pool = reallocatedPool;
960	for (baseIdx = maxBaseIdx + `1`; baseIdx <= newMaxBaseIdx; ++baseIdx) {
961	pool[baseIdx - minBaseIdx] = nullptr;
962	}
963	maxBaseIdx = newMaxBaseIdx;
964	}
965
966	// insert the new word
967	if (cursor && wordBaseIdx == cursorBaseIdx && word->primaryCmp(word: cursor) >= `0`) {
968	w0 = cursor;
969	w1 = cursor->next;
970	} else {
971	w0 = nullptr;
972	w1 = pool[wordBaseIdx - minBaseIdx];
973	}
974	for (; w1 && word->primaryCmp(word: w1) > `0`; w0 = w1, w1 = w1->next) {
975	;
976	}
977	word->next = w1;
978	if (w0) {
979	w0->next = word;
980	} else {
981	pool[wordBaseIdx - minBaseIdx] = word;
982	}
983	cursor = word;
984	cursorBaseIdx = wordBaseIdx;
985	}
986
987	//------------------------------------------------------------------------
988	// TextLine
989	//------------------------------------------------------------------------
990
991	TextLine::TextLine(TextBlock blkA, int* rotA, double baseA)
992	{
993	blk = blkA;
994	rot = rotA;
995	base = baseA;
996	words = lastWord = nullptr;
997	text = nullptr;
998	edge = nullptr;
999	col = nullptr;
1000	len = `0`;
1001	convertedLen = `0`;
1002	hyphenated = false;
1003	next = nullptr;
1004	xMin = yMin = `0`;
1005	xMax = yMax = -`1`;
1006	normalized = nullptr;
1007	normalized_len = `0`;
1008	normalized_idx = nullptr;
1009	ascii_translation = nullptr;
1010	ascii_len = `0`;
1011	ascii_idx = nullptr;
1012	}
1013
1014	TextLine::~TextLine()
1015	{
1016	TextWord *word;
1017
1018	while (words) {
1019	word = words;
1020	words = words->next;
1021	delete word;
1022	}
1023	gfree(p: text);
1024	gfree(p: edge);
1025	gfree(p: col);
1026	if (normalized) {
1027	gfree(p: normalized);
1028	gfree(p: normalized_idx);
1029	}
1030	if (ascii_translation) {
1031	gfree(p: ascii_translation);
1032	gfree(p: ascii_idx);
1033	}
1034	}
1035
1036	void TextLine::addWord(TextWord *word)
1037	{
1038	if (lastWord) {
1039	lastWord->next = word;
1040	} else {
1041	words = word;
1042	}
1043	lastWord = word;
1044
1045	if (xMin > xMax) {
1046	xMin = word->xMin;
1047	xMax = word->xMax;
1048	yMin = word->yMin;
1049	yMax = word->yMax;
1050	} else {
1051	if (word->xMin < xMin) {
1052	xMin = word->xMin;
1053	}
1054	if (word->xMax > xMax) {
1055	xMax = word->xMax;
1056	}
1057	if (word->yMin < yMin) {
1058	yMin = word->yMin;
1059	}
1060	if (word->yMax > yMax) {
1061	yMax = word->yMax;
1062	}
1063	}
1064	}
1065
1066	double TextLine::primaryDelta(const TextLine line) const*
1067	{
1068	double delta;
1069
1070	delta = `0`; // make gcc happy
1071	switch (rot) {
1072	case `0`:
1073	delta = line->xMin - xMax;
1074	break;
1075	case `1`:
1076	delta = line->yMin - yMax;
1077	break;
1078	case `2`:
1079	delta = xMin - line->xMax;
1080	break;
1081	case `3`:
1082	delta = yMin - line->yMax;
1083	break;
1084	}
1085	return delta;
1086	}
1087
1088	int TextLine::primaryCmp(const TextLine line) const*
1089	{
1090	double cmp;
1091
1092	cmp = `0`; // make gcc happy
1093	switch (rot) {
1094	case `0`:
1095	cmp = xMin - line->xMin;
1096	break;
1097	case `1`:
1098	cmp = yMin - line->yMin;
1099	break;
1100	case `2`:
1101	cmp = line->xMax - xMax;
1102	break;
1103	case `3`:
1104	cmp = line->yMax - yMax;
1105	break;
1106	}
1107	return cmp < `0` ? -`1` : cmp > `0` ? `1` : `0`;
1108	}
1109
1110	int TextLine::secondaryCmp(const TextLine line) const*
1111	{
1112	double cmp;
1113
1114	cmp = (rot == `0` \|\| rot == `3`) ? base - line->base : line->base - base;
1115	return cmp < `0` ? -`1` : cmp > `0` ? `1` : `0`;
1116	}
1117
1118	int TextLine::cmpYX(const TextLine line) const*
1119	{
1120	int cmp;
1121
1122	if ((cmp = secondaryCmp(line))) {
1123	return cmp;
1124	}
1125	return primaryCmp(line);
1126	}
1127
1128	int TextLine::cmpXY(const void p1, const* void *p2)
1129	{
1130	TextLine line1 = (TextLine **)p1;
1131	TextLine line2 = (TextLine **)p2;
1132	int cmp;
1133
1134	if ((cmp = line1->primaryCmp(line: line2))) {
1135	return cmp;
1136	}
1137	return line1->secondaryCmp(line: line2);
1138	}
1139
1140	void TextLine::coalesce(const UnicodeMap *uMap)
1141	{
1142	double space, delta, minSpace;
1143	bool isUnicode;
1144	char buf[`8`];
1145
1146	if (words->next) {
1147
1148	// compute the inter-word space threshold
1149	if (words->len() > `1` \|\| words->next->len() > `1`) {
1150	minSpace = `0`;
1151	} else {
1152	minSpace = words->primaryDelta(word: words->next);
1153	for (auto word0 = words->next, word1 = word0->next; word1 && minSpace > `0`; word0 = word1, word1 = word0->next) {
1154	if (word1->len() > `1`) {
1155	minSpace = `0`;
1156	}
1157	delta = word0->primaryDelta(word: word1);
1158	if (delta < minSpace) {
1159	minSpace = delta;
1160	}
1161	}
1162	}
1163	if (minSpace <= `0`) {
1164	space = maxCharSpacing * words->fontSize;
1165	} else {
1166	space = maxWideCharSpacingMul * minSpace;
1167	if (space > maxWideCharSpacing * words->fontSize) {
1168	space = maxWideCharSpacing * words->fontSize;
1169	}
1170	}
1171
1172	// merge words
1173	auto word0 = words;
1174	auto word1 = words->next;
1175	while (word1) {
1176	if (word0->primaryDelta(word: word1) >= space) {
1177	word0->spaceAfter = true;
1178	word0 = word1;
1179	word1 = word1->next;
1180	} else if (word0->chars.back().font == word1->chars.front().font //
1181	&& word0->underlined == word1->underlined //
1182	&& fabs(x: word0->fontSize - word1->fontSize) < maxWordFontSizeDelta * words->fontSize //
1183	&& word1->chars.front().charPos == word0->charPosEnd) {
1184	word0->merge(word: word1);
1185	word0->next = word1->next;
1186	delete word1;
1187	word1 = word0->next;
1188	} else {
1189	word0 = word1;
1190	word1 = word1->next;
1191	}
1192	}
1193	}
1194
1195	// build the line text
1196	isUnicode = uMap ? uMap->isUnicode() : false;
1197	len = `0`;
1198	for (auto word1 = words; word1; word1 = word1->next) {
1199	len += word1->len();
1200	if (word1->spaceAfter) {
1201	++len;
1202	}
1203	}
1204	text = (Unicode )gmallocn(count: len, size: sizeof*(Unicode));
1205	edge = (double )gmallocn(count: len + `1`, size: sizeof(double*));
1206	size_t i = `0`;
1207	for (auto word1 = words; word1; word1 = word1->next) {
1208	for (size_t j = `0`; j < word1->len(); ++j) {
1209	text[i] = word1->chars [j].text;
1210	edge[i] = word1->chars [j].edge;
1211	++i;
1212	}
1213	edge[i] = word1->edgeEnd;
1214	if (word1->spaceAfter) {
1215	text[i] = (Unicode)`0x0020`;
1216	++i;
1217	}
1218	}
1219
1220	// compute convertedLen and set up the col array
1221	col = (int )gmallocn(count: len + `1`, size: sizeof(int*));
1222	convertedLen = `0`;
1223	for (int ci = `0`; ci < len; ++ci) {
1224	col[ci] = convertedLen;
1225	if (isUnicode) {
1226	++convertedLen;
1227	} else if (uMap) {
1228	convertedLen += uMap->mapUnicode(u: text[ci], buf, bufSize: sizeof(buf));
1229	}
1230	}
1231	col[len] = convertedLen;
1232
1233	// check for hyphen at end of line
1234	//~ need to check for other chars used as hyphens
1235	hyphenated = text[len - `1`] == (Unicode)`'-'`;
1236	}
1237
1238	//------------------------------------------------------------------------
1239	// TextLineFrag
1240	//------------------------------------------------------------------------
1241
1242	class TextLineFrag
1243	{
1244	public:
1245	TextLine line; // the line object*
1246	int start, len; // offset and length of this fragment
1247	// (in Unicode chars)
1248	double xMin, xMax; // bounding box coordinates
1249	double yMin, yMax;
1250	double base; // baseline virtual coordinate
1251	int col; // first column
1252
1253	void init(TextLine lineA, int* startA, int lenA);
1254	void computeCoords(bool oneRot);
1255
1256	static int cmpYXPrimaryRot(const void p1, const* void *p2);
1257	static int cmpYXLineRot(const void p1, const* void *p2);
1258	static int cmpXYLineRot(const void p1, const* void *p2);
1259	static int cmpXYColumnPrimaryRot(const void p1, const* void *p2);
1260	static int cmpXYColumnLineRot(const void p1, const* void *p2);
1261	};
1262
1263	void TextLineFrag::init(TextLine lineA, int* startA, int lenA)
1264	{
1265	line = lineA;
1266	start = startA;
1267	len = lenA;
1268	col = line->col[start];
1269	}
1270
1271	void TextLineFrag::computeCoords(bool oneRot)
1272	{
1273	TextBlock *blk;
1274	double d0, d1, d2, d3, d4;
1275
1276	if (oneRot) {
1277
1278	switch (line->rot) {
1279	case `0`:
1280	xMin = line->edge[start];
1281	xMax = line->edge[start + len];
1282	yMin = line->yMin;
1283	yMax = line->yMax;
1284	break;
1285	case `1`:
1286	xMin = line->xMin;
1287	xMax = line->xMax;
1288	yMin = line->edge[start];
1289	yMax = line->edge[start + len];
1290	break;
1291	case `2`:
1292	xMin = line->edge[start + len];
1293	xMax = line->edge[start];
1294	yMin = line->yMin;
1295	yMax = line->yMax;
1296	break;
1297	case `3`:
1298	xMin = line->xMin;
1299	xMax = line->xMax;
1300	yMin = line->edge[start + len];
1301	yMax = line->edge[start];
1302	break;
1303	}
1304	base = line->base;
1305
1306	} else {
1307
1308	if (line->rot == `0` && line->blk->page->primaryRot == `0`) {
1309
1310	xMin = line->edge[start];
1311	xMax = line->edge[start + len];
1312	yMin = line->yMin;
1313	yMax = line->yMax;
1314	base = line->base;
1315
1316	} else {
1317
1318	blk = line->blk;
1319	d0 = line->edge[start];
1320	d1 = line->edge[start + len];
1321	d2 = d3 = d4 = `0`; // make gcc happy
1322
1323	switch (line->rot) {
1324	case `0`:
1325	d2 = line->yMin;
1326	d3 = line->yMax;
1327	d4 = line->base;
1328	d0 = (d0 - blk->xMin) / (blk->xMax - blk->xMin);
1329	d1 = (d1 - blk->xMin) / (blk->xMax - blk->xMin);
1330	d2 = (d2 - blk->yMin) / (blk->yMax - blk->yMin);
1331	d3 = (d3 - blk->yMin) / (blk->yMax - blk->yMin);
1332	d4 = (d4 - blk->yMin) / (blk->yMax - blk->yMin);
1333	break;
1334	case `1`:
1335	d2 = line->xMax;
1336	d3 = line->xMin;
1337	d4 = line->base;
1338	d0 = (d0 - blk->yMin) / (blk->yMax - blk->yMin);
1339	d1 = (d1 - blk->yMin) / (blk->yMax - blk->yMin);
1340	d2 = (blk->xMax - d2) / (blk->xMax - blk->xMin);
1341	d3 = (blk->xMax - d3) / (blk->xMax - blk->xMin);
1342	d4 = (blk->xMax - d4) / (blk->xMax - blk->xMin);
1343	break;
1344	case `2`:
1345	d2 = line->yMax;
1346	d3 = line->yMin;
1347	d4 = line->base;
1348	d0 = (blk->xMax - d0) / (blk->xMax - blk->xMin);
1349	d1 = (blk->xMax - d1) / (blk->xMax - blk->xMin);
1350	d2 = (blk->yMax - d2) / (blk->yMax - blk->yMin);
1351	d3 = (blk->yMax - d3) / (blk->yMax - blk->yMin);
1352	d4 = (blk->yMax - d4) / (blk->yMax - blk->yMin);
1353	break;
1354	case `3`:
1355	d2 = line->xMin;
1356	d3 = line->xMax;
1357	d4 = line->base;
1358	d0 = (blk->yMax - d0) / (blk->yMax - blk->yMin);
1359	d1 = (blk->yMax - d1) / (blk->yMax - blk->yMin);
1360	d2 = (d2 - blk->xMin) / (blk->xMax - blk->xMin);
1361	d3 = (d3 - blk->xMin) / (blk->xMax - blk->xMin);
1362	d4 = (d4 - blk->xMin) / (blk->xMax - blk->xMin);
1363	break;
1364	}
1365
1366	switch (line->blk->page->primaryRot) {
1367	case `0`:
1368	xMin = blk->xMin + d0 * (blk->xMax - blk->xMin);
1369	xMax = blk->xMin + d1 * (blk->xMax - blk->xMin);
1370	yMin = blk->yMin + d2 * (blk->yMax - blk->yMin);
1371	yMax = blk->yMin + d3 * (blk->yMax - blk->yMin);
1372	base = blk->yMin + d4 * (blk->yMax - blk->yMin);
1373	break;
1374	case `1`:
1375	xMin = blk->xMax - d3 * (blk->xMax - blk->xMin);
1376	xMax = blk->xMax - d2 * (blk->xMax - blk->xMin);
1377	yMin = blk->yMin + d0 * (blk->yMax - blk->yMin);
1378	yMax = blk->yMin + d1 * (blk->yMax - blk->yMin);
1379	base = blk->xMax - d4 * (blk->xMax - blk->xMin);
1380	break;
1381	case `2`:
1382	xMin = blk->xMax - d1 * (blk->xMax - blk->xMin);
1383	xMax = blk->xMax - d0 * (blk->xMax - blk->xMin);
1384	yMin = blk->yMax - d3 * (blk->yMax - blk->yMin);
1385	yMax = blk->yMax - d2 * (blk->yMax - blk->yMin);
1386	base = blk->yMax - d4 * (blk->yMax - blk->yMin);
1387	break;
1388	case `3`:
1389	xMin = blk->xMin + d2 * (blk->xMax - blk->xMin);
1390	xMax = blk->xMin + d3 * (blk->xMax - blk->xMin);
1391	yMin = blk->yMax - d1 * (blk->yMax - blk->yMin);
1392	yMax = blk->yMax - d0 * (blk->yMax - blk->yMin);
1393	base = blk->xMin + d4 * (blk->xMax - blk->xMin);
1394	break;
1395	}
1396	}
1397	}
1398	}
1399
1400	int TextLineFrag::cmpYXPrimaryRot(const void p1, const* void *p2)
1401	{
1402	TextLineFrag frag1 = (TextLineFrag )p1;
1403	TextLineFrag frag2 = (TextLineFrag )p2;
1404	double cmp;
1405
1406	cmp = `0`; // make gcc happy
1407	switch (frag1->line->blk->page->primaryRot) {
1408	case `0`:
1409	if (fabs(x: cmp = frag1->yMin - frag2->yMin) < `0.01`) {
1410	cmp = frag1->xMin - frag2->xMin;
1411	}
1412	break;
1413	case `1`:
1414	if (fabs(x: cmp = frag2->xMax - frag1->xMax) < `0.01`) {
1415	cmp = frag1->yMin - frag2->yMin;
1416	}
1417	break;
1418	case `2`:
1419	if (fabs(x: cmp = frag2->yMin - frag1->yMin) < `0.01`) {
1420	cmp = frag2->xMax - frag1->xMax;
1421	}
1422	break;
1423	case `3`:
1424	if (fabs(x: cmp = frag1->xMax - frag2->xMax) < `0.01`) {
1425	cmp = frag2->yMax - frag1->yMax;
1426	}
1427	break;
1428	}
1429	return cmp < `0` ? -`1` : cmp > `0` ? `1` : `0`;
1430	}
1431
1432	int TextLineFrag::cmpYXLineRot(const void p1, const* void *p2)
1433	{
1434	TextLineFrag frag1 = (TextLineFrag )p1;
1435	TextLineFrag frag2 = (TextLineFrag )p2;
1436	double cmp;
1437
1438	cmp = `0`; // make gcc happy
1439	switch (frag1->line->rot) {
1440	case `0`:
1441	if ((cmp = frag1->yMin - frag2->yMin) == `0`) {
1442	cmp = frag1->xMin - frag2->xMin;
1443	}
1444	break;
1445	case `1`:
1446	if ((cmp = frag2->xMax - frag1->xMax) == `0`) {
1447	cmp = frag1->yMin - frag2->yMin;
1448	}
1449	break;
1450	case `2`:
1451	if ((cmp = frag2->yMin - frag1->yMin) == `0`) {
1452	cmp = frag2->xMax - frag1->xMax;
1453	}
1454	break;
1455	case `3`:
1456	if ((cmp = frag1->xMax - frag2->xMax) == `0`) {
1457	cmp = frag2->yMax - frag1->yMax;
1458	}
1459	break;
1460	}
1461	return cmp < `0` ? -`1` : cmp > `0` ? `1` : `0`;
1462	}
1463
1464	int TextLineFrag::cmpXYLineRot(const void p1, const* void *p2)
1465	{
1466	TextLineFrag frag1 = (TextLineFrag )p1;
1467	TextLineFrag frag2 = (TextLineFrag )p2;
1468	double cmp;
1469
1470	cmp = `0`; // make gcc happy
1471	switch (frag1->line->rot) {
1472	case `0`:
1473	if ((cmp = frag1->xMin - frag2->xMin) == `0`) {
1474	cmp = frag1->yMin - frag2->yMin;
1475	}
1476	break;
1477	case `1`:
1478	if ((cmp = frag1->yMin - frag2->yMin) == `0`) {
1479	cmp = frag2->xMax - frag1->xMax;
1480	}
1481	break;
1482	case `2`:
1483	if ((cmp = frag2->xMax - frag1->xMax) == `0`) {
1484	cmp = frag2->yMin - frag1->yMin;
1485	}
1486	break;
1487	case `3`:
1488	if ((cmp = frag2->yMax - frag1->yMax) == `0`) {
1489	cmp = frag1->xMax - frag2->xMax;
1490	}
1491	break;
1492	}
1493	return cmp < `0` ? -`1` : cmp > `0` ? `1` : `0`;
1494	}
1495
1496	int TextLineFrag::cmpXYColumnPrimaryRot(const void p1, const* void *p2)
1497	{
1498	TextLineFrag frag1 = (TextLineFrag )p1;
1499	TextLineFrag frag2 = (TextLineFrag )p2;
1500	double cmp;
1501
1502	// if columns overlap, compare y values
1503	if (frag1->col < frag2->col + (frag2->line->col[frag2->start + frag2->len] - frag2->line->col[frag2->start]) && frag2->col < frag1->col + (frag1->line->col[frag1->start + frag1->len] - frag1->line->col[frag1->start])) {
1504	cmp = `0`; // make gcc happy
1505	switch (frag1->line->blk->page->primaryRot) {
1506	case `0`:
1507	cmp = frag1->yMin - frag2->yMin;
1508	break;
1509	case `1`:
1510	cmp = frag2->xMax - frag1->xMax;
1511	break;
1512	case `2`:
1513	cmp = frag2->yMin - frag1->yMin;
1514	break;
1515	case `3`:
1516	cmp = frag1->xMax - frag2->xMax;
1517	break;
1518	}
1519	return cmp < `0` ? -`1` : cmp > `0` ? `1` : `0`;
1520	}
1521
1522	// otherwise, compare starting column
1523	return frag1->col - frag2->col;
1524	}
1525
1526	int TextLineFrag::cmpXYColumnLineRot(const void p1, const* void *p2)
1527	{
1528	TextLineFrag frag1 = (TextLineFrag )p1;
1529	TextLineFrag frag2 = (TextLineFrag )p2;
1530	double cmp;
1531
1532	// if columns overlap, compare y values
1533	if (frag1->col < frag2->col + (frag2->line->col[frag2->start + frag2->len] - frag2->line->col[frag2->start]) && frag2->col < frag1->col + (frag1->line->col[frag1->start + frag1->len] - frag1->line->col[frag1->start])) {
1534	cmp = `0`; // make gcc happy
1535	switch (frag1->line->rot) {
1536	case `0`:
1537	cmp = frag1->yMin - frag2->yMin;
1538	break;
1539	case `1`:
1540	cmp = frag2->xMax - frag1->xMax;
1541	break;
1542	case `2`:
1543	cmp = frag2->yMin - frag1->yMin;
1544	break;
1545	case `3`:
1546	cmp = frag1->xMax - frag2->xMax;
1547	break;
1548	}
1549	return cmp < `0` ? -`1` : cmp > `0` ? `1` : `0`;
1550	}
1551
1552	// otherwise, compare starting column
1553	return frag1->col - frag2->col;
1554	}
1555
1556	//------------------------------------------------------------------------
1557	// TextBlock
1558	//------------------------------------------------------------------------
1559
1560	TextBlock::TextBlock(TextPage pageA, int* rotA)
1561	{
1562	page = pageA;
1563	rot = rotA;
1564	xMin = yMin = `0`;
1565	xMax = yMax = -`1`;
1566	priMin = `0`;
1567	priMax = page->pageWidth;
1568	pool = new TextPool ();
1569	lines = nullptr;
1570	curLine = nullptr;
1571	next = nullptr;
1572	stackNext = nullptr;
1573	tableId = -`1`;
1574	tableEnd = false;
1575	}
1576
1577	TextBlock::~TextBlock()
1578	{
1579	TextLine *line;
1580
1581	delete pool;
1582	while (lines) {
1583	line = lines;
1584	lines = lines->next;
1585	delete line;
1586	}
1587	}
1588
1589	void TextBlock::addWord(TextWord *word)
1590	{
1591	pool->addWord(word);
1592	if (xMin > xMax) {
1593	xMin = word->xMin;
1594	xMax = word->xMax;
1595	yMin = word->yMin;
1596	yMax = word->yMax;
1597	} else {
1598	if (word->xMin < xMin) {
1599	xMin = word->xMin;
1600	}
1601	if (word->xMax > xMax) {
1602	xMax = word->xMax;
1603	}
1604	if (word->yMin < yMin) {
1605	yMin = word->yMin;
1606	}
1607	if (word->yMax > yMax) {
1608	yMax = word->yMax;
1609	}
1610	}
1611	}
1612
1613	void TextBlock::coalesce(const UnicodeMap uMap, double* fixedPitch)
1614	{
1615	// discard duplicated text (fake boldface, drop shadows)
1616	for (int idx0 = pool->minBaseIdx; idx0 <= pool->maxBaseIdx; ++idx0) {
1617	// Get the first LHS word from the pool
1618	TextWord *word0 = pool->getPool(baseIdx: idx0);
1619
1620	while (word0) {
1621	double priDelta = dupMaxPriDelta * word0->fontSize;
1622	double secDelta = dupMaxSecDelta * word0->fontSize;
1623	double xDelta = ((rot == `0`) \|\| (rot == `2`)) ? priDelta : secDelta;
1624	double yDelta = ((rot == `0`) \|\| (rot == `2`)) ? secDelta : priDelta;
1625
1626	int maxBaseIdx = pool->getBaseIdx(base: word0->base + secDelta);
1627
1628	for (int idx1 = idx0; idx1 <= maxBaseIdx; idx1++) {
1629	TextWord *prevWord;
1630	/ In case the RHS word is from the same pool as the LHS word,*
1631	* start the inner loop with the word following the LHS word.
1632	* Otherwise, start with the second word from the subsequent pools
1633	* - the first word is compared at the end.
1634	*/
1635	if (idx0 == idx1) {
1636	prevWord = word0;
1637	} else {
1638	prevWord = pool->getPool(baseIdx: idx1);
1639	if (!prevWord) {
1640	continue;
1641	}
1642	}
1643	TextWord *word1 = prevWord->next;
1644
1645	auto equalText = [](const TextWord &w1, const TextWord &w2) -> bool { //
1646	return std::equal(first1: w1.chars.begin(), last1: w1.chars.end(), first2: w2.chars.begin(), last2: w2.chars.end(), //
1647	binary_pred: [](auto c1, auto c2) { return c1.text == c2.text; });
1648	};
1649	auto match = [&equalText, xDelta, yDelta](const TextWord &w1, const TextWord &w2) -> bool {
1650	if (!equalText (w1, w2)) {
1651	return false;
1652	}
1653	return fabs(x: w1.xMin - w2.xMin) < xDelta && fabs(x: w1.xMax - w2.xMax) < xDelta //
1654	&& fabs(x: w1.yMin - w2.yMin) < yDelta && fabs(x: w1.yMax - w2.yMax) < yDelta;
1655	};
1656
1657	while (word1) {
1658	if (match (word0, word1)) {
1659	prevWord->next = word1->next;
1660	delete word1;
1661	word1 = prevWord->next;
1662	} else {
1663	prevWord = word1;
1664	word1 = word1->next;
1665	}
1666	}
1667
1668	// Check the first word from each subsequent pool
1669	if (idx0 != idx1) {
1670	word1 = pool->getPool(baseIdx: idx1);
1671	}
1672	if (word1 && match (word0, word1)) {
1673	pool->setPool(baseIdx: idx1, p: word1->next);
1674	delete word1;
1675	}
1676	}
1677
1678	word0 = word0->next;
1679	}
1680	}
1681
1682	TextWord word0, word1;
1683	TextWord bestWord0, bestWord1, *lastWord;
1684	TextLine line, line0, *line1;
1685	TextLine **lineArray;
1686	int poolMinBaseIdx, startBaseIdx, minBaseIdx, maxBaseIdx;
1687	int baseIdx, bestWordBaseIdx;
1688	double minBase, maxBase;
1689	double fontSize, wordSpacing, delta;
1690	bool overlap;
1691	int col1, col2;
1692	int i, j, k;
1693
1694	// build the lines
1695	curLine = nullptr;
1696	poolMinBaseIdx = pool->minBaseIdx;
1697	charCount = `0`;
1698	nLines = `0`;
1699	while (true) {
1700
1701	// find the first non-empty line in the pool
1702	for (; poolMinBaseIdx <= pool->maxBaseIdx && !pool->getPool(baseIdx: poolMinBaseIdx); ++poolMinBaseIdx) {
1703	;
1704	}
1705	if (poolMinBaseIdx > pool->maxBaseIdx) {
1706	break;
1707	}
1708
1709	// look for the left-most word in the first four lines of the
1710	// pool -- this avoids starting with a superscript word
1711	startBaseIdx = poolMinBaseIdx;
1712	for (baseIdx = poolMinBaseIdx + `1`; baseIdx < poolMinBaseIdx + `4` && baseIdx <= pool->maxBaseIdx; ++baseIdx) {
1713	if (!pool->getPool(baseIdx)) {
1714	continue;
1715	}
1716	if (pool->getPool(baseIdx)->primaryCmp(word: pool->getPool(baseIdx: startBaseIdx)) < `0`) {
1717	startBaseIdx = baseIdx;
1718	}
1719	}
1720
1721	// create a new line
1722	word0 = pool->getPool(baseIdx: startBaseIdx);
1723	pool->setPool(baseIdx: startBaseIdx, p: word0->next);
1724	word0->next = nullptr;
1725	line = new TextLine (this, word0->rot, word0->base);
1726	line->addWord(word: word0);
1727	lastWord = word0;
1728
1729	// compute the search range
1730	fontSize = word0->fontSize;
1731	minBase = word0->base - maxIntraLineDelta * fontSize;
1732	maxBase = word0->base + maxIntraLineDelta * fontSize;
1733	minBaseIdx = pool->getBaseIdx(base: minBase);
1734	maxBaseIdx = pool->getBaseIdx(base: maxBase);
1735	wordSpacing = fixedPitch ? fixedPitch : maxWordSpacing * fontSize;
1736
1737	// find the rest of the words in this line
1738	while (true) {
1739
1740	// find the left-most word whose baseline is in the range for
1741	// this line
1742	bestWordBaseIdx = `0`;
1743	bestWord0 = bestWord1 = nullptr;
1744	overlap = false;
1745	for (baseIdx = minBaseIdx; !overlap && baseIdx <= maxBaseIdx; ++baseIdx) {
1746	for (word0 = nullptr, word1 = pool->getPool(baseIdx); word1; word0 = word1, word1 = word1->next) {
1747	if (word1->base >= minBase && word1->base <= maxBase) {
1748	delta = lastWord->primaryDelta(word: word1);
1749	if (delta < minCharSpacing * fontSize) {
1750	overlap = true;
1751	break;
1752	} else {
1753	if (delta < wordSpacing && (!bestWord1 \|\| word1->primaryCmp(word: bestWord1) < `0`)) {
1754	bestWordBaseIdx = baseIdx;
1755	bestWord0 = word0;
1756	bestWord1 = word1;
1757	}
1758	break;
1759	}
1760	}
1761	}
1762	}
1763	if (overlap \|\| !bestWord1) {
1764	break;
1765	}
1766
1767	// remove it from the pool, and add it to the line
1768	if (bestWord0) {
1769	bestWord0->next = bestWord1->next;
1770	} else {
1771	pool->setPool(baseIdx: bestWordBaseIdx, p: bestWord1->next);
1772	}
1773	bestWord1->next = nullptr;
1774	line->addWord(word: bestWord1);
1775	lastWord = bestWord1;
1776	}
1777
1778	// add the line
1779	if (curLine && line->cmpYX(line: curLine) > `0`) {
1780	line0 = curLine;
1781	line1 = curLine->next;
1782	} else {
1783	line0 = nullptr;
1784	line1 = lines;
1785	}
1786	for (; line1 && line->cmpYX(line: line1) > `0`; line0 = line1, line1 = line1->next) {
1787	;
1788	}
1789	if (line0) {
1790	line0->next = line;
1791	} else {
1792	lines = line;
1793	}
1794	line->next = line1;
1795	curLine = line;
1796	line->coalesce(uMap);
1797	charCount += line->len;
1798	++nLines;
1799	}
1800
1801	// sort lines into xy order for column assignment
1802	lineArray = (TextLine )gmallocn(count: nLines, size: sizeof*(TextLine ));
1803	for (line = lines, i = `0`; line; line = line->next, ++i) {
1804	lineArray[i] = line;
1805	}
1806	qsort(base: lineArray, nmemb: nLines, size: sizeof(TextLine *), compar: &TextLine::cmpXY);
1807
1808	// column assignment
1809	nColumns = `0`;
1810	if (fixedPitch) {
1811	for (i = `0`; i < nLines; ++i) {
1812	line0 = lineArray[i];
1813	col1 = `0`; // make gcc happy
1814	switch (rot) {
1815	case `0`:
1816	col1 = (int)((line0->xMin - xMin) / fixedPitch + `0.5`);
1817	break;
1818	case `1`:
1819	col1 = (int)((line0->yMin - yMin) / fixedPitch + `0.5`);
1820	break;
1821	case `2`:
1822	col1 = (int)((xMax - line0->xMax) / fixedPitch + `0.5`);
1823	break;
1824	case `3`:
1825	col1 = (int)((yMax - line0->yMax) / fixedPitch + `0.5`);
1826	break;
1827	}
1828	for (k = `0`; k <= line0->len; ++k) {
1829	line0->col[k] += col1;
1830	}
1831	if (line0->col[line0->len] > nColumns) {
1832	nColumns = line0->col[line0->len];
1833	}
1834	}
1835	} else {
1836	for (i = `0`; i < nLines; ++i) {
1837	line0 = lineArray[i];
1838	col1 = `0`;
1839	for (j = `0`; j < i; ++j) {
1840	line1 = lineArray[j];
1841	if (line1->primaryDelta(line: line0) >= `0`) {
1842	col2 = line1->col[line1->len] + `1`;
1843	} else {
1844	k = `0`; // make gcc happy
1845	switch (rot) {
1846	case `0`:
1847	for (k = `0`; k < line1->len && line0->xMin >= `0.5` * (line1->edge[k] + line1->edge[k + `1`]); ++k) {
1848	;
1849	}
1850	break;
1851	case `1`:
1852	for (k = `0`; k < line1->len && line0->yMin >= `0.5` * (line1->edge[k] + line1->edge[k + `1`]); ++k) {
1853	;
1854	}
1855	break;
1856	case `2`:
1857	for (k = `0`; k < line1->len && line0->xMax <= `0.5` * (line1->edge[k] + line1->edge[k + `1`]); ++k) {
1858	;
1859	}
1860	break;
1861	case `3`:
1862	for (k = `0`; k < line1->len && line0->yMax <= `0.5` * (line1->edge[k] + line1->edge[k + `1`]); ++k) {
1863	;
1864	}
1865	break;
1866	}
1867	col2 = line1->col[k];
1868	}
1869	if (col2 > col1) {
1870	col1 = col2;
1871	}
1872	}
1873	for (k = `0`; k <= line0->len; ++k) {
1874	line0->col[k] += col1;
1875	}
1876	if (line0->col[line0->len] > nColumns) {
1877	nColumns = line0->col[line0->len];
1878	}
1879	}
1880	}
1881	gfree(p: lineArray);
1882	}
1883
1884	void TextBlock::updatePriMinMax(const TextBlock *blk)
1885	{
1886	double newPriMin, newPriMax;
1887	bool gotPriMin, gotPriMax;
1888
1889	gotPriMin = gotPriMax = false;
1890	newPriMin = newPriMax = `0`; // make gcc happy
1891	switch (page->primaryRot) {
1892	case `0`:
1893	case `2`:
1894	if (blk->yMin < yMax && blk->yMax > yMin) {
1895	if (blk->xMin < xMin) {
1896	newPriMin = blk->xMax;
1897	gotPriMin = true;
1898	}
1899	if (blk->xMax > xMax) {
1900	newPriMax = blk->xMin;
1901	gotPriMax = true;
1902	}
1903	}
1904	break;
1905	case `1`:
1906	case `3`:
1907	if (blk->xMin < xMax && blk->xMax > xMin) {
1908	if (blk->yMin < yMin) {
1909	newPriMin = blk->yMax;
1910	gotPriMin = true;
1911	}
1912	if (blk->yMax > yMax) {
1913	newPriMax = blk->yMin;
1914	gotPriMax = true;
1915	}
1916	}
1917	break;
1918	}
1919	if (gotPriMin) {
1920	if (newPriMin > xMin) {
1921	newPriMin = xMin;
1922	}
1923	if (newPriMin > priMin) {
1924	priMin = newPriMin;
1925	}
1926	}
1927	if (gotPriMax) {
1928	if (newPriMax < xMax) {
1929	newPriMax = xMax;
1930	}
1931	if (newPriMax < priMax) {
1932	priMax = newPriMax;
1933	}
1934	}
1935	}
1936
1937	int TextBlock::cmpXYPrimaryRot(const void p1, const* void *p2)
1938	{
1939	TextBlock blk1 = (TextBlock **)p1;
1940	TextBlock blk2 = (TextBlock **)p2;
1941	double cmp;
1942
1943	cmp = `0`; // make gcc happy
1944	switch (blk1->page->primaryRot) {
1945	case `0`:
1946	if ((cmp = blk1->xMin - blk2->xMin) == `0`) {
1947	cmp = blk1->yMin - blk2->yMin;
1948	}
1949	break;
1950	case `1`:
1951	if ((cmp = blk1->yMin - blk2->yMin) == `0`) {
1952	cmp = blk2->xMax - blk1->xMax;
1953	}
1954	break;
1955	case `2`:
1956	if ((cmp = blk2->xMax - blk1->xMax) == `0`) {
1957	cmp = blk2->yMin - blk1->yMin;
1958	}
1959	break;
1960	case `3`:
1961	if ((cmp = blk2->yMax - blk1->yMax) == `0`) {
1962	cmp = blk1->xMax - blk2->xMax;
1963	}
1964	break;
1965	}
1966	return cmp < `0` ? -`1` : cmp > `0` ? `1` : `0`;
1967	}
1968
1969	int TextBlock::cmpYXPrimaryRot(const void p1, const* void *p2)
1970	{
1971	TextBlock blk1 = (TextBlock **)p1;
1972	TextBlock blk2 = (TextBlock **)p2;
1973	double cmp;
1974
1975	cmp = `0`; // make gcc happy
1976	switch (blk1->page->primaryRot) {
1977	case `0`:
1978	if ((cmp = blk1->yMin - blk2->yMin) == `0`) {
1979	cmp = blk1->xMin - blk2->xMin;
1980	}
1981	break;
1982	case `1`:
1983	if ((cmp = blk2->xMax - blk1->xMax) == `0`) {
1984	cmp = blk1->yMin - blk2->yMin;
1985	}
1986	break;
1987	case `2`:
1988	if ((cmp = blk2->yMin - blk1->yMin) == `0`) {
1989	cmp = blk2->xMax - blk1->xMax;
1990	}
1991	break;
1992	case `3`:
1993	if ((cmp = blk1->xMax - blk2->xMax) == `0`) {
1994	cmp = blk2->yMax - blk1->yMax;
1995	}
1996	break;
1997	}
1998	return cmp < `0` ? -`1` : cmp > `0` ? `1` : `0`;
1999	}
2000
2001	int TextBlock::primaryCmp(const TextBlock blk) const*
2002	{
2003	double cmp;
2004
2005	cmp = `0`; // make gcc happy
2006	switch (rot) {
2007	case `0`:
2008	cmp = xMin - blk->xMin;
2009	break;
2010	case `1`:
2011	cmp = yMin - blk->yMin;
2012	break;
2013	case `2`:
2014	cmp = blk->xMax - xMax;
2015	break;
2016	case `3`:
2017	cmp = blk->yMax - yMax;
2018	break;
2019	}
2020	return cmp < `0` ? -`1` : cmp > `0` ? `1` : `0`;
2021	}
2022
2023	double TextBlock::secondaryDelta(const TextBlock blk) const*
2024	{
2025	double delta;
2026
2027	delta = `0`; // make gcc happy
2028	switch (rot) {
2029	case `0`:
2030	delta = blk->yMin - yMax;
2031	break;
2032	case `1`:
2033	delta = xMin - blk->xMax;
2034	break;
2035	case `2`:
2036	delta = yMin - blk->yMax;
2037	break;
2038	case `3`:
2039	delta = blk->xMin - xMax;
2040	break;
2041	}
2042	return delta;
2043	}
2044
2045	bool TextBlock::isBelow(const TextBlock blk) const*
2046	{
2047	bool below;
2048
2049	below = false; // make gcc happy
2050	switch (page->primaryRot) {
2051	case `0`:
2052	below = xMin >= blk->priMin && xMax <= blk->priMax && yMin > blk->yMin;
2053	break;
2054	case `1`:
2055	below = yMin >= blk->priMin && yMax <= blk->priMax && xMax < blk->xMax;
2056	break;
2057	case `2`:
2058	below = xMin >= blk->priMin && xMax <= blk->priMax && yMax < blk->yMax;
2059	break;
2060	case `3`:
2061	below = yMin >= blk->priMin && yMax <= blk->priMax && xMin > blk->xMin;
2062	break;
2063	}
2064
2065	return below;
2066	}
2067
2068	bool TextBlock::isBeforeByRule1(const TextBlock *blk1)
2069	{
2070	bool before = false;
2071	bool overlap = false;
2072
2073	switch (this->page->primaryRot) {
2074	case `0`:
2075	case `2`:
2076	overlap = ((this->ExMin <= blk1->ExMin) && (blk1->ExMin <= this->ExMax)) \|\| ((blk1->ExMin <= this->ExMin) && (this->ExMin <= blk1->ExMax));
2077	break;
2078	case `1`:
2079	case `3`:
2080	overlap = ((this->EyMin <= blk1->EyMin) && (blk1->EyMin <= this->EyMax)) \|\| ((blk1->EyMin <= this->EyMin) && (this->EyMin <= blk1->EyMax));
2081	break;
2082	}
2083	switch (this->page->primaryRot) {
2084	case `0`:
2085	before = overlap && this->EyMin < blk1->EyMin;
2086	break;
2087	case `1`:
2088	before = overlap && this->ExMax > blk1->ExMax;
2089	break;
2090	case `2`:
2091	before = overlap && this->EyMax > blk1->EyMax;
2092	break;
2093	case `3`:
2094	before = overlap && this->ExMin < blk1->ExMin;
2095	break;
2096	}
2097	return before;
2098	}
2099
2100	bool TextBlock::isBeforeByRule2(const TextBlock *blk1)
2101	{
2102	double cmp = `0`;
2103	int rotLR = rot;
2104
2105	if (!page->primaryLR) {
2106	rotLR = (rotLR + `2`) % `4`;
2107	}
2108
2109	switch (rotLR) {
2110	case `0`:
2111	cmp = ExMax - blk1->ExMin;
2112	break;
2113	case `1`:
2114	cmp = EyMin - blk1->EyMax;
2115	break;
2116	case `2`:
2117	cmp = blk1->ExMax - ExMin;
2118	break;
2119	case `3`:
2120	cmp = blk1->EyMin - EyMax;
2121	break;
2122	}
2123	return cmp <= `0`;
2124	}
2125
2126	// Sort into reading order by performing a topological sort using the rules
2127	// given in "High Performance Document Layout Analysis", T.M. Breuel, 2003.
2128	// See http://pubs.iupr.org/#2003-breuel-sdiut
2129	// Topological sort is done by depth first search, see
2130	// http://en.wikipedia.org/wiki/Topological_sorting
2131	int TextBlock::visitDepthFirst(TextBlock blkList, int* pos1, TextBlock *sorted, int* sortPos, bool visited, TextBlock cache, int* cacheSize)
2132	{
2133	int pos2;
2134	TextBlock blk1, blk2, *blk3;
2135	bool before;
2136
2137	if (visited[pos1]) {
2138	return sortPos;
2139	}
2140
2141	blk1 = this;
2142
2143	#if 0 // for debugging
2144	printf("visited: %d %.2f..%.2f %.2f..%.2f\n",
2145	sortPos, blk1->ExMin, blk1->ExMax, blk1->EyMin, blk1->EyMax);
2146	#endif
2147	visited[pos1] = true;
2148	pos2 = -`1`;
2149	for (blk2 = blkList; blk2; blk2 = blk2->next) {
2150	pos2++;
2151	if (visited[pos2]) {
2152	// skip visited nodes
2153	continue;
2154	}
2155	before = false;
2156
2157	// is blk2 before blk1? (for table entries)
2158	if (blk1->tableId >= `0` && blk1->tableId == blk2->tableId) {
2159	if (page->primaryLR) {
2160	if (blk2->xMax <= blk1->xMin && blk2->yMin <= blk1->yMax && blk2->yMax >= blk1->yMin) {
2161	before = true;
2162	}
2163	} else {
2164	if (blk2->xMin >= blk1->xMax && blk2->yMin <= blk1->yMax && blk2->yMax >= blk1->yMin) {
2165	before = true;
2166	}
2167	}
2168
2169	if (blk2->yMax <= blk1->yMin) {
2170	before = true;
2171	}
2172	} else {
2173	if (blk2->isBeforeByRule1(blk1)) {
2174	// Rule (1) blk1 and blk2 overlap, and blk2 is above blk1.
2175	before = true;
2176	#if 0 // for debugging
2177	printf("rule1: %.2f..%.2f %.2f..%.2f %.2f..%.2f %.2f..%.2f\n",
2178	blk2->ExMin, blk2->ExMax, blk2->EyMin, blk2->EyMax,
2179	blk1->ExMin, blk1->ExMax, blk1->EyMin, blk1->EyMax);
2180	#endif
2181	} else if (blk2->isBeforeByRule2(blk1)) {
2182	// Rule (2) blk2 left of blk1, and no intervening blk3
2183	// such that blk1 is before blk3 by rule 1,
2184	// and blk3 is before blk2 by rule 1.
2185	before = true;
2186	for (int i = `0`; i < cacheSize && cache[i]; ++i) {
2187	if (blk1->isBeforeByRule1(blk1: cache[i]) && cache[i]->isBeforeByRule1(blk1: blk2)) {
2188	before = false;
2189	std::rotate(first: cache, middle: cache + i, last: cache + i + `1`);
2190	break;
2191	}
2192	}
2193
2194	if (before) {
2195	for (blk3 = blkList; blk3; blk3 = blk3->next) {
2196	if (blk3 == blk2 \|\| blk3 == blk1) {
2197	continue;
2198	}
2199	if (blk1->isBeforeByRule1(blk1: blk3) && blk3->isBeforeByRule1(blk1: blk2)) {
2200	before = false;
2201	std::copy_backward(first: cache, last: cache + cacheSize - `1`, result: cache + cacheSize);
2202	cache[`0`] = blk3;
2203	break;
2204	}
2205	}
2206	}
2207	#if 0 // for debugging
2208	if (before) {
2209	printf("rule2: %.2f..%.2f %.2f..%.2f %.2f..%.2f %.2f..%.2f\n",
2210	blk1->ExMin, blk1->ExMax, blk1->EyMin, blk1->EyMax,
2211	blk2->ExMin, blk2->ExMax, blk2->EyMin, blk2->EyMax);
2212	}
2213	#endif
2214	}
2215	}
2216	if (before) {
2217	// blk2 is before blk1, so it needs to be visited
2218	// before we can add blk1 to the sorted list.
2219	sortPos = blk2->visitDepthFirst(blkList, pos1: pos2, sorted, sortPos, visited, cache, cacheSize);
2220	}
2221	}
2222	#if 0 // for debugging
2223	printf("sorted: %d %.2f..%.2f %.2f..%.2f\n",
2224	sortPos, blk1->ExMin, blk1->ExMax, blk1->EyMin, blk1->EyMax);
2225	#endif
2226	sorted[sortPos++] = blk1;
2227	return sortPos;
2228	}
2229
2230	int TextBlock::visitDepthFirst(TextBlock blkList, int* pos1, TextBlock *sorted, int* sortPos, bool *visited)
2231	{
2232	const int blockCacheSize = `4`;
2233	TextBlock *blockCache[blockCacheSize];
2234	std::fill(first: blockCache, last: blockCache + blockCacheSize, value: nullptr);
2235	return visitDepthFirst(blkList, pos1, sorted, sortPos, visited, cache: blockCache, cacheSize: blockCacheSize);
2236	}
2237
2238	//------------------------------------------------------------------------
2239	// TextFlow
2240	//------------------------------------------------------------------------
2241
2242	TextFlow::TextFlow(TextPage pageA, TextBlock blk)
2243	{
2244	page = pageA;
2245	xMin = blk->xMin;
2246	xMax = blk->xMax;
2247	yMin = blk->yMin;
2248	yMax = blk->yMax;
2249	priMin = blk->priMin;
2250	priMax = blk->priMax;
2251	blocks = lastBlk = blk;
2252	next = nullptr;
2253	}
2254
2255	TextFlow::~TextFlow()
2256	{
2257	TextBlock *blk;
2258
2259	while (blocks) {
2260	blk = blocks;
2261	blocks = blocks->next;
2262	delete blk;
2263	}
2264	}
2265
2266	void TextFlow::addBlock(TextBlock *blk)
2267	{
2268	if (lastBlk) {
2269	lastBlk->next = blk;
2270	} else {
2271	blocks = blk;
2272	}
2273	lastBlk = blk;
2274	if (blk->xMin < xMin) {
2275	xMin = blk->xMin;
2276	}
2277	if (blk->xMax > xMax) {
2278	xMax = blk->xMax;
2279	}
2280	if (blk->yMin < yMin) {
2281	yMin = blk->yMin;
2282	}
2283	if (blk->yMax > yMax) {
2284	yMax = blk->yMax;
2285	}
2286	}
2287
2288	bool TextFlow::blockFits(const TextBlock blk, const* TextBlock prevBlk) const*
2289	{
2290	bool fits;
2291
2292	// lower blocks must use smaller fonts
2293	if (blk->lines->words->fontSize > lastBlk->lines->words->fontSize) {
2294	return false;
2295	}
2296
2297	fits = false; // make gcc happy
2298	switch (page->primaryRot) {
2299	case `0`:
2300	fits = blk->xMin >= priMin && blk->xMax <= priMax;
2301	break;
2302	case `1`:
2303	fits = blk->yMin >= priMin && blk->yMax <= priMax;
2304	break;
2305	case `2`:
2306	fits = blk->xMin >= priMin && blk->xMax <= priMax;
2307	break;
2308	case `3`:
2309	fits = blk->yMin >= priMin && blk->yMax <= priMax;
2310	break;
2311	}
2312	return fits;
2313	}
2314
2315	#ifdef TEXTOUT_WORD_LIST
2316
2317	//------------------------------------------------------------------------
2318	// TextWordList
2319	//------------------------------------------------------------------------
2320
2321	TextWordList::TextWordList(const TextPage text, bool* physLayout)
2322	{
2323	TextFlow *flow;
2324	TextBlock *blk;
2325	TextLine *line;
2326	TextWord *word;
2327	TextWord **wordArray;
2328	int nWords, i;
2329
2330	if (text->rawOrder) {
2331	for (word = text->rawWords; word; word = word->next) {
2332	words.push_back(x: word);
2333	}
2334
2335	} else if (physLayout) {
2336	// this is inefficient, but it's also the least useful of these
2337	// three cases
2338	nWords = `0`;
2339	for (flow = text->flows; flow; flow = flow->next) {
2340	for (blk = flow->blocks; blk; blk = blk->next) {
2341	for (line = blk->lines; line; line = line->next) {
2342	for (word = line->words; word; word = word->next) {
2343	++nWords;
2344	}
2345	}
2346	}
2347	}
2348	wordArray = (TextWord )gmallocn(count: nWords, size: sizeof*(TextWord ));
2349	i = `0`;
2350	for (flow = text->flows; flow; flow = flow->next) {
2351	for (blk = flow->blocks; blk; blk = blk->next) {
2352	for (line = blk->lines; line; line = line->next) {
2353	for (word = line->words; word; word = word->next) {
2354	wordArray[i++] = word;
2355	}
2356	}
2357	}
2358	}
2359	qsort(base: wordArray, nmemb: nWords, size: sizeof(TextWord *), compar: &TextWord::cmpYX);
2360	for (i = `0`; i < nWords; ++i) {
2361	words.push_back(x: wordArray[i]);
2362	}
2363	gfree(p: wordArray);
2364
2365	} else {
2366	for (flow = text->flows; flow; flow = flow->next) {
2367	for (blk = flow->blocks; blk; blk = blk->next) {
2368	for (line = blk->lines; line; line = line->next) {
2369	for (word = line->words; word; word = word->next) {
2370	words.push_back(x: word);
2371	}
2372	}
2373	}
2374	}
2375	}
2376	}
2377
2378	TextWordList::~TextWordList() { }
2379
2380	int TextWordList::getLength() const
2381	{
2382	return words.size();
2383	}
2384
2385	TextWord TextWordList::get(int* idx)
2386	{
2387	if (idx < `0` \|\| idx >= (int)words.size()) {
2388	return nullptr;
2389	}
2390	return words [idx];
2391	}
2392
2393	#endif // TEXTOUT_WORD_LIST
2394
2395	//------------------------------------------------------------------------
2396	// TextPage
2397	//------------------------------------------------------------------------
2398
2399	TextPage::TextPage(bool rawOrderA, bool discardDiagA)
2400	{
2401	int rot;
2402
2403	refCnt = `1`;
2404	rawOrder = rawOrderA;
2405	discardDiag = discardDiagA;
2406	curWord = nullptr;
2407	charPos = `0`;
2408	curFont = nullptr;
2409	curFontSize = `0`;
2410	nest = `0`;
2411	nTinyChars = `0`;
2412	lastCharOverlap = false;
2413	if (!rawOrder) {
2414	for (rot = `0`; rot < `4`; ++rot) {
2415	pools[rot] = std::make_unique<TextPool>();
2416	}
2417	}
2418	flows = nullptr;
2419	blocks = nullptr;
2420	rawWords = nullptr;
2421	rawLastWord = nullptr;
2422	lastFindXMin = lastFindYMin = `0`;
2423	haveLastFind = false;
2424	mergeCombining = true;
2425	diagonal = false;
2426	}
2427
2428	TextPage::~TextPage()
2429	{
2430	clear();
2431	}
2432
2433	void TextPage::incRefCnt()
2434	{
2435	refCnt++;
2436	}
2437
2438	void TextPage::decRefCnt()
2439	{
2440	if (--refCnt == `0`) {
2441	delete this;
2442	}
2443	}
2444
2445	void TextPage::startPage(const GfxState *state)
2446	{
2447	clear();
2448	if (state) {
2449	pageWidth = state->getPageWidth();
2450	pageHeight = state->getPageHeight();
2451	} else {
2452	pageWidth = pageHeight = `0`;
2453	}
2454	}
2455
2456	void TextPage::endPage()
2457	{
2458	if (curWord) {
2459	endWord();
2460	}
2461	}
2462
2463	void TextPage::clear()
2464	{
2465	int rot;
2466	TextFlow *flow;
2467	TextWord *word;
2468
2469	if (curWord) {
2470	delete curWord;
2471	curWord = nullptr;
2472	}
2473	if (rawOrder) {
2474	while (rawWords) {
2475	word = rawWords;
2476	rawWords = rawWords->next;
2477	delete word;
2478	}
2479	} else {
2480	for (rot = `0`; rot < `4`; ++rot) {
2481	pools[rot] = std::make_unique<TextPool>();
2482	}
2483	while (flows) {
2484	flow = flows;
2485	flows = flows->next;
2486	delete flow;
2487	}
2488	gfree(p: blocks);
2489	}
2490	fonts.clear();
2491	underlines.clear();
2492	links.clear();
2493
2494	diagonal = false;
2495	curWord = nullptr;
2496	charPos = `0`;
2497	curFont = nullptr;
2498	curFontSize = `0`;
2499	nest = `0`;
2500	nTinyChars = `0`;
2501	flows = nullptr;
2502	blocks = nullptr;
2503	rawWords = nullptr;
2504	rawLastWord = nullptr;
2505	}
2506
2507	void TextPage::updateFont(const GfxState *state)
2508	{
2509	const double *fm;
2510	const char *name;
2511	int code, mCode, letterCode, anyCode;
2512	double w;
2513
2514	// get the font info object
2515	curFont = nullptr;
2516	for (const std::unique_ptr<TextFontInfo> &f : fonts) {
2517	if (f ->matches(state)) {
2518	curFont = f.get();
2519	break;
2520	}
2521	}
2522	if (!curFont) {
2523	fonts.emplace_back(args: std::make_unique<TextFontInfo>(args&: state));
2524	curFont = fonts.back().get();
2525	}
2526
2527	// adjust the font size
2528	GfxFont *const gfxFont = state->getFont().get();
2529	curFontSize = state->getTransformedFontSize();
2530	if (gfxFont && gfxFont->getType() == fontType3) {
2531	// This is a hack which makes it possible to deal with some Type 3
2532	// fonts. The problem is that it's impossible to know what the
2533	// base coordinate system used in the font is without actually
2534	// rendering the font. This code tries to guess by looking at the
2535	// width of the character 'm' (which breaks if the font is a
2536	// subset that doesn't contain 'm').
2537	mCode = letterCode = anyCode = -`1`;
2538	for (code = `0`; code < `256`; ++code) {
2539	name = ((Gfx8BitFont *)gfxFont)->getCharName(code);
2540	int nameLen = name ? strlen(s: name) : `0`;
2541	bool nameOneChar = nameLen == `1` \|\| (nameLen > `1` && name[`1`] == `'\0'`);
2542	if (nameOneChar && name[`0`] == `'m'`) {
2543	mCode = code;
2544	}
2545	if (letterCode < `0` && nameOneChar && ((name[`0`] >= `'A'` && name[`0`] <= `'Z'`) \|\| (name[`0`] >= `'a'` && name[`0`] <= `'z'`))) {
2546	letterCode = code;
2547	}
2548	if (anyCode < `0` && name && ((Gfx8BitFont *)gfxFont)->getWidth(c: code) > `0`) {
2549	anyCode = code;
2550	}
2551	}
2552	if (mCode >= `0` && (w = ((Gfx8BitFont *)gfxFont)->getWidth(c: mCode)) > `0`) {
2553	// 0.6 is a generic average 'm' width -- yes, this is a hack
2554	curFontSize *= w / `0.6`;
2555	} else if (letterCode >= `0` && (w = ((Gfx8BitFont *)gfxFont)->getWidth(c: letterCode)) > `0`) {
2556	// even more of a hack: 0.5 is a generic letter width
2557	curFontSize *= w / `0.5`;
2558	} else if (anyCode >= `0` && (w = ((Gfx8BitFont *)gfxFont)->getWidth(c: anyCode)) > `0`) {
2559	// better than nothing: 0.5 is a generic character width
2560	curFontSize *= w / `0.5`;
2561	}
2562	fm = gfxFont->getFontMatrix();
2563	if (fm[`0`] != `0`) {
2564	curFontSize *= fabs(x: fm[`3`] / fm[`0`]);
2565	}
2566	}
2567	}
2568
2569	void TextPage::beginWord(const GfxState *state)
2570	{
2571	const double *fontm;
2572	double m[`4`], m2[`4`];
2573	int rot;
2574
2575	// This check is needed because Type 3 characters can contain
2576	// text-drawing operations (when TextPage is being used via
2577	// {X,Win}SplashOutputDev rather than TextOutputDev).
2578	if (curWord) {
2579	++nest;
2580	return;
2581	}
2582
2583	// compute the rotation
2584	state->getFontTransMat(m11: &m[`0`], m12: &m[`1`], m21: &m[`2`], m22: &m[`3`]);
2585	std::shared_ptr<GfxFont> gfxFont = state->getFont();
2586	if (gfxFont && gfxFont ->getType() == fontType3) {
2587	fontm = state->getFont()->getFontMatrix();
2588	m2[`0`] = fontm[`0`] * m[`0`] + fontm[`1`] * m[`2`];
2589	m2[`1`] = fontm[`0`] * m[`1`] + fontm[`1`] * m[`3`];
2590	m2[`2`] = fontm[`2`] * m[`0`] + fontm[`3`] * m[`2`];
2591	m2[`3`] = fontm[`2`] * m[`1`] + fontm[`3`] * m[`3`];
2592	m[`0`] = m2[`0`];
2593	m[`1`] = m2[`1`];
2594	m[`2`] = m2[`2`];
2595	m[`3`] = m2[`3`];
2596	}
2597	if (fabs(x: m[`0`] * m[`3`]) > fabs(x: m[`1`] * m[`2`])) {
2598	rot = (m[`0`] > `0` \|\| m[`3`] < `0`) ? `0` : `2`;
2599	} else {
2600	rot = (m[`2`] > `0`) ? `1` : `3`;
2601	}
2602	if (fabs(x: m[`0`]) >= fabs(x: m[`1`])) {
2603	diagonal = fabs(x: m[`1`]) > diagonalThreshold * fabs(x: m[`0`]);
2604	} else {
2605	diagonal = fabs(x: m[`0`]) > diagonalThreshold * fabs(x: m[`1`]);
2606	}
2607
2608	// for vertical writing mode, the lines are effectively rotated 90
2609	// degrees
2610	if (gfxFont && gfxFont ->getWMode()) {
2611	rot = (rot + `1`) & `3`;
2612	}
2613
2614	curWord = new TextWord (state, rot, curFontSize);
2615	}
2616
2617	void TextPage::addChar(const GfxState state, double* x, double y, double dx, double dy, CharCode c, int nBytes, const Unicode u, int* uLen)
2618	{
2619	double x1, y1, w1, h1, dx2, dy2, base, sp, delta;
2620	bool overlap;
2621	int i;
2622	int wMode;
2623	Matrix mat;
2624
2625	// subtract char and word spacing from the dx,dy values
2626	sp = state->getCharSpace();
2627	if (c == (CharCode)`0x20`) {
2628	sp += state->getWordSpace();
2629	}
2630	state->textTransformDelta(x1: sp * state->getHorizScaling(), y1: `0`, x2: &dx2, y2: &dy2);
2631	dx -= dx2;
2632	dy -= dy2;
2633	state->transformDelta(x1: dx, y1: dy, x2: &w1, y2: &h1);
2634
2635	// throw away chars that aren't inside the page bounds
2636	// (and also do a sanity check on the character size)
2637	state->transform(x1: x, y1: y, x2: &x1, y2: &y1);
2638	if (x1 + w1 < `0` \|\| x1 > pageWidth \|\| y1 + h1 < `0` \|\| y1 > pageHeight \|\| std::isnan(x: x1) \|\| std::isnan(x: y1) \|\| std::isnan(x: w1) \|\| std::isnan(x: h1)) {
2639	charPos += nBytes;
2640	return;
2641	}
2642
2643	// check the tiny chars limit
2644	if (fabs(x: w1) < `3` && fabs(x: h1) < `3`) {
2645	if (++nTinyChars > `50000`) {
2646	charPos += nBytes;
2647	return;
2648	}
2649	}
2650
2651	// break words at space character
2652	if (uLen == `1` && UnicodeIsWhitespace(ucs4: u[`0`])) {
2653	charPos += nBytes;
2654	endWord();
2655	return;
2656	} else if (uLen == `1` && u[`0`] == (Unicode)`0x0`) {
2657	// ignore null characters
2658	charPos += nBytes;
2659	return;
2660	}
2661
2662	state->getFontTransMat(m11: &mat.m[`0`], m12: &mat.m[`1`], m21: &mat.m[`2`], m22: &mat.m[`3`]);
2663	mat.m[`0`] *= state->getHorizScaling();
2664	mat.m[`1`] *= state->getHorizScaling();
2665	mat.m[`4`] = x1;
2666	mat.m[`5`] = y1;
2667
2668	if (mergeCombining && curWord && uLen == `1` && curWord->addCombining(state, fontA: curFont, fontSizeA: curFontSize, x: x1, y: y1, dx: w1, dy: h1, charPosA: charPos, charLen: nBytes, c, u: u[`0`], textMatA: mat)) {
2669	charPos += nBytes;
2670	return;
2671	}
2672
2673	// start a new word if:
2674	// (1) this character doesn't fall in the right place relative to
2675	// the end of the previous word (this places upper and lower
2676	// constraints on the position deltas along both the primary
2677	// and secondary axes), or
2678	// (2) this character overlaps the previous one (duplicated text), or
2679	// (3) the previous character was an overlap (we want each duplicated
2680	// character to be in a word by itself at this stage),
2681	// (4) the font size has changed
2682	// (5) the WMode changed
2683	if (curWord && curWord->len() > `0`) {
2684	base = sp = delta = `0`; // make gcc happy
2685	switch (curWord->rot) {
2686	case `0`:
2687	base = y1;
2688	sp = x1 - curWord->xMax;
2689	delta = x1 - curWord->chars.back().edge;
2690	break;
2691	case `1`:
2692	base = x1;
2693	sp = y1 - curWord->yMax;
2694	delta = y1 - curWord->chars.back().edge;
2695	break;
2696	case `2`:
2697	base = y1;
2698	sp = curWord->xMin - x1;
2699	delta = curWord->chars.back().edge - x1;
2700	break;
2701	case `3`:
2702	base = x1;
2703	sp = curWord->yMin - y1;
2704	delta = curWord->chars.back().edge - y1;
2705	break;
2706	}
2707	overlap = fabs(x: delta) < dupMaxPriDelta * curWord->fontSize && fabs(x: base - curWord->base) < dupMaxSecDelta * curWord->fontSize;
2708	wMode = curFont->getWMode();
2709	if (overlap \|\| lastCharOverlap \|\| sp < -minDupBreakOverlap * curWord->fontSize \|\| sp > minWordBreakSpace * curWord->fontSize \|\| fabs(x: base - curWord->base) > `0.5` \|\| curFontSize != curWord->fontSize \|\| wMode != curWord->wMode) {
2710	endWord();
2711	}
2712	lastCharOverlap = overlap;
2713	} else {
2714	lastCharOverlap = false;
2715	}
2716
2717	if (uLen != `0`) {
2718	// start a new word if needed
2719	if (!curWord) {
2720	beginWord(state);
2721	}
2722
2723	// throw away diagonal chars
2724	if (discardDiag && diagonal) {
2725	charPos += nBytes;
2726	return;
2727	}
2728
2729	// page rotation and/or transform matrices can cause text to be
2730	// drawn in reverse order -- in this case, swap the begin/end
2731	// coordinates and break text into individual chars
2732	if ((curWord->rot == `0` && w1 < `0`) \|\| (curWord->rot == `1` && h1 < `0`) \|\| (curWord->rot == `2` && w1 > `0`) \|\| (curWord->rot == `3` && h1 > `0`)) {
2733	endWord();
2734	beginWord(state);
2735
2736	// throw away diagonal chars
2737	if (discardDiag && diagonal) {
2738	charPos += nBytes;
2739	return;
2740	}
2741
2742	x1 += w1;
2743	y1 += h1;
2744	w1 = -w1;
2745	h1 = -h1;
2746	}
2747
2748	// add the characters to the current word
2749	w1 /= uLen;
2750	h1 /= uLen;
2751	for (i = `0`; i < uLen; ++i) {
2752	curWord->addChar(state, fontA: curFont, x: x1 + i * w1, y: y1 + i * h1, dx: w1, dy: h1, charPosA: charPos, charLen: nBytes, c, u: u[i], textMatA: mat);
2753	}
2754	}
2755	charPos += nBytes;
2756	}
2757
2758	void TextPage::incCharCount(int nChars)
2759	{
2760	charPos += nChars;
2761	}
2762
2763	void TextPage::endWord()
2764	{
2765	// This check is needed because Type 3 characters can contain
2766	// text-drawing operations (when TextPage is being used via
2767	// {X,Win}SplashOutputDev rather than TextOutputDev).
2768	if (nest > `0`) {
2769	--nest;
2770	return;
2771	}
2772
2773	if (curWord) {
2774	addWord(word: curWord);
2775	curWord = nullptr;
2776	}
2777	}
2778
2779	void TextPage::addWord(TextWord *word)
2780	{
2781	// throw away zero-length words -- they don't have valid xMin/xMax
2782	// values, and they're useless anyway
2783	if (word->len() == `0`) {
2784	delete word;
2785	return;
2786	}
2787
2788	if (rawOrder) {
2789	if (rawLastWord) {
2790	rawLastWord->next = word;
2791	} else {
2792	rawWords = word;
2793	}
2794	rawLastWord = word;
2795	} else {
2796	pools[word->rot]->addWord(word);
2797	}
2798	}
2799
2800	void TextPage::addUnderline(double x0, double y0, double x1, double y1)
2801	{
2802	underlines.emplace_back(args: std::make_unique<TextUnderline>(args&: x0, args&: y0, args&: x1, args&: y1));
2803	}
2804
2805	void TextPage::addLink(int xMin, int yMin, int xMax, int yMax, AnnotLink *link)
2806	{
2807	links.emplace_back(args: std::make_unique<TextLink>(args&: xMin, args&: yMin, args&: xMax, args&: yMax, args&: link));
2808	}
2809
2810	void TextPage::coalesce(bool physLayout, double fixedPitch, bool doHTML)
2811	{
2812	coalesce(physLayout, fixedPitch, doHTML, minColSpacing1: TextOutputDev::minColSpacing1_default);
2813	}
2814
2815	void TextPage::coalesce(bool physLayout, double fixedPitch, bool doHTML, double minColSpacing1)
2816	{
2817	TextWord word0, word1, *word2;
2818	TextLine *line;
2819	TextBlock blkList, blk, lastBlk, blk0, blk1, blk2;
2820	TextFlow flow, lastFlow;
2821	int rot, poolMinBaseIdx, baseIdx, startBaseIdx, endBaseIdx;
2822	double minBase, maxBase, newMinBase, newMaxBase;
2823	double fontSize, colSpace1, colSpace2, lineSpace, intraLineSpace, blkSpace;
2824	bool found;
2825	int count[`4`];
2826	int lrCount;
2827	int col1, col2;
2828	int j, n;
2829
2830	if (rawOrder) {
2831	primaryRot = `0`;
2832	primaryLR = true;
2833	return;
2834	}
2835
2836	const UnicodeMap *uMap = globalParams ->getTextEncoding();
2837	blkList = nullptr;
2838	lastBlk = nullptr;
2839	nBlocks = `0`;
2840	primaryRot = `0`;
2841
2842	#if 0 // for debugging
2843	printf("* initial words *\n");
2844	for (rot = `0`; rot < `4`; ++rot) {
2845	pool = pools[rot];
2846	for (baseIdx = pool->minBaseIdx; baseIdx <= pool->maxBaseIdx; ++baseIdx) {
2847	for (word0 = pool->getPool(baseIdx); word0; word0 = word0->next) {
2848	printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSize=%.2f rot=%d link=%p '",
2849	word0->xMin, word0->xMax, word0->yMin, word0->yMax,
2850	word0->base, word0->fontSize, rot*`90`, word0->link);
2851	for (i = `0`; i < word0->len; ++i) {
2852	fputc(word0->text[i] & `0xff`, stdout);
2853	}
2854	printf("'\n");
2855	}
2856	}
2857	}
2858	printf("\n");
2859	#endif
2860
2861	#if 0 //~ for debugging
2862	for (i = `0`; i < underlines->getLength(); ++i) {
2863	underline = (TextUnderline *)underlines->get(i);
2864	printf("underline: x=%g..%g y=%g..%g horiz=%d\n",
2865	underline->x0, underline->x1, underline->y0, underline->y1,
2866	underline->horiz);
2867	}
2868	#endif
2869
2870	if (doHTML) {
2871
2872	//----- handle underlining
2873	for (const std::unique_ptr<TextUnderline> &underline : underlines) {
2874	if (underline ->horiz) {
2875	// rot = 0
2876	if (pools[`0`]->minBaseIdx <= pools[`0`]->maxBaseIdx) {
2877	startBaseIdx = pools[`0`]->getBaseIdx(base: underline ->y0 + minUnderlineGap);
2878	endBaseIdx = pools[`0`]->getBaseIdx(base: underline ->y0 + maxUnderlineGap);
2879	for (j = startBaseIdx; j <= endBaseIdx; ++j) {
2880	for (word0 = pools[`0`]->getPool(baseIdx: j); word0; word0 = word0->next) {
2881	//~ need to check the y value against the word baseline
2882	if (underline ->x0 < word0->xMin + underlineSlack && word0->xMax - underlineSlack < underline ->x1) {
2883	word0->underlined = true;
2884	}
2885	}
2886	}
2887	}
2888
2889	// rot = 2
2890	if (pools[`2`]->minBaseIdx <= pools[`2`]->maxBaseIdx) {
2891	startBaseIdx = pools[`2`]->getBaseIdx(base: underline ->y0 - maxUnderlineGap);
2892	endBaseIdx = pools[`2`]->getBaseIdx(base: underline ->y0 - minUnderlineGap);
2893	for (j = startBaseIdx; j <= endBaseIdx; ++j) {
2894	for (word0 = pools[`2`]->getPool(baseIdx: j); word0; word0 = word0->next) {
2895	if (underline ->x0 < word0->xMin + underlineSlack && word0->xMax - underlineSlack < underline ->x1) {
2896	word0->underlined = true;
2897	}
2898	}
2899	}
2900	}
2901	} else {
2902	// rot = 1
2903	if (pools[`1`]->minBaseIdx <= pools[`1`]->maxBaseIdx) {
2904	startBaseIdx = pools[`1`]->getBaseIdx(base: underline ->x0 - maxUnderlineGap);
2905	endBaseIdx = pools[`1`]->getBaseIdx(base: underline ->x0 - minUnderlineGap);
2906	for (j = startBaseIdx; j <= endBaseIdx; ++j) {
2907	for (word0 = pools[`1`]->getPool(baseIdx: j); word0; word0 = word0->next) {
2908	if (underline ->y0 < word0->yMin + underlineSlack && word0->yMax - underlineSlack < underline ->y1) {
2909	word0->underlined = true;
2910	}
2911	}
2912	}
2913	}
2914
2915	// rot = 3
2916	if (pools[`3`]->minBaseIdx <= pools[`3`]->maxBaseIdx) {
2917	startBaseIdx = pools[`3`]->getBaseIdx(base: underline ->x0 + minUnderlineGap);
2918	endBaseIdx = pools[`3`]->getBaseIdx(base: underline ->x0 + maxUnderlineGap);
2919	for (j = startBaseIdx; j <= endBaseIdx; ++j) {
2920	for (word0 = pools[`3`]->getPool(baseIdx: j); word0; word0 = word0->next) {
2921	if (underline ->y0 < word0->yMin + underlineSlack && word0->yMax - underlineSlack < underline ->y1) {
2922	word0->underlined = true;
2923	}
2924	}
2925	}
2926	}
2927	}
2928	}
2929
2930	//----- handle links
2931	for (const std::unique_ptr<TextLink> &link : links) {
2932	// rot = 0
2933	if (pools[`0`]->minBaseIdx <= pools[`0`]->maxBaseIdx) {
2934	startBaseIdx = pools[`0`]->getBaseIdx(base: link ->yMin);
2935	endBaseIdx = pools[`0`]->getBaseIdx(base: link ->yMax);
2936	for (j = startBaseIdx; j <= endBaseIdx; ++j) {
2937	for (word0 = pools[`0`]->getPool(baseIdx: j); word0; word0 = word0->next) {
2938	if (link ->xMin < word0->xMin + hyperlinkSlack && word0->xMax - hyperlinkSlack < link ->xMax && link ->yMin < word0->yMin + hyperlinkSlack && word0->yMax - hyperlinkSlack < link ->yMax) {
2939	word0->link = link ->link;
2940	}
2941	}
2942	}
2943	}
2944
2945	// rot = 2
2946	if (pools[`2`]->minBaseIdx <= pools[`2`]->maxBaseIdx) {
2947	startBaseIdx = pools[`2`]->getBaseIdx(base: link ->yMin);
2948	endBaseIdx = pools[`2`]->getBaseIdx(base: link ->yMax);
2949	for (j = startBaseIdx; j <= endBaseIdx; ++j) {
2950	for (word0 = pools[`2`]->getPool(baseIdx: j); word0; word0 = word0->next) {
2951	if (link ->xMin < word0->xMin + hyperlinkSlack && word0->xMax - hyperlinkSlack < link ->xMax && link ->yMin < word0->yMin + hyperlinkSlack && word0->yMax - hyperlinkSlack < link ->yMax) {
2952	word0->link = link ->link;
2953	}
2954	}
2955	}
2956	}
2957
2958	// rot = 1
2959	if (pools[`1`]->minBaseIdx <= pools[`1`]->maxBaseIdx) {
2960	startBaseIdx = pools[`1`]->getBaseIdx(base: link ->xMin);
2961	endBaseIdx = pools[`1`]->getBaseIdx(base: link ->xMax);
2962	for (j = startBaseIdx; j <= endBaseIdx; ++j) {
2963	for (word0 = pools[`1`]->getPool(baseIdx: j); word0; word0 = word0->next) {
2964	if (link ->yMin < word0->yMin + hyperlinkSlack && word0->yMax - hyperlinkSlack < link ->yMax && link ->xMin < word0->xMin + hyperlinkSlack && word0->xMax - hyperlinkSlack < link ->xMax) {
2965	word0->link = link ->link;
2966	}
2967	}
2968	}
2969	}
2970
2971	// rot = 3
2972	if (pools[`3`]->minBaseIdx <= pools[`3`]->maxBaseIdx) {
2973	startBaseIdx = pools[`3`]->getBaseIdx(base: link ->xMin);
2974	endBaseIdx = pools[`3`]->getBaseIdx(base: link ->xMax);
2975	for (j = startBaseIdx; j <= endBaseIdx; ++j) {
2976	for (word0 = pools[`3`]->getPool(baseIdx: j); word0; word0 = word0->next) {
2977	if (link ->yMin < word0->yMin + hyperlinkSlack && word0->yMax - hyperlinkSlack < link ->yMax && link ->xMin < word0->xMin + hyperlinkSlack && word0->xMax - hyperlinkSlack < link ->xMax) {
2978	word0->link = link ->link;
2979	}
2980	}
2981	}
2982	}
2983	}
2984	}
2985
2986	//----- assemble the blocks
2987
2988	//~ add an outer loop for writing mode (vertical text)
2989
2990	// build blocks for each rotation value
2991	for (rot = `0`; rot < `4`; ++rot) {
2992	std::unique_ptr<TextPool> &pool = pools[rot];
2993	poolMinBaseIdx = pool ->minBaseIdx;
2994	count[rot] = `0`;
2995
2996	// add blocks until no more words are left
2997	while (true) {
2998
2999	// find the first non-empty line in the pool
3000	for (; poolMinBaseIdx <= pool ->maxBaseIdx && !pool ->getPool(baseIdx: poolMinBaseIdx); ++poolMinBaseIdx) {
3001	;
3002	}
3003	if (poolMinBaseIdx > pool ->maxBaseIdx) {
3004	break;
3005	}
3006
3007	// look for the left-most word in the first four lines of the
3008	// pool -- this avoids starting with a superscript word
3009	startBaseIdx = poolMinBaseIdx;
3010	for (baseIdx = poolMinBaseIdx + `1`; baseIdx < poolMinBaseIdx + `4` && baseIdx <= pool ->maxBaseIdx; ++baseIdx) {
3011	if (!pool ->getPool(baseIdx)) {
3012	continue;
3013	}
3014	if (pool ->getPool(baseIdx)->primaryCmp(word: pool ->getPool(baseIdx: startBaseIdx)) < `0`) {
3015	startBaseIdx = baseIdx;
3016	}
3017	}
3018
3019	// create a new block
3020	word0 = pool ->getPool(baseIdx: startBaseIdx);
3021	pool ->setPool(baseIdx: startBaseIdx, p: word0->next);
3022	word0->next = nullptr;
3023	blk = new TextBlock (this, rot);
3024	blk->addWord(word: word0);
3025
3026	fontSize = word0->fontSize;
3027	minBase = maxBase = word0->base;
3028	colSpace1 = minColSpacing1 * fontSize;
3029	colSpace2 = minColSpacing2 * fontSize;
3030	lineSpace = maxLineSpacingDelta * fontSize;
3031	intraLineSpace = maxIntraLineDelta * fontSize;
3032
3033	// add words to the block
3034	do {
3035	found = false;
3036
3037	// look for words on the line above the current top edge of
3038	// the block
3039	newMinBase = minBase;
3040	for (baseIdx = pool ->getBaseIdx(base: minBase); baseIdx >= pool ->getBaseIdx(base: minBase - lineSpace); --baseIdx) {
3041	word0 = nullptr;
3042	word1 = pool ->getPool(baseIdx);
3043	while (word1) {
3044	if (word1->base < minBase && word1->base >= minBase - lineSpace && ((rot == `0` \|\| rot == `2`) ? (word1->xMin < blk->xMax && word1->xMax > blk->xMin) : (word1->yMin < blk->yMax && word1->yMax > blk->yMin))
3045	&& fabs(x: word1->fontSize - fontSize) < maxBlockFontSizeDelta1 * fontSize) {
3046	word2 = word1;
3047	if (word0) {
3048	word0->next = word1->next;
3049	} else {
3050	pool ->setPool(baseIdx, p: word1->next);
3051	}
3052	word1 = word1->next;
3053	word2->next = nullptr;
3054	blk->addWord(word: word2);
3055	found = true;
3056	newMinBase = word2->base;
3057	} else {
3058	word0 = word1;
3059	word1 = word1->next;
3060	}
3061	}
3062	}
3063	minBase = newMinBase;
3064
3065	// look for words on the line below the current bottom edge of
3066	// the block
3067	newMaxBase = maxBase;
3068	for (baseIdx = pool ->getBaseIdx(base: maxBase); baseIdx <= pool ->getBaseIdx(base: maxBase + lineSpace); ++baseIdx) {
3069	word0 = nullptr;
3070	word1 = pool ->getPool(baseIdx);
3071	while (word1) {
3072	if (word1->base > maxBase && word1->base <= maxBase + lineSpace && ((rot == `0` \|\| rot == `2`) ? (word1->xMin < blk->xMax && word1->xMax > blk->xMin) : (word1->yMin < blk->yMax && word1->yMax > blk->yMin))
3073	&& fabs(x: word1->fontSize - fontSize) < maxBlockFontSizeDelta1 * fontSize) {
3074	word2 = word1;
3075	if (word0) {
3076	word0->next = word1->next;
3077	} else {
3078	pool ->setPool(baseIdx, p: word1->next);
3079	}
3080	word1 = word1->next;
3081	word2->next = nullptr;
3082	blk->addWord(word: word2);
3083	found = true;
3084	newMaxBase = word2->base;
3085	} else {
3086	word0 = word1;
3087	word1 = word1->next;
3088	}
3089	}
3090	}
3091	maxBase = newMaxBase;
3092
3093	// look for words that are on lines already in the block, and
3094	// that overlap the block horizontally
3095	for (baseIdx = pool ->getBaseIdx(base: minBase - intraLineSpace); baseIdx <= pool ->getBaseIdx(base: maxBase + intraLineSpace); ++baseIdx) {
3096	word0 = nullptr;
3097	word1 = pool ->getPool(baseIdx);
3098	while (word1) {
3099	if (word1->base >= minBase - intraLineSpace && word1->base <= maxBase + intraLineSpace
3100	&& ((rot == `0` \|\| rot == `2`) ? (word1->xMin < blk->xMax + colSpace1 && word1->xMax > blk->xMin - colSpace1) : (word1->yMin < blk->yMax + colSpace1 && word1->yMax > blk->yMin - colSpace1))
3101	&& fabs(x: word1->fontSize - fontSize) < maxBlockFontSizeDelta2 * fontSize) {
3102	word2 = word1;
3103	if (word0) {
3104	word0->next = word1->next;
3105	} else {
3106	pool ->setPool(baseIdx, p: word1->next);
3107	}
3108	word1 = word1->next;
3109	word2->next = nullptr;
3110	blk->addWord(word: word2);
3111	found = true;
3112	} else {
3113	word0 = word1;
3114	word1 = word1->next;
3115	}
3116	}
3117	}
3118
3119	// only check for outlying words (the next two chunks of code)
3120	// if we didn't find anything else
3121	if (found) {
3122	continue;
3123	}
3124
3125	// scan down the left side of the block, looking for words
3126	// that are near (but not overlapping) the block; if there are
3127	// three or fewer, add them to the block
3128	n = `0`;
3129	for (baseIdx = pool ->getBaseIdx(base: minBase - intraLineSpace); baseIdx <= pool ->getBaseIdx(base: maxBase + intraLineSpace); ++baseIdx) {
3130	word1 = pool ->getPool(baseIdx);
3131	while (word1) {
3132	if (word1->base >= minBase - intraLineSpace && word1->base <= maxBase + intraLineSpace
3133	&& ((rot == `0` \|\| rot == `2`) ? (word1->xMax <= blk->xMin && word1->xMax > blk->xMin - colSpace2) : (word1->yMax <= blk->yMin && word1->yMax > blk->yMin - colSpace2))
3134	&& fabs(x: word1->fontSize - fontSize) < maxBlockFontSizeDelta3 * fontSize) {
3135	++n;
3136	break;
3137	}
3138	word1 = word1->next;
3139	}
3140	}
3141	if (n > `0` && n <= `3`) {
3142	for (baseIdx = pool ->getBaseIdx(base: minBase - intraLineSpace); baseIdx <= pool ->getBaseIdx(base: maxBase + intraLineSpace); ++baseIdx) {
3143	word0 = nullptr;
3144	word1 = pool ->getPool(baseIdx);
3145	while (word1) {
3146	if (word1->base >= minBase - intraLineSpace && word1->base <= maxBase + intraLineSpace
3147	&& ((rot == `0` \|\| rot == `2`) ? (word1->xMax <= blk->xMin && word1->xMax > blk->xMin - colSpace2) : (word1->yMax <= blk->yMin && word1->yMax > blk->yMin - colSpace2))
3148	&& fabs(x: word1->fontSize - fontSize) < maxBlockFontSizeDelta3 * fontSize) {
3149	word2 = word1;
3150	if (word0) {
3151	word0->next = word1->next;
3152	} else {
3153	pool ->setPool(baseIdx, p: word1->next);
3154	}
3155	word1 = word1->next;
3156	word2->next = nullptr;
3157	blk->addWord(word: word2);
3158	if (word2->base < minBase) {
3159	minBase = word2->base;
3160	} else if (word2->base > maxBase) {
3161	maxBase = word2->base;
3162	}
3163	found = true;
3164	break;
3165	} else {
3166	word0 = word1;
3167	word1 = word1->next;
3168	}
3169	}
3170	}
3171	}
3172
3173	// scan down the right side of the block, looking for words
3174	// that are near (but not overlapping) the block; if there are
3175	// three or fewer, add them to the block
3176	n = `0`;
3177	for (baseIdx = pool ->getBaseIdx(base: minBase - intraLineSpace); baseIdx <= pool ->getBaseIdx(base: maxBase + intraLineSpace); ++baseIdx) {
3178	word1 = pool ->getPool(baseIdx);
3179	while (word1) {
3180	if (word1->base >= minBase - intraLineSpace && word1->base <= maxBase + intraLineSpace
3181	&& ((rot == `0` \|\| rot == `2`) ? (word1->xMin >= blk->xMax && word1->xMin < blk->xMax + colSpace2) : (word1->yMin >= blk->yMax && word1->yMin < blk->yMax + colSpace2))
3182	&& fabs(x: word1->fontSize - fontSize) < maxBlockFontSizeDelta3 * fontSize) {
3183	++n;
3184	break;
3185	}
3186	word1 = word1->next;
3187	}
3188	}
3189	if (n > `0` && n <= `3`) {
3190	for (baseIdx = pool ->getBaseIdx(base: minBase - intraLineSpace); baseIdx <= pool ->getBaseIdx(base: maxBase + intraLineSpace); ++baseIdx) {
3191	word0 = nullptr;
3192	word1 = pool ->getPool(baseIdx);
3193	while (word1) {
3194	if (word1->base >= minBase - intraLineSpace && word1->base <= maxBase + intraLineSpace
3195	&& ((rot == `0` \|\| rot == `2`) ? (word1->xMin >= blk->xMax && word1->xMin < blk->xMax + colSpace2) : (word1->yMin >= blk->yMax && word1->yMin < blk->yMax + colSpace2))
3196	&& fabs(x: word1->fontSize - fontSize) < maxBlockFontSizeDelta3 * fontSize) {
3197	word2 = word1;
3198	if (word0) {
3199	word0->next = word1->next;
3200	} else {
3201	pool ->setPool(baseIdx, p: word1->next);
3202	}
3203	word1 = word1->next;
3204	word2->next = nullptr;
3205	blk->addWord(word: word2);
3206	if (word2->base < minBase) {
3207	minBase = word2->base;
3208	} else if (word2->base > maxBase) {
3209	maxBase = word2->base;
3210	}
3211	found = true;
3212	break;
3213	} else {
3214	word0 = word1;
3215	word1 = word1->next;
3216	}
3217	}
3218	}
3219	}
3220
3221	} while (found);
3222
3223	//~ need to compute the primary writing mode (horiz/vert) in
3224	//~ addition to primary rotation
3225
3226	// coalesce the block, and add it to the list
3227	blk->coalesce(uMap, fixedPitch);
3228	if (lastBlk) {
3229	lastBlk->next = blk;
3230	} else {
3231	blkList = blk;
3232	}
3233	lastBlk = blk;
3234	count[rot] += blk->charCount;
3235	++nBlocks;
3236	}
3237
3238	if (count[rot] > count[primaryRot]) {
3239	primaryRot = rot;
3240	}
3241	}
3242
3243	#if 0 // for debugging
3244	printf("* rotation *\n");
3245	for (rot = `0`; rot < `4`; ++rot) {
3246	printf(" %d: %6d\n", rot, count[rot]);
3247	}
3248	printf(" primary rot = %d\n", primaryRot);
3249	printf("\n");
3250	#endif
3251
3252	#if 0 // for debugging
3253	printf("* blocks *\n");
3254	for (blk = blkList; blk; blk = blk->next) {
3255	printf("block: rot=%d x=%.2f..%.2f y=%.2f..%.2f\n",
3256	blk->rot, blk->xMin, blk->xMax, blk->yMin, blk->yMax);
3257	for (line = blk->lines; line; line = line->next) {
3258	printf(" line: x=%.2f..%.2f y=%.2f..%.2f base=%.2f\n",
3259	line->xMin, line->xMax, line->yMin, line->yMax, line->base);
3260	for (word0 = line->words; word0; word0 = word0->next) {
3261	printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSize=%.2f space=%d: '",
3262	word0->xMin, word0->xMax, word0->yMin, word0->yMax,
3263	word0->base, word0->fontSize, word0->spaceAfter);
3264	for (i = `0`; i < word0->len; ++i) {
3265	fputc(word0->text[i] & `0xff`, stdout);
3266	}
3267	printf("'\n");
3268	}
3269	}
3270	}
3271	printf("\n");
3272	#endif
3273
3274	// determine the primary direction
3275	lrCount = `0`;
3276	for (blk = blkList; blk; blk = blk->next) {
3277	for (line = blk->lines; line; line = line->next) {
3278	for (word0 = line->words; word0; word0 = word0->next) {
3279	for (size_t i = `0`; i < word0->len(); ++i) {
3280	if (unicodeTypeL(c: word0->chars [i].text)) {
3281	++lrCount;
3282	} else if (unicodeTypeR(c: word0->chars [i].text)) {
3283	--lrCount;
3284	}
3285	}
3286	}
3287	}
3288	}
3289	primaryLR = lrCount >= `0`;
3290
3291	#if 0 // for debugging
3292	printf("* direction *\n");
3293	printf("lrCount = %d\n", lrCount);
3294	printf("primaryLR = %d\n", primaryLR);
3295	#endif
3296
3297	//----- column assignment
3298
3299	// sort blocks into xy order for column assignment
3300	if (blocks) {
3301	gfree(p: blocks);
3302	}
3303	if (physLayout && fixedPitch) {
3304
3305	blocks = (TextBlock )gmallocn(count: nBlocks, size: sizeof*(TextBlock ));
3306	int i;
3307	for (blk = blkList, i = `0`; blk; blk = blk->next, ++i) {
3308	blocks[i] = blk;
3309	col1 = `0`; // make gcc happy
3310	switch (primaryRot) {
3311	case `0`:
3312	col1 = (int)(blk->xMin / fixedPitch + `0.5`);
3313	break;
3314	case `1`:
3315	col1 = (int)(blk->yMin / fixedPitch + `0.5`);
3316	break;
3317	case `2`:
3318	col1 = (int)((pageWidth - blk->xMax) / fixedPitch + `0.5`);
3319	break;
3320	case `3`:
3321	col1 = (int)((pageHeight - blk->yMax) / fixedPitch + `0.5`);
3322	break;
3323	}
3324	blk->col = col1;
3325	for (line = blk->lines; line; line = line->next) {
3326	for (j = `0`; j <= line->len; ++j) {
3327	line->col[j] += col1;
3328	}
3329	}
3330	}
3331
3332	} else {
3333
3334	// sort blocks into xy order for column assignment
3335	blocks = (TextBlock )gmallocn(count: nBlocks, size: sizeof*(TextBlock ));
3336	int i;
3337	for (blk = blkList, i = `0`; blk; blk = blk->next, ++i) {
3338	blocks[i] = blk;
3339	}
3340	if (blocks) {
3341	qsort(base: blocks, nmemb: nBlocks, size: sizeof(TextBlock *), compar: &TextBlock::cmpXYPrimaryRot);
3342	}
3343
3344	// column assignment
3345	for (i = `0`; i < nBlocks; ++i) {
3346	blk0 = blocks[i];
3347	col1 = `0`;
3348	for (j = `0`; j < i; ++j) {
3349	blk1 = blocks[j];
3350	col2 = `0`; // make gcc happy
3351	switch (primaryRot) {
3352	case `0`:
3353	if (blk0->xMin > blk1->xMax) {
3354	col2 = blk1->col + blk1->nColumns + `3`;
3355	} else if (blk1->xMax == blk1->xMin) {
3356	col2 = blk1->col;
3357	} else {
3358	col2 = blk1->col + (int)(((blk0->xMin - blk1->xMin) / (blk1->xMax - blk1->xMin)) * blk1->nColumns);
3359	}
3360	break;
3361	case `1`:
3362	if (blk0->yMin > blk1->yMax) {
3363	col2 = blk1->col + blk1->nColumns + `3`;
3364	} else if (blk1->yMax == blk1->yMin) {
3365	col2 = blk1->col;
3366	} else {
3367	col2 = blk1->col + (int)(((blk0->yMin - blk1->yMin) / (blk1->yMax - blk1->yMin)) * blk1->nColumns);
3368	}
3369	break;
3370	case `2`:
3371	if (blk0->xMax < blk1->xMin) {
3372	col2 = blk1->col + blk1->nColumns + `3`;
3373	} else if (blk1->xMin == blk1->xMax) {
3374	col2 = blk1->col;
3375	} else {
3376	col2 = blk1->col + (int)(((blk0->xMax - blk1->xMax) / (blk1->xMin - blk1->xMax)) * blk1->nColumns);
3377	}
3378	break;
3379	case `3`:
3380	if (blk0->yMax < blk1->yMin) {
3381	col2 = blk1->col + blk1->nColumns + `3`;
3382	} else if (blk1->yMin == blk1->yMax) {
3383	col2 = blk1->col;
3384	} else {
3385	col2 = blk1->col + (int)(((blk0->yMax - blk1->yMax) / (blk1->yMin - blk1->yMax)) * blk1->nColumns);
3386	}
3387	break;
3388	}
3389	if (col2 > col1) {
3390	col1 = col2;
3391	}
3392	}
3393	blk0->col = col1;
3394	for (line = blk0->lines; line; line = line->next) {
3395	for (j = `0`; j <= line->len; ++j) {
3396	line->col[j] += col1;
3397	}
3398	}
3399	}
3400	}
3401
3402	#if 0 // for debugging
3403	printf("* blocks, after column assignment *\n");
3404	for (blk = blkList; blk; blk = blk->next) {
3405	printf("block: rot=%d x=%.2f..%.2f y=%.2f..%.2f col=%d nCols=%d\n",
3406	blk->rot, blk->xMin, blk->xMax, blk->yMin, blk->yMax, blk->col,
3407	blk->nColumns);
3408	for (line = blk->lines; line; line = line->next) {
3409	printf(" line: col[0]=%d\n", line->col[`0`]);
3410	for (word0 = line->words; word0; word0 = word0->next) {
3411	printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSize=%.2f space=%d: '",
3412	word0->xMin, word0->xMax, word0->yMin, word0->yMax,
3413	word0->base, word0->fontSize, word0->spaceAfter);
3414	for (i = `0`; i < word0->len; ++i) {
3415	fputc(word0->text[i] & `0xff`, stdout);
3416	}
3417	printf("'\n");
3418	}
3419	}
3420	}
3421	printf("\n");
3422	#endif
3423
3424	//----- reading order sort
3425
3426	// compute space on left and right sides of each block
3427	for (int i = `0`; i < nBlocks; ++i) {
3428	blk0 = blocks[i];
3429	for (j = `0`; j < nBlocks; ++j) {
3430	blk1 = blocks[j];
3431	if (blk1 != blk0) {
3432	blk0->updatePriMinMax(blk: blk1);
3433	}
3434	}
3435	}
3436
3437	#if 0 // for debugging
3438	printf("PAGE\n");
3439	#endif
3440
3441	int sortPos = `0`;
3442	bool visited = (bool* )gmallocn(count: nBlocks, size: sizeof(bool*));
3443	for (int i = `0`; i < nBlocks; i++) {
3444	visited[i] = false;
3445	}
3446
3447	double bxMin0, byMin0, bxMin1, byMin1;
3448	int numTables = `0`;
3449	int tableId = -`1`;
3450	int correspondenceX, correspondenceY;
3451	double xCentre1, yCentre1, xCentre2, yCentre2;
3452	double xCentre3, yCentre3, xCentre4, yCentre4;
3453	double deltaX, deltaY;
3454	TextBlock fblk2 = nullptr, fblk3 = nullptr, fblk4 = nullptr*;
3455
3456	for (blk1 = blkList; blk1; blk1 = blk1->next) {
3457	blk1->ExMin = blk1->xMin;
3458	blk1->ExMax = blk1->xMax;
3459	blk1->EyMin = blk1->yMin;
3460	blk1->EyMax = blk1->yMax;
3461
3462	bxMin0 = DBL_MAX;
3463	byMin0 = DBL_MAX;
3464	bxMin1 = DBL_MAX;
3465	byMin1 = DBL_MAX;
3466
3467	fblk2 = nullptr;
3468	fblk3 = nullptr;
3469	fblk4 = nullptr;
3470
3471	/ find fblk2, fblk3 and fblk4 so that*
3472	* fblk2 is on the right of blk1 and overlap with blk1 in y axis
3473	* fblk3 is under blk1 and overlap with blk1 in x axis
3474	* fblk4 is under blk1 and on the right of blk1
3475	* and they are closest to blk1
3476	*/
3477	for (blk2 = blkList; blk2; blk2 = blk2->next) {
3478	if (blk2 != blk1) {
3479	if (blk2->yMin <= blk1->yMax && blk2->yMax >= blk1->yMin && blk2->xMin > blk1->xMax && blk2->xMin < bxMin0) {
3480	bxMin0 = blk2->xMin;
3481	fblk2 = blk2;
3482	} else if (blk2->xMin <= blk1->xMax && blk2->xMax >= blk1->xMin && blk2->yMin > blk1->yMax && blk2->yMin < byMin0) {
3483	byMin0 = blk2->yMin;
3484	fblk3 = blk2;
3485	} else if (blk2->xMin > blk1->xMax && blk2->xMin < bxMin1 && blk2->yMin > blk1->yMax && blk2->yMin < byMin1) {
3486	bxMin1 = blk2->xMin;
3487	byMin1 = blk2->yMin;
3488	fblk4 = blk2;
3489	}
3490	}
3491	}
3492
3493	/ fblk4 can not overlap with fblk3 in x and with fblk2 in y*
3494	* fblk2 can not overlap with fblk3 in x and y
3495	* fblk4 has to overlap with fblk3 in y and with fblk2 in x
3496	*/
3497	if (fblk2 != nullptr && fblk3 != nullptr && fblk4 != nullptr) {
3498	if (((fblk3->xMin <= fblk4->xMax && fblk3->xMax >= fblk4->xMin) \|\| (fblk2->yMin <= fblk4->yMax && fblk2->yMax >= fblk4->yMin) \|\| (fblk2->xMin <= fblk3->xMax && fblk2->xMax >= fblk3->xMin)
3499	\|\| (fblk2->yMin <= fblk3->yMax && fblk2->yMax >= fblk3->yMin))
3500	\|\| !(fblk4->xMin <= fblk2->xMax && fblk4->xMax >= fblk2->xMin && fblk4->yMin <= fblk3->yMax && fblk4->yMax >= fblk3->yMin)) {
3501	fblk2 = nullptr;
3502	fblk3 = nullptr;
3503	fblk4 = nullptr;
3504	}
3505	}
3506
3507	// if we found any then look whether they form a table
3508	if (fblk2 != nullptr && fblk3 != nullptr && fblk4 != nullptr) {
3509	tableId = -`1`;
3510	correspondenceX = `0`;
3511	correspondenceY = `0`;
3512	deltaX = `0.0`;
3513	deltaY = `0.0`;
3514
3515	if (blk1->lines && blk1->lines->words) {
3516	deltaX = blk1->lines->words->getFontSize();
3517	}
3518	if (fblk2->lines && fblk2->lines->words) {
3519	deltaX = deltaX < fblk2->lines->words->getFontSize() ? deltaX : fblk2->lines->words->getFontSize();
3520	}
3521	if (fblk3->lines && fblk3->lines->words) {
3522	deltaX = deltaX < fblk3->lines->words->getFontSize() ? deltaX : fblk3->lines->words->getFontSize();
3523	}
3524	if (fblk4->lines && fblk4->lines->words) {
3525	deltaX = deltaX < fblk4->lines->words->getFontSize() ? deltaX : fblk4->lines->words->getFontSize();
3526	}
3527
3528	deltaY = deltaX;
3529
3530	deltaX *= minColSpacing1;
3531	deltaY *= maxIntraLineDelta;
3532
3533	xCentre1 = (blk1->xMax + blk1->xMin) / `2.0`;
3534	yCentre1 = (blk1->yMax + blk1->yMin) / `2.0`;
3535	xCentre2 = (fblk2->xMax + fblk2->xMin) / `2.0`;
3536	yCentre2 = (fblk2->yMax + fblk2->yMin) / `2.0`;
3537	xCentre3 = (fblk3->xMax + fblk3->xMin) / `2.0`;
3538	yCentre3 = (fblk3->yMax + fblk3->yMin) / `2.0`;
3539	xCentre4 = (fblk4->xMax + fblk4->xMin) / `2.0`;
3540	yCentre4 = (fblk4->yMax + fblk4->yMin) / `2.0`;
3541
3542	// are blocks centrally aligned in x ?
3543	if (fabs(x: xCentre1 - xCentre3) <= deltaX && fabs(x: xCentre2 - xCentre4) <= deltaX) {
3544	correspondenceX++;
3545	}
3546
3547	// are blocks centrally aligned in y ?
3548	if (fabs(x: yCentre1 - yCentre2) <= deltaY && fabs(x: yCentre3 - yCentre4) <= deltaY) {
3549	correspondenceY++;
3550	}
3551
3552	// are blocks aligned to the left ?
3553	if (fabs(x: blk1->xMin - fblk3->xMin) <= deltaX && fabs(x: fblk2->xMin - fblk4->xMin) <= deltaX) {
3554	correspondenceX++;
3555	}
3556
3557	// are blocks aligned to the right ?
3558	if (fabs(x: blk1->xMax - fblk3->xMax) <= deltaX && fabs(x: fblk2->xMax - fblk4->xMax) <= deltaX) {
3559	correspondenceX++;
3560	}
3561
3562	// are blocks aligned to the top ?
3563	if (fabs(x: blk1->yMin - fblk2->yMin) <= deltaY && fabs(x: fblk3->yMin - fblk4->yMin) <= deltaY) {
3564	correspondenceY++;
3565	}
3566
3567	// are blocks aligned to the bottom ?
3568	if (fabs(x: blk1->yMax - fblk2->yMax) <= deltaY && fabs(x: fblk3->yMax - fblk4->yMax) <= deltaY) {
3569	correspondenceY++;
3570	}
3571
3572	// are blocks aligned in x and y ?
3573	if (correspondenceX > `0` && correspondenceY > `0`) {
3574
3575	// find maximal tableId
3576	tableId = tableId < fblk4->tableId ? fblk4->tableId : tableId;
3577	tableId = tableId < fblk3->tableId ? fblk3->tableId : tableId;
3578	tableId = tableId < fblk2->tableId ? fblk2->tableId : tableId;
3579	tableId = tableId < blk1->tableId ? blk1->tableId : tableId;
3580
3581	// if the tableId is -1, then we found new table
3582	if (tableId < `0`) {
3583	tableId = numTables;
3584	numTables++;
3585	}
3586
3587	blk1->tableId = tableId;
3588	fblk2->tableId = tableId;
3589	fblk3->tableId = tableId;
3590	fblk4->tableId = tableId;
3591	}
3592	}
3593	}
3594
3595	/ set extended bounding boxes of all table entries*
3596	* so that they contain whole table
3597	* (we need to process whole table size when comparing it
3598	* with regular text blocks)
3599	*/
3600	PDFRectangle envelopes = new* PDFRectangle[numTables];
3601	TextBlock ending_blocks = new** TextBlock *[numTables];
3602
3603	for (int i = `0`; i < numTables; i++) {
3604	envelopes[i].x1 = DBL_MAX;
3605	envelopes[i].x2 = DBL_MIN;
3606	envelopes[i].y1 = DBL_MAX;
3607	envelopes[i].y2 = DBL_MIN;
3608	ending_blocks[i] = nullptr;
3609	}
3610
3611	for (blk1 = blkList; blk1; blk1 = blk1->next) {
3612	if (blk1->tableId >= `0`) {
3613	if (blk1->ExMin < envelopes[blk1->tableId].x1) {
3614	envelopes[blk1->tableId].x1 = blk1->ExMin;
3615	if (!blk1->page->primaryLR) {
3616	ending_blocks[blk1->tableId] = blk1;
3617	}
3618	}
3619
3620	if (blk1->ExMax > envelopes[blk1->tableId].x2) {
3621	envelopes[blk1->tableId].x2 = blk1->ExMax;
3622	if (blk1->page->primaryLR) {
3623	ending_blocks[blk1->tableId] = blk1;
3624	}
3625	}
3626
3627	envelopes[blk1->tableId].y1 = blk1->EyMin < envelopes[blk1->tableId].y1 ? blk1->EyMin : envelopes[blk1->tableId].y1;
3628	envelopes[blk1->tableId].y2 = blk1->EyMax > envelopes[blk1->tableId].y2 ? blk1->EyMax : envelopes[blk1->tableId].y2;
3629	}
3630	}
3631
3632	for (blk1 = blkList; blk1; blk1 = blk1->next) {
3633	if (blk1->tableId >= `0` && ending_blocks[blk1->tableId] && blk1->xMin <= ending_blocks[blk1->tableId]->xMax && blk1->xMax >= ending_blocks[blk1->tableId]->xMin) {
3634	blk1->tableEnd = true;
3635	}
3636	}
3637
3638	for (blk1 = blkList; blk1; blk1 = blk1->next) {
3639	if (blk1->tableId >= `0`) {
3640	blk1->ExMin = envelopes[blk1->tableId].x1;
3641	blk1->ExMax = envelopes[blk1->tableId].x2;
3642	blk1->EyMin = envelopes[blk1->tableId].y1;
3643	blk1->EyMax = envelopes[blk1->tableId].y2;
3644	}
3645	}
3646	delete[] envelopes;
3647	delete[] ending_blocks;
3648
3649	/ set extended bounding boxes of all other blocks*
3650	* so that they extend in x without hitting neighbours
3651	*/
3652	for (blk1 = blkList; blk1; blk1 = blk1->next) {
3653	if (!(blk1->tableId >= `0`)) {
3654	double xMax = DBL_MAX;
3655	double xMin = DBL_MIN;
3656
3657	for (blk2 = blkList; blk2; blk2 = blk2->next) {
3658	if (blk2 == blk1) {
3659	continue;
3660	}
3661
3662	if (blk1->yMin <= blk2->yMax && blk1->yMax >= blk2->yMin) {
3663	if (blk2->xMin < xMax && blk2->xMin > blk1->xMax) {
3664	xMax = blk2->xMin;
3665	}
3666
3667	if (blk2->xMax > xMin && blk2->xMax < blk1->xMin) {
3668	xMin = blk2->xMax;
3669	}
3670	}
3671	}
3672
3673	for (blk2 = blkList; blk2; blk2 = blk2->next) {
3674	if (blk2 == blk1) {
3675	continue;
3676	}
3677
3678	if (blk2->xMax > blk1->ExMax && blk2->xMax <= xMax && blk2->yMin >= blk1->yMax) {
3679	blk1->ExMax = blk2->xMax;
3680	}
3681
3682	if (blk2->xMin < blk1->ExMin && blk2->xMin >= xMin && blk2->yMin >= blk1->yMax) {
3683	blk1->ExMin = blk2->xMin;
3684	}
3685	}
3686	}
3687	}
3688
3689	int i = -`1`;
3690	for (blk1 = blkList; blk1; blk1 = blk1->next) {
3691	i++;
3692	sortPos = blk1->visitDepthFirst(blkList, pos1: i, sorted: blocks, sortPos, visited);
3693	}
3694	if (visited) {
3695	gfree(p: visited);
3696	}
3697
3698	#if 0 // for debugging
3699	printf("* blocks, after ro sort *\n");
3700	for (i = `0`; i < nBlocks; ++i) {
3701	blk = blocks[i];
3702	printf("block: rot=%d x=%.2f..%.2f y=%.2f..%.2f space=%.2f..%.2f\n",
3703	blk->rot, blk->xMin, blk->xMax, blk->yMin, blk->yMax,
3704	blk->priMin, blk->priMax);
3705	for (line = blk->lines; line; line = line->next) {
3706	printf(" line:\n");
3707	for (word0 = line->words; word0; word0 = word0->next) {
3708	printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSize=%.2f space=%d: '",
3709	word0->xMin, word0->xMax, word0->yMin, word0->yMax,
3710	word0->base, word0->fontSize, word0->spaceAfter);
3711	for (j = `0`; j < word0->len; ++j) {
3712	fputc(word0->text[j] & `0xff`, stdout);
3713	}
3714	printf("'\n");
3715	}
3716	}
3717	}
3718	printf("\n");
3719	fflush(stdout);
3720	#endif
3721
3722	// build the flows
3723	//~ this needs to be adjusted for writing mode (vertical text)
3724	//~ this also needs to account for right-to-left column ordering
3725	while (flows) {
3726	flow = flows;
3727	flows = flows->next;
3728	delete flow;
3729	}
3730	flow = nullptr;
3731	flows = lastFlow = nullptr;
3732	// assume blocks are already in reading order,
3733	// and construct flows accordingly.
3734	for (i = `0`; i < nBlocks; i++) {
3735	blk = blocks[i];
3736	blk->next = nullptr;
3737	if (flow) {
3738	blk1 = blocks[i - `1`];
3739	blkSpace = maxBlockSpacing * blk1->lines->words->fontSize;
3740	if (blk1->secondaryDelta(blk) <= blkSpace && blk->isBelow(blk: blk1) && flow->blockFits(blk, prevBlk: blk1)) {
3741	flow->addBlock(blk);
3742	continue;
3743	}
3744	}
3745	flow = new TextFlow (this, blk);
3746	if (lastFlow) {
3747	lastFlow->next = flow;
3748	} else {
3749	flows = flow;
3750	}
3751	lastFlow = flow;
3752	}
3753
3754	#if 0 // for debugging
3755	printf("* flows *\n");
3756	for (flow = flows; flow; flow = flow->next) {
3757	printf("flow: x=%.2f..%.2f y=%.2f..%.2f pri:%.2f..%.2f\n",
3758	flow->xMin, flow->xMax, flow->yMin, flow->yMax,
3759	flow->priMin, flow->priMax);
3760	for (blk = flow->blocks; blk; blk = blk->next) {
3761	printf(" block: rot=%d x=%.2f..%.2f y=%.2f..%.2f pri=%.2f..%.2f\n",
3762	blk->rot, blk->ExMin, blk->ExMax, blk->EyMin, blk->EyMax,
3763	blk->priMin, blk->priMax);
3764	for (line = blk->lines; line; line = line->next) {
3765	printf(" line:\n");
3766	for (word0 = line->words; word0; word0 = word0->next) {
3767	printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSize=%.2f space=%d: '",
3768	word0->xMin, word0->xMax, word0->yMin, word0->yMax,
3769	word0->base, word0->fontSize, word0->spaceAfter);
3770	for (i = `0`; i < word0->len; ++i) {
3771	fputc(word0->text[i] & `0xff`, stdout);
3772	}
3773	printf("'\n");
3774	}
3775	}
3776	}
3777	}
3778	printf("\n");
3779	#endif
3780	}
3781
3782	void TextPage::adjustRotation(TextLine line, int* start, int end, double xMin, double* xMax, double* yMin, double* *yMax)
3783	{
3784	switch (line->rot) {
3785	case `0`:
3786	*xMin = line->edge[start];
3787	*xMax = line->edge[end];
3788	*yMin = line->yMin;
3789	*yMax = line->yMax;
3790	break;
3791	case `1`:
3792	*xMin = line->xMin;
3793	*xMax = line->xMax;
3794	*yMin = line->edge[start];
3795	*yMax = line->edge[end];
3796	break;
3797	case `2`:
3798	*xMin = line->edge[end];
3799	*xMax = line->edge[start];
3800	*yMin = line->yMin;
3801	*yMax = line->yMax;
3802	break;
3803	case `3`:
3804	*xMin = line->xMin;
3805	*xMax = line->xMax;
3806	*yMin = line->edge[end];
3807	*yMax = line->edge[start];
3808	break;
3809	}
3810	}
3811
3812	bool TextPage::findText(const Unicode s, int* len, bool startAtTop, bool stopAtBottom, bool startAtLast, bool stopAtLast, bool caseSensitive, bool backward, bool wholeWord, double xMin, double* yMin, double* xMax, double* *yMax)
3813	{
3814	return findText(s, len, startAtTop, stopAtBottom, startAtLast, stopAtLast, caseSensitive, ignoreDiacritics: false, matchAcrossLines: false, backward, wholeWord, xMin, yMin, xMax, yMax, continueMatch: nullptr, ignoredHyphen: nullptr);
3815	}
3816
3817	bool TextPage::findText(const Unicode s, int* len, bool startAtTop, bool stopAtBottom, bool startAtLast, bool stopAtLast, bool caseSensitive, bool ignoreDiacritics, bool backward, bool wholeWord, double xMin, double* yMin, double* *xMax,
3818	double *yMax)
3819	{
3820	return findText(s, len, startAtTop, stopAtBottom, startAtLast, stopAtLast, caseSensitive, ignoreDiacritics, matchAcrossLines: false, backward, wholeWord, xMin, yMin, xMax, yMax, continueMatch: nullptr, ignoredHyphen: nullptr);
3821	}
3822
3823	bool TextPage::findText(const Unicode s, int* len, bool startAtTop, bool stopAtBottom, bool startAtLast, bool stopAtLast, bool caseSensitive, bool ignoreDiacritics, bool matchAcrossLines, bool backward, bool wholeWord, double *xMin,
3824	double yMin, double* xMax, double* yMax, PDFRectangle continueMatch, bool *ignoredHyphen)
3825	{
3826	TextBlock *blk;
3827	TextLine *line;
3828	Unicode s2, txt, *reordered;
3829	Unicode *p;
3830	TextLine *nextline;
3831	Unicode *nextline_txt;
3832	int nextline_len;
3833	bool nextlineAfterHyphen = false;
3834	int txtSize, m, i, j, k;
3835	double xStart, yStart, xStop, yStop;
3836	double xMin0, yMin0, xMax0, yMax0;
3837	double xMin1, yMin1, xMax1, yMax1;
3838	double xMin2, yMin2, xMax2, yMax2;
3839	bool found;
3840
3841	if (len == `0`) {
3842	return false;
3843	}
3844
3845	if (rawOrder) {
3846	return false;
3847	}
3848
3849	if (matchAcrossLines && backward) {
3850	// matchAcrossLines is unimplemented for backward search
3851	matchAcrossLines = false;
3852	}
3853
3854	// handle right-to-left text
3855	reordered = (Unicode )gmallocn(count: len, size: sizeof*(Unicode));
3856	reorderText(text: s, len, uMap: nullptr, primaryLR, s: nullptr, u: reordered);
3857
3858	// normalize the search string
3859	s2 = unicodeNormalizeNFKC(in: reordered, len, out_len: &len, indices: nullptr);
3860
3861	// if search string is not pure ascii then don't
3862	// use ignoreDiacritics (as they won't match)
3863	if (!caseSensitive) {
3864	// convert the search string to uppercase
3865	for (i = `0`; i < len; ++i) {
3866	s2[i] = unicodeToUpper(c: s2[i]);
3867	if (ignoreDiacritics && !isAscii7(uchar: s2[i])) {
3868	ignoreDiacritics = false;
3869	}
3870	}
3871	} else if (ignoreDiacritics) {
3872	for (i = `0`; i < len; ++i) {
3873	if (!isAscii7(uchar: s2[i])) {
3874	ignoreDiacritics = false;
3875	break;
3876	}
3877	}
3878	}
3879
3880	txt = nullptr;
3881	txtSize = `0`;
3882
3883	xStart = yStart = xStop = yStop = `0`;
3884	if (startAtLast && haveLastFind) {
3885	xStart = lastFindXMin;
3886	yStart = lastFindYMin;
3887	} else if (!startAtTop) {
3888	xStart = *xMin;
3889	yStart = *yMin;
3890	}
3891	if (stopAtLast && haveLastFind) {
3892	xStop = lastFindXMin;
3893	yStop = lastFindYMin;
3894	} else if (!stopAtBottom) {
3895	xStop = *xMax;
3896	yStop = *yMax;
3897	}
3898
3899	found = false;
3900	xMin0 = xMax0 = yMin0 = yMax0 = `0`; // make gcc happy
3901	xMin1 = xMax1 = yMin1 = yMax1 = `0`; // make gcc happy
3902
3903	for (i = backward ? nBlocks - `1` : `0`; backward ? i >= `0` : i < nBlocks; i += backward ? -`1` : `1`) {
3904	blk = blocks[i];
3905
3906	// check: is the block above the top limit?
3907	// (this only works if the page's primary rotation is zero --
3908	// otherwise the blocks won't be sorted in the useful order)
3909	if (!startAtTop && primaryRot == `0` && (backward ? blk->yMin > yStart : blk->yMax < yStart)) {
3910	continue;
3911	}
3912
3913	// check: is the block below the bottom limit?
3914	// (this only works if the page's primary rotation is zero --
3915	// otherwise the blocks won't be sorted in the useful order)
3916	if (!stopAtBottom && primaryRot == `0` && (backward ? blk->yMax < yStop : blk->yMin > yStop)) {
3917	break;
3918	}
3919
3920	for (line = blk->lines; line; line = line->next) {
3921
3922	// check: is the line above the top limit?
3923	// (this only works if the page's primary rotation is zero --
3924	// otherwise the lines won't be sorted in the useful order)
3925	if (!startAtTop && primaryRot == `0` && (backward ? line->yMin > yStart : line->yMin < yStart)) {
3926	continue;
3927	}
3928
3929	// check: is the line below the bottom limit?
3930	// (this only works if the page's primary rotation is zero --
3931	// otherwise the lines won't be sorted in the useful order)
3932	if (!stopAtBottom && primaryRot == `0` && (backward ? line->yMin < yStop : line->yMin > yStop)) {
3933	continue;
3934	}
3935
3936	if (!line->normalized) {
3937	line->normalized = unicodeNormalizeNFKC(in: line->text, len: line->len, out_len: &line->normalized_len, indices: &line->normalized_idx, reverseRTL: true);
3938	}
3939
3940	nextline = nullptr;
3941	nextline_txt = nullptr;
3942	nextline_len = `0`;
3943	if (line->next) {
3944	nextline = line->next;
3945	} else {
3946	// set nextline to first line of next block
3947	int ind = i + (backward ? -`1` : `1`);
3948	if ((backward && ind >= `0`) \|\| (!backward && ind < nBlocks)) {
3949	nextline = blocks[ind]->lines;
3950	}
3951	}
3952
3953	if (matchAcrossLines && nextline && !nextline->normalized) {
3954	nextline->normalized = unicodeNormalizeNFKC(in: nextline->text, len: nextline->len, out_len: &nextline->normalized_len, indices: &nextline->normalized_idx, reverseRTL: true);
3955	}
3956
3957	// convert the line to uppercase
3958	m = line->normalized_len;
3959
3960	if (ignoreDiacritics) {
3961	if (!line->ascii_translation) {
3962	unicodeToAscii7(in: line->normalized, len: line->normalized_len, ucs4_out: &line->ascii_translation, out_len: &line->ascii_len, in_idx: line->normalized_idx, indices: &line->ascii_idx);
3963	}
3964	if (line->ascii_len) {
3965	m = line->ascii_len;
3966	} else {
3967	ignoreDiacritics = false;
3968	}
3969
3970	if (matchAcrossLines && nextline && !nextline->ascii_translation) {
3971	unicodeToAscii7(in: nextline->normalized, len: nextline->normalized_len, ucs4_out: &nextline->ascii_translation, out_len: &nextline->ascii_len, in_idx: nextline->normalized_idx, indices: &nextline->ascii_idx);
3972	}
3973	}
3974	if (!caseSensitive) {
3975	if (m > txtSize) {
3976	txt = (Unicode )greallocn(p: txt, count: m, size: sizeof*(Unicode));
3977	txtSize = m;
3978	}
3979	for (k = `0`; k < m; ++k) {
3980	if (ignoreDiacritics) {
3981	txt[k] = unicodeToUpper(c: line->ascii_translation[k]);
3982	} else {
3983	txt[k] = unicodeToUpper(c: line->normalized[k]);
3984	}
3985	}
3986	if (matchAcrossLines && nextline) {
3987	nextline_len = ignoreDiacritics ? nextline->ascii_len : nextline->normalized_len;
3988	nextline_txt = (Unicode )gmallocn(count: nextline_len, size: sizeof*(Unicode));
3989	for (k = `0`; k < nextline_len; ++k) {
3990	nextline_txt[k] = ignoreDiacritics ? unicodeToUpper(c: nextline->ascii_translation[k]) : unicodeToUpper(c: nextline->normalized[k]);
3991	}
3992	}
3993	} else {
3994	if (ignoreDiacritics) {
3995	txt = line->ascii_translation;
3996	} else {
3997	txt = line->normalized;
3998	}
3999
4000	if (matchAcrossLines && nextline) {
4001	nextline_len = ignoreDiacritics ? nextline->ascii_len : nextline->normalized_len;
4002	nextline_txt = ignoreDiacritics ? nextline->ascii_translation : nextline->normalized;
4003	}
4004	}
4005
4006	// search each position in this line
4007	j = backward ? m - len : `0`;
4008	p = txt + j;
4009	while (backward ? j >= `0` : j <= m - (nextline_txt ? `1` : len)) {
4010	bool wholeWordStartIsOk, wholeWordEndIsOk;
4011	if (wholeWord) {
4012	wholeWordStartIsOk = j == `0` \|\| !unicodeTypeAlphaNum(c: txt[j - `1`]);
4013	if (nextline_txt) {
4014	wholeWordEndIsOk = true; // word end may be in next line, so we'll check it later
4015	} else {
4016	wholeWordEndIsOk = j + len == m \|\| !unicodeTypeAlphaNum(c: txt[j + len]);
4017	}
4018	}
4019	if (!wholeWord \|\| (wholeWordStartIsOk && wholeWordEndIsOk)) {
4020	int n = `0`;
4021	bool spaceConsumedByNewline = false;
4022	bool found_it;
4023
4024	// compare the strings
4025	for (k = `0`; k < len; ++k) {
4026	bool last_char_of_line = j + k == m - `1`;
4027	bool last_char_of_search_term = k == len - `1`;
4028	bool match_started = (bool)k;
4029
4030	if (p[k] != s2[k] \|\| (nextline_txt && last_char_of_line && !last_char_of_search_term)) {
4031	// now check if the comparison failed at the end-of-line hyphen,
4032	// and if so, keep on comparing at the next line
4033	nextlineAfterHyphen = false;
4034
4035	if (s2[k] == p[k]) {
4036	if (p[k] != (Unicode)`'-'` && !UnicodeIsWhitespace(ucs4: s2[k + `1`])) {
4037	break;
4038	}
4039	k++;
4040	} else if (!match_started \|\| p[k] != (Unicode)`'-'` \|\| !last_char_of_line \|\| UnicodeIsWhitespace(ucs4: s2[k])) {
4041	break;
4042	} else {
4043	nextlineAfterHyphen = true;
4044	}
4045
4046	for (; n < nextline_len && k < len; ++k, ++n) {
4047	if (nextline_txt[n] != s2[k]) {
4048	if (!spaceConsumedByNewline && !n && UnicodeIsWhitespace(ucs4: s2[k])) {
4049	n = -`1`;
4050	spaceConsumedByNewline = true;
4051	continue;
4052	}
4053	break;
4054	}
4055	}
4056	break;
4057	}
4058	}
4059
4060	found_it = k == len;
4061	if (found_it && nextline_txt && wholeWord) { // check word end for nextline case
4062	if (n) { // Match ended at next line
4063	wholeWordEndIsOk = n == nextline_len \|\| !unicodeTypeAlphaNum(c: nextline_txt[n]);
4064	} else { // Match ended on same line
4065	wholeWordEndIsOk = j + len == m \|\| !unicodeTypeAlphaNum(c: txt[j + len]);
4066	}
4067
4068	if (!wholeWordEndIsOk) {
4069	found_it = false;
4070	}
4071	}
4072	// found it
4073	if (found_it) {
4074	bool nextLineMatch = (bool)n;
4075	if (spaceConsumedByNewline) {
4076	k--;
4077	}
4078	// where s2 matches a subsequence of a compatibility equivalence
4079	// decomposition, highlight the entire glyph, since we don't know
4080	// the internal layout of subglyph components
4081	int normStart, normAfterEnd;
4082	if (ignoreDiacritics) {
4083	normStart = line->ascii_idx[j];
4084	if (nextline_txt) {
4085	normAfterEnd = line->ascii_idx[j + k - n];
4086	} else {
4087	normAfterEnd = line->ascii_idx[j + len - `1`] + `1`;
4088	}
4089	} else {
4090	normStart = line->normalized_idx[j];
4091	if (nextline_txt) {
4092	normAfterEnd = line->normalized_idx[j + k - n];
4093	} else {
4094	normAfterEnd = line->normalized_idx[j + len - `1`] + `1`;
4095	}
4096	}
4097
4098	adjustRotation(line, start: normStart, end: normAfterEnd, xMin: &xMin1, xMax: &xMax1, yMin: &yMin1, yMax: &yMax1);
4099
4100	if (backward) {
4101	if ((startAtTop \|\| yMin1 < yStart \|\| (yMin1 == yStart && xMin1 < xStart)) && (stopAtBottom \|\| yMin1 > yStop \|\| (yMin1 == yStop && xMin1 > xStop))) {
4102	if (!found \|\| yMin1 > yMin0 \|\| (yMin1 == yMin0 && xMin1 > xMin0)) {
4103	xMin0 = xMin1;
4104	xMax0 = xMax1;
4105	yMin0 = yMin1;
4106	yMax0 = yMax1;
4107	found = true;
4108	}
4109	}
4110	} else {
4111	if ((startAtTop \|\| yMin1 > yStart \|\| (yMin1 == yStart && xMin1 > xStart)) && (stopAtBottom \|\| yMin1 < yStop \|\| (yMin1 == yStop && xMin1 < xStop))) {
4112	if (!found \|\| yMin1 < yMin0 \|\| (yMin1 == yMin0 && xMin1 < xMin0)) {
4113	xMin0 = xMin1;
4114	xMax0 = xMax1;
4115	yMin0 = yMin1;
4116	yMax0 = yMax1;
4117	found = true;
4118	if (nextLineMatch) { // set the out parameters
4119	if (ignoredHyphen) {
4120	*ignoredHyphen = nextlineAfterHyphen;
4121	}
4122
4123	if (continueMatch) {
4124	adjustRotation(line: nextline, start: `0`, end: n, xMin: &xMin2, xMax: &xMax2, yMin: &yMin2, yMax: &yMax2);
4125	continueMatch->x1 = xMin2;
4126	continueMatch->y1 = yMax2;
4127	continueMatch->x2 = xMax2;
4128	continueMatch->y2 = yMin2;
4129	}
4130	} else if (continueMatch && continueMatch->x1 != std::numeric_limits<double>::max()) {
4131	if (ignoredHyphen) {
4132	ignoredHyphen = false*;
4133	}
4134
4135	continueMatch->x1 = std::numeric_limits<double>::max();
4136	}
4137	}
4138	}
4139	}
4140	}
4141	}
4142	if (backward) {
4143	--j;
4144	--p;
4145	} else {
4146	++j;
4147	++p;
4148	}
4149	}
4150
4151	if (nextline_txt && nextline_txt != nextline->ascii_translation && nextline_txt != nextline->normalized) {
4152	gfree(p: nextline_txt);
4153	}
4154	}
4155	}
4156
4157	gfree(p: s2);
4158	gfree(p: reordered);
4159	if (!caseSensitive) {
4160	gfree(p: txt);
4161	}
4162
4163	if (found) {
4164	*xMin = xMin0;
4165	*xMax = xMax0;
4166	*yMin = yMin0;
4167	*yMax = yMax0;
4168	lastFindXMin = xMin0;
4169	lastFindYMin = yMin0;
4170	haveLastFind = true;
4171	return true;
4172	}
4173
4174	return false;
4175	}
4176
4177	GooString TextPage::getText(double* xMin, double yMin, double xMax, double yMax, EndOfLineKind textEOL) const
4178	{
4179	GooString *s;
4180	const UnicodeMap *uMap;
4181	TextBlock *blk;
4182	TextLine *line;
4183	TextLineFrag *frags;
4184	int nFrags, fragsSize;
4185	TextLineFrag *frag;
4186	char space[`8`], eol[`16`];
4187	int spaceLen, eolLen;
4188	int lastRot;
4189	double x, y, delta;
4190	int col, idx0, idx1, i, j;
4191	bool multiLine, oneRot;
4192
4193	s = new GooString ();
4194
4195	// get the output encoding
4196	if (!(uMap = globalParams ->getTextEncoding())) {
4197	return s;
4198	}
4199
4200	if (rawOrder) {
4201	TextWord *word;
4202	char mbc[`16`];
4203	int mbc_len;
4204
4205	for (word = rawWords; word && word <= rawLastWord; word = word->next) {
4206	for (j = `0`; j < word->getLength(); ++j) {
4207	double gXMin, gXMax, gYMin, gYMax;
4208	word->getCharBBox(charIdx: j, xMinA: &gXMin, yMinA: &gYMin, xMaxA: &gXMax, yMaxA: &gYMax);
4209	if (xMin <= gXMin && gXMax <= xMax && yMin <= gYMin && gYMax <= yMax) {
4210	mbc_len = uMap->mapUnicode(u: (word->getChar(idx: j)), buf: mbc, bufSize: sizeof*(mbc));
4211	s->append(str: mbc, lengthA: mbc_len);
4212	}
4213	}
4214	}
4215	return s;
4216	}
4217
4218	spaceLen = uMap->mapUnicode(u: `0x20`, buf: space, bufSize: sizeof(space));
4219	eolLen = `0`; // make gcc happy
4220	switch (textEOL) {
4221	case eolUnix:
4222	eolLen = uMap->mapUnicode(u: `0x0a`, buf: eol, bufSize: sizeof(eol));
4223	break;
4224	case eolDOS:
4225	eolLen = uMap->mapUnicode(u: `0x0d`, buf: eol, bufSize: sizeof(eol));
4226	eolLen += uMap->mapUnicode(u: `0x0a`, buf: eol + eolLen, bufSize: sizeof(eol) - eolLen);
4227	break;
4228	case eolMac:
4229	eolLen = uMap->mapUnicode(u: `0x0d`, buf: eol, bufSize: sizeof(eol));
4230	break;
4231	}
4232
4233	//~ writing mode (horiz/vert)
4234
4235	// collect the line fragments that are in the rectangle
4236	fragsSize = `256`;
4237	frags = (TextLineFrag )gmallocn(count: fragsSize, size: sizeof*(TextLineFrag));
4238	nFrags = `0`;
4239	lastRot = -`1`;
4240	oneRot = true;
4241	for (i = `0`; i < nBlocks; ++i) {
4242	blk = blocks[i];
4243	if (xMin < blk->xMax && blk->xMin < xMax && yMin < blk->yMax && blk->yMin < yMax) {
4244	for (line = blk->lines; line; line = line->next) {
4245	if (xMin < line->xMax && line->xMin < xMax && yMin < line->yMax && line->yMin < yMax) {
4246	idx0 = idx1 = -`1`;
4247	switch (line->rot) {
4248	case `0`:
4249	y = `0.5` * (line->yMin + line->yMax);
4250	if (yMin < y && y < yMax) {
4251	j = `0`;
4252	while (j < line->len) {
4253	if (`0.5` * (line->edge[j] + line->edge[j + `1`]) > xMin) {
4254	idx0 = j;
4255	break;
4256	}
4257	++j;
4258	}
4259	j = line->len - `1`;
4260	while (j >= `0`) {
4261	if (`0.5` * (line->edge[j] + line->edge[j + `1`]) < xMax) {
4262	idx1 = j;
4263	break;
4264	}
4265	--j;
4266	}
4267	}
4268	break;
4269	case `1`:
4270	x = `0.5` * (line->xMin + line->xMax);
4271	if (xMin < x && x < xMax) {
4272	j = `0`;
4273	while (j < line->len) {
4274	if (`0.5` * (line->edge[j] + line->edge[j + `1`]) > yMin) {
4275	idx0 = j;
4276	break;
4277	}
4278	++j;
4279	}
4280	j = line->len - `1`;
4281	while (j >= `0`) {
4282	if (`0.5` * (line->edge[j] + line->edge[j + `1`]) < yMax) {
4283	idx1 = j;
4284	break;
4285	}
4286	--j;
4287	}
4288	}
4289	break;
4290	case `2`:
4291	y = `0.5` * (line->yMin + line->yMax);
4292	if (yMin < y && y < yMax) {
4293	j = `0`;
4294	while (j < line->len) {
4295	if (`0.5` * (line->edge[j] + line->edge[j + `1`]) < xMax) {
4296	idx0 = j;
4297	break;
4298	}
4299	++j;
4300	}
4301	j = line->len - `1`;
4302	while (j >= `0`) {
4303	if (`0.5` * (line->edge[j] + line->edge[j + `1`]) > xMin) {
4304	idx1 = j;
4305	break;
4306	}
4307	--j;
4308	}
4309	}
4310	break;
4311	case `3`:
4312	x = `0.5` * (line->xMin + line->xMax);
4313	if (xMin < x && x < xMax) {
4314	j = `0`;
4315	while (j < line->len) {
4316	if (`0.5` * (line->edge[j] + line->edge[j + `1`]) < yMax) {
4317	idx0 = j;
4318	break;
4319	}
4320	++j;
4321	}
4322	j = line->len - `1`;
4323	while (j >= `0`) {
4324	if (`0.5` * (line->edge[j] + line->edge[j + `1`]) > yMin) {
4325	idx1 = j;
4326	break;
4327	}
4328	--j;
4329	}
4330	}
4331	break;
4332	}
4333	if (idx0 >= `0` && idx1 >= `0`) {
4334	if (nFrags == fragsSize) {
4335	fragsSize *= `2`;
4336	frags = (TextLineFrag )greallocn(p: frags, count: fragsSize, size: sizeof*(TextLineFrag));
4337	}
4338	frags[nFrags].init(lineA: line, startA: idx0, lenA: idx1 - idx0 + `1`);
4339	++nFrags;
4340	if (lastRot >= `0` && line->rot != lastRot) {
4341	oneRot = false;
4342	}
4343	lastRot = line->rot;
4344	}
4345	}
4346	}
4347	}
4348	}
4349
4350	// sort the fragments and generate the string
4351	if (nFrags > `0`) {
4352
4353	for (i = `0`; i < nFrags; ++i) {
4354	frags[i].computeCoords(oneRot);
4355	}
4356	assignColumns(frags, nFrags, rot: oneRot);
4357
4358	// if all lines in the region have the same rotation, use it;
4359	// otherwise, use the page's primary rotation
4360	if (oneRot) {
4361	qsort(base: frags, nmemb: nFrags, size: sizeof(TextLineFrag), compar: &TextLineFrag::cmpYXLineRot);
4362	} else {
4363	qsort(base: frags, nmemb: nFrags, size: sizeof(TextLineFrag), compar: &TextLineFrag::cmpYXPrimaryRot);
4364	}
4365	i = `0`;
4366	while (i < nFrags) {
4367	delta = maxIntraLineDelta * frags[i].line->words->fontSize;
4368	for (j = i + `1`; j < nFrags && fabs(x: frags[j].base - frags[i].base) < delta; ++j) {
4369	;
4370	}
4371	qsort(base: frags + i, nmemb: j - i, size: sizeof(TextLineFrag), compar: oneRot ? &TextLineFrag::cmpXYColumnLineRot : &TextLineFrag::cmpXYColumnPrimaryRot);
4372	i = j;
4373	}
4374
4375	col = `0`;
4376	multiLine = false;
4377	for (i = `0`; i < nFrags; ++i) {
4378	frag = &frags[i];
4379
4380	// insert a return
4381	if (frag->col < col \|\| (i > `0` && fabs(x: frag->base - frags[i - `1`].base) > maxIntraLineDelta * frags[i - `1`].line->words->fontSize)) {
4382	s->append(str: eol, lengthA: eolLen);
4383	col = `0`;
4384	multiLine = true;
4385	}
4386
4387	// column alignment
4388	for (; col < frag->col; ++col) {
4389	s->append(str: space, lengthA: spaceLen);
4390	}
4391
4392	// get the fragment text
4393	col += dumpFragment(text: frag->line->text + frag->start, len: frag->len, uMap, s);
4394	}
4395
4396	if (multiLine) {
4397	s->append(str: eol, lengthA: eolLen);
4398	}
4399	}
4400
4401	gfree(p: frags);
4402
4403	return s;
4404	}
4405
4406	class TextSelectionVisitor
4407	{
4408	public:
4409	explicit TextSelectionVisitor(TextPage *page);
4410	virtual ~TextSelectionVisitor();
4411	TextSelectionVisitor(const TextSelectionVisitor &) = delete;
4412	TextSelectionVisitor &operator=(const TextSelectionVisitor &) = delete;
4413	virtual void visitBlock(TextBlock block, TextLine begin, TextLine end, const* PDFRectangle *selection) = `0`;
4414	virtual void visitLine(TextLine line, TextWord begin, TextWord end, int* edge_begin, int edge_end, const PDFRectangle *selection) = `0`;
4415	virtual void visitWord(TextWord word, int* begin, int end, const PDFRectangle *selection) = `0`;
4416
4417	protected:
4418	TextPage *page;
4419	};
4420
4421	TextSelectionVisitor::TextSelectionVisitor(TextPage *p) : page(p) { }
4422
4423	TextSelectionVisitor::~TextSelectionVisitor() = default;
4424
4425	class TextSelectionDumper : public TextSelectionVisitor
4426	{
4427	public:
4428	explicit TextSelectionDumper(TextPage *page);
4429	~TextSelectionDumper() override;
4430
4431	void visitBlock(TextBlock block, TextLine begin, TextLine end, const* PDFRectangle *selection) override {};
4432	void visitLine(TextLine line, TextWord begin, TextWord end, int* edge_begin, int edge_end, const PDFRectangle *selection) override;
4433	void visitWord(TextWord word, int* begin, int end, const PDFRectangle *selection) override;
4434	void endPage();
4435
4436	GooString *getText();
4437	std::vector<TextWordSelection > takeWordList(int* *nLines);
4438
4439	private:
4440	void startLine();
4441	void finishLine();
4442
4443	std::vector<TextWordSelection > *lines;
4444	int nLines, linesSize;
4445	std::vector<TextWordSelection > words;
4446	int tableId;
4447	TextBlock *currentBlock;
4448	};
4449
4450	TextSelectionDumper::TextSelectionDumper(TextPage *p) : TextSelectionVisitor (p)
4451	{
4452	linesSize = `256`;
4453	lines = (std::vector<TextWordSelection > )gmallocn(count: linesSize, size: sizeof(std::vector<TextWordSelection > *));
4454	nLines = `0`;
4455
4456	tableId = -`1`;
4457	currentBlock = nullptr;
4458	words = nullptr;
4459	}
4460
4461	TextSelectionDumper::~TextSelectionDumper()
4462	{
4463	for (int i = `0`; i < nLines; i++) {
4464	for (auto entry : *(lines[i])) {
4465	delete entry;
4466	}
4467	delete lines[i];
4468	}
4469	gfree(p: lines);
4470	}
4471
4472	void TextSelectionDumper::startLine()
4473	{
4474	finishLine();
4475	words = new std::vector<TextWordSelection *>();
4476	}
4477
4478	void TextSelectionDumper::finishLine()
4479	{
4480	if (nLines == linesSize) {
4481	linesSize *= `2`;
4482	lines = (std::vector<TextWordSelection > )grealloc(p: lines, size: linesSize sizeof(std::vector<TextWordSelection > ));
4483	}
4484
4485	if (words && words->size() > `0`) {
4486	// Reverse word order for RTL text. Fixes #53 for glib backend (Evince)
4487	if (!page->primaryLR) {
4488	std::reverse(first: words->begin(), last: words->end());
4489	}
4490
4491	lines[nLines++] = words;
4492	} else if (words) {
4493	delete words;
4494	}
4495	words = nullptr;
4496	}
4497
4498	void TextSelectionDumper::visitLine(TextLine line, TextWord begin, TextWord end, int* edge_begin, int edge_end, const PDFRectangle *selection)
4499	{
4500	TextLineFrag frag;
4501
4502	frag.init(lineA: line, startA: edge_begin, lenA: edge_end - edge_begin);
4503
4504	if (tableId >= `0` && frag.line->blk->tableId < `0`) {
4505	finishLine();
4506
4507	tableId = -`1`;
4508	currentBlock = nullptr;
4509	}
4510
4511	if (frag.line->blk->tableId >= `0`) { // a table
4512	if (tableId == -`1`) {
4513	tableId = frag.line->blk->tableId;
4514	currentBlock = frag.line->blk;
4515	}
4516
4517	if (currentBlock == frag.line->blk) { // the same block
4518	startLine();
4519	} else { // another block
4520	if (currentBlock->tableEnd) { // previous block ended its row
4521	startLine();
4522	}
4523	currentBlock = frag.line->blk;
4524	}
4525	} else { // not a table
4526	startLine();
4527	}
4528	}
4529
4530	void TextSelectionDumper::visitWord(TextWord word, int* begin, int end, const PDFRectangle *selection)
4531	{
4532	words->push_back(x: new TextWordSelection (word, begin, end));
4533	}
4534
4535	void TextSelectionDumper::endPage()
4536	{
4537	finishLine();
4538	}
4539
4540	GooString *TextSelectionDumper::getText()
4541	{
4542	GooString *text;
4543	int i;
4544	const UnicodeMap *uMap;
4545	char space[`8`], eol[`16`];
4546	int spaceLen, eolLen;
4547
4548	text = new GooString ();
4549
4550	if (!(uMap = globalParams ->getTextEncoding())) {
4551	return text;
4552	}
4553
4554	spaceLen = uMap->mapUnicode(u: `0x20`, buf: space, bufSize: sizeof(space));
4555	eolLen = uMap->mapUnicode(u: `0x0a`, buf: eol, bufSize: sizeof(eol));
4556
4557	std::vector<Unicode> uText;
4558	for (i = `0`; i < nLines; i++) {
4559	std::vector<TextWordSelection > lineWords = lines[i];
4560	for (std::size_t j = `0`; j < lineWords->size(); j++) {
4561	TextWordSelection sel = (lineWords)[j];
4562
4563	uText.resize(new_size: sel->end - sel->begin);
4564	std::transform(first: sel->word->chars.begin() + sel->begin, last: sel->word->chars.begin() + sel->end, result: uText.begin(), unary_op: [](auto &c) { return c.text; });
4565	page->dumpFragment(text: uText.data(), len: uText.size(), uMap, s: text);
4566
4567	if (j < lineWords->size() - `1` && sel->word->spaceAfter) {
4568	text->append(str: space, lengthA: spaceLen);
4569	}
4570	}
4571	if (i < nLines - `1`) {
4572	text->append(str: eol, lengthA: eolLen);
4573	}
4574	}
4575
4576	return text;
4577	}
4578
4579	std::vector<TextWordSelection > TextSelectionDumper::takeWordList(int* *nLinesOut)
4580	{
4581	std::vector<TextWordSelection > *returnValue = lines;
4582
4583	*nLinesOut = nLines;
4584	if (nLines == `0`) {
4585	return nullptr;
4586	}
4587
4588	nLines = `0`;
4589	lines = nullptr;
4590
4591	return returnValue;
4592	}
4593
4594	class TextSelectionSizer : public TextSelectionVisitor
4595	{
4596	public:
4597	TextSelectionSizer(TextPage page, double* scale);
4598	~TextSelectionSizer() override { delete list; }
4599
4600	void visitBlock(TextBlock block, TextLine begin, TextLine end, const* PDFRectangle *selection) override {};
4601	void visitLine(TextLine line, TextWord begin, TextWord end, int* edge_begin, int edge_end, const PDFRectangle *selection) override;
4602	void visitWord(TextWord word, int* begin, int end, const PDFRectangle *selection) override {};
4603
4604	std::vector<PDFRectangle > takeRegion()
4605	{
4606	auto aux = list;
4607	list = nullptr;
4608	return aux;
4609	}
4610
4611	private:
4612	std::vector<PDFRectangle > list;
4613	double scale;
4614	};
4615
4616	TextSelectionSizer::TextSelectionSizer(TextPage p, double* s) : TextSelectionVisitor (p), scale(s)
4617	{
4618	list = new std::vector<PDFRectangle *>();
4619	}
4620
4621	void TextSelectionSizer::visitLine(TextLine line, TextWord begin, TextWord end, int* edge_begin, int edge_end, const PDFRectangle *selection)
4622	{
4623	PDFRectangle *rect;
4624	double x1, y1, x2, y2, margin;
4625
4626	switch (line->rot) {
4627	default:
4628	case `0`:
4629	margin = (line->yMax - line->yMin) / `8`;
4630	x1 = line->edge[edge_begin];
4631	x2 = line->edge[edge_end];
4632	y1 = line->yMin - margin;
4633	y2 = line->yMax + margin;
4634	break;
4635	case `1`:
4636	margin = (line->xMax - line->xMin) / `8`;
4637	x1 = line->xMin - margin;
4638	x2 = line->xMax + margin;
4639	y1 = line->edge[edge_begin];
4640	y2 = line->edge[edge_end];
4641	break;
4642	case `2`:
4643	margin = (line->yMax - line->yMin) / `8`;
4644	x1 = line->edge[edge_end];
4645	x2 = line->edge[edge_begin];
4646	y1 = line->yMin - margin;
4647	y2 = line->yMax + margin;
4648	break;
4649	case `3`:
4650	margin = (line->xMax - line->xMin) / `8`;
4651	x1 = line->xMin - margin;
4652	x2 = line->xMax + margin;
4653	y1 = line->edge[edge_end];
4654	y2 = line->edge[edge_begin];
4655	break;
4656	}
4657
4658	rect = new PDFRectangle (floor(x: x1 * scale), floor(x: y1 * scale), ceil(x: x2 * scale), ceil(x: y2 * scale));
4659	list->push_back(x: rect);
4660	}
4661
4662	class TextSelectionPainter : public TextSelectionVisitor
4663	{
4664	public:
4665	TextSelectionPainter(TextPage page, double* scale, int rotation, OutputDev out, const* GfxColor box_color, const* GfxColor *glyph_color);
4666	~TextSelectionPainter() override;
4667
4668	void visitBlock(TextBlock block, TextLine begin, TextLine end, const* PDFRectangle *selection) override {};
4669	void visitLine(TextLine line, TextWord begin, TextWord end, int* edge_begin, int edge_end, const PDFRectangle *selection) override;
4670	void visitWord(TextWord word, int* begin, int end, const PDFRectangle *selection) override;
4671	void endPage();
4672
4673	private:
4674	OutputDev *out;
4675	const GfxColor *glyph_color;
4676	GfxState *state;
4677	std::vector<TextWordSelection > selectionList;
4678	Matrix ctm, ictm;
4679	bool hasGlyphLessFont();
4680	};
4681
4682	TextSelectionPainter::TextSelectionPainter(TextPage p, double* scale, int rotation, OutputDev outA, const* GfxColor box_color, const* GfxColor *glyph_colorA) : TextSelectionVisitor (p), out(outA), glyph_color(glyph_colorA)
4683	{
4684	PDFRectangle box(`0`, `0`, p->pageWidth, p->pageHeight);
4685
4686	selectionList = new std::vector<TextWordSelection *>();
4687	state = new GfxState (`72` * scale, `72` * scale, &box, rotation, false);
4688
4689	state->getCTM(m: &ctm);
4690	ctm.invertTo(other: &ictm);
4691
4692	out->startPage(pageNum: `0`, state, xref: nullptr);
4693	out->setDefaultCTM(state->getCTM());
4694
4695	state->setFillColorSpace(new GfxDeviceRGBColorSpace ());
4696	state->setFillColor(box_color);
4697	out->updateFillColor(state);
4698	}
4699
4700	TextSelectionPainter::~TextSelectionPainter()
4701	{
4702	for (auto entry : *selectionList) {
4703	delete entry;
4704	}
4705	delete selectionList;
4706	delete state;
4707	}
4708
4709	void TextSelectionPainter::visitLine(TextLine line, TextWord begin, TextWord end, int* edge_begin, int edge_end, const PDFRectangle *selection)
4710	{
4711	double x1, y1, x2, y2, margin;
4712
4713	switch (line->rot) {
4714	default:
4715	case `0`:
4716	margin = (line->yMax - line->yMin) / `8`;
4717	x1 = line->edge[edge_begin];
4718	x2 = line->edge[edge_end];
4719	y1 = line->yMin - margin;
4720	y2 = line->yMax + margin;
4721	break;
4722	case `1`:
4723	margin = (line->xMax - line->xMin) / `8`;
4724	x1 = line->xMin - margin;
4725	x2 = line->xMax + margin;
4726	y1 = line->edge[edge_begin];
4727	y2 = line->edge[edge_end];
4728	break;
4729	case `2`:
4730	margin = (line->yMax - line->yMin) / `8`;
4731	x1 = line->edge[edge_end];
4732	x2 = line->edge[edge_begin];
4733	y1 = line->yMin - margin;
4734	y2 = line->yMax + margin;
4735	break;
4736	case `3`:
4737	margin = (line->xMax - line->xMin) / `8`;
4738	x1 = line->xMin - margin;
4739	x2 = line->xMax + margin;
4740	y1 = line->edge[edge_end];
4741	y2 = line->edge[edge_begin];
4742	break;
4743	}
4744
4745	ctm.transform(x: x1, y: y1, tx: &x1, ty: &y1);
4746	ctm.transform(x: x2, y: y2, tx: &x2, ty: &y2);
4747
4748	if (x1 < x2) {
4749	x1 = floor(x: x1);
4750	x2 = ceil(x: x2);
4751	} else {
4752	x1 = ceil(x: x1);
4753	x2 = floor(x: x2);
4754	}
4755
4756	if (y1 < y2) {
4757	y1 = floor(x: y1);
4758	y2 = ceil(x: y2);
4759	} else {
4760	y1 = ceil(x: y1);
4761	y2 = floor(x: y2);
4762	}
4763
4764	ictm.transform(x: x1, y: y1, tx: &x1, ty: &y1);
4765	ictm.transform(x: x2, y: y2, tx: &x2, ty: &y2);
4766
4767	state->moveTo(x: x1, y: y1);
4768	state->lineTo(x: x2, y: y1);
4769	state->lineTo(x: x2, y: y2);
4770	state->lineTo(x: x1, y: y2);
4771	state->closePath();
4772	}
4773
4774	void TextSelectionPainter::visitWord(TextWord word, int* begin, int end, const PDFRectangle *selection)
4775	{
4776	selectionList->push_back(x: new TextWordSelection (word, begin, end));
4777	}
4778
4779	bool TextSelectionPainter::hasGlyphLessFont()
4780	{
4781	if (selectionList && selectionList->size()) {
4782	TextWordSelection sel = (selectionList)[`0`];
4783	return sel->word->invisible;
4784	}
4785
4786	return false;
4787	}
4788
4789	void TextSelectionPainter::endPage()
4790	{
4791	/ Take a shortcut for glyphless fonts (eg. Tesseract scanned documents)*
4792	* cause we just paint a transparent fill over existent text.Issue #157 */
4793	if (hasGlyphLessFont()) {
4794	state->setFillOpacity(glyphlessSelectionOpacity);
4795	out->updateFillOpacity(state);
4796	out->fill(state);
4797	out->endPage();
4798	return;
4799	}
4800
4801	out->fill(state);
4802
4803	out->saveState(state);
4804	out->clip(state);
4805
4806	state->clearPath();
4807
4808	state->setFillColor(glyph_color);
4809
4810	out->updateFillColor(state);
4811
4812	GooString string;
4813	for (const TextWordSelection sel : selectionList) {
4814	int begin = sel->begin;
4815
4816	while (begin < sel->end) {
4817	TextFontInfo *font = sel->word->chars [begin].font;
4818	const Matrix *mat = &sel->word->chars [begin].textMat;
4819
4820	state->setTextMat(a: mat->m[`0`], b: mat->m[`1`], c: mat->m[`2`], d: mat->m[`3`], e: `0`, f: `0`);
4821	state->setFont(fontA: font->gfxFont, fontSizeA: `1`);
4822	out->updateFont(state);
4823
4824	int fEnd = begin + `1`;
4825	while (fEnd < sel->end && font->matches(fontInfo: sel->word->chars [fEnd].font) //
4826	&& mat->m[`0`] == sel->word->chars [fEnd].textMat.m[`0`] && mat->m[`1`] == sel->word->chars [fEnd].textMat.m[`1`] //
4827	&& mat->m[`2`] == sel->word->chars [fEnd].textMat.m[`2`] && mat->m[`3`] == sel->word->chars [fEnd].textMat.m[`3`]) {
4828	fEnd++;
4829	}
4830
4831	/ The only purpose of this string is to let the output device query*
4832	* it's length. Might want to change this interface later. */
4833	string.clear();
4834	std::for_each(first: sel->word->chars.begin() + begin, last: sel->word->chars.begin() + fEnd, f: [&string](const auto c) { string.append(c.charcode); });
4835	out->beginString(state, &string);
4836
4837	for (int j = begin; j < fEnd; j++) {
4838	const auto &charJ = sel->word->chars [j];
4839	if (j != begin && charJ.charPos == sel->word->chars [j - `1`].charPos) {
4840	continue;
4841	}
4842	out->drawChar(state, charJ.textMat.m[`4`], charJ.textMat.m[`5`], `0`, `0`, `0`, `0`, charJ.charcode, `1`, nullptr, `0`);
4843	}
4844	out->endString(state);
4845	begin = fEnd;
4846	}
4847	}
4848
4849	out->restoreState(state);
4850	out->endPage();
4851	}
4852
4853	void TextWord::visitSelection(TextSelectionVisitor visitor, const* PDFRectangle *selection, SelectionStyle style)
4854	{
4855	double mid, s1, s2;
4856
4857	if (rot == `0` \|\| rot == `2`) {
4858	s1 = selection->x1;
4859	s2 = selection->x2;
4860	} else {
4861	s1 = selection->y1;
4862	s2 = selection->y2;
4863	}
4864
4865	size_t begin = len();
4866	size_t end = `0`;
4867	for (size_t i = `0`; i < len(); i++) {
4868	if (i + `1` < len()) {
4869	mid = (chars [i].edge + chars [i + `1`].edge) / `2`;
4870	} else {
4871	mid = (chars [i].edge + edgeEnd) / `2`;
4872	}
4873	if (XBetweenAB(mid, s1, s2)) {
4874	if (i < begin) {
4875	begin = i;
4876	}
4877
4878	end = i + `1`;
4879	}
4880	}
4881
4882	/ Skip empty selection. /
4883	if (end <= begin) {
4884	return;
4885	}
4886
4887	visitor->visitWord(word: this, begin, end, selection);
4888	}
4889
4890	void TextLine::visitSelection(TextSelectionVisitor visitor, const* PDFRectangle *selection, SelectionStyle style)
4891	{
4892	TextWord p, begin, end, current;
4893	int i, edge_begin, edge_end;
4894	PDFRectangle child_selection;
4895	double s1, s2, pMin, pMax;
4896
4897	if (rot == `0` \|\| rot == `2`) {
4898	s1 = selection->x1;
4899	s2 = selection->x2;
4900	} else {
4901	s1 = selection->y1;
4902	s2 = selection->y2;
4903	}
4904
4905	begin = nullptr;
4906	end = nullptr;
4907	current = nullptr;
4908	for (p = words; p != nullptr; p = p->next) {
4909	if (rot == `0` \|\| rot == `2`) {
4910	pMin = p->xMin;
4911	pMax = p->xMax;
4912	} else {
4913	pMin = p->yMin;
4914	pMax = p->yMax;
4915	}
4916
4917	if (blk->page->primaryLR) {
4918	if (((s1 < pMax) \|\| (s2 < pMax)) && begin == nullptr) {
4919	begin = p;
4920	}
4921
4922	if (((s1 > pMin) \|\| (s2 > pMin)) && begin != nullptr) {
4923	end = p->next;
4924	current = p;
4925	}
4926	} else {
4927	if (((s1 > pMin) \|\| (s2 > pMin)) && begin == nullptr) {
4928	begin = p;
4929	}
4930
4931	if (((s1 < pMax) \|\| (s2 < pMax)) && begin != nullptr) {
4932	end = p->next;
4933	current = p;
4934	}
4935	}
4936	}
4937
4938	if (!current) {
4939	current = begin;
4940	}
4941
4942	child_selection = *selection;
4943	if (style == selectionStyleWord) {
4944	if (rot == `0` \|\| rot == `2`) {
4945	child_selection.x1 = begin ? begin->xMin : xMin;
4946	if (end && end->xMax != -`1`) {
4947	child_selection.x2 = current->xMax;
4948	} else {
4949	child_selection.x2 = xMax;
4950	}
4951	} else {
4952	child_selection.y1 = begin ? begin->yMin : yMin;
4953	if (end && end->yMax != -`1`) {
4954	child_selection.y2 = current->yMax;
4955	} else {
4956	child_selection.y2 = yMax;
4957	}
4958	}
4959	}
4960
4961	if (rot == `0` \|\| rot == `2`) {
4962	s1 = child_selection.x1;
4963	s2 = child_selection.x2;
4964	} else {
4965	s1 = child_selection.y1;
4966	s2 = child_selection.y2;
4967	}
4968
4969	edge_begin = len;
4970	edge_end = `0`;
4971	for (i = `0`; i < len; i++) {
4972	double mid = (edge[i] + edge[i + `1`]) / `2`;
4973	if (XBetweenAB(mid, s1, s2)) {
4974	if (i < edge_begin) {
4975	edge_begin = i;
4976	}
4977
4978	edge_end = i + `1`;
4979	}
4980	}
4981
4982	/ Skip empty selection. /
4983	if (edge_end <= edge_begin) {
4984	return;
4985	}
4986
4987	visitor->visitLine(line: this, begin, end, edge_begin, edge_end, selection: &child_selection);
4988
4989	for (p = begin; p != end; p = p->next) {
4990	p->visitSelection(visitor, selection: &child_selection, style);
4991	}
4992	}
4993
4994	void TextBlock::visitSelection(TextSelectionVisitor visitor, const* PDFRectangle *selection, SelectionStyle style)
4995	{
4996	PDFRectangle child_selection;
4997	double x[`2`], y[`2`], d, best_d[`2`];
4998	TextLine p, best_line[`2`];
4999	int i, count = `0`, best_count[`2`], start, stop;
5000	bool all[`2`];
5001
5002	x[`0`] = selection->x1;
5003	y[`0`] = selection->y1;
5004	x[`1`] = selection->x2;
5005	y[`1`] = selection->y2;
5006
5007	for (i = `0`; i < `2`; i++) {
5008	// the first/last lines are often not nearest
5009	// the corners, so we have to force them to be
5010	// selected when the selection runs outside this
5011	// block.
5012	if (page->primaryLR) {
5013	all[i] = x[i] >= this->xMax && y[i] >= this->yMax;
5014	if (x[i] <= this->xMin && y[i] <= this->yMin) {
5015	best_line[i] = this->lines;
5016	best_count[i] = `1`;
5017	} else {
5018	best_line[i] = nullptr;
5019	best_count[i] = `0`;
5020	}
5021	} else {
5022	all[i] = x[i] <= this->xMin && y[i] >= this->yMax;
5023	if (x[i] >= this->xMax && y[i] <= this->yMin) {
5024	best_line[i] = this->lines;
5025	best_count[i] = `1`;
5026	} else {
5027	best_line[i] = nullptr;
5028	best_count[i] = `0`;
5029	}
5030	}
5031	best_d[i] = `0`;
5032	}
5033
5034	// find the nearest line to the selection points
5035	// using the manhattan distance.
5036	for (p = this->lines; p; p = p->next) {
5037	count++;
5038	for (i = `0`; i < `2`; i++) {
5039	d = fmax(x: p->xMin - x[i], y: `0.0`) + fmax(x: x[i] - p->xMax, y: `0.0`) + fmax(x: p->yMin - y[i], y: `0.0`) + fmax(x: y[i] - p->yMax, y: `0.0`);
5040	if (!best_line[i] \|\| all[i] \|\| d < best_d[i]) {
5041	best_line[i] = p;
5042	best_count[i] = count;
5043	best_d[i] = d;
5044	}
5045	}
5046	}
5047	// assert: best is always set.
5048	if (!best_line[`0`] \|\| !best_line[`1`]) {
5049	return;
5050	}
5051
5052	// Now decide which point was first.
5053	if (best_count[`0`] < best_count[`1`] \|\| (best_count[`0`] == best_count[`1`] && y[`0`] < y[`1`])) {
5054	start = `0`;
5055	stop = `1`;
5056	} else {
5057	start = `1`;
5058	stop = `0`;
5059	}
5060
5061	visitor->visitBlock(block: this, begin: best_line[start], end: best_line[stop], selection);
5062
5063	for (p = best_line[start]; p; p = p->next) {
5064	if (page->primaryLR) {
5065	child_selection.x1 = p->xMin;
5066	child_selection.x2 = p->xMax;
5067	} else {
5068	child_selection.x1 = p->xMax;
5069	child_selection.x2 = p->xMin;
5070	}
5071	child_selection.y1 = p->yMin;
5072	child_selection.y2 = p->yMax;
5073	if (style == selectionStyleLine) {
5074	if (p == best_line[start]) {
5075	child_selection.x1 = `0`;
5076	child_selection.y1 = `0`;
5077	}
5078	if (p == best_line[stop]) {
5079	child_selection.x2 = page->pageWidth;
5080	child_selection.y2 = page->pageHeight;
5081	}
5082	} else {
5083	if (p == best_line[start]) {
5084	child_selection.x1 = fmax(x: p->xMin, y: fmin(x: p->xMax, y: x[start]));
5085	child_selection.y1 = fmax(x: p->yMin, y: fmin(x: p->yMax, y: y[start]));
5086	}
5087	if (p == best_line[stop]) {
5088	child_selection.x2 = fmax(x: p->xMin, y: fmin(x: p->xMax, y: x[stop]));
5089	child_selection.y2 = fmax(x: p->yMin, y: fmin(x: p->yMax, y: y[stop]));
5090	}
5091	}
5092	p->visitSelection(visitor, selection: &child_selection, style);
5093	if (p == best_line[stop]) {
5094	return;
5095	}
5096	}
5097	}
5098
5099	void TextPage::visitSelection(TextSelectionVisitor visitor, const* PDFRectangle *selection, SelectionStyle style)
5100	{
5101	PDFRectangle child_selection;
5102	double x[`2`], y[`2`], d, best_d[`2`];
5103	double xMin, yMin, xMax, yMax;
5104	TextFlow flow, best_flow[`2`];
5105	TextBlock blk, best_block[`2`];
5106	int i, count = `0`, best_count[`2`], start, stop;
5107
5108	if (!flows) {
5109	return;
5110	}
5111
5112	x[`0`] = selection->x1;
5113	y[`0`] = selection->y1;
5114	x[`1`] = selection->x2;
5115	y[`1`] = selection->y2;
5116
5117	xMin = pageWidth;
5118	yMin = pageHeight;
5119	xMax = `0.0`;
5120	yMax = `0.0`;
5121
5122	for (i = `0`; i < `2`; i++) {
5123	best_block[i] = nullptr;
5124	best_flow[i] = nullptr;
5125	best_count[i] = `0`;
5126	best_d[i] = `0`;
5127	}
5128
5129	// find the nearest blocks to the selection points
5130	// using the manhattan distance.
5131	for (flow = flows; flow; flow = flow->next) {
5132	for (blk = flow->blocks; blk; blk = blk->next) {
5133	count++;
5134	// the first/last blocks in reading order are
5135	// often not the closest to the page corners;
5136	// track the corners, force those blocks to
5137	// be selected if the selection runs across
5138	// multiple pages.
5139	xMin = fmin(x: xMin, y: blk->xMin);
5140	yMin = fmin(x: yMin, y: blk->yMin);
5141	xMax = fmax(x: xMax, y: blk->xMax);
5142	yMax = fmax(x: yMax, y: blk->yMax);
5143	for (i = `0`; i < `2`; i++) {
5144	d = fmax(x: blk->xMin - x[i], y: `0.0`) + fmax(x: x[i] - blk->xMax, y: `0.0`) + fmax(x: blk->yMin - y[i], y: `0.0`) + fmax(x: y[i] - blk->yMax, y: `0.0`);
5145	if (!best_block[i] \|\| d < best_d[i] \|\| (!blk->next && !flow->next && x[i] >= fmin(x: xMax, y: pageWidth) && y[i] >= fmin(x: yMax, y: pageHeight))) {
5146	best_block[i] = blk;
5147	best_flow[i] = flow;
5148	best_count[i] = count;
5149	best_d[i] = d;
5150	}
5151	}
5152	}
5153	}
5154	for (i = `0`; i < `2`; i++) {
5155	if (primaryLR) {
5156	if (x[i] < xMin && y[i] < yMin) {
5157	best_block[i] = flows->blocks;
5158	best_flow[i] = flows;
5159	best_count[i] = `1`;
5160	}
5161	} else {
5162	if (x[i] > xMax && y[i] < yMin) {
5163	best_block[i] = flows->blocks;
5164	best_flow[i] = flows;
5165	best_count[i] = `1`;
5166	}
5167	}
5168	}
5169	// assert: best is always set.
5170	if (!best_block[`0`] \|\| !best_block[`1`]) {
5171	return;
5172	}
5173
5174	// Now decide which point was first.
5175	if (best_count[`0`] < best_count[`1`] \|\| (best_count[`0`] == best_count[`1`] && y[`0`] < y[`1`])) {
5176	start = `0`;
5177	stop = `1`;
5178	} else {
5179	start = `1`;
5180	stop = `0`;
5181	}
5182
5183	for (flow = best_flow[start]; flow; flow = flow->next) {
5184	if (flow == best_flow[start]) {
5185	blk = best_block[start];
5186	} else {
5187	blk = flow->blocks;
5188	}
5189	for (; blk; blk = blk->next) {
5190	if (primaryLR) {
5191	child_selection.x1 = blk->xMin;
5192	child_selection.x2 = blk->xMax;
5193	} else {
5194	child_selection.x1 = blk->xMax;
5195	child_selection.x2 = blk->xMin;
5196	}
5197	child_selection.y1 = blk->yMin;
5198	child_selection.y2 = blk->yMax;
5199	if (blk == best_block[start]) {
5200	child_selection.x1 = fmax(x: blk->xMin, y: fmin(x: blk->xMax, y: x[start]));
5201	child_selection.y1 = fmax(x: blk->yMin, y: fmin(x: blk->yMax, y: y[start]));
5202	}
5203	if (blk == best_block[stop]) {
5204	child_selection.x2 = fmax(x: blk->xMin, y: fmin(x: blk->xMax, y: x[stop]));
5205	child_selection.y2 = fmax(x: blk->yMin, y: fmin(x: blk->yMax, y: y[stop]));
5206	blk->visitSelection(visitor, selection: &child_selection, style);
5207	return;
5208	}
5209	blk->visitSelection(visitor, selection: &child_selection, style);
5210	}
5211	}
5212	}
5213
5214	void TextPage::drawSelection(OutputDev out, double* scale, int rotation, const PDFRectangle selection, SelectionStyle style, const* GfxColor glyph_color, const* GfxColor *box_color)
5215	{
5216	TextSelectionPainter painter(this, scale, rotation, out, box_color, glyph_color);
5217
5218	visitSelection(visitor: &painter, selection, style);
5219	painter.endPage();
5220	}
5221
5222	std::vector<PDFRectangle > TextPage::getSelectionRegion(const PDFRectangle selection, SelectionStyle style, double* scale)
5223	{
5224	TextSelectionSizer sizer(this, scale);
5225
5226	visitSelection(visitor: &sizer, selection, style);
5227
5228	return sizer.takeRegion();
5229	}
5230
5231	GooString TextPage::getSelectionText(const* PDFRectangle *selection, SelectionStyle style)
5232	{
5233	TextSelectionDumper dumper(this);
5234
5235	visitSelection(visitor: &dumper, selection, style);
5236	dumper.endPage();
5237
5238	return dumper.getText();
5239	}
5240
5241	std::vector<TextWordSelection > TextPage::getSelectionWords(const* PDFRectangle selection, SelectionStyle style, int* *nLines)
5242	{
5243	TextSelectionDumper dumper(this);
5244
5245	visitSelection(visitor: &dumper, selection, style);
5246	dumper.endPage();
5247
5248	return dumper.takeWordList(nLinesOut: nLines);
5249	}
5250
5251	bool TextPage::findCharRange(int pos, int length, double xMin, double* yMin, double* xMax, double* yMax) const*
5252	{
5253	TextBlock *blk;
5254	TextLine *line;
5255	TextWord *word;
5256	double xMin0, xMax0, yMin0, yMax0;
5257	double xMin1, xMax1, yMin1, yMax1;
5258	bool first;
5259
5260	if (rawOrder) {
5261	return false;
5262	}
5263
5264	//~ this doesn't correctly handle ranges split across multiple lines
5265	//~ (the highlighted region is the bounding box of all the parts of
5266	//~ the range)
5267	first = true;
5268	xMin0 = xMax0 = yMin0 = yMax0 = `0`; // make gcc happy
5269	xMin1 = xMax1 = yMin1 = yMax1 = `0`; // make gcc happy
5270	for (int i = `0`; i < nBlocks; ++i) {
5271	blk = blocks[i];
5272	for (line = blk->lines; line; line = line->next) {
5273	for (word = line->words; word; word = word->next) {
5274	if (pos < word->charPosEnd && pos + length > word->chars.front().charPos) {
5275	size_t j0, j1;
5276	for (j0 = `0`; (j0 + `1`) < word->len() && pos >= word->chars [j0 + `1`].charPos; ++j0) {
5277	;
5278	}
5279	for (j1 = word->len(); j1 > j0 && pos + length <= word->chars [j1].charPos; --j1) {
5280	;
5281	}
5282	auto startingEdge = word->chars [j0].edge;
5283	auto endingEdge = (j1 + `1` == word->len()) ? word->edgeEnd : word->chars [j1 + `1`].edge;
5284	switch (line->rot) {
5285	case `0`:
5286	xMin1 = startingEdge;
5287	xMax1 = endingEdge;
5288	yMin1 = word->yMin;
5289	yMax1 = word->yMax;
5290	break;
5291	case `1`:
5292	xMin1 = word->xMin;
5293	xMax1 = word->xMax;
5294	yMin1 = startingEdge;
5295	yMax1 = endingEdge;
5296	break;
5297	case `2`:
5298	xMin1 = endingEdge;
5299	xMax1 = startingEdge;
5300	yMin1 = word->yMin;
5301	yMax1 = word->yMax;
5302	break;
5303	case `3`:
5304	xMin1 = word->xMin;
5305	xMax1 = word->xMax;
5306	yMin1 = endingEdge;
5307	yMax1 = startingEdge;
5308	break;
5309	}
5310	if (first \|\| xMin1 < xMin0) {
5311	xMin0 = xMin1;
5312	}
5313	if (first \|\| xMax1 > xMax0) {
5314	xMax0 = xMax1;
5315	}
5316	if (first \|\| yMin1 < yMin0) {
5317	yMin0 = yMin1;
5318	}
5319	if (first \|\| yMax1 > yMax0) {
5320	yMax0 = yMax1;
5321	}
5322	first = false;
5323	}
5324	}
5325	}
5326	}
5327	if (!first) {
5328	*xMin = xMin0;
5329	*xMax = xMax0;
5330	*yMin = yMin0;
5331	*yMax = yMax0;
5332	return true;
5333	}
5334	return false;
5335	}
5336
5337	void TextPage::dump(void outputStream, TextOutputFunc outputFunc, bool* physLayout, EndOfLineKind textEOL, bool pageBreaks)
5338	{
5339	const UnicodeMap *uMap;
5340	TextFlow *flow;
5341	TextBlock *blk;
5342	TextLine *line;
5343	TextLineFrag *frags;
5344	TextWord *word;
5345	int nFrags, fragsSize;
5346	TextLineFrag *frag;
5347	char space[`8`], eol[`16`], eop[`8`];
5348	int spaceLen, eolLen, eopLen;
5349	double delta;
5350	int col, i, j, d, n;
5351
5352	// get the output encoding
5353	if (!(uMap = globalParams ->getTextEncoding())) {
5354	return;
5355	}
5356	spaceLen = uMap->mapUnicode(u: `0x20`, buf: space, bufSize: sizeof(space));
5357	eolLen = `0`; // make gcc happy
5358	switch (textEOL) {
5359	case eolUnix:
5360	eolLen = uMap->mapUnicode(u: `0x0a`, buf: eol, bufSize: sizeof(eol));
5361	break;
5362	case eolDOS:
5363	eolLen = uMap->mapUnicode(u: `0x0d`, buf: eol, bufSize: sizeof(eol));
5364	eolLen += uMap->mapUnicode(u: `0x0a`, buf: eol + eolLen, bufSize: sizeof(eol) - eolLen);
5365	break;
5366	case eolMac:
5367	eolLen = uMap->mapUnicode(u: `0x0d`, buf: eol, bufSize: sizeof(eol));
5368	break;
5369	}
5370	eopLen = uMap->mapUnicode(u: `0x0c`, buf: eop, bufSize: sizeof(eop));
5371
5372	//~ writing mode (horiz/vert)
5373
5374	// output the page in raw (content stream) order
5375	if (rawOrder) {
5376
5377	GooString s;
5378	std::vector<Unicode> uText;
5379
5380	for (word = rawWords; word; word = word->next) {
5381	s.clear();
5382	uText.resize(new_size: word->len());
5383	std::transform(first: word->chars.begin(), last: word->chars.end(), result: uText.begin(), unary_op: [](auto &c) { return c.text; });
5384	dumpFragment(text: uText.data(), len: uText.size(), uMap, s: &s);
5385	(*outputFunc)(outputStream, s.c_str(), s.getLength());
5386
5387	if (word->next && fabs(x: word->next->base - word->base) < maxIntraLineDelta * word->fontSize && word->next->xMin > word->xMax - minDupBreakOverlap * word->fontSize) {
5388	if (word->next->xMin > word->xMax + minWordSpacing * word->fontSize) {
5389	(*outputFunc)(outputStream, space, spaceLen);
5390	}
5391	} else {
5392	(*outputFunc)(outputStream, eol, eolLen);
5393	}
5394	}
5395
5396	// output the page, maintaining the original physical layout
5397	} else if (physLayout) {
5398
5399	// collect the line fragments for the page and sort them
5400	fragsSize = `256`;
5401	frags = (TextLineFrag )gmallocn(count: fragsSize, size: sizeof*(TextLineFrag));
5402	nFrags = `0`;
5403	for (i = `0`; i < nBlocks; ++i) {
5404	blk = blocks[i];
5405	for (line = blk->lines; line; line = line->next) {
5406	if (nFrags == fragsSize) {
5407	fragsSize *= `2`;
5408	frags = (TextLineFrag )greallocn(p: frags, count: fragsSize, size: sizeof*(TextLineFrag));
5409	}
5410	frags[nFrags].init(lineA: line, startA: `0`, lenA: line->len);
5411	frags[nFrags].computeCoords(oneRot: true);
5412	++nFrags;
5413	}
5414	}
5415	qsort(base: frags, nmemb: nFrags, size: sizeof(TextLineFrag), compar: &TextLineFrag::cmpYXPrimaryRot);
5416	i = `0`;
5417	while (i < nFrags) {
5418	delta = maxIntraLineDelta * frags[i].line->words->fontSize;
5419	for (j = i + `1`; j < nFrags && fabs(x: frags[j].base - frags[i].base) < delta; ++j) {
5420	;
5421	}
5422	qsort(base: frags + i, nmemb: j - i, size: sizeof(TextLineFrag), compar: &TextLineFrag::cmpXYColumnPrimaryRot);
5423	i = j;
5424	}
5425
5426	#if 0 // for debugging
5427	printf("* line fragments *\n");
5428	for (i = `0`; i < nFrags; ++i) {
5429	frag = &frags[i];
5430	printf("frag: x=%.2f..%.2f y=%.2f..%.2f base=%.2f '",
5431	frag->xMin, frag->xMax, frag->yMin, frag->yMax, frag->base);
5432	for (n = `0`; n < frag->len; ++n) {
5433	fputc(frag->line->text[frag->start + n] & `0xff`, stdout);
5434	}
5435	printf("'\n");
5436	}
5437	printf("\n");
5438	#endif
5439
5440	GooString s;
5441	// generate output
5442	col = `0`;
5443	for (i = `0`; i < nFrags; ++i) {
5444	frag = &frags[i];
5445
5446	// column alignment
5447	for (; col < frag->col; ++col) {
5448	(*outputFunc)(outputStream, space, spaceLen);
5449	}
5450
5451	// print the line
5452	s.clear();
5453	col += dumpFragment(text: frag->line->text + frag->start, len: frag->len, uMap, s: &s);
5454	(*outputFunc)(outputStream, s.c_str(), s.getLength());
5455
5456	// print one or more returns if necessary
5457	if (i == nFrags - `1` \|\| frags[i + `1`].col < col \|\| fabs(x: frags[i + `1`].base - frag->base) > maxIntraLineDelta * frag->line->words->fontSize) {
5458	if (i < nFrags - `1`) {
5459	d = (int)((frags[i + `1`].base - frag->base) / frag->line->words->fontSize);
5460	if (d < `1`) {
5461	d = `1`;
5462	} else if (d > `5`) {
5463	d = `5`;
5464	}
5465	} else {
5466	d = `1`;
5467	}
5468	for (; d > `0`; --d) {
5469	(*outputFunc)(outputStream, eol, eolLen);
5470	}
5471	col = `0`;
5472	}
5473	}
5474
5475	gfree(p: frags);
5476
5477	// output the page, "undoing" the layout
5478	} else {
5479	for (flow = flows; flow; flow = flow->next) {
5480	for (blk = flow->blocks; blk; blk = blk->next) {
5481	for (line = blk->lines; line; line = line->next) {
5482	n = line->len;
5483	if (line->hyphenated && (line->next \|\| blk->next)) {
5484	--n;
5485	}
5486	GooString s;
5487	dumpFragment(text: line->text, len: n, uMap, s: &s);
5488	(*outputFunc)(outputStream, s.c_str(), s.getLength());
5489	// output a newline when a hyphen is not suppressed
5490	if (n == line->len) {
5491	(*outputFunc)(outputStream, eol, eolLen);
5492	}
5493	}
5494	}
5495	(*outputFunc)(outputStream, eol, eolLen);
5496	}
5497	}
5498
5499	// end of page
5500	if (pageBreaks) {
5501	(*outputFunc)(outputStream, eop, eopLen);
5502	}
5503	}
5504
5505	void TextPage::setMergeCombining(bool merge)
5506	{
5507	mergeCombining = merge;
5508	}
5509
5510	void TextPage::assignColumns(TextLineFrag frags, int* nFrags, bool oneRot) const
5511	{
5512	TextLineFrag frag0, frag1;
5513	int rot, col1, col2, i, j, k;
5514
5515	// all text in the region has the same rotation -- recompute the
5516	// column numbers based only on the text in the region
5517	if (oneRot) {
5518	qsort(base: frags, nmemb: nFrags, size: sizeof(TextLineFrag), compar: &TextLineFrag::cmpXYLineRot);
5519	rot = frags[`0`].line->rot;
5520	for (i = `0`; i < nFrags; ++i) {
5521	frag0 = &frags[i];
5522	col1 = `0`;
5523	for (j = `0`; j < i; ++j) {
5524	frag1 = &frags[j];
5525	col2 = `0`; // make gcc happy
5526	switch (rot) {
5527	case `0`:
5528	if (frag0->xMin >= frag1->xMax) {
5529	col2 = frag1->col + (frag1->line->col[frag1->start + frag1->len] - frag1->line->col[frag1->start]) + `1`;
5530	} else {
5531	for (k = frag1->start; k < frag1->start + frag1->len && frag0->xMin >= `0.5` * (frag1->line->edge[k] + frag1->line->edge[k + `1`]); ++k) {
5532	;
5533	}
5534	col2 = frag1->col + frag1->line->col[k] - frag1->line->col[frag1->start];
5535	}
5536	break;
5537	case `1`:
5538	if (frag0->yMin >= frag1->yMax) {
5539	col2 = frag1->col + (frag1->line->col[frag1->start + frag1->len] - frag1->line->col[frag1->start]) + `1`;
5540	} else {
5541	for (k = frag1->start; k < frag1->start + frag1->len && frag0->yMin >= `0.5` * (frag1->line->edge[k] + frag1->line->edge[k + `1`]); ++k) {
5542	;
5543	}
5544	col2 = frag1->col + frag1->line->col[k] - frag1->line->col[frag1->start];
5545	}
5546	break;
5547	case `2`:
5548	if (frag0->xMax <= frag1->xMin) {
5549	col2 = frag1->col + (frag1->line->col[frag1->start + frag1->len] - frag1->line->col[frag1->start]) + `1`;
5550	} else {
5551	for (k = frag1->start; k < frag1->start + frag1->len && frag0->xMax <= `0.5` * (frag1->line->edge[k] + frag1->line->edge[k + `1`]); ++k) {
5552	;
5553	}
5554	col2 = frag1->col + frag1->line->col[k] - frag1->line->col[frag1->start];
5555	}
5556	break;
5557	case `3`:
5558	if (frag0->yMax <= frag1->yMin) {
5559	col2 = frag1->col + (frag1->line->col[frag1->start + frag1->len] - frag1->line->col[frag1->start]) + `1`;
5560	} else {
5561	for (k = frag1->start; k < frag1->start + frag1->len && frag0->yMax <= `0.5` * (frag1->line->edge[k] + frag1->line->edge[k + `1`]); ++k) {
5562	;
5563	}
5564	col2 = frag1->col + frag1->line->col[k] - frag1->line->col[frag1->start];
5565	}
5566	break;
5567	}
5568	if (col2 > col1) {
5569	col1 = col2;
5570	}
5571	}
5572	frag0->col = col1;
5573	}
5574
5575	// the region includes text at different rotations -- use the
5576	// globally assigned column numbers, offset by the minimum column
5577	// number (i.e., shift everything over to column 0)
5578	} else {
5579	col1 = frags[`0`].col;
5580	for (i = `1`; i < nFrags; ++i) {
5581	if (frags[i].col < col1) {
5582	col1 = frags[i].col;
5583	}
5584	}
5585	for (i = `0`; i < nFrags; ++i) {
5586	frags[i].col -= col1;
5587	}
5588	}
5589	}
5590
5591	int TextPage::dumpFragment(const Unicode text, int* len, const UnicodeMap uMap, GooString s) const
5592	{
5593	if (uMap->isUnicode()) {
5594	return reorderText(text, len, uMap, primaryLR, s, u: nullptr);
5595	} else {
5596	int nCols = `0`;
5597
5598	char buf[`8`];
5599	int buflen = `0`;
5600
5601	for (int i = `0`; i < len; ++i) {
5602	buflen = uMap->mapUnicode(u: text[i], buf, bufSize: sizeof(buf));
5603	s->append(str: buf, lengthA: buflen);
5604	nCols += buflen;
5605	}
5606
5607	return nCols;
5608	}
5609	}
5610
5611	#ifdef TEXTOUT_WORD_LIST
5612	std::unique_ptr<TextWordList> TextPage::makeWordList(bool physLayout)
5613	{
5614	return std::make_unique<TextWordList>(args: this, args&: physLayout);
5615	}
5616	#endif
5617
5618	//------------------------------------------------------------------------
5619	// ActualText
5620	//------------------------------------------------------------------------
5621	ActualText::ActualText(TextPage *out)
5622	{
5623	out->incRefCnt();
5624	text = out;
5625	actualText = nullptr;
5626	actualTextNBytes = `0`;
5627	}
5628
5629	ActualText::~ActualText()
5630	{
5631	if (actualText) {
5632	delete actualText;
5633	}
5634	text->decRefCnt();
5635	}
5636
5637	void ActualText::addChar(const GfxState state, double* x, double y, double dx, double dy, CharCode c, int nBytes, const Unicode u, int* uLen)
5638	{
5639	if (!actualText) {
5640	text->addChar(state, x, y, dx, dy, c, nBytes, u, uLen);
5641	return;
5642	}
5643
5644	// Inside ActualText span.
5645	if (!actualTextNBytes) {
5646	actualTextX0 = x;
5647	actualTextY0 = y;
5648	}
5649	actualTextX1 = x + dx;
5650	actualTextY1 = y + dy;
5651	actualTextNBytes += nBytes;
5652	}
5653
5654	void ActualText::begin(const GfxState state, const* GooString *t)
5655	{
5656	if (actualText) {
5657	delete actualText;
5658	}
5659	actualText = new GooString (t);
5660	actualTextNBytes = `0`;
5661	}
5662
5663	void ActualText::end(const GfxState *state)
5664	{
5665	// ActualText span closed. Output the span text and the
5666	// extents of all the glyphs inside the span
5667
5668	if (actualTextNBytes) {
5669	// now that we have the position info for all of the text inside
5670	// the marked content span, we feed the "ActualText" back through
5671	// text->addChar()
5672	std::vector<Unicode> uni = TextStringToUCS4(textStr: actualText->toStr());
5673	text->addChar(state, x: actualTextX0, y: actualTextY0, dx: actualTextX1 - actualTextX0, dy: actualTextY1 - actualTextY0, c: `0`, nBytes: actualTextNBytes, u: uni.data(), uLen: uni.size());
5674	}
5675
5676	delete actualText;
5677	actualText = nullptr;
5678	actualTextNBytes = `0`;
5679	}
5680
5681	//------------------------------------------------------------------------
5682	// TextOutputDev
5683	//------------------------------------------------------------------------
5684
5685	static void TextOutputDev_outputToFile(void stream, const* char text, int* len)
5686	{
5687	fwrite(ptr: text, size: `1`, n: len, s: (FILE *)stream);
5688	}
5689
5690	TextOutputDev::TextOutputDev(const char fileName, bool* physLayoutA, double fixedPitchA, bool rawOrderA, bool append, bool discardDiagA)
5691	{
5692	text = nullptr;
5693	physLayout = physLayoutA;
5694	fixedPitch = physLayout ? fixedPitchA : `0`;
5695	rawOrder = rawOrderA;
5696	discardDiag = discardDiagA;
5697	doHTML = false;
5698	textEOL = defaultEndOfLine();
5699	textPageBreaks = true;
5700	ok = true;
5701	minColSpacing1 = minColSpacing1_default;
5702
5703	// open file
5704	needClose = false;
5705	if (fileName) {
5706	if (!strcmp(s1: fileName, s2: "-")) {
5707	outputStream = stdout;
5708	#if defined(_WIN32) \|\| defined(__CYGWIN__)
5709	// keep DOS from munging the end-of-line characters
5710	_setmode(fileno(stdout), O_BINARY);
5711	#endif
5712	} else if ((outputStream = openFile(path: fileName, mode: append ? "ab" : "wb"))) {
5713	needClose = true;
5714	} else {
5715	error(category: errIO, pos: -`1`, msg: "Couldn't open text file '{0:s}'", fileName);
5716	ok = false;
5717	actualText = nullptr;
5718	return;
5719	}
5720	outputFunc = &TextOutputDev_outputToFile;
5721	} else {
5722	outputStream = nullptr;
5723	}
5724
5725	// set up text object
5726	text = new TextPage (rawOrderA, discardDiagA);
5727	actualText = new ActualText (text);
5728	}
5729
5730	TextOutputDev::TextOutputDev(TextOutputFunc func, void stream, bool* physLayoutA, double fixedPitchA, bool rawOrderA, bool discardDiagA)
5731	{
5732	outputFunc = func;
5733	outputStream = stream;
5734	needClose = false;
5735	physLayout = physLayoutA;
5736	fixedPitch = physLayout ? fixedPitchA : `0`;
5737	rawOrder = rawOrderA;
5738	discardDiag = discardDiagA;
5739	doHTML = false;
5740	text = new TextPage (rawOrderA, discardDiagA);
5741	actualText = new ActualText (text);
5742	textEOL = defaultEndOfLine();
5743	textPageBreaks = true;
5744	ok = true;
5745	minColSpacing1 = minColSpacing1_default;
5746	}
5747
5748	TextOutputDev::~TextOutputDev()
5749	{
5750	if (needClose) {
5751	fclose(stream: (FILE *)outputStream);
5752	}
5753	if (text) {
5754	text->decRefCnt();
5755	}
5756	delete actualText;
5757	}
5758
5759	void TextOutputDev::startPage(int pageNum, GfxState state, XRef xref)
5760	{
5761	text->startPage(state);
5762	}
5763
5764	void TextOutputDev::endPage()
5765	{
5766	text->endPage();
5767	text->coalesce(physLayout, fixedPitch, doHTML, minColSpacing1);
5768	if (outputStream) {
5769	text->dump(outputStream, outputFunc, physLayout, textEOL, pageBreaks: textPageBreaks);
5770	}
5771	}
5772
5773	void TextOutputDev::restoreState(GfxState *state)
5774	{
5775	text->updateFont(state);
5776	}
5777
5778	void TextOutputDev::updateFont(GfxState *state)
5779	{
5780	text->updateFont(state);
5781	}
5782
5783	void TextOutputDev::beginString(GfxState state, const* GooString *s) { }
5784
5785	void TextOutputDev::endString(GfxState *state) { }
5786
5787	void TextOutputDev::drawChar(GfxState state, double* x, double y, double dx, double dy, double originX, double originY, CharCode c, int nBytes, const Unicode u, int* uLen)
5788	{
5789	actualText->addChar(state, x, y, dx, dy, c, nBytes, u, uLen);
5790	}
5791
5792	void TextOutputDev::incCharCount(int nChars)
5793	{
5794	text->incCharCount(nChars);
5795	}
5796
5797	void TextOutputDev::beginActualText(GfxState state, const* GooString *t)
5798	{
5799	actualText->begin(state, t);
5800	}
5801
5802	void TextOutputDev::endActualText(GfxState *state)
5803	{
5804	actualText->end(state);
5805	}
5806
5807	void TextOutputDev::stroke(GfxState *state)
5808	{
5809	double x[`2`], y[`2`];
5810
5811	if (!doHTML) {
5812	return;
5813	}
5814	const GfxPath *path = state->getPath();
5815	if (path->getNumSubpaths() != `1`) {
5816	return;
5817	}
5818	const GfxSubpath *subpath = path->getSubpath(i: `0`);
5819	if (subpath->getNumPoints() != `2`) {
5820	return;
5821	}
5822	state->transform(x1: subpath->getX(i: `0`), y1: subpath->getY(i: `0`), x2: &x[`0`], y2: &y[`0`]);
5823	state->transform(x1: subpath->getX(i: `1`), y1: subpath->getY(i: `1`), x2: &x[`1`], y2: &y[`1`]);
5824
5825	// look for a vertical or horizontal line
5826	if (x[`0`] == x[`1`] \|\| y[`0`] == y[`1`]) {
5827	text->addUnderline(x0: x[`0`], y0: y[`0`], x1: x[`1`], y1: y[`1`]);
5828	}
5829	}
5830
5831	void TextOutputDev::fill(GfxState *state)
5832	{
5833	double x[`5`], y[`5`];
5834	double rx0, ry0, rx1, ry1, t;
5835	int i;
5836
5837	if (!doHTML) {
5838	return;
5839	}
5840	const GfxPath *path = state->getPath();
5841	if (path->getNumSubpaths() != `1`) {
5842	return;
5843	}
5844	const GfxSubpath *subpath = path->getSubpath(i: `0`);
5845	if (subpath->getNumPoints() != `5`) {
5846	return;
5847	}
5848	for (i = `0`; i < `5`; ++i) {
5849	if (subpath->getCurve(i)) {
5850	return;
5851	}
5852	state->transform(x1: subpath->getX(i), y1: subpath->getY(i), x2: &x[i], y2: &y[i]);
5853	}
5854
5855	// look for a rectangle
5856	if (x[`0`] == x[`1`] && y[`1`] == y[`2`] && x[`2`] == x[`3`] && y[`3`] == y[`4`] && x[`0`] == x[`4`] && y[`0`] == y[`4`]) {
5857	rx0 = x[`0`];
5858	ry0 = y[`0`];
5859	rx1 = x[`2`];
5860	ry1 = y[`1`];
5861	} else if (y[`0`] == y[`1`] && x[`1`] == x[`2`] && y[`2`] == y[`3`] && x[`3`] == x[`4`] && x[`0`] == x[`4`] && y[`0`] == y[`4`]) {
5862	rx0 = x[`0`];
5863	ry0 = y[`0`];
5864	rx1 = x[`1`];
5865	ry1 = y[`2`];
5866	} else {
5867	return;
5868	}
5869	if (rx1 < rx0) {
5870	t = rx0;
5871	rx0 = rx1;
5872	rx1 = t;
5873	}
5874	if (ry1 < ry0) {
5875	t = ry0;
5876	ry0 = ry1;
5877	ry1 = t;
5878	}
5879
5880	// skinny horizontal rectangle
5881	if (ry1 - ry0 < rx1 - rx0) {
5882	if (ry1 - ry0 < maxUnderlineWidth) {
5883	ry0 = `0.5` * (ry0 + ry1);
5884	text->addUnderline(x0: rx0, y0: ry0, x1: rx1, y1: ry0);
5885	}
5886
5887	// skinny vertical rectangle
5888	} else {
5889	if (rx1 - rx0 < maxUnderlineWidth) {
5890	rx0 = `0.5` * (rx0 + rx1);
5891	text->addUnderline(x0: rx0, y0: ry0, x1: rx0, y1: ry1);
5892	}
5893	}
5894	}
5895
5896	void TextOutputDev::eoFill(GfxState *state)
5897	{
5898	if (!doHTML) {
5899	return;
5900	}
5901	fill(state);
5902	}
5903
5904	void TextOutputDev::processLink(AnnotLink *link)
5905	{
5906	double x1, y1, x2, y2;
5907	int xMin, yMin, xMax, yMax, x, y;
5908
5909	if (!doHTML) {
5910	return;
5911	}
5912	link->getRect(x1: &x1, y1: &y1, x2: &x2, y2: &y2);
5913	cvtUserToDev(ux: x1, uy: y1, dx: &x, dy: &y);
5914	xMin = xMax = x;
5915	yMin = yMax = y;
5916	cvtUserToDev(ux: x1, uy: y2, dx: &x, dy: &y);
5917	if (x < xMin) {
5918	xMin = x;
5919	} else if (x > xMax) {
5920	xMax = x;
5921	}
5922	if (y < yMin) {
5923	yMin = y;
5924	} else if (y > yMax) {
5925	yMax = y;
5926	}
5927	cvtUserToDev(ux: x2, uy: y1, dx: &x, dy: &y);
5928	if (x < xMin) {
5929	xMin = x;
5930	} else if (x > xMax) {
5931	xMax = x;
5932	}
5933	if (y < yMin) {
5934	yMin = y;
5935	} else if (y > yMax) {
5936	yMax = y;
5937	}
5938	cvtUserToDev(ux: x2, uy: y2, dx: &x, dy: &y);
5939	if (x < xMin) {
5940	xMin = x;
5941	} else if (x > xMax) {
5942	xMax = x;
5943	}
5944	if (y < yMin) {
5945	yMin = y;
5946	} else if (y > yMax) {
5947	yMax = y;
5948	}
5949	text->addLink(xMin, yMin, xMax, yMax, link);
5950	}
5951
5952	bool TextOutputDev::findText(const Unicode s, int* len, bool startAtTop, bool stopAtBottom, bool startAtLast, bool stopAtLast, bool caseSensitive, bool backward, bool wholeWord, double xMin, double* yMin, double* xMax, double* yMax) const*
5953	{
5954	return text->findText(s, len, startAtTop, stopAtBottom, startAtLast, stopAtLast, caseSensitive, backward, wholeWord, xMin, yMin, xMax, yMax);
5955	}
5956
5957	GooString TextOutputDev::getText(double* xMin, double yMin, double xMax, double yMax) const
5958	{
5959	return text->getText(xMin, yMin, xMax, yMax, textEOL);
5960	}
5961
5962	void TextOutputDev::drawSelection(OutputDev out, double* scale, int rotation, const PDFRectangle selection, SelectionStyle style, const* GfxColor glyph_color, const* GfxColor *box_color)
5963	{
5964	text->drawSelection(out, scale, rotation, selection, style, glyph_color, box_color);
5965	}
5966
5967	std::vector<PDFRectangle > TextOutputDev::getSelectionRegion(const PDFRectangle selection, SelectionStyle style, double* scale)
5968	{
5969	return text->getSelectionRegion(selection, style, scale);
5970	}
5971
5972	GooString TextOutputDev::getSelectionText(const* PDFRectangle *selection, SelectionStyle style)
5973	{
5974	return text->getSelectionText(selection, style);
5975	}
5976
5977	bool TextOutputDev::findCharRange(int pos, int length, double xMin, double* yMin, double* xMax, double* yMax) const*
5978	{
5979	return text->findCharRange(pos, length, xMin, yMin, xMax, yMax);
5980	}
5981
5982	void TextOutputDev::setMergeCombining(bool merge)
5983	{
5984	text->setMergeCombining(merge);
5985	}
5986
5987	#ifdef TEXTOUT_WORD_LIST
5988	std::unique_ptr<TextWordList> TextOutputDev::makeWordList()
5989	{
5990	return text->makeWordList(physLayout);
5991	}
5992	#endif
5993
5994	TextPage *TextOutputDev::takeText()
5995	{
5996	TextPage *ret;
5997
5998	ret = text;
5999	text = new TextPage (rawOrder, discardDiag);
6000	delete actualText;
6001	actualText = new ActualText (text);
6002	return ret;
6003	}
6004
6005	const TextFlow TextOutputDev::getFlows() const*
6006	{
6007	return text->getFlows();
6008	}
6009

source code of poppler/poppler/TextOutputDev.cc