1 | /* |
2 | * Copyright (C) 2009-2010, Pino Toscano <pino@kde.org> |
3 | * Copyright (C) 2017-2020, Albert Astals Cid <aacid@kde.org> |
4 | * Copyright (C) 2017, Jason Alan Palmer <jalanpalmer@gmail.com> |
5 | * Copyright (C) 2018, 2020, Suzuki Toshiya <mpsuzuki@hiroshima-u.ac.jp> |
6 | * Copyright (C) 2018, 2020, Adam Reichold <adam.reichold@t-online.de> |
7 | * Copyright (C) 2018, Zsombor Hollay-Horvath <hollay.horvath@gmail.com> |
8 | * Copyright (C) 2018, Aleksey Nikolaev <nae202@gmail.com> |
9 | * Copyright (C) 2020, Jiri Jakes <freedesktop@jirijakes.eu> |
10 | * |
11 | * This program is free software; you can redistribute it and/or modify |
12 | * it under the terms of the GNU General Public License as published by |
13 | * the Free Software Foundation; either version 2, or (at your option) |
14 | * any later version. |
15 | * |
16 | * This program is distributed in the hope that it will be useful, |
17 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
18 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
19 | * GNU General Public License for more details. |
20 | * |
21 | * You should have received a copy of the GNU General Public License |
22 | * along with this program; if not, write to the Free Software |
23 | * Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA 02110-1301, USA. |
24 | */ |
25 | |
26 | /** |
27 | \file poppler-page.h |
28 | */ |
29 | #include "poppler-page.h" |
30 | #include "poppler-page-transition.h" |
31 | |
32 | #include "poppler-document-private.h" |
33 | #include "poppler-page-private.h" |
34 | #include "poppler-private.h" |
35 | #include "poppler-font-private.h" |
36 | #include "poppler-font.h" |
37 | |
38 | #include "TextOutputDev.h" |
39 | |
40 | #include <algorithm> |
41 | #include <memory> |
42 | #include <utility> |
43 | |
44 | using namespace poppler; |
45 | |
46 | page_private::page_private(document_private *_doc, int _index) : doc(_doc), page(doc->doc->getCatalog()->getPage(i: _index + 1)), index(_index), transition(nullptr), font_info_cache_initialized(false) { } |
47 | |
48 | page_private::~page_private() |
49 | { |
50 | delete transition; |
51 | } |
52 | |
53 | void page_private::init_font_info_cache() |
54 | { |
55 | if (font_info_cache_initialized) { |
56 | return; |
57 | } |
58 | |
59 | poppler::font_iterator it(index, doc); |
60 | |
61 | if (it.has_next()) { |
62 | font_info_cache = it.next(); |
63 | } |
64 | |
65 | font_info_cache_initialized = true; |
66 | return; |
67 | } |
68 | |
69 | /** |
70 | \class poppler::page poppler-page.h "poppler/cpp/poppler-page.h" |
71 | |
72 | A page in a PDF %document. |
73 | */ |
74 | |
75 | /** |
76 | \enum poppler::page::orientation_enum |
77 | |
78 | The possible orientation of a page. |
79 | */ |
80 | |
81 | /** |
82 | \enum poppler::page::search_direction_enum |
83 | |
84 | The direction/action to follow when performing a text search. |
85 | */ |
86 | |
87 | /** |
88 | \enum poppler::page::text_layout_enum |
89 | |
90 | A layout of the text of a page. |
91 | */ |
92 | |
93 | page::page(document_private *doc, int index) : d(new page_private(doc, index)) { } |
94 | |
95 | /** |
96 | Destructor. |
97 | */ |
98 | page::~page() |
99 | { |
100 | delete d; |
101 | } |
102 | |
103 | /** |
104 | \returns the orientation of the page |
105 | */ |
106 | page::orientation_enum page::orientation() const |
107 | { |
108 | const int rotation = d->page->getRotate(); |
109 | switch (rotation) { |
110 | case 90: |
111 | return landscape; |
112 | break; |
113 | case 180: |
114 | return upside_down; |
115 | break; |
116 | case 270: |
117 | return seascape; |
118 | break; |
119 | default: |
120 | return portrait; |
121 | } |
122 | } |
123 | |
124 | /** |
125 | The eventual duration the page can be hinted to be shown in a presentation. |
126 | |
127 | If this value is positive (usually different than -1) then a PDF viewer, when |
128 | showing the page in a presentation, should show the page for at most for this |
129 | number of seconds, and then switch to the next page (if any). Note this is |
130 | purely a presentation attribute, it has no influence on the behaviour. |
131 | |
132 | \returns the duration time (in seconds) of the page |
133 | */ |
134 | double page::duration() const |
135 | { |
136 | return d->page->getDuration(); |
137 | } |
138 | |
139 | /** |
140 | Returns the size of one rect of the page. |
141 | |
142 | \returns the size of the specified page rect |
143 | */ |
144 | rectf page::page_rect(page_box_enum box) const |
145 | { |
146 | const PDFRectangle *r = nullptr; |
147 | switch (box) { |
148 | case media_box: |
149 | r = d->page->getMediaBox(); |
150 | break; |
151 | case crop_box: |
152 | r = d->page->getCropBox(); |
153 | break; |
154 | case bleed_box: |
155 | r = d->page->getBleedBox(); |
156 | break; |
157 | case trim_box: |
158 | r = d->page->getTrimBox(); |
159 | break; |
160 | case art_box: |
161 | r = d->page->getArtBox(); |
162 | break; |
163 | } |
164 | if (r) { |
165 | return detail::pdfrectangle_to_rectf(pdfrect: *r); |
166 | } |
167 | return rectf(); |
168 | } |
169 | |
170 | /** |
171 | \returns the label of the page, if any |
172 | */ |
173 | ustring page::label() const |
174 | { |
175 | GooString goo; |
176 | if (!d->doc->doc->getCatalog()->indexToLabel(index: d->index, label: &goo)) { |
177 | return ustring(); |
178 | } |
179 | |
180 | return detail::unicode_GooString_to_ustring(str: &goo); |
181 | } |
182 | |
183 | /** |
184 | The transition from this page to the next one. |
185 | |
186 | If it is set, then a PDF viewer in a presentation should perform the |
187 | specified transition effect when switching from this page to the next one. |
188 | |
189 | \returns the transition effect for the switch to the next page, if any |
190 | */ |
191 | page_transition *page::transition() const |
192 | { |
193 | if (!d->transition) { |
194 | Object o = d->page->getTrans(); |
195 | if (o.isDict()) { |
196 | d->transition = new page_transition(&o); |
197 | } |
198 | } |
199 | return d->transition; |
200 | } |
201 | |
202 | /** |
203 | Search the page for some text. |
204 | |
205 | \param text the text to search |
206 | \param[in,out] r the area where to start search, which will be set to the area |
207 | of the match (if any) |
208 | \param direction in which direction search for text |
209 | \param case_sensitivity whether search in a case sensitive way |
210 | \param rotation the rotation assumed for the page |
211 | */ |
212 | bool page::search(const ustring &text, rectf &r, search_direction_enum direction, case_sensitivity_enum case_sensitivity, rotation_enum rotation) const |
213 | { |
214 | const size_t len = text.length(); |
215 | |
216 | if (len == 0) { |
217 | return false; |
218 | } |
219 | |
220 | std::vector<Unicode> u(len); |
221 | for (size_t i = 0; i < len; ++i) { |
222 | u[i] = text[i]; |
223 | } |
224 | |
225 | const bool sCase = case_sensitivity == case_sensitive; |
226 | const int rotation_value = (int)rotation * 90; |
227 | |
228 | bool found = false; |
229 | double rect_left = r.left(); |
230 | double rect_top = r.top(); |
231 | double rect_right = r.right(); |
232 | double rect_bottom = r.bottom(); |
233 | |
234 | TextOutputDev td(nullptr, true, 0, false, false); |
235 | d->doc->doc->displayPage(out: &td, page: d->index + 1, hDPI: 72, vDPI: 72, rotate: rotation_value, useMediaBox: false, crop: true, printing: false); |
236 | TextPage *text_page = td.takeText(); |
237 | |
238 | switch (direction) { |
239 | case search_from_top: |
240 | found = text_page->findText(s: &u[0], len, startAtTop: true, stopAtBottom: true, startAtLast: false, stopAtLast: false, caseSensitive: sCase, backward: false, wholeWord: false, xMin: &rect_left, yMin: &rect_top, xMax: &rect_right, yMax: &rect_bottom); |
241 | break; |
242 | case search_next_result: |
243 | found = text_page->findText(s: &u[0], len, startAtTop: false, stopAtBottom: true, startAtLast: true, stopAtLast: false, caseSensitive: sCase, backward: false, wholeWord: false, xMin: &rect_left, yMin: &rect_top, xMax: &rect_right, yMax: &rect_bottom); |
244 | break; |
245 | case search_previous_result: |
246 | found = text_page->findText(s: &u[0], len, startAtTop: false, stopAtBottom: true, startAtLast: true, stopAtLast: false, caseSensitive: sCase, backward: true, wholeWord: false, xMin: &rect_left, yMin: &rect_top, xMax: &rect_right, yMax: &rect_bottom); |
247 | break; |
248 | } |
249 | |
250 | text_page->decRefCnt(); |
251 | r.set_left(rect_left); |
252 | r.set_top(rect_top); |
253 | r.set_right(rect_right); |
254 | r.set_bottom(rect_bottom); |
255 | |
256 | return found; |
257 | } |
258 | |
259 | /** |
260 | Returns the text in the page, in its physical layout. |
261 | |
262 | \param r if not empty, it will be extracted the text in it; otherwise, the |
263 | text of the whole page |
264 | |
265 | \returns the text of the page in the specified rect or in the whole page |
266 | */ |
267 | ustring page::text(const rectf &r) const |
268 | { |
269 | return text(r, layout_mode: physical_layout); |
270 | } |
271 | |
272 | static void appendToGooString(void *stream, const char *text, int len) |
273 | { |
274 | ((GooString *)stream)->append(str: text, lengthA: len); |
275 | } |
276 | |
277 | /** |
278 | Returns the text in the page. |
279 | |
280 | \param rect if not empty, it will be extracted the text in it; otherwise, the |
281 | text of the whole page |
282 | \param layout_mode the layout of the text |
283 | |
284 | \returns the text of the page in the specified rect or in the whole page |
285 | |
286 | \since 0.16 |
287 | */ |
288 | ustring page::text(const rectf &r, text_layout_enum layout_mode) const |
289 | { |
290 | std::unique_ptr<GooString> out(new GooString()); |
291 | const bool use_raw_order = (layout_mode == raw_order_layout); |
292 | const bool use_physical_layout = (layout_mode == physical_layout); |
293 | TextOutputDev td(&appendToGooString, out.get(), use_physical_layout, 0, use_raw_order, false); |
294 | if (r.is_empty()) { |
295 | d->doc->doc->displayPage(out: &td, page: d->index + 1, hDPI: 72, vDPI: 72, rotate: 0, useMediaBox: false, crop: true, printing: false); |
296 | } else { |
297 | d->doc->doc->displayPageSlice(out: &td, page: d->index + 1, hDPI: 72, vDPI: 72, rotate: 0, useMediaBox: false, crop: true, printing: false, sliceX: r.left(), sliceY: r.top(), sliceW: r.width(), sliceH: r.height()); |
298 | } |
299 | return ustring::from_utf8(str: out->c_str()); |
300 | } |
301 | |
302 | /* |
303 | * text_box_font_info object for text_box |
304 | */ |
305 | text_box_font_info_data::~text_box_font_info_data() = default; |
306 | |
307 | /* |
308 | * text_box object for page::text_list() |
309 | */ |
310 | text_box_data::~text_box_data() = default; |
311 | |
312 | text_box::~text_box() = default; |
313 | |
314 | text_box &text_box::operator=(text_box &&a) noexcept = default; |
315 | text_box::text_box(text_box &&a) noexcept = default; |
316 | |
317 | text_box::text_box(text_box_data *data) : m_data { data } { } |
318 | |
319 | ustring text_box::text() const |
320 | { |
321 | return m_data->text; |
322 | } |
323 | |
324 | rectf text_box::bbox() const |
325 | { |
326 | return m_data->bbox; |
327 | } |
328 | |
329 | int text_box::rotation() const |
330 | { |
331 | return m_data->rotation; |
332 | } |
333 | |
334 | rectf text_box::char_bbox(size_t i) const |
335 | { |
336 | if (i < m_data->char_bboxes.size()) { |
337 | return m_data->char_bboxes[i]; |
338 | } |
339 | return rectf(0, 0, 0, 0); |
340 | } |
341 | |
342 | bool text_box::has_space_after() const |
343 | { |
344 | return m_data->has_space_after; |
345 | } |
346 | |
347 | bool text_box::has_font_info() const |
348 | { |
349 | return (m_data->text_box_font != nullptr); |
350 | } |
351 | |
352 | text_box::writing_mode_enum text_box::get_wmode(int i) const |
353 | { |
354 | if (this->has_font_info()) { |
355 | return m_data->text_box_font->wmodes[i]; |
356 | } else { |
357 | return text_box::invalid_wmode; |
358 | } |
359 | } |
360 | |
361 | double text_box::get_font_size() const |
362 | { |
363 | if (this->has_font_info()) { |
364 | return m_data->text_box_font->font_size; |
365 | } else { |
366 | return -1; |
367 | } |
368 | } |
369 | |
370 | std::string text_box::get_font_name(int i) const |
371 | { |
372 | if (!this->has_font_info()) { |
373 | return std::string("*ignored*" ); |
374 | } |
375 | |
376 | int j = m_data->text_box_font->glyph_to_cache_index[i]; |
377 | if (j < 0) { |
378 | return std::string("" ); |
379 | } |
380 | return m_data->text_box_font->font_info_cache[j].name(); |
381 | } |
382 | |
383 | std::vector<text_box> page::text_list(int opt_flag) const |
384 | { |
385 | std::vector<text_box> output_list; |
386 | |
387 | /* config values are same with Qt5 Page::TextList() */ |
388 | auto output_dev = std::make_unique<TextOutputDev>(args: nullptr, /* char* fileName */ |
389 | args: false, /* bool physLayoutA */ |
390 | args: 0, /* double fixedPitchA */ |
391 | args: false, /* bool rawOrderA */ |
392 | args: false /* bool append */ |
393 | ); |
394 | |
395 | /* |
396 | * config values are same with Qt5 Page::TextList(), |
397 | * but rotation is fixed to zero. |
398 | * Few people use non-zero values. |
399 | */ |
400 | d->doc->doc->displayPageSlice(out: output_dev.get(), page: d->index + 1, /* page */ |
401 | hDPI: 72, vDPI: 72, rotate: 0, /* hDPI, vDPI, rot */ |
402 | useMediaBox: false, crop: false, printing: false, /* useMediaBox, crop, printing */ |
403 | sliceX: -1, sliceY: -1, sliceW: -1, sliceH: -1, /* sliceX, sliceY, sliceW, sliceH */ |
404 | abortCheckCbk: nullptr, abortCheckCbkData: nullptr, /* abortCheckCbk(), abortCheckCbkData */ |
405 | annotDisplayDecideCbk: nullptr, annotDisplayDecideCbkData: nullptr, /* annotDisplayDecideCbk(), annotDisplayDecideCbkData */ |
406 | copyXRef: true); /* copyXRef */ |
407 | |
408 | if (std::unique_ptr<TextWordList> word_list { output_dev->makeWordList() }) { |
409 | |
410 | output_list.reserve(n: word_list->getLength()); |
411 | for (int i = 0; i < word_list->getLength(); i++) { |
412 | TextWord *word = word_list->get(idx: i); |
413 | |
414 | std::unique_ptr<GooString> gooWord { word->getText() }; |
415 | ustring ustr = ustring::from_utf8(str: gooWord->c_str()); |
416 | |
417 | double xMin, yMin, xMax, yMax; |
418 | word->getBBox(xMinA: &xMin, yMinA: &yMin, xMaxA: &xMax, yMaxA: &yMax); |
419 | |
420 | text_box tb { new text_box_data { .text: ustr, .bbox: { xMin, yMin, xMax - xMin, yMax - yMin }, .rotation: word->getRotation(), .char_bboxes: {}, .has_space_after: word->hasSpaceAfter() == true, .text_box_font: nullptr } }; |
421 | |
422 | std::unique_ptr<text_box_font_info_data> tb_font_info = nullptr; |
423 | if (opt_flag & page::text_list_include_font) { |
424 | d->init_font_info_cache(); |
425 | |
426 | std::unique_ptr<text_box_font_info_data> tb_font { new text_box_font_info_data { |
427 | .font_size: word->getFontSize(), // double font_size |
428 | .wmodes: {}, // std::vector<text_box::writing_mode> wmodes; |
429 | .font_info_cache: d->font_info_cache, // std::vector<font_info> font_info_cache; |
430 | .glyph_to_cache_index: {} // std::vector<int> glyph_to_cache_index; |
431 | } }; |
432 | |
433 | tb_font_info = std::move(tb_font); |
434 | }; |
435 | |
436 | tb.m_data->char_bboxes.reserve(n: word->getLength()); |
437 | for (int j = 0; j < word->getLength(); j++) { |
438 | word->getCharBBox(charIdx: j, xMinA: &xMin, yMinA: &yMin, xMaxA: &xMax, yMaxA: &yMax); |
439 | tb.m_data->char_bboxes.emplace_back(args&: xMin, args&: yMin, args: xMax - xMin, args: yMax - yMin); |
440 | } |
441 | |
442 | if (tb_font_info && d->font_info_cache_initialized) { |
443 | tb_font_info->glyph_to_cache_index.reserve(n: word->getLength()); |
444 | for (int j = 0; j < word->getLength(); j++) { |
445 | const TextFontInfo *cur_text_font_info = word->getFontInfo(idx: j); |
446 | |
447 | // filter-out the invalid WMode value here. |
448 | switch (cur_text_font_info->getWMode()) { |
449 | case 0: |
450 | tb_font_info->wmodes.push_back(x: text_box::horizontal_wmode); |
451 | break; |
452 | case 1: |
453 | tb_font_info->wmodes.push_back(x: text_box::vertical_wmode); |
454 | break; |
455 | default: |
456 | tb_font_info->wmodes.push_back(x: text_box::invalid_wmode); |
457 | }; |
458 | |
459 | tb_font_info->glyph_to_cache_index.push_back(x: -1); |
460 | for (size_t k = 0; k < tb_font_info->font_info_cache.size(); k++) { |
461 | if (cur_text_font_info->matches(ref: &(tb_font_info->font_info_cache[k].d->ref))) { |
462 | tb_font_info->glyph_to_cache_index[j] = k; |
463 | break; |
464 | } |
465 | } |
466 | } |
467 | tb.m_data->text_box_font = std::move(tb_font_info); |
468 | } |
469 | |
470 | output_list.push_back(x: std::move(tb)); |
471 | } |
472 | } |
473 | |
474 | return output_list; |
475 | } |
476 | |
477 | std::vector<text_box> page::text_list() const |
478 | { |
479 | return text_list(opt_flag: 0); |
480 | } |
481 | |