1 | //======================================================================== |
2 | // |
3 | // pdftohtml.cc |
4 | // |
5 | // |
6 | // Copyright 1999-2000 G. Ovtcharov |
7 | //======================================================================== |
8 | |
9 | //======================================================================== |
10 | // |
11 | // Modified under the Poppler project - http://poppler.freedesktop.org |
12 | // |
13 | // All changes made under the Poppler project to this file are licensed |
14 | // under GPL version 2 or later |
15 | // |
16 | // Copyright (C) 2007-2008, 2010, 2012, 2015-2020, 2022 Albert Astals Cid <aacid@kde.org> |
17 | // Copyright (C) 2010 Hib Eris <hib@hiberis.nl> |
18 | // Copyright (C) 2010 Mike Slegeir <tehpola@yahoo.com> |
19 | // Copyright (C) 2010, 2013 Suzuki Toshiya <mpsuzuki@hiroshima-u.ac.jp> |
20 | // Copyright (C) 2010 OSSD CDAC Mumbai by Leena Chourey (leenac@cdacmumbai.in) and Onkar Potdar (onkar@cdacmumbai.in) |
21 | // Copyright (C) 2011 Steven Murdoch <Steven.Murdoch@cl.cam.ac.uk> |
22 | // Copyright (C) 2012 Igor Slepchin <igor.redhat@gmail.com> |
23 | // Copyright (C) 2012 Ihar Filipau <thephilips@gmail.com> |
24 | // Copyright (C) 2012 Luis Parravicini <lparravi@gmail.com> |
25 | // Copyright (C) 2014 Pino Toscano <pino@kde.org> |
26 | // Copyright (C) 2015 William Bader <williambader@hotmail.com> |
27 | // Copyright (C) 2017, 2021 Adrian Johnson <ajohnson@redneon.com> |
28 | // Copyright (C) 2018 Klarälvdalens Datakonsult AB, a KDAB Group company, <info@kdab.com>. Work sponsored by the LiMux project of the city of Munich |
29 | // Copyright (C) 2018 Thibaut Brard <thibaut.brard@gmail.com> |
30 | // Copyright (C) 2018 Adam Reichold <adam.reichold@t-online.de> |
31 | // Copyright (C) 2019, 2021, 2024 Oliver Sander <oliver.sander@tu-dresden.de> |
32 | // Copyright (C) 2021 Hubert Figuiere <hub@figuiere.net> |
33 | // Copyright (C) 2024 g10 Code GmbH, Author: Sune Stolborg Vuorela <sune@vuorela.dk> |
34 | // |
35 | // To see a description of the changes please see the Changelog file that |
36 | // came with your tarball or type make ChangeLog if you are building from git |
37 | // |
38 | //======================================================================== |
39 | |
40 | #include "config.h" |
41 | #include <poppler-config.h> |
42 | #include <cstdio> |
43 | #include <cstdlib> |
44 | #include <cstddef> |
45 | #include <cstring> |
46 | #ifdef HAVE_DIRENT_H |
47 | # include <dirent.h> |
48 | #endif |
49 | #include <ctime> |
50 | #include "parseargs.h" |
51 | #include "goo/GooString.h" |
52 | #include "goo/gbase64.h" |
53 | #include "goo/gbasename.h" |
54 | #include "goo/gmem.h" |
55 | #include "Object.h" |
56 | #include "Stream.h" |
57 | #include "Array.h" |
58 | #include "Dict.h" |
59 | #include "XRef.h" |
60 | #include "Catalog.h" |
61 | #include "Page.h" |
62 | #include "Outline.h" |
63 | #include "PDFDoc.h" |
64 | #include "PDFDocFactory.h" |
65 | #include "HtmlOutputDev.h" |
66 | #include "SplashOutputDev.h" |
67 | #include "splash/SplashBitmap.h" |
68 | #include "GlobalParams.h" |
69 | #include "PDFDocEncoding.h" |
70 | #include "Error.h" |
71 | #include "DateInfo.h" |
72 | #include "goo/gfile.h" |
73 | #include "Win32Console.h" |
74 | #include "InMemoryFile.h" |
75 | #include "UTF.h" |
76 | |
77 | static int firstPage = 1; |
78 | static int lastPage = 0; |
79 | static bool rawOrder = true; |
80 | bool printCommands = true; |
81 | static bool printHelp = false; |
82 | bool printHtml = false; |
83 | bool complexMode = false; |
84 | bool singleHtml = false; // singleHtml |
85 | bool dataUrls = false; |
86 | bool ignore = false; |
87 | static char extension[5] = "png" ; |
88 | static double scale = 1.5; |
89 | bool noframes = false; |
90 | bool stout = false; |
91 | bool xml = false; |
92 | bool noRoundedCoordinates = false; |
93 | static bool errQuiet = false; |
94 | static bool noDrm = false; |
95 | double wordBreakThreshold = 10; // 10%, below converted into a coefficient - 0.1 |
96 | |
97 | bool showHidden = false; |
98 | bool noMerge = false; |
99 | bool fontFullName = false; |
100 | static char ownerPassword[33] = "" ; |
101 | static char userPassword[33] = "" ; |
102 | static bool printVersion = false; |
103 | |
104 | static std::unique_ptr<GooString> getInfoString(Dict *infoDict, const char *key); |
105 | static GooString *getInfoDate(Dict *infoDict, const char *key); |
106 | |
107 | static char textEncName[128] = "" ; |
108 | |
109 | static const ArgDesc argDesc[] = { { .arg: "-f" , .kind: argInt, .val: &firstPage, .size: 0, .usage: "first page to convert" }, |
110 | { .arg: "-l" , .kind: argInt, .val: &lastPage, .size: 0, .usage: "last page to convert" }, |
111 | /*{"-raw", argFlag, &rawOrder, 0, |
112 | "keep strings in content stream order"},*/ |
113 | { .arg: "-q" , .kind: argFlag, .val: &errQuiet, .size: 0, .usage: "don't print any messages or errors" }, |
114 | { .arg: "-h" , .kind: argFlag, .val: &printHelp, .size: 0, .usage: "print usage information" }, |
115 | { .arg: "-?" , .kind: argFlag, .val: &printHelp, .size: 0, .usage: "print usage information" }, |
116 | { .arg: "-help" , .kind: argFlag, .val: &printHelp, .size: 0, .usage: "print usage information" }, |
117 | { .arg: "--help" , .kind: argFlag, .val: &printHelp, .size: 0, .usage: "print usage information" }, |
118 | { .arg: "-p" , .kind: argFlag, .val: &printHtml, .size: 0, .usage: "exchange .pdf links by .html" }, |
119 | { .arg: "-c" , .kind: argFlag, .val: &complexMode, .size: 0, .usage: "generate complex document" }, |
120 | { .arg: "-s" , .kind: argFlag, .val: &singleHtml, .size: 0, .usage: "generate single document that includes all pages" }, |
121 | #ifdef HAVE_IN_MEMORY_FILE |
122 | { .arg: "-dataurls" , .kind: argFlag, .val: &dataUrls, .size: 0, .usage: "use data URLs instead of external images in HTML" }, |
123 | #endif |
124 | { .arg: "-i" , .kind: argFlag, .val: &ignore, .size: 0, .usage: "ignore images" }, |
125 | { .arg: "-noframes" , .kind: argFlag, .val: &noframes, .size: 0, .usage: "generate no frames" }, |
126 | { .arg: "-stdout" , .kind: argFlag, .val: &stout, .size: 0, .usage: "use standard output" }, |
127 | { .arg: "-zoom" , .kind: argFP, .val: &scale, .size: 0, .usage: "zoom the pdf document (default 1.5)" }, |
128 | { .arg: "-xml" , .kind: argFlag, .val: &xml, .size: 0, .usage: "output for XML post-processing" }, |
129 | { .arg: "-noroundcoord" , .kind: argFlag, .val: &noRoundedCoordinates, .size: 0, .usage: "do not round coordinates (with XML output only)" }, |
130 | { .arg: "-hidden" , .kind: argFlag, .val: &showHidden, .size: 0, .usage: "output hidden text" }, |
131 | { .arg: "-nomerge" , .kind: argFlag, .val: &noMerge, .size: 0, .usage: "do not merge paragraphs" }, |
132 | { .arg: "-enc" , .kind: argString, .val: textEncName, .size: sizeof(textEncName), .usage: "output text encoding name" }, |
133 | { .arg: "-fmt" , .kind: argString, .val: extension, .size: sizeof(extension), .usage: "image file format for Splash output (png or jpg)" }, |
134 | { .arg: "-v" , .kind: argFlag, .val: &printVersion, .size: 0, .usage: "print copyright and version info" }, |
135 | { .arg: "-opw" , .kind: argString, .val: ownerPassword, .size: sizeof(ownerPassword), .usage: "owner password (for encrypted files)" }, |
136 | { .arg: "-upw" , .kind: argString, .val: userPassword, .size: sizeof(userPassword), .usage: "user password (for encrypted files)" }, |
137 | { .arg: "-nodrm" , .kind: argFlag, .val: &noDrm, .size: 0, .usage: "override document DRM settings" }, |
138 | { .arg: "-wbt" , .kind: argFP, .val: &wordBreakThreshold, .size: 0, .usage: "word break threshold (default 10 percent)" }, |
139 | { .arg: "-fontfullname" , .kind: argFlag, .val: &fontFullName, .size: 0, .usage: "outputs font full name" }, |
140 | {} }; |
141 | |
142 | class SplashOutputDevNoText : public SplashOutputDev |
143 | { |
144 | public: |
145 | SplashOutputDevNoText(SplashColorMode colorModeA, int bitmapRowPadA, bool reverseVideoA, SplashColorPtr paperColorA, bool bitmapTopDownA = true) |
146 | : SplashOutputDev(colorModeA, bitmapRowPadA, reverseVideoA, paperColorA, bitmapTopDownA) { } |
147 | ~SplashOutputDevNoText() override; |
148 | |
149 | void drawChar(GfxState *state, double x, double y, double dx, double dy, double originX, double originY, CharCode code, int nBytes, const Unicode *u, int uLen) override { } |
150 | bool beginType3Char(GfxState *state, double x, double y, double dx, double dy, CharCode code, const Unicode *u, int uLen) override { return false; } |
151 | void endType3Char(GfxState *state) override { } |
152 | void beginTextObject(GfxState *state) override { } |
153 | void endTextObject(GfxState *state) override { } |
154 | bool interpretType3Chars() override { return false; } |
155 | }; |
156 | |
157 | SplashOutputDevNoText::~SplashOutputDevNoText() = default; |
158 | |
159 | int main(int argc, char *argv[]) |
160 | { |
161 | std::unique_ptr<PDFDoc> doc; |
162 | GooString *fileName = nullptr; |
163 | std::unique_ptr<GooString> docTitle; |
164 | std::unique_ptr<GooString> author; |
165 | std::unique_ptr<GooString> keywords; |
166 | std::unique_ptr<GooString> subject; |
167 | GooString *date = nullptr; |
168 | GooString *htmlFileName = nullptr; |
169 | HtmlOutputDev *htmlOut = nullptr; |
170 | SplashOutputDev *splashOut = nullptr; |
171 | bool doOutline; |
172 | bool ok; |
173 | std::optional<GooString> ownerPW, userPW; |
174 | Object info; |
175 | int exit_status = EXIT_FAILURE; |
176 | |
177 | Win32Console win32Console(&argc, &argv); |
178 | // parse args |
179 | ok = parseArgs(args: argDesc, argc: &argc, argv); |
180 | if (!ok || argc < 2 || argc > 3 || printHelp || printVersion) { |
181 | fprintf(stderr, format: "pdftohtml version %s\n" , PACKAGE_VERSION); |
182 | fprintf(stderr, format: "%s\n" , popplerCopyright); |
183 | fprintf(stderr, format: "%s\n" , "Copyright 1999-2003 Gueorgui Ovtcharov and Rainer Dorsch" ); |
184 | fprintf(stderr, format: "%s\n\n" , xpdfCopyright); |
185 | if (!printVersion) { |
186 | printUsage(program: "pdftohtml" , otherArgs: "<PDF-file> [<html-file> <xml-file>]" , args: argDesc); |
187 | } |
188 | exit(status: printHelp || printVersion ? 0 : 1); |
189 | } |
190 | |
191 | // init error file |
192 | // errorInit(); |
193 | |
194 | // read config file |
195 | globalParams = std::make_unique<GlobalParams>(); |
196 | |
197 | if (errQuiet) { |
198 | globalParams->setErrQuiet(errQuiet); |
199 | printCommands = false; // I'm not 100% what is the difference between them |
200 | } |
201 | |
202 | if (textEncName[0]) { |
203 | globalParams->setTextEncoding(textEncName); |
204 | if (!globalParams->getTextEncoding()) { |
205 | goto error; |
206 | } |
207 | } |
208 | |
209 | // convert from user-friendly percents into a coefficient |
210 | wordBreakThreshold /= 100.0; |
211 | |
212 | // open PDF file |
213 | if (ownerPassword[0]) { |
214 | ownerPW = GooString(ownerPassword); |
215 | } |
216 | if (userPassword[0]) { |
217 | userPW = GooString(userPassword); |
218 | } |
219 | |
220 | fileName = new GooString(argv[1]); |
221 | |
222 | if (fileName->cmp(sA: "-" ) == 0) { |
223 | delete fileName; |
224 | fileName = new GooString("fd://0" ); |
225 | } |
226 | |
227 | doc = PDFDocFactory().createPDFDoc(uri: *fileName, ownerPassword: ownerPW, userPassword: userPW); |
228 | |
229 | if (!doc->isOk()) { |
230 | goto error; |
231 | } |
232 | |
233 | // check for copy permission |
234 | if (!doc->okToCopy()) { |
235 | if (!noDrm) { |
236 | error(category: errNotAllowed, pos: -1, msg: "Copying of text from this document is not allowed." ); |
237 | goto error; |
238 | } |
239 | fprintf(stderr, format: "Document has copy-protection bit set.\n" ); |
240 | } |
241 | |
242 | // construct text file name |
243 | if (argc == 3) { |
244 | GooString *tmp = new GooString(argv[2]); |
245 | if (!xml) { |
246 | if (tmp->getLength() >= 5) { |
247 | const char *p = tmp->c_str() + tmp->getLength() - 5; |
248 | if (!strcmp(s1: p, s2: ".html" ) || !strcmp(s1: p, s2: ".HTML" )) { |
249 | htmlFileName = new GooString(tmp->c_str(), tmp->getLength() - 5); |
250 | } |
251 | } |
252 | } else { |
253 | if (tmp->getLength() >= 4) { |
254 | const char *p = tmp->c_str() + tmp->getLength() - 4; |
255 | if (!strcmp(s1: p, s2: ".xml" ) || !strcmp(s1: p, s2: ".XML" )) { |
256 | htmlFileName = new GooString(tmp->c_str(), tmp->getLength() - 4); |
257 | } |
258 | } |
259 | } |
260 | if (!htmlFileName) { |
261 | htmlFileName = new GooString(tmp); |
262 | } |
263 | delete tmp; |
264 | } else if (fileName->cmp(sA: "fd://0" ) == 0) { |
265 | error(category: errCommandLine, pos: -1, msg: "You have to provide an output filename when reading from stdin." ); |
266 | goto error; |
267 | } else { |
268 | const char *p = fileName->c_str() + fileName->getLength() - 4; |
269 | if (!strcmp(s1: p, s2: ".pdf" ) || !strcmp(s1: p, s2: ".PDF" )) { |
270 | htmlFileName = new GooString(fileName->c_str(), fileName->getLength() - 4); |
271 | } else { |
272 | htmlFileName = fileName->copy(); |
273 | } |
274 | // htmlFileName->append(".html"); |
275 | } |
276 | |
277 | if (scale > 3.0) { |
278 | scale = 3.0; |
279 | } |
280 | if (scale < 0.5) { |
281 | scale = 0.5; |
282 | } |
283 | |
284 | if (complexMode) { |
285 | // noframes=false; |
286 | stout = false; |
287 | } |
288 | |
289 | if (stout) { |
290 | noframes = true; |
291 | complexMode = false; |
292 | } |
293 | |
294 | if (xml) { |
295 | complexMode = true; |
296 | singleHtml = false; |
297 | noframes = true; |
298 | noMerge = true; |
299 | } |
300 | |
301 | // get page range |
302 | if (firstPage < 1) { |
303 | firstPage = 1; |
304 | } |
305 | if (lastPage < 1 || lastPage > doc->getNumPages()) { |
306 | lastPage = doc->getNumPages(); |
307 | } |
308 | if (lastPage < firstPage) { |
309 | error(category: errCommandLine, pos: -1, msg: "Wrong page range given: the first page ({0:d}) can not be after the last page ({1:d})." , firstPage, lastPage); |
310 | goto error; |
311 | } |
312 | |
313 | info = doc->getDocInfo(); |
314 | if (info.isDict()) { |
315 | docTitle = getInfoString(infoDict: info.getDict(), key: "Title" ); |
316 | author = getInfoString(infoDict: info.getDict(), key: "Author" ); |
317 | keywords = getInfoString(infoDict: info.getDict(), key: "Keywords" ); |
318 | subject = getInfoString(infoDict: info.getDict(), key: "Subject" ); |
319 | date = getInfoDate(infoDict: info.getDict(), key: "ModDate" ); |
320 | if (!date) { |
321 | date = getInfoDate(infoDict: info.getDict(), key: "CreationDate" ); |
322 | } |
323 | } |
324 | if (!docTitle) { |
325 | docTitle = std::make_unique<GooString>(args&: htmlFileName); |
326 | } |
327 | |
328 | if (!singleHtml) { |
329 | rawOrder = complexMode; // todo: figure out what exactly rawOrder do :) |
330 | } else { |
331 | rawOrder = singleHtml; |
332 | } |
333 | |
334 | doOutline = doc->getOutline()->getItems() != nullptr; |
335 | // write text file |
336 | htmlOut = new HtmlOutputDev(doc->getCatalog(), htmlFileName->c_str(), docTitle->c_str(), author ? author->c_str() : nullptr, keywords ? keywords->c_str() : nullptr, subject ? subject->c_str() : nullptr, date ? date->c_str() : nullptr, |
337 | rawOrder, firstPage, doOutline); |
338 | if (date) { |
339 | delete date; |
340 | } |
341 | |
342 | if ((complexMode || singleHtml) && !xml && !ignore) { |
343 | // White paper color |
344 | SplashColor color; |
345 | color[0] = color[1] = color[2] = 255; |
346 | // If the user specified "jpg" use JPEG, otherwise PNG |
347 | SplashImageFileFormat format = strcmp(s1: extension, s2: "jpg" ) ? splashFormatPng : splashFormatJpeg; |
348 | |
349 | splashOut = new SplashOutputDevNoText(splashModeRGB8, 4, false, color); |
350 | splashOut->startDoc(docA: doc.get()); |
351 | |
352 | for (int pg = firstPage; pg <= lastPage; ++pg) { |
353 | InMemoryFile imf; |
354 | doc->displayPage(out: splashOut, page: pg, hDPI: 72 * scale, vDPI: 72 * scale, rotate: 0, useMediaBox: true, crop: false, printing: false); |
355 | SplashBitmap *bitmap = splashOut->getBitmap(); |
356 | |
357 | const std::unique_ptr<GooString> imgFileName = GooString::format(fmt: "{0:s}{1:03d}.{2:s}" , htmlFileName->c_str(), pg, extension); |
358 | auto f1 = dataUrls ? imf.open(mode: "wb" ) : fopen(filename: imgFileName->c_str(), modes: "wb" ); |
359 | if (!f1) { |
360 | fprintf(stderr, format: "Could not open %s\n" , imgFileName->c_str()); |
361 | continue; |
362 | } |
363 | bitmap->writeImgFile(format, f: f1, hDPI: 72 * scale, vDPI: 72 * scale); |
364 | fclose(stream: f1); |
365 | if (dataUrls) { |
366 | htmlOut->addBackgroundImage(img: std::string((format == splashFormatJpeg) ? "data:image/jpeg;base64," : "data:image/png;base64," ) + gbase64Encode(input: imf.getBuffer())); |
367 | } else { |
368 | htmlOut->addBackgroundImage(img: gbasename(filename: imgFileName->c_str())); |
369 | } |
370 | } |
371 | |
372 | delete splashOut; |
373 | } |
374 | |
375 | if (htmlOut->isOk()) { |
376 | doc->displayPages(out: htmlOut, firstPage, lastPage, hDPI: 72 * scale, vDPI: 72 * scale, rotate: 0, useMediaBox: true, crop: false, printing: false); |
377 | htmlOut->dumpDocOutline(doc: doc.get()); |
378 | } |
379 | |
380 | delete htmlOut; |
381 | |
382 | exit_status = EXIT_SUCCESS; |
383 | |
384 | // clean up |
385 | error: |
386 | delete fileName; |
387 | |
388 | if (htmlFileName) { |
389 | delete htmlFileName; |
390 | } |
391 | |
392 | return exit_status; |
393 | } |
394 | |
395 | static std::unique_ptr<GooString> getInfoString(Dict *infoDict, const char *key) |
396 | { |
397 | Object obj; |
398 | // Raw value as read from PDF (may be in pdfDocEncoding or UCS2) |
399 | const GooString *rawString; |
400 | // Value converted to unicode |
401 | Unicode *unicodeString; |
402 | int unicodeLength; |
403 | // Value HTML escaped and converted to desired encoding |
404 | std::unique_ptr<GooString> encodedString; |
405 | // Is rawString UCS2 (as opposed to pdfDocEncoding) |
406 | bool isUnicode; |
407 | |
408 | obj = infoDict->lookup(key); |
409 | if (obj.isString()) { |
410 | rawString = obj.getString(); |
411 | |
412 | // Convert rawString to unicode |
413 | if (hasUnicodeByteOrderMark(s: rawString->toStr())) { |
414 | isUnicode = true; |
415 | unicodeLength = (obj.getString()->getLength() - 2) / 2; |
416 | } else { |
417 | isUnicode = false; |
418 | unicodeLength = obj.getString()->getLength(); |
419 | } |
420 | unicodeString = new Unicode[unicodeLength]; |
421 | |
422 | for (int i = 0; i < unicodeLength; i++) { |
423 | if (isUnicode) { |
424 | unicodeString[i] = ((rawString->getChar(i: (i + 1) * 2) & 0xff) << 8) | (rawString->getChar(i: ((i + 1) * 2) + 1) & 0xff); |
425 | } else { |
426 | unicodeString[i] = pdfDocEncoding[rawString->getChar(i) & 0xff]; |
427 | } |
428 | } |
429 | |
430 | // HTML escape and encode unicode |
431 | encodedString = HtmlFont::HtmlFilter(u: unicodeString, uLen: unicodeLength); |
432 | delete[] unicodeString; |
433 | } |
434 | |
435 | return encodedString; |
436 | } |
437 | |
438 | static GooString *getInfoDate(Dict *infoDict, const char *key) |
439 | { |
440 | Object obj; |
441 | int year, mon, day, hour, min, sec, tz_hour, tz_minute; |
442 | char tz; |
443 | struct tm tmStruct; |
444 | GooString *result = nullptr; |
445 | char buf[256]; |
446 | |
447 | obj = infoDict->lookup(key); |
448 | if (obj.isString()) { |
449 | const GooString *s = obj.getString(); |
450 | // TODO do something with the timezone info |
451 | if (parseDateString(date: s, year: &year, month: &mon, day: &day, hour: &hour, minute: &min, second: &sec, tz: &tz, tzHour: &tz_hour, tzMinute: &tz_minute)) { |
452 | tmStruct.tm_year = year - 1900; |
453 | tmStruct.tm_mon = mon - 1; |
454 | tmStruct.tm_mday = day; |
455 | tmStruct.tm_hour = hour; |
456 | tmStruct.tm_min = min; |
457 | tmStruct.tm_sec = sec; |
458 | tmStruct.tm_wday = -1; |
459 | tmStruct.tm_yday = -1; |
460 | tmStruct.tm_isdst = -1; |
461 | mktime(tp: &tmStruct); // compute the tm_wday and tm_yday fields |
462 | if (strftime(s: buf, maxsize: sizeof(buf), format: "%Y-%m-%dT%H:%M:%S+00:00" , tp: &tmStruct)) { |
463 | result = new GooString(buf); |
464 | } else { |
465 | result = new GooString(s); |
466 | } |
467 | } else { |
468 | result = new GooString(s); |
469 | } |
470 | } |
471 | return result; |
472 | } |
473 | |