1//========================================================================
2//
3// pdftohtml.cc
4//
5//
6// Copyright 1999-2000 G. Ovtcharov
7//========================================================================
8
9//========================================================================
10//
11// Modified under the Poppler project - http://poppler.freedesktop.org
12//
13// All changes made under the Poppler project to this file are licensed
14// under GPL version 2 or later
15//
16// Copyright (C) 2007-2008, 2010, 2012, 2015-2020, 2022 Albert Astals Cid <aacid@kde.org>
17// Copyright (C) 2010 Hib Eris <hib@hiberis.nl>
18// Copyright (C) 2010 Mike Slegeir <tehpola@yahoo.com>
19// Copyright (C) 2010, 2013 Suzuki Toshiya <mpsuzuki@hiroshima-u.ac.jp>
20// Copyright (C) 2010 OSSD CDAC Mumbai by Leena Chourey (leenac@cdacmumbai.in) and Onkar Potdar (onkar@cdacmumbai.in)
21// Copyright (C) 2011 Steven Murdoch <Steven.Murdoch@cl.cam.ac.uk>
22// Copyright (C) 2012 Igor Slepchin <igor.redhat@gmail.com>
23// Copyright (C) 2012 Ihar Filipau <thephilips@gmail.com>
24// Copyright (C) 2012 Luis Parravicini <lparravi@gmail.com>
25// Copyright (C) 2014 Pino Toscano <pino@kde.org>
26// Copyright (C) 2015 William Bader <williambader@hotmail.com>
27// Copyright (C) 2017, 2021 Adrian Johnson <ajohnson@redneon.com>
28// Copyright (C) 2018 Klarälvdalens Datakonsult AB, a KDAB Group company, <info@kdab.com>. Work sponsored by the LiMux project of the city of Munich
29// Copyright (C) 2018 Thibaut Brard <thibaut.brard@gmail.com>
30// Copyright (C) 2018 Adam Reichold <adam.reichold@t-online.de>
31// Copyright (C) 2019, 2021, 2024 Oliver Sander <oliver.sander@tu-dresden.de>
32// Copyright (C) 2021 Hubert Figuiere <hub@figuiere.net>
33// Copyright (C) 2024 g10 Code GmbH, Author: Sune Stolborg Vuorela <sune@vuorela.dk>
34//
35// To see a description of the changes please see the Changelog file that
36// came with your tarball or type make ChangeLog if you are building from git
37//
38//========================================================================
39
40#include "config.h"
41#include <poppler-config.h>
42#include <cstdio>
43#include <cstdlib>
44#include <cstddef>
45#include <cstring>
46#ifdef HAVE_DIRENT_H
47# include <dirent.h>
48#endif
49#include <ctime>
50#include "parseargs.h"
51#include "goo/GooString.h"
52#include "goo/gbase64.h"
53#include "goo/gbasename.h"
54#include "goo/gmem.h"
55#include "Object.h"
56#include "Stream.h"
57#include "Array.h"
58#include "Dict.h"
59#include "XRef.h"
60#include "Catalog.h"
61#include "Page.h"
62#include "Outline.h"
63#include "PDFDoc.h"
64#include "PDFDocFactory.h"
65#include "HtmlOutputDev.h"
66#include "SplashOutputDev.h"
67#include "splash/SplashBitmap.h"
68#include "GlobalParams.h"
69#include "PDFDocEncoding.h"
70#include "Error.h"
71#include "DateInfo.h"
72#include "goo/gfile.h"
73#include "Win32Console.h"
74#include "InMemoryFile.h"
75#include "UTF.h"
76
77static int firstPage = 1;
78static int lastPage = 0;
79static bool rawOrder = true;
80bool printCommands = true;
81static bool printHelp = false;
82bool printHtml = false;
83bool complexMode = false;
84bool singleHtml = false; // singleHtml
85bool dataUrls = false;
86bool ignore = false;
87static char extension[5] = "png";
88static double scale = 1.5;
89bool noframes = false;
90bool stout = false;
91bool xml = false;
92bool noRoundedCoordinates = false;
93static bool errQuiet = false;
94static bool noDrm = false;
95double wordBreakThreshold = 10; // 10%, below converted into a coefficient - 0.1
96
97bool showHidden = false;
98bool noMerge = false;
99bool fontFullName = false;
100static char ownerPassword[33] = "";
101static char userPassword[33] = "";
102static bool printVersion = false;
103
104static std::unique_ptr<GooString> getInfoString(Dict *infoDict, const char *key);
105static GooString *getInfoDate(Dict *infoDict, const char *key);
106
107static char textEncName[128] = "";
108
109static const ArgDesc argDesc[] = { { .arg: "-f", .kind: argInt, .val: &firstPage, .size: 0, .usage: "first page to convert" },
110 { .arg: "-l", .kind: argInt, .val: &lastPage, .size: 0, .usage: "last page to convert" },
111 /*{"-raw", argFlag, &rawOrder, 0,
112 "keep strings in content stream order"},*/
113 { .arg: "-q", .kind: argFlag, .val: &errQuiet, .size: 0, .usage: "don't print any messages or errors" },
114 { .arg: "-h", .kind: argFlag, .val: &printHelp, .size: 0, .usage: "print usage information" },
115 { .arg: "-?", .kind: argFlag, .val: &printHelp, .size: 0, .usage: "print usage information" },
116 { .arg: "-help", .kind: argFlag, .val: &printHelp, .size: 0, .usage: "print usage information" },
117 { .arg: "--help", .kind: argFlag, .val: &printHelp, .size: 0, .usage: "print usage information" },
118 { .arg: "-p", .kind: argFlag, .val: &printHtml, .size: 0, .usage: "exchange .pdf links by .html" },
119 { .arg: "-c", .kind: argFlag, .val: &complexMode, .size: 0, .usage: "generate complex document" },
120 { .arg: "-s", .kind: argFlag, .val: &singleHtml, .size: 0, .usage: "generate single document that includes all pages" },
121#ifdef HAVE_IN_MEMORY_FILE
122 { .arg: "-dataurls", .kind: argFlag, .val: &dataUrls, .size: 0, .usage: "use data URLs instead of external images in HTML" },
123#endif
124 { .arg: "-i", .kind: argFlag, .val: &ignore, .size: 0, .usage: "ignore images" },
125 { .arg: "-noframes", .kind: argFlag, .val: &noframes, .size: 0, .usage: "generate no frames" },
126 { .arg: "-stdout", .kind: argFlag, .val: &stout, .size: 0, .usage: "use standard output" },
127 { .arg: "-zoom", .kind: argFP, .val: &scale, .size: 0, .usage: "zoom the pdf document (default 1.5)" },
128 { .arg: "-xml", .kind: argFlag, .val: &xml, .size: 0, .usage: "output for XML post-processing" },
129 { .arg: "-noroundcoord", .kind: argFlag, .val: &noRoundedCoordinates, .size: 0, .usage: "do not round coordinates (with XML output only)" },
130 { .arg: "-hidden", .kind: argFlag, .val: &showHidden, .size: 0, .usage: "output hidden text" },
131 { .arg: "-nomerge", .kind: argFlag, .val: &noMerge, .size: 0, .usage: "do not merge paragraphs" },
132 { .arg: "-enc", .kind: argString, .val: textEncName, .size: sizeof(textEncName), .usage: "output text encoding name" },
133 { .arg: "-fmt", .kind: argString, .val: extension, .size: sizeof(extension), .usage: "image file format for Splash output (png or jpg)" },
134 { .arg: "-v", .kind: argFlag, .val: &printVersion, .size: 0, .usage: "print copyright and version info" },
135 { .arg: "-opw", .kind: argString, .val: ownerPassword, .size: sizeof(ownerPassword), .usage: "owner password (for encrypted files)" },
136 { .arg: "-upw", .kind: argString, .val: userPassword, .size: sizeof(userPassword), .usage: "user password (for encrypted files)" },
137 { .arg: "-nodrm", .kind: argFlag, .val: &noDrm, .size: 0, .usage: "override document DRM settings" },
138 { .arg: "-wbt", .kind: argFP, .val: &wordBreakThreshold, .size: 0, .usage: "word break threshold (default 10 percent)" },
139 { .arg: "-fontfullname", .kind: argFlag, .val: &fontFullName, .size: 0, .usage: "outputs font full name" },
140 {} };
141
142class SplashOutputDevNoText : public SplashOutputDev
143{
144public:
145 SplashOutputDevNoText(SplashColorMode colorModeA, int bitmapRowPadA, bool reverseVideoA, SplashColorPtr paperColorA, bool bitmapTopDownA = true)
146 : SplashOutputDev(colorModeA, bitmapRowPadA, reverseVideoA, paperColorA, bitmapTopDownA) { }
147 ~SplashOutputDevNoText() override;
148
149 void drawChar(GfxState *state, double x, double y, double dx, double dy, double originX, double originY, CharCode code, int nBytes, const Unicode *u, int uLen) override { }
150 bool beginType3Char(GfxState *state, double x, double y, double dx, double dy, CharCode code, const Unicode *u, int uLen) override { return false; }
151 void endType3Char(GfxState *state) override { }
152 void beginTextObject(GfxState *state) override { }
153 void endTextObject(GfxState *state) override { }
154 bool interpretType3Chars() override { return false; }
155};
156
157SplashOutputDevNoText::~SplashOutputDevNoText() = default;
158
159int main(int argc, char *argv[])
160{
161 std::unique_ptr<PDFDoc> doc;
162 GooString *fileName = nullptr;
163 std::unique_ptr<GooString> docTitle;
164 std::unique_ptr<GooString> author;
165 std::unique_ptr<GooString> keywords;
166 std::unique_ptr<GooString> subject;
167 GooString *date = nullptr;
168 GooString *htmlFileName = nullptr;
169 HtmlOutputDev *htmlOut = nullptr;
170 SplashOutputDev *splashOut = nullptr;
171 bool doOutline;
172 bool ok;
173 std::optional<GooString> ownerPW, userPW;
174 Object info;
175 int exit_status = EXIT_FAILURE;
176
177 Win32Console win32Console(&argc, &argv);
178 // parse args
179 ok = parseArgs(args: argDesc, argc: &argc, argv);
180 if (!ok || argc < 2 || argc > 3 || printHelp || printVersion) {
181 fprintf(stderr, format: "pdftohtml version %s\n", PACKAGE_VERSION);
182 fprintf(stderr, format: "%s\n", popplerCopyright);
183 fprintf(stderr, format: "%s\n", "Copyright 1999-2003 Gueorgui Ovtcharov and Rainer Dorsch");
184 fprintf(stderr, format: "%s\n\n", xpdfCopyright);
185 if (!printVersion) {
186 printUsage(program: "pdftohtml", otherArgs: "<PDF-file> [<html-file> <xml-file>]", args: argDesc);
187 }
188 exit(status: printHelp || printVersion ? 0 : 1);
189 }
190
191 // init error file
192 // errorInit();
193
194 // read config file
195 globalParams = std::make_unique<GlobalParams>();
196
197 if (errQuiet) {
198 globalParams->setErrQuiet(errQuiet);
199 printCommands = false; // I'm not 100% what is the difference between them
200 }
201
202 if (textEncName[0]) {
203 globalParams->setTextEncoding(textEncName);
204 if (!globalParams->getTextEncoding()) {
205 goto error;
206 }
207 }
208
209 // convert from user-friendly percents into a coefficient
210 wordBreakThreshold /= 100.0;
211
212 // open PDF file
213 if (ownerPassword[0]) {
214 ownerPW = GooString(ownerPassword);
215 }
216 if (userPassword[0]) {
217 userPW = GooString(userPassword);
218 }
219
220 fileName = new GooString(argv[1]);
221
222 if (fileName->cmp(sA: "-") == 0) {
223 delete fileName;
224 fileName = new GooString("fd://0");
225 }
226
227 doc = PDFDocFactory().createPDFDoc(uri: *fileName, ownerPassword: ownerPW, userPassword: userPW);
228
229 if (!doc->isOk()) {
230 goto error;
231 }
232
233 // check for copy permission
234 if (!doc->okToCopy()) {
235 if (!noDrm) {
236 error(category: errNotAllowed, pos: -1, msg: "Copying of text from this document is not allowed.");
237 goto error;
238 }
239 fprintf(stderr, format: "Document has copy-protection bit set.\n");
240 }
241
242 // construct text file name
243 if (argc == 3) {
244 GooString *tmp = new GooString(argv[2]);
245 if (!xml) {
246 if (tmp->getLength() >= 5) {
247 const char *p = tmp->c_str() + tmp->getLength() - 5;
248 if (!strcmp(s1: p, s2: ".html") || !strcmp(s1: p, s2: ".HTML")) {
249 htmlFileName = new GooString(tmp->c_str(), tmp->getLength() - 5);
250 }
251 }
252 } else {
253 if (tmp->getLength() >= 4) {
254 const char *p = tmp->c_str() + tmp->getLength() - 4;
255 if (!strcmp(s1: p, s2: ".xml") || !strcmp(s1: p, s2: ".XML")) {
256 htmlFileName = new GooString(tmp->c_str(), tmp->getLength() - 4);
257 }
258 }
259 }
260 if (!htmlFileName) {
261 htmlFileName = new GooString(tmp);
262 }
263 delete tmp;
264 } else if (fileName->cmp(sA: "fd://0") == 0) {
265 error(category: errCommandLine, pos: -1, msg: "You have to provide an output filename when reading from stdin.");
266 goto error;
267 } else {
268 const char *p = fileName->c_str() + fileName->getLength() - 4;
269 if (!strcmp(s1: p, s2: ".pdf") || !strcmp(s1: p, s2: ".PDF")) {
270 htmlFileName = new GooString(fileName->c_str(), fileName->getLength() - 4);
271 } else {
272 htmlFileName = fileName->copy();
273 }
274 // htmlFileName->append(".html");
275 }
276
277 if (scale > 3.0) {
278 scale = 3.0;
279 }
280 if (scale < 0.5) {
281 scale = 0.5;
282 }
283
284 if (complexMode) {
285 // noframes=false;
286 stout = false;
287 }
288
289 if (stout) {
290 noframes = true;
291 complexMode = false;
292 }
293
294 if (xml) {
295 complexMode = true;
296 singleHtml = false;
297 noframes = true;
298 noMerge = true;
299 }
300
301 // get page range
302 if (firstPage < 1) {
303 firstPage = 1;
304 }
305 if (lastPage < 1 || lastPage > doc->getNumPages()) {
306 lastPage = doc->getNumPages();
307 }
308 if (lastPage < firstPage) {
309 error(category: errCommandLine, pos: -1, msg: "Wrong page range given: the first page ({0:d}) can not be after the last page ({1:d}).", firstPage, lastPage);
310 goto error;
311 }
312
313 info = doc->getDocInfo();
314 if (info.isDict()) {
315 docTitle = getInfoString(infoDict: info.getDict(), key: "Title");
316 author = getInfoString(infoDict: info.getDict(), key: "Author");
317 keywords = getInfoString(infoDict: info.getDict(), key: "Keywords");
318 subject = getInfoString(infoDict: info.getDict(), key: "Subject");
319 date = getInfoDate(infoDict: info.getDict(), key: "ModDate");
320 if (!date) {
321 date = getInfoDate(infoDict: info.getDict(), key: "CreationDate");
322 }
323 }
324 if (!docTitle) {
325 docTitle = std::make_unique<GooString>(args&: htmlFileName);
326 }
327
328 if (!singleHtml) {
329 rawOrder = complexMode; // todo: figure out what exactly rawOrder do :)
330 } else {
331 rawOrder = singleHtml;
332 }
333
334 doOutline = doc->getOutline()->getItems() != nullptr;
335 // write text file
336 htmlOut = new HtmlOutputDev(doc->getCatalog(), htmlFileName->c_str(), docTitle->c_str(), author ? author->c_str() : nullptr, keywords ? keywords->c_str() : nullptr, subject ? subject->c_str() : nullptr, date ? date->c_str() : nullptr,
337 rawOrder, firstPage, doOutline);
338 if (date) {
339 delete date;
340 }
341
342 if ((complexMode || singleHtml) && !xml && !ignore) {
343 // White paper color
344 SplashColor color;
345 color[0] = color[1] = color[2] = 255;
346 // If the user specified "jpg" use JPEG, otherwise PNG
347 SplashImageFileFormat format = strcmp(s1: extension, s2: "jpg") ? splashFormatPng : splashFormatJpeg;
348
349 splashOut = new SplashOutputDevNoText(splashModeRGB8, 4, false, color);
350 splashOut->startDoc(docA: doc.get());
351
352 for (int pg = firstPage; pg <= lastPage; ++pg) {
353 InMemoryFile imf;
354 doc->displayPage(out: splashOut, page: pg, hDPI: 72 * scale, vDPI: 72 * scale, rotate: 0, useMediaBox: true, crop: false, printing: false);
355 SplashBitmap *bitmap = splashOut->getBitmap();
356
357 const std::unique_ptr<GooString> imgFileName = GooString::format(fmt: "{0:s}{1:03d}.{2:s}", htmlFileName->c_str(), pg, extension);
358 auto f1 = dataUrls ? imf.open(mode: "wb") : fopen(filename: imgFileName->c_str(), modes: "wb");
359 if (!f1) {
360 fprintf(stderr, format: "Could not open %s\n", imgFileName->c_str());
361 continue;
362 }
363 bitmap->writeImgFile(format, f: f1, hDPI: 72 * scale, vDPI: 72 * scale);
364 fclose(stream: f1);
365 if (dataUrls) {
366 htmlOut->addBackgroundImage(img: std::string((format == splashFormatJpeg) ? "data:image/jpeg;base64," : "data:image/png;base64,") + gbase64Encode(input: imf.getBuffer()));
367 } else {
368 htmlOut->addBackgroundImage(img: gbasename(filename: imgFileName->c_str()));
369 }
370 }
371
372 delete splashOut;
373 }
374
375 if (htmlOut->isOk()) {
376 doc->displayPages(out: htmlOut, firstPage, lastPage, hDPI: 72 * scale, vDPI: 72 * scale, rotate: 0, useMediaBox: true, crop: false, printing: false);
377 htmlOut->dumpDocOutline(doc: doc.get());
378 }
379
380 delete htmlOut;
381
382 exit_status = EXIT_SUCCESS;
383
384 // clean up
385error:
386 delete fileName;
387
388 if (htmlFileName) {
389 delete htmlFileName;
390 }
391
392 return exit_status;
393}
394
395static std::unique_ptr<GooString> getInfoString(Dict *infoDict, const char *key)
396{
397 Object obj;
398 // Raw value as read from PDF (may be in pdfDocEncoding or UCS2)
399 const GooString *rawString;
400 // Value converted to unicode
401 Unicode *unicodeString;
402 int unicodeLength;
403 // Value HTML escaped and converted to desired encoding
404 std::unique_ptr<GooString> encodedString;
405 // Is rawString UCS2 (as opposed to pdfDocEncoding)
406 bool isUnicode;
407
408 obj = infoDict->lookup(key);
409 if (obj.isString()) {
410 rawString = obj.getString();
411
412 // Convert rawString to unicode
413 if (hasUnicodeByteOrderMark(s: rawString->toStr())) {
414 isUnicode = true;
415 unicodeLength = (obj.getString()->getLength() - 2) / 2;
416 } else {
417 isUnicode = false;
418 unicodeLength = obj.getString()->getLength();
419 }
420 unicodeString = new Unicode[unicodeLength];
421
422 for (int i = 0; i < unicodeLength; i++) {
423 if (isUnicode) {
424 unicodeString[i] = ((rawString->getChar(i: (i + 1) * 2) & 0xff) << 8) | (rawString->getChar(i: ((i + 1) * 2) + 1) & 0xff);
425 } else {
426 unicodeString[i] = pdfDocEncoding[rawString->getChar(i) & 0xff];
427 }
428 }
429
430 // HTML escape and encode unicode
431 encodedString = HtmlFont::HtmlFilter(u: unicodeString, uLen: unicodeLength);
432 delete[] unicodeString;
433 }
434
435 return encodedString;
436}
437
438static GooString *getInfoDate(Dict *infoDict, const char *key)
439{
440 Object obj;
441 int year, mon, day, hour, min, sec, tz_hour, tz_minute;
442 char tz;
443 struct tm tmStruct;
444 GooString *result = nullptr;
445 char buf[256];
446
447 obj = infoDict->lookup(key);
448 if (obj.isString()) {
449 const GooString *s = obj.getString();
450 // TODO do something with the timezone info
451 if (parseDateString(date: s, year: &year, month: &mon, day: &day, hour: &hour, minute: &min, second: &sec, tz: &tz, tzHour: &tz_hour, tzMinute: &tz_minute)) {
452 tmStruct.tm_year = year - 1900;
453 tmStruct.tm_mon = mon - 1;
454 tmStruct.tm_mday = day;
455 tmStruct.tm_hour = hour;
456 tmStruct.tm_min = min;
457 tmStruct.tm_sec = sec;
458 tmStruct.tm_wday = -1;
459 tmStruct.tm_yday = -1;
460 tmStruct.tm_isdst = -1;
461 mktime(tp: &tmStruct); // compute the tm_wday and tm_yday fields
462 if (strftime(s: buf, maxsize: sizeof(buf), format: "%Y-%m-%dT%H:%M:%S+00:00", tp: &tmStruct)) {
463 result = new GooString(buf);
464 } else {
465 result = new GooString(s);
466 }
467 } else {
468 result = new GooString(s);
469 }
470 }
471 return result;
472}
473

source code of poppler/utils/pdftohtml.cc