pdftohtml.cc source code [poppler/utils/pdftohtml.cc]

1	//========================================================================
2	//
3	// pdftohtml.cc
4	//
5	//
6	// Copyright 1999-2000 G. Ovtcharov
7	//========================================================================
8
9	//========================================================================
10	//
11	// Modified under the Poppler project - http://poppler.freedesktop.org
12	//
13	// All changes made under the Poppler project to this file are licensed
14	// under GPL version 2 or later
15	//
16	// Copyright (C) 2007-2008, 2010, 2012, 2015-2020, 2022 Albert Astals Cid <aacid@kde.org>
17	// Copyright (C) 2010 Hib Eris <hib@hiberis.nl>
18	// Copyright (C) 2010 Mike Slegeir <tehpola@yahoo.com>
19	// Copyright (C) 2010, 2013 Suzuki Toshiya <mpsuzuki@hiroshima-u.ac.jp>
20	// Copyright (C) 2010 OSSD CDAC Mumbai by Leena Chourey (leenac@cdacmumbai.in) and Onkar Potdar (onkar@cdacmumbai.in)
21	// Copyright (C) 2011 Steven Murdoch <Steven.Murdoch@cl.cam.ac.uk>
22	// Copyright (C) 2012 Igor Slepchin <igor.redhat@gmail.com>
23	// Copyright (C) 2012 Ihar Filipau <thephilips@gmail.com>
24	// Copyright (C) 2012 Luis Parravicini <lparravi@gmail.com>
25	// Copyright (C) 2014 Pino Toscano <pino@kde.org>
26	// Copyright (C) 2015 William Bader <williambader@hotmail.com>
27	// Copyright (C) 2017, 2021 Adrian Johnson <ajohnson@redneon.com>
28	// Copyright (C) 2018 Klarälvdalens Datakonsult AB, a KDAB Group company, <info@kdab.com>. Work sponsored by the LiMux project of the city of Munich
29	// Copyright (C) 2018 Thibaut Brard <thibaut.brard@gmail.com>
30	// Copyright (C) 2018 Adam Reichold <adam.reichold@t-online.de>
31	// Copyright (C) 2019, 2021, 2024 Oliver Sander <oliver.sander@tu-dresden.de>
32	// Copyright (C) 2021 Hubert Figuiere <hub@figuiere.net>
33	// Copyright (C) 2024 g10 Code GmbH, Author: Sune Stolborg Vuorela <sune@vuorela.dk>
34	//
35	// To see a description of the changes please see the Changelog file that
36	// came with your tarball or type make ChangeLog if you are building from git
37	//
38	//========================================================================
39
40	#include "config.h"
41	#include <poppler-config.h>
42	#include <cstdio>
43	#include <cstdlib>
44	#include <cstddef>
45	#include <cstring>
46	#ifdef HAVE_DIRENT_H
47	# include <dirent.h>
48	#endif
49	#include <ctime>
50	#include "parseargs.h"
51	#include "goo/GooString.h"
52	#include "goo/gbase64.h"
53	#include "goo/gbasename.h"
54	#include "goo/gmem.h"
55	#include "Object.h"
56	#include "Stream.h"
57	#include "Array.h"
58	#include "Dict.h"
59	#include "XRef.h"
60	#include "Catalog.h"
61	#include "Page.h"
62	#include "Outline.h"
63	#include "PDFDoc.h"
64	#include "PDFDocFactory.h"
65	#include "HtmlOutputDev.h"
66	#include "SplashOutputDev.h"
67	#include "splash/SplashBitmap.h"
68	#include "GlobalParams.h"
69	#include "PDFDocEncoding.h"
70	#include "Error.h"
71	#include "DateInfo.h"
72	#include "goo/gfile.h"
73	#include "Win32Console.h"
74	#include "InMemoryFile.h"
75	#include "UTF.h"
76
77	static int firstPage = `1`;
78	static int lastPage = `0`;
79	static bool rawOrder = true;
80	bool printCommands = true;
81	static bool printHelp = false;
82	bool printHtml = false;
83	bool complexMode = false;
84	bool singleHtml = false; // singleHtml
85	bool dataUrls = false;
86	bool ignore = false;
87	static char extension[`5`] = "png";
88	static double scale = `1.5`;
89	bool noframes = false;
90	bool stout = false;
91	bool xml = false;
92	bool noRoundedCoordinates = false;
93	static bool errQuiet = false;
94	static bool noDrm = false;
95	double wordBreakThreshold = `10`; // 10%, below converted into a coefficient - 0.1
96
97	bool showHidden = false;
98	bool noMerge = false;
99	bool fontFullName = false;
100	static char ownerPassword[`33`] = "";
101	static char userPassword[`33`] = "";
102	static bool printVersion = false;
103
104	static std::unique_ptr<GooString> getInfoString(Dict infoDict, const* char *key);
105	static GooString getInfoDate(Dict infoDict, const char *key);
106
107	static char textEncName[`128`] = "";
108
109	static const ArgDesc argDesc[] = { { .arg: "-f", .kind: argInt, .val: &firstPage, .size: `0`, .usage: "first page to convert" },
110	{ .arg: "-l", .kind: argInt, .val: &lastPage, .size: `0`, .usage: "last page to convert" },
111	/{"-raw", argFlag, &rawOrder, 0,*
112	"keep strings in content stream order"},/*
113	{ .arg: "-q", .kind: argFlag, .val: &errQuiet, .size: `0`, .usage: "don't print any messages or errors" },
114	{ .arg: "-h", .kind: argFlag, .val: &printHelp, .size: `0`, .usage: "print usage information" },
115	{ .arg: "-?", .kind: argFlag, .val: &printHelp, .size: `0`, .usage: "print usage information" },
116	{ .arg: "-help", .kind: argFlag, .val: &printHelp, .size: `0`, .usage: "print usage information" },
117	{ .arg: "--help", .kind: argFlag, .val: &printHelp, .size: `0`, .usage: "print usage information" },
118	{ .arg: "-p", .kind: argFlag, .val: &printHtml, .size: `0`, .usage: "exchange .pdf links by .html" },
119	{ .arg: "-c", .kind: argFlag, .val: &complexMode, .size: `0`, .usage: "generate complex document" },
120	{ .arg: "-s", .kind: argFlag, .val: &singleHtml, .size: `0`, .usage: "generate single document that includes all pages" },
121	#ifdef HAVE_IN_MEMORY_FILE
122	{ .arg: "-dataurls", .kind: argFlag, .val: &dataUrls, .size: `0`, .usage: "use data URLs instead of external images in HTML" },
123	#endif
124	{ .arg: "-i", .kind: argFlag, .val: &ignore, .size: `0`, .usage: "ignore images" },
125	{ .arg: "-noframes", .kind: argFlag, .val: &noframes, .size: `0`, .usage: "generate no frames" },
126	{ .arg: "-stdout", .kind: argFlag, .val: &stout, .size: `0`, .usage: "use standard output" },
127	{ .arg: "-zoom", .kind: argFP, .val: &scale, .size: `0`, .usage: "zoom the pdf document (default 1.5)" },
128	{ .arg: "-xml", .kind: argFlag, .val: &xml, .size: `0`, .usage: "output for XML post-processing" },
129	{ .arg: "-noroundcoord", .kind: argFlag, .val: &noRoundedCoordinates, .size: `0`, .usage: "do not round coordinates (with XML output only)" },
130	{ .arg: "-hidden", .kind: argFlag, .val: &showHidden, .size: `0`, .usage: "output hidden text" },
131	{ .arg: "-nomerge", .kind: argFlag, .val: &noMerge, .size: `0`, .usage: "do not merge paragraphs" },
132	{ .arg: "-enc", .kind: argString, .val: textEncName, .size: sizeof(textEncName), .usage: "output text encoding name" },
133	{ .arg: "-fmt", .kind: argString, .val: extension, .size: sizeof(extension), .usage: "image file format for Splash output (png or jpg)" },
134	{ .arg: "-v", .kind: argFlag, .val: &printVersion, .size: `0`, .usage: "print copyright and version info" },
135	{ .arg: "-opw", .kind: argString, .val: ownerPassword, .size: sizeof(ownerPassword), .usage: "owner password (for encrypted files)" },
136	{ .arg: "-upw", .kind: argString, .val: userPassword, .size: sizeof(userPassword), .usage: "user password (for encrypted files)" },
137	{ .arg: "-nodrm", .kind: argFlag, .val: &noDrm, .size: `0`, .usage: "override document DRM settings" },
138	{ .arg: "-wbt", .kind: argFP, .val: &wordBreakThreshold, .size: `0`, .usage: "word break threshold (default 10 percent)" },
139	{ .arg: "-fontfullname", .kind: argFlag, .val: &fontFullName, .size: `0`, .usage: "outputs font full name" },
140	{} };
141
142	class SplashOutputDevNoText : public SplashOutputDev
143	{
144	public:
145	SplashOutputDevNoText(SplashColorMode colorModeA, int bitmapRowPadA, bool reverseVideoA, SplashColorPtr paperColorA, bool bitmapTopDownA = true)
146	: SplashOutputDev (colorModeA, bitmapRowPadA, reverseVideoA, paperColorA, bitmapTopDownA) { }
147	~SplashOutputDevNoText() override;
148
149	void drawChar(GfxState state, double* x, double y, double dx, double dy, double originX, double originY, CharCode code, int nBytes, const Unicode u, int* uLen) override { }
150	bool beginType3Char(GfxState state, double* x, double y, double dx, double dy, CharCode code, const Unicode u, int* uLen) override { return false; }
151	void endType3Char(GfxState *state) override { }
152	void beginTextObject(GfxState *state) override { }
153	void endTextObject(GfxState *state) override { }
154	bool interpretType3Chars() override { return false; }
155	};
156
157	SplashOutputDevNoText::~SplashOutputDevNoText() = default;
158
159	int main(int argc, char *argv[])
160	{
161	std::unique_ptr<PDFDoc> doc;
162	GooString fileName = nullptr*;
163	std::unique_ptr<GooString> docTitle;
164	std::unique_ptr<GooString> author;
165	std::unique_ptr<GooString> keywords;
166	std::unique_ptr<GooString> subject;
167	GooString date = nullptr*;
168	GooString htmlFileName = nullptr*;
169	HtmlOutputDev htmlOut = nullptr*;
170	SplashOutputDev splashOut = nullptr*;
171	bool doOutline;
172	bool ok;
173	std::optional<GooString> ownerPW, userPW;
174	Object info;
175	int exit_status = EXIT_FAILURE;
176
177	Win32Console win32Console(&argc, &argv);
178	// parse args
179	ok = parseArgs(args: argDesc, argc: &argc, argv);
180	if (!ok \|\| argc < `2` \|\| argc > `3` \|\| printHelp \|\| printVersion) {
181	fprintf(stderr, format: "pdftohtml version %s\n", PACKAGE_VERSION);
182	fprintf(stderr, format: "%s\n", popplerCopyright);
183	fprintf(stderr, format: "%s\n", "Copyright 1999-2003 Gueorgui Ovtcharov and Rainer Dorsch");
184	fprintf(stderr, format: "%s\n\n", xpdfCopyright);
185	if (!printVersion) {
186	printUsage(program: "pdftohtml", otherArgs: "<PDF-file> [<html-file> <xml-file>]", args: argDesc);
187	}
188	exit(status: printHelp \|\| printVersion ? `0` : `1`);
189	}
190
191	// init error file
192	// errorInit();
193
194	// read config file
195	globalParams = std::make_unique<GlobalParams>();
196
197	if (errQuiet) {
198	globalParams ->setErrQuiet(errQuiet);
199	printCommands = false; // I'm not 100% what is the difference between them
200	}
201
202	if (textEncName[`0`]) {
203	globalParams ->setTextEncoding(textEncName);
204	if (!globalParams ->getTextEncoding()) {
205	goto error;
206	}
207	}
208
209	// convert from user-friendly percents into a coefficient
210	wordBreakThreshold /= `100.0`;
211
212	// open PDF file
213	if (ownerPassword[`0`]) {
214	ownerPW = GooString (ownerPassword);
215	}
216	if (userPassword[`0`]) {
217	userPW = GooString (userPassword);
218	}
219
220	fileName = new GooString (argv[`1`]);
221
222	if (fileName->cmp(sA: "-") == `0`) {
223	delete fileName;
224	fileName = new GooString ("fd://0");
225	}
226
227	doc = PDFDocFactory ().createPDFDoc(uri: *fileName, ownerPassword: ownerPW, userPassword: userPW);
228
229	if (!doc ->isOk()) {
230	goto error;
231	}
232
233	// check for copy permission
234	if (!doc ->okToCopy()) {
235	if (!noDrm) {
236	error(category: errNotAllowed, pos: -`1`, msg: "Copying of text from this document is not allowed.");
237	goto error;
238	}
239	fprintf(stderr, format: "Document has copy-protection bit set.\n");
240	}
241
242	// construct text file name
243	if (argc == `3`) {
244	GooString tmp = new* GooString (argv[`2`]);
245	if (!xml) {
246	if (tmp->getLength() >= `5`) {
247	const char *p = tmp->c_str() + tmp->getLength() - `5`;
248	if (!strcmp(s1: p, s2: ".html") \|\| !strcmp(s1: p, s2: ".HTML")) {
249	htmlFileName = new GooString (tmp->c_str(), tmp->getLength() - `5`);
250	}
251	}
252	} else {
253	if (tmp->getLength() >= `4`) {
254	const char *p = tmp->c_str() + tmp->getLength() - `4`;
255	if (!strcmp(s1: p, s2: ".xml") \|\| !strcmp(s1: p, s2: ".XML")) {
256	htmlFileName = new GooString (tmp->c_str(), tmp->getLength() - `4`);
257	}
258	}
259	}
260	if (!htmlFileName) {
261	htmlFileName = new GooString (tmp);
262	}
263	delete tmp;
264	} else if (fileName->cmp(sA: "fd://0") == `0`) {
265	error(category: errCommandLine, pos: -`1`, msg: "You have to provide an output filename when reading from stdin.");
266	goto error;
267	} else {
268	const char *p = fileName->c_str() + fileName->getLength() - `4`;
269	if (!strcmp(s1: p, s2: ".pdf") \|\| !strcmp(s1: p, s2: ".PDF")) {
270	htmlFileName = new GooString (fileName->c_str(), fileName->getLength() - `4`);
271	} else {
272	htmlFileName = fileName->copy();
273	}
274	// htmlFileName->append(".html");
275	}
276
277	if (scale > `3.0`) {
278	scale = `3.0`;
279	}
280	if (scale < `0.5`) {
281	scale = `0.5`;
282	}
283
284	if (complexMode) {
285	// noframes=false;
286	stout = false;
287	}
288
289	if (stout) {
290	noframes = true;
291	complexMode = false;
292	}
293
294	if (xml) {
295	complexMode = true;
296	singleHtml = false;
297	noframes = true;
298	noMerge = true;
299	}
300
301	// get page range
302	if (firstPage < `1`) {
303	firstPage = `1`;
304	}
305	if (lastPage < `1` \|\| lastPage > doc ->getNumPages()) {
306	lastPage = doc ->getNumPages();
307	}
308	if (lastPage < firstPage) {
309	error(category: errCommandLine, pos: -`1`, msg: "Wrong page range given: the first page ({0:d}) can not be after the last page ({1:d}).", firstPage, lastPage);
310	goto error;
311	}
312
313	info = doc ->getDocInfo();
314	if (info.isDict()) {
315	docTitle = getInfoString(infoDict: info.getDict(), key: "Title");
316	author = getInfoString(infoDict: info.getDict(), key: "Author");
317	keywords = getInfoString(infoDict: info.getDict(), key: "Keywords");
318	subject = getInfoString(infoDict: info.getDict(), key: "Subject");
319	date = getInfoDate(infoDict: info.getDict(), key: "ModDate");
320	if (!date) {
321	date = getInfoDate(infoDict: info.getDict(), key: "CreationDate");
322	}
323	}
324	if (!docTitle) {
325	docTitle = std::make_unique<GooString>(args&: htmlFileName);
326	}
327
328	if (!singleHtml) {
329	rawOrder = complexMode; // todo: figure out what exactly rawOrder do :)
330	} else {
331	rawOrder = singleHtml;
332	}
333
334	doOutline = doc ->getOutline()->getItems() != nullptr;
335	// write text file
336	htmlOut = new HtmlOutputDev (doc ->getCatalog(), htmlFileName->c_str(), docTitle ->c_str(), author ? author ->c_str() : nullptr, keywords ? keywords ->c_str() : nullptr, subject ? subject ->c_str() : nullptr, date ? date->c_str() : nullptr,
337	rawOrder, firstPage, doOutline);
338	if (date) {
339	delete date;
340	}
341
342	if ((complexMode \|\| singleHtml) && !xml && !ignore) {
343	// White paper color
344	SplashColor color;
345	color[`0`] = color[`1`] = color[`2`] = `255`;
346	// If the user specified "jpg" use JPEG, otherwise PNG
347	SplashImageFileFormat format = strcmp(s1: extension, s2: "jpg") ? splashFormatPng : splashFormatJpeg;
348
349	splashOut = new SplashOutputDevNoText (splashModeRGB8, `4`, false, color);
350	splashOut->startDoc(docA: doc.get());
351
352	for (int pg = firstPage; pg <= lastPage; ++pg) {
353	InMemoryFile imf;
354	doc ->displayPage(out: splashOut, page: pg, hDPI: `72` * scale, vDPI: `72` * scale, rotate: `0`, useMediaBox: true, crop: false, printing: false);
355	SplashBitmap *bitmap = splashOut->getBitmap();
356
357	const std::unique_ptr<GooString> imgFileName = GooString::format(fmt: "{0:s}{1:03d}.{2:s}", htmlFileName->c_str(), pg, extension);
358	auto f1 = dataUrls ? imf.open(mode: "wb") : fopen(filename: imgFileName ->c_str(), modes: "wb");
359	if (!f1) {
360	fprintf(stderr, format: "Could not open %s\n", imgFileName ->c_str());
361	continue;
362	}
363	bitmap->writeImgFile(format, f: f1, hDPI: `72` * scale, vDPI: `72` * scale);
364	fclose(stream: f1);
365	if (dataUrls) {
366	htmlOut->addBackgroundImage(img: std::string ((format == splashFormatJpeg) ? "data:image/jpeg;base64," : "data:image/png;base64,") + gbase64Encode(input: imf.getBuffer()));
367	} else {
368	htmlOut->addBackgroundImage(img: gbasename(filename: imgFileName ->c_str()));
369	}
370	}
371
372	delete splashOut;
373	}
374
375	if (htmlOut->isOk()) {
376	doc ->displayPages(out: htmlOut, firstPage, lastPage, hDPI: `72` * scale, vDPI: `72` * scale, rotate: `0`, useMediaBox: true, crop: false, printing: false);
377	htmlOut->dumpDocOutline(doc: doc.get());
378	}
379
380	delete htmlOut;
381
382	exit_status = EXIT_SUCCESS;
383
384	// clean up
385	error:
386	delete fileName;
387
388	if (htmlFileName) {
389	delete htmlFileName;
390	}
391
392	return exit_status;
393	}
394
395	static std::unique_ptr<GooString> getInfoString(Dict infoDict, const* char *key)
396	{
397	Object obj;
398	// Raw value as read from PDF (may be in pdfDocEncoding or UCS2)
399	const GooString *rawString;
400	// Value converted to unicode
401	Unicode *unicodeString;
402	int unicodeLength;
403	// Value HTML escaped and converted to desired encoding
404	std::unique_ptr<GooString> encodedString;
405	// Is rawString UCS2 (as opposed to pdfDocEncoding)
406	bool isUnicode;
407
408	obj = infoDict->lookup(key);
409	if (obj.isString()) {
410	rawString = obj.getString();
411
412	// Convert rawString to unicode
413	if (hasUnicodeByteOrderMark(s: rawString->toStr())) {
414	isUnicode = true;
415	unicodeLength = (obj.getString()->getLength() - `2`) / `2`;
416	} else {
417	isUnicode = false;
418	unicodeLength = obj.getString()->getLength();
419	}
420	unicodeString = new Unicode[unicodeLength];
421
422	for (int i = `0`; i < unicodeLength; i++) {
423	if (isUnicode) {
424	unicodeString[i] = ((rawString->getChar(i: (i + `1`) * `2`) & `0xff`) << `8`) \| (rawString->getChar(i: ((i + `1`) * `2`) + `1`) & `0xff`);
425	} else {
426	unicodeString[i] = pdfDocEncoding[rawString->getChar(i) & `0xff`];
427	}
428	}
429
430	// HTML escape and encode unicode
431	encodedString = HtmlFont::HtmlFilter(u: unicodeString, uLen: unicodeLength);
432	delete[] unicodeString;
433	}
434
435	return encodedString;
436	}
437
438	static GooString getInfoDate(Dict infoDict, const char *key)
439	{
440	Object obj;
441	int year, mon, day, hour, min, sec, tz_hour, tz_minute;
442	char tz;
443	struct tm tmStruct;
444	GooString result = nullptr*;
445	char buf[`256`];
446
447	obj = infoDict->lookup(key);
448	if (obj.isString()) {
449	const GooString *s = obj.getString();
450	// TODO do something with the timezone info
451	if (parseDateString(date: s, year: &year, month: &mon, day: &day, hour: &hour, minute: &min, second: &sec, tz: &tz, tzHour: &tz_hour, tzMinute: &tz_minute)) {
452	tmStruct.tm_year = year - `1900`;
453	tmStruct.tm_mon = mon - `1`;
454	tmStruct.tm_mday = day;
455	tmStruct.tm_hour = hour;
456	tmStruct.tm_min = min;
457	tmStruct.tm_sec = sec;
458	tmStruct.tm_wday = -`1`;
459	tmStruct.tm_yday = -`1`;
460	tmStruct.tm_isdst = -`1`;
461	mktime(tp: &tmStruct); // compute the tm_wday and tm_yday fields
462	if (strftime(s: buf, maxsize: sizeof(buf), format: "%Y-%m-%dT%H:%M:%S+00:00", tp: &tmStruct)) {
463	result = new GooString (buf);
464	} else {
465	result = new GooString (s);
466	}
467	} else {
468	result = new GooString (s);
469	}
470	}
471	return result;
472	}
473

Provided by KDAB

Definitions

source code of poppler/utils/pdftohtml.cc