1//========================================================================
2//
3// HtmlOutputDev.h
4//
5// Copyright 1997 Derek B. Noonburg
6//
7// Changed 1999 by G.Ovtcharov
8//========================================================================
9
10//========================================================================
11//
12// Modified under the Poppler project - http://poppler.freedesktop.org
13//
14// All changes made under the Poppler project to this file are licensed
15// under GPL version 2 or later
16//
17// Copyright (C) 2006, 2007, 2009, 2012, 2018-2022 Albert Astals Cid <aacid@kde.org>
18// Copyright (C) 2008, 2009 Warren Toomey <wkt@tuhs.org>
19// Copyright (C) 2009, 2011 Carlos Garcia Campos <carlosgc@gnome.org>
20// Copyright (C) 2009 Kovid Goyal <kovid@kovidgoyal.net>
21// Copyright (C) 2010 Hib Eris <hib@hiberis.nl>
22// Copyright (C) 2011 Joshua Richardson <jric@chegg.com>
23// Copyright (C) 2011 Stephen Reichling <sreichling@chegg.com>
24// Copyright (C) 2012 Igor Slepchin <igor.redhat@gmail.com>
25// Copyright (C) 2012 Fabio D'Urso <fabiodurso@hotmail.it>
26// Copyright (C) 2013 Thomas Freitag <Thomas.Freitag@alfa.de>
27// Copyright (C) 2018 Klarälvdalens Datakonsult AB, a KDAB Group company, <info@kdab.com>. Work sponsored by the LiMux project of the city of Munich
28// Copyright (C) 2019 Oliver Sander <oliver.sander@tu-dresden.de>
29// Copyright (C) 2024 g10 Code GmbH, Author: Sune Stolborg Vuorela <sune@vuorela.dk>
30//
31// To see a description of the changes please see the Changelog file that
32// came with your tarball or type make ChangeLog if you are building from git
33//
34//========================================================================
35
36#ifndef HTMLOUTPUTDEV_H
37#define HTMLOUTPUTDEV_H
38
39#include <cstdio>
40#include "goo/gbasename.h"
41#include "GfxFont.h"
42#include "OutputDev.h"
43#include "HtmlLinks.h"
44#include "HtmlFonts.h"
45#include "Link.h"
46#include "Catalog.h"
47#include "UnicodeMap.h"
48
49#define xoutRound(x) ((int)(x + 0.5))
50
51#define DOCTYPE "<!DOCTYPE html>"
52
53class GfxState;
54class GooString;
55class HtmlImage;
56class PDFDoc;
57class OutlineItem;
58//------------------------------------------------------------------------
59// HtmlString
60//------------------------------------------------------------------------
61
62enum UnicodeTextDirection
63{
64 textDirUnknown,
65 textDirLeftRight,
66 textDirRightLeft,
67 textDirTopBottom
68};
69
70class HtmlString
71{
72public:
73 // Constructor.
74 HtmlString(GfxState *state, double fontSize, HtmlFontAccu *fonts);
75
76 // Destructor.
77 ~HtmlString();
78
79 HtmlString(const HtmlString &) = delete;
80 HtmlString &operator=(const HtmlString &) = delete;
81
82 // Add a character to the string.
83 void addChar(GfxState *state, double x, double y, double dx, double dy, Unicode u);
84 const HtmlLink *getLink() const { return link; }
85 const HtmlFont &getFont() const { return *fonts->Get(i: fontpos); }
86 void endString(); // postprocessing
87
88private:
89 // aender die text variable
90 const HtmlLink *link;
91 double xMin, xMax; // bounding box x coordinates
92 double yMin, yMax; // bounding box y coordinates
93 int col; // starting column
94 Unicode *text; // the text
95 double *xRight; // right-hand x coord of each char
96 HtmlString *yxNext; // next string in y-major order
97 HtmlString *xyNext; // next string in x-major order
98 int fontpos;
99 std::unique_ptr<GooString> htext;
100 int len; // length of text and xRight
101 int size; // size of text and xRight arrays
102 UnicodeTextDirection dir; // direction (left to right/right to left)
103 HtmlFontAccu *fonts;
104
105 friend class HtmlPage;
106};
107
108//------------------------------------------------------------------------
109// HtmlPage
110//------------------------------------------------------------------------
111
112class HtmlPage
113{
114public:
115 // Constructor.
116 explicit HtmlPage(bool rawOrder);
117
118 // Destructor.
119 ~HtmlPage();
120
121 HtmlPage(const HtmlPage &) = delete;
122 HtmlPage &operator=(const HtmlPage &) = delete;
123
124 // Begin a new string.
125 void beginString(GfxState *state, const GooString *s);
126
127 // Add a character to the current string.
128 void addChar(GfxState *state, double x, double y, double dx, double dy, double ox, double oy, const Unicode *u, int uLen); // unsigned char c);
129
130 void updateFont(GfxState *state);
131
132 // End the current string, sorting it into the list of strings.
133 void endString();
134
135 // Coalesce strings that look like parts of the same line.
136 void coalesce();
137
138 // Find a string. If <top> is true, starts looking at top of page;
139 // otherwise starts looking at <xMin>,<yMin>. If <bottom> is true,
140 // stops looking at bottom of page; otherwise stops looking at
141 // <xMax>,<yMax>. If found, sets the text bounding rectangle and
142 // returns true; otherwise returns false.
143
144 // new functions
145 void AddLink(const HtmlLink &x) { links->AddLink(x); }
146
147 // add an image to the current page
148 void addImage(std::unique_ptr<GooString> &&fname, GfxState *state);
149
150 // number of images on the current page
151 int getNumImages() { return imgList.size(); }
152
153 void dump(FILE *f, int pageNum, const std::vector<std::string> &backgroundImages);
154
155 // Clear the page.
156 void clear();
157
158 void conv();
159
160private:
161 const HtmlFont *getFont(HtmlString *hStr) const { return fonts->Get(i: hStr->fontpos); }
162
163 double fontSize; // current font size
164 bool rawOrder; // keep strings in content stream order
165
166 HtmlString *curStr; // currently active string
167
168 HtmlString *yxStrings; // strings in y-major order
169 HtmlString *xyStrings; // strings in x-major order
170 HtmlString *yxCur1, *yxCur2; // cursors for yxStrings list
171
172 void setDocName(const char *fname);
173 void dumpAsXML(FILE *f, int page);
174 void dumpComplex(FILE *f, int page, const std::vector<std::string> &backgroundImages);
175 int dumpComplexHeaders(FILE *const file, FILE *&pageFile, int page);
176
177 // marks the position of the fonts that belong to current page (for noframes)
178 int fontsPageMarker;
179 HtmlFontAccu *fonts;
180 HtmlLinks *links;
181 std::vector<HtmlImage *> imgList;
182
183 GooString *DocName;
184 int pageWidth;
185 int pageHeight;
186 int firstPage; // used to begin the numeration of pages
187
188 friend class HtmlOutputDev;
189};
190
191//------------------------------------------------------------------------
192// HtmlMetaVar
193//------------------------------------------------------------------------
194class HtmlMetaVar
195{
196public:
197 HtmlMetaVar(const char *_name, const char *_content);
198 ~HtmlMetaVar();
199
200 HtmlMetaVar(const HtmlMetaVar &) = delete;
201 HtmlMetaVar &operator=(const HtmlMetaVar &) = delete;
202
203 GooString *toString() const;
204
205private:
206 GooString *name;
207 GooString *content;
208};
209
210//------------------------------------------------------------------------
211// HtmlOutputDev
212//------------------------------------------------------------------------
213
214class HtmlOutputDev : public OutputDev
215{
216public:
217 // Open a text output file. If <fileName> is nullptr, no file is written
218 // (this is useful, e.g., for searching text). If <useASCII7> is true,
219 // text is converted to 7-bit ASCII; otherwise, text is converted to
220 // 8-bit ISO Latin-1. <useASCII7> should also be set for Japanese
221 // (EUC-JP) text. If <rawOrder> is true, the text is kept in content
222 // stream order.
223 HtmlOutputDev(Catalog *catalogA, const char *fileName, const char *title, const char *author, const char *keywords, const char *subject, const char *date, bool rawOrder, int firstPage = 1, bool outline = false);
224
225 // Destructor.
226 ~HtmlOutputDev() override;
227
228 // Check if file was successfully created.
229 virtual bool isOk() { return ok; }
230
231 //---- get info about output device
232
233 // Does this device use upside-down coordinates?
234 // (Upside-down means (0,0) is the top left corner of the page.)
235 bool upsideDown() override { return true; }
236
237 // Does this device use drawChar() or drawString()?
238 bool useDrawChar() override { return true; }
239
240 // Does this device use beginType3Char/endType3Char? Otherwise,
241 // text in Type 3 fonts will be drawn with drawChar/drawString.
242 bool interpretType3Chars() override { return false; }
243
244 // Does this device need non-text content?
245 bool needNonText() override { return true; }
246
247 //----- initialization and control
248
249 bool checkPageSlice(Page *p, double hDPI, double vDPI, int rotate, bool useMediaBox, bool crop, int sliceX, int sliceY, int sliceW, int sliceH, bool printing, bool (*abortCheckCbk)(void *data) = nullptr,
250 void *abortCheckCbkData = nullptr, bool (*annotDisplayDecideCbk)(Annot *annot, void *user_data) = nullptr, void *annotDisplayDecideCbkData = nullptr) override
251 {
252 docPage = p;
253 return true;
254 }
255
256 // Start a page.
257 void startPage(int pageNum, GfxState *state, XRef *xref) override;
258
259 // End a page.
260 void endPage() override;
261
262 // add a background image to the list of background images,
263 // as this seems to be done outside other processing. takes ownership of img.
264 void addBackgroundImage(const std::string &img);
265
266 //----- update text state
267 void updateFont(GfxState *state) override;
268
269 //----- text drawing
270 void beginString(GfxState *state, const GooString *s) override;
271 void endString(GfxState *state) override;
272 void drawChar(GfxState *state, double x, double y, double dx, double dy, double originX, double originY, CharCode code, int nBytes, const Unicode *u, int uLen) override;
273
274 void drawImageMask(GfxState *state, Object *ref, Stream *str, int width, int height, bool invert, bool interpolate, bool inlineImg) override;
275 void drawImage(GfxState *state, Object *ref, Stream *str, int width, int height, GfxImageColorMap *colorMap, bool interpolate, const int *maskColors, bool inlineImg) override;
276
277 // new feature
278 virtual int DevType() { return 1234; }
279
280 int getPageWidth() { return maxPageWidth; }
281 int getPageHeight() { return maxPageHeight; }
282
283 bool dumpDocOutline(PDFDoc *doc);
284
285private:
286 // convert encoding into a HTML standard, or encoding->c_str if not
287 // recognized.
288 static std::string mapEncodingToHtml(const std::string &encoding);
289 void doProcessLink(AnnotLink *link);
290 GooString *getLinkDest(AnnotLink *link);
291 void dumpMetaVars(FILE *);
292 void doFrame(int firstPage);
293 bool newHtmlOutlineLevel(FILE *output, const std::vector<OutlineItem *> *outlines, int level = 1);
294 void newXmlOutlineLevel(FILE *output, const std::vector<OutlineItem *> *outlines);
295 int getOutlinePageNum(OutlineItem *item);
296 void drawJpegImage(GfxState *state, Stream *str);
297 void drawPngImage(GfxState *state, Stream *str, int width, int height, GfxImageColorMap *colorMap, bool isMask = false);
298 std::unique_ptr<GooString> createImageFileName(const char *ext);
299
300 FILE *fContentsFrame;
301 FILE *page; // html file
302 // FILE *tin; // image log file
303 // bool write;
304 bool needClose; // need to close the file?
305 HtmlPage *pages; // text for the current page
306 bool rawOrder; // keep text in content stream order
307 bool doOutline; // output document outline
308 bool ok; // set up ok?
309 bool dumpJPEG;
310 int pageNum;
311 int maxPageWidth;
312 int maxPageHeight;
313 GooString *Docname;
314 GooString *docTitle;
315 std::vector<HtmlMetaVar *> glMetaVars;
316 Catalog *catalog;
317 Page *docPage;
318 std::vector<std::string> backgroundImages;
319 friend class HtmlPage;
320};
321
322#endif
323

source code of poppler/utils/HtmlOutputDev.h