1//========================================================================
2//
3// StructElement.h
4//
5// This file is licensed under the GPLv2 or later
6//
7// Copyright 2013, 2014 Igalia S.L.
8// Copyright 2014 Luigi Scarso <luigi.scarso@gmail.com>
9// Copyright 2014, 2018, 2019, 2021, 2023 Albert Astals Cid <aacid@kde.org>
10// Copyright 2018 Adam Reichold <adam.reichold@t-online.de>
11// Copyright 2021, 2023 Adrian Johnson <ajohnson@redneon.com>
12//
13//========================================================================
14
15#ifndef STRUCTELEMENT_H
16#define STRUCTELEMENT_H
17
18#include "goo/GooString.h"
19#include "MarkedContentOutputDev.h"
20#include "Object.h"
21#include "poppler_private_export.h"
22#include <vector>
23#include <set>
24
25class GooString;
26class Dict;
27class StructElement;
28class StructTreeRoot;
29
30class POPPLER_PRIVATE_EXPORT Attribute
31{
32public:
33 enum Type
34 {
35 Unknown = 0, // Uninitialized, parsing error, etc.
36 UserProperty, // User defined attribute (i.e. non-standard)
37
38 // Common standard attributes
39 Placement,
40 WritingMode,
41 BackgroundColor,
42 BorderColor,
43 BorderStyle,
44 BorderThickness,
45 Color,
46 Padding,
47
48 // Block element standard attributes
49 SpaceBefore,
50 SpaceAfter,
51 StartIndent,
52 EndIndent,
53 TextIndent,
54 TextAlign,
55 BBox,
56 Width,
57 Height,
58 BlockAlign,
59 InlineAlign,
60 TBorderStyle,
61 TPadding,
62
63 // Inline element standard attributes
64 BaselineShift,
65 LineHeight,
66 TextDecorationColor,
67 TextDecorationThickness,
68 TextDecorationType,
69 RubyAlign,
70 RubyPosition,
71 GlyphOrientationVertical,
72
73 // Column-only standard attributes
74 ColumnCount,
75 ColumnGap,
76 ColumnWidths,
77
78 // List-only standard attributes
79 ListNumbering,
80
81 // PrintField-only standard attributes
82 Role,
83 checked,
84 Desc,
85
86 // Table-only standard attributes
87 RowSpan,
88 ColSpan,
89 Headers,
90 Scope,
91 Summary,
92 };
93
94 enum Owner
95 {
96 UnknownOwner = 0,
97 // User-defined attributes
98 UserProperties,
99 // Standard attributes
100 Layout,
101 List,
102 PrintField,
103 Table,
104 // Translation to other formats
105 XML_1_00,
106 HTML_3_20,
107 HTML_4_01,
108 OEB_1_00,
109 RTF_1_05,
110 CSS_1_00,
111 CSS_2_00,
112 };
113
114 // Creates a standard attribute. The name is predefined, and the
115 // value is type-checked to conform to the PDF specification.
116 Attribute(Type type, Object *value);
117
118 // Creates an UserProperty attribute, with an arbitrary name and value.
119 Attribute(GooString &&name, Object *value);
120
121 bool isOk() const { return type != Unknown; }
122
123 // Name, type and value can be set only on construction.
124 Type getType() const { return type; }
125 Owner getOwner() const { return owner; }
126 const char *getTypeName() const;
127 const char *getOwnerName() const;
128 const Object *getValue() const { return &value; }
129 static Object *getDefaultValue(Type type);
130
131 // The caller gets the ownership of the return GooString and is responsible of deleting it
132 std::unique_ptr<GooString> getName() const { return std::make_unique<GooString>(args: type == UserProperty ? name.c_str() : getTypeName()); }
133
134 // The revision is optional, and defaults to zero.
135 unsigned int getRevision() const { return revision; }
136 void setRevision(unsigned int revisionA) { revision = revisionA; }
137
138 // Hidden elements should not be displayed by the user agent
139 bool isHidden() const { return hidden; }
140 void setHidden(bool hiddenA) { hidden = hiddenA; }
141
142 // The formatted value may be in the PDF, or be left undefined (nullptr).
143 // In the later case the user agent should provide a default representation.
144 const char *getFormattedValue() const { return formatted ? formatted->c_str() : nullptr; }
145 void setFormattedValue(const char *formattedA);
146
147 ~Attribute();
148
149private:
150 Type type;
151 Owner owner;
152 unsigned int revision;
153 GooString name;
154 Object value;
155 bool hidden;
156 GooString *formatted;
157
158 bool checkType(StructElement *element = nullptr);
159 static Type getTypeForName(const char *name, StructElement *element = nullptr);
160 static Attribute *parseUserProperty(Dict *property);
161
162 friend class StructElement;
163};
164
165class POPPLER_PRIVATE_EXPORT StructElement
166{
167public:
168 enum Type
169 {
170 Unknown = 0,
171 MCID, // MCID reference, used internally
172 OBJR, // Object reference, used internally
173
174 Document,
175 Part,
176 Art,
177 Sect,
178 Div, // Structural elements
179
180 Span,
181 Quote,
182 Note,
183 Reference,
184 BibEntry, // Inline elements
185 Code,
186 Link,
187 Annot,
188 BlockQuote,
189 Caption,
190 NonStruct,
191 TOC,
192 TOCI,
193 Index,
194 Private,
195
196 P,
197 H,
198 H1,
199 H2,
200 H3,
201 H4,
202 H5,
203 H6, // Paragraph-like
204
205 L,
206 LI,
207 Lbl,
208 LBody, // List elements
209
210 Table,
211 TR,
212 TH,
213 TD,
214 THead,
215 TFoot,
216 TBody, // Table elements
217
218 Ruby,
219 RB,
220 RT,
221 RP, // Ruby text elements
222 Warichu,
223 WT,
224 WP,
225
226 Figure,
227 Formula,
228 Form, // Illustration-like elements
229 };
230
231 static const Ref InvalidRef;
232
233 const char *getTypeName() const;
234 Type getType() const { return type; }
235 bool isOk() const { return type != Unknown; }
236 bool isBlock() const;
237 bool isInline() const;
238 bool isGrouping() const;
239
240 inline bool isContent() const { return (type == MCID) || isObjectRef(); }
241 inline bool isObjectRef() const { return (type == OBJR && c->ref != Ref::INVALID()); }
242
243 int getMCID() const { return c->mcid; }
244 Ref getObjectRef() const { return c->ref; }
245 Ref getParentRef() const { return isContent() ? parent->getParentRef() : s->parentRef; }
246 StructElement *getParent() const { return parent; } // returns NULL if parent is StructTreeRoot
247 bool hasPageRef() const;
248 bool getPageRef(Ref &ref) const;
249 bool hasStmRef() const { return stmRef.isRef(); }
250 bool getStmRef(Ref &ref) const;
251 StructTreeRoot *getStructTreeRoot() { return treeRoot; }
252
253 // Optional element identifier.
254 const GooString *getID() const { return isContent() ? nullptr : s->id; }
255 GooString *getID() { return isContent() ? nullptr : s->id; }
256
257 // Optional ISO language name, e.g. en_US
258 GooString *getLanguage()
259 {
260 if (!isContent() && s->language) {
261 return s->language;
262 }
263 return parent ? parent->getLanguage() : nullptr;
264 }
265 const GooString *getLanguage() const
266 {
267 if (!isContent() && s->language) {
268 return s->language;
269 }
270 return parent ? parent->getLanguage() : nullptr;
271 }
272
273 // Optional revision number, defaults to zero.
274 unsigned int getRevision() const { return isContent() ? 0 : s->revision; }
275 void setRevision(unsigned int revision)
276 {
277 if (isContent()) {
278 s->revision = revision;
279 }
280 }
281
282 // Optional element title, in human-readable form.
283 const GooString *getTitle() const { return isContent() ? nullptr : s->title; }
284 GooString *getTitle() { return isContent() ? nullptr : s->title; }
285
286 // Optional element expanded abbreviation text.
287 const GooString *getExpandedAbbr() const { return isContent() ? nullptr : s->expandedAbbr; }
288 GooString *getExpandedAbbr() { return isContent() ? nullptr : s->expandedAbbr; }
289
290 unsigned getNumChildren() const { return isContent() ? 0 : s->elements.size(); }
291 const StructElement *getChild(int i) const { return isContent() ? nullptr : s->elements.at(n: i); }
292 StructElement *getChild(int i) { return isContent() ? nullptr : s->elements.at(n: i); }
293
294 void appendChild(StructElement *element)
295 {
296 if (!isContent() && element && element->isOk()) {
297 s->elements.push_back(x: element);
298 }
299 }
300
301 unsigned getNumAttributes() const { return isContent() ? 0 : s->attributes.size(); }
302 const Attribute *getAttribute(int i) const { return isContent() ? nullptr : s->attributes.at(n: i); }
303 Attribute *getAttribute(int i) { return isContent() ? nullptr : s->attributes.at(n: i); }
304
305 void appendAttribute(Attribute *attribute)
306 {
307 if (!isContent() && attribute) {
308 s->attributes.push_back(x: attribute);
309 }
310 }
311
312 const Attribute *findAttribute(Attribute::Type attributeType, bool inherit = false, Attribute::Owner owner = Attribute::UnknownOwner) const;
313
314 const GooString *getAltText() const { return isContent() ? nullptr : s->altText; }
315 GooString *getAltText() { return isContent() ? nullptr : s->altText; }
316
317 const GooString *getActualText() const { return isContent() ? nullptr : s->actualText; }
318 GooString *getActualText() { return isContent() ? nullptr : s->actualText; }
319
320 // Content text referenced by the element:
321 //
322 // - For MCID reference elements, this is just the text of the
323 // corresponding marked content object in the page stream, regardless
324 // of the setting of the "recursive" flag.
325 // - For other elements, if the "recursive" flag is set, the text
326 // enclosed by *all* the child MCID reference elements of the subtree
327 // is returned. The text is assembled by traversing the leaf MCID
328 // reference elements in logical order.
329 // - In any other case, the function returns nullptr.
330 //
331 // A new string is returned, and the ownership passed to the caller.
332 //
333 GooString *getText(bool recursive = true) const { return appendSubTreeText(string: nullptr, recursive); }
334
335 const TextSpanArray getTextSpans() const
336 {
337 if (!isContent()) {
338 return TextSpanArray();
339 }
340 MarkedContentOutputDev mcdev(getMCID(), stmRef);
341 return getTextSpansInternal(mcdev);
342 }
343
344 ~StructElement();
345
346private:
347 GooString *appendSubTreeText(GooString *string, bool recursive) const;
348 const TextSpanArray &getTextSpansInternal(MarkedContentOutputDev &mcdev) const;
349
350 typedef std::vector<Attribute *> AttrPtrArray;
351 typedef std::vector<StructElement *> ElemPtrArray;
352
353 struct StructData
354 {
355 Ref parentRef;
356 GooString *altText;
357 GooString *actualText;
358 GooString *id;
359 GooString *title;
360 GooString *expandedAbbr;
361 GooString *language;
362 unsigned int revision;
363 ElemPtrArray elements;
364 AttrPtrArray attributes;
365
366 StructData();
367 ~StructData();
368
369 StructData(const StructData &) = delete;
370 StructData &operator=(const StructData &) = delete;
371 };
372
373 // Data in content elements (MCID, MCR)
374 struct ContentData
375 {
376 union {
377 int mcid;
378 Ref ref;
379 };
380
381 explicit ContentData(int mcidA) : mcid(mcidA) { }
382 explicit ContentData(const Ref r) { ref = r; }
383 };
384
385 // Common data
386 Type type;
387 StructTreeRoot *treeRoot;
388 StructElement *parent;
389 mutable Object pageRef;
390 Object stmRef;
391
392 union {
393 StructData *s;
394 ContentData *c;
395 };
396
397 StructElement(Dict *elementDict, StructTreeRoot *treeRootA, StructElement *parentA, RefRecursionChecker &seen);
398 StructElement(int mcid, StructTreeRoot *treeRootA, StructElement *parentA);
399 StructElement(const Ref ref, StructTreeRoot *treeRootA, StructElement *parentA);
400
401 void parse(Dict *elementDict);
402 StructElement *parseChild(const Object *ref, Object *childObj, RefRecursionChecker &seen);
403 void parseChildren(Dict *element, RefRecursionChecker &seen);
404 void parseAttributes(Dict *attributes, bool keepExisting = false);
405
406 friend class StructTreeRoot;
407};
408
409#endif
410

source code of poppler/poppler/StructElement.h