1 | //======================================================================== |
2 | // |
3 | // StructElement.h |
4 | // |
5 | // This file is licensed under the GPLv2 or later |
6 | // |
7 | // Copyright 2013, 2014 Igalia S.L. |
8 | // Copyright 2014 Luigi Scarso <luigi.scarso@gmail.com> |
9 | // Copyright 2014, 2018, 2019, 2021, 2023 Albert Astals Cid <aacid@kde.org> |
10 | // Copyright 2018 Adam Reichold <adam.reichold@t-online.de> |
11 | // Copyright 2021, 2023 Adrian Johnson <ajohnson@redneon.com> |
12 | // |
13 | //======================================================================== |
14 | |
15 | #ifndef STRUCTELEMENT_H |
16 | #define STRUCTELEMENT_H |
17 | |
18 | #include "goo/GooString.h" |
19 | #include "MarkedContentOutputDev.h" |
20 | #include "Object.h" |
21 | #include "poppler_private_export.h" |
22 | #include <vector> |
23 | #include <set> |
24 | |
25 | class GooString; |
26 | class Dict; |
27 | class StructElement; |
28 | class StructTreeRoot; |
29 | |
30 | class POPPLER_PRIVATE_EXPORT Attribute |
31 | { |
32 | public: |
33 | enum Type |
34 | { |
35 | Unknown = 0, // Uninitialized, parsing error, etc. |
36 | UserProperty, // User defined attribute (i.e. non-standard) |
37 | |
38 | // Common standard attributes |
39 | Placement, |
40 | WritingMode, |
41 | BackgroundColor, |
42 | BorderColor, |
43 | BorderStyle, |
44 | BorderThickness, |
45 | Color, |
46 | Padding, |
47 | |
48 | // Block element standard attributes |
49 | SpaceBefore, |
50 | SpaceAfter, |
51 | StartIndent, |
52 | EndIndent, |
53 | TextIndent, |
54 | TextAlign, |
55 | BBox, |
56 | Width, |
57 | Height, |
58 | BlockAlign, |
59 | InlineAlign, |
60 | TBorderStyle, |
61 | TPadding, |
62 | |
63 | // Inline element standard attributes |
64 | BaselineShift, |
65 | LineHeight, |
66 | TextDecorationColor, |
67 | TextDecorationThickness, |
68 | TextDecorationType, |
69 | RubyAlign, |
70 | RubyPosition, |
71 | GlyphOrientationVertical, |
72 | |
73 | // Column-only standard attributes |
74 | ColumnCount, |
75 | ColumnGap, |
76 | ColumnWidths, |
77 | |
78 | // List-only standard attributes |
79 | ListNumbering, |
80 | |
81 | // PrintField-only standard attributes |
82 | Role, |
83 | checked, |
84 | Desc, |
85 | |
86 | // Table-only standard attributes |
87 | RowSpan, |
88 | ColSpan, |
89 | , |
90 | Scope, |
91 | Summary, |
92 | }; |
93 | |
94 | enum Owner |
95 | { |
96 | UnknownOwner = 0, |
97 | // User-defined attributes |
98 | UserProperties, |
99 | // Standard attributes |
100 | Layout, |
101 | List, |
102 | PrintField, |
103 | Table, |
104 | // Translation to other formats |
105 | XML_1_00, |
106 | HTML_3_20, |
107 | HTML_4_01, |
108 | OEB_1_00, |
109 | RTF_1_05, |
110 | CSS_1_00, |
111 | CSS_2_00, |
112 | }; |
113 | |
114 | // Creates a standard attribute. The name is predefined, and the |
115 | // value is type-checked to conform to the PDF specification. |
116 | Attribute(Type type, Object *value); |
117 | |
118 | // Creates an UserProperty attribute, with an arbitrary name and value. |
119 | Attribute(GooString &&name, Object *value); |
120 | |
121 | bool isOk() const { return type != Unknown; } |
122 | |
123 | // Name, type and value can be set only on construction. |
124 | Type getType() const { return type; } |
125 | Owner getOwner() const { return owner; } |
126 | const char *getTypeName() const; |
127 | const char *getOwnerName() const; |
128 | const Object *getValue() const { return &value; } |
129 | static Object *getDefaultValue(Type type); |
130 | |
131 | // The caller gets the ownership of the return GooString and is responsible of deleting it |
132 | std::unique_ptr<GooString> getName() const { return std::make_unique<GooString>(args: type == UserProperty ? name.c_str() : getTypeName()); } |
133 | |
134 | // The revision is optional, and defaults to zero. |
135 | unsigned int getRevision() const { return revision; } |
136 | void setRevision(unsigned int revisionA) { revision = revisionA; } |
137 | |
138 | // Hidden elements should not be displayed by the user agent |
139 | bool isHidden() const { return hidden; } |
140 | void setHidden(bool hiddenA) { hidden = hiddenA; } |
141 | |
142 | // The formatted value may be in the PDF, or be left undefined (nullptr). |
143 | // In the later case the user agent should provide a default representation. |
144 | const char *getFormattedValue() const { return formatted ? formatted->c_str() : nullptr; } |
145 | void setFormattedValue(const char *formattedA); |
146 | |
147 | ~Attribute(); |
148 | |
149 | private: |
150 | Type type; |
151 | Owner owner; |
152 | unsigned int revision; |
153 | GooString name; |
154 | Object value; |
155 | bool hidden; |
156 | GooString *formatted; |
157 | |
158 | bool checkType(StructElement *element = nullptr); |
159 | static Type getTypeForName(const char *name, StructElement *element = nullptr); |
160 | static Attribute *parseUserProperty(Dict *property); |
161 | |
162 | friend class StructElement; |
163 | }; |
164 | |
165 | class POPPLER_PRIVATE_EXPORT StructElement |
166 | { |
167 | public: |
168 | enum Type |
169 | { |
170 | Unknown = 0, |
171 | MCID, // MCID reference, used internally |
172 | OBJR, // Object reference, used internally |
173 | |
174 | Document, |
175 | Part, |
176 | Art, |
177 | Sect, |
178 | Div, // Structural elements |
179 | |
180 | Span, |
181 | Quote, |
182 | Note, |
183 | Reference, |
184 | BibEntry, // Inline elements |
185 | Code, |
186 | Link, |
187 | Annot, |
188 | BlockQuote, |
189 | Caption, |
190 | NonStruct, |
191 | TOC, |
192 | TOCI, |
193 | Index, |
194 | Private, |
195 | |
196 | P, |
197 | H, |
198 | H1, |
199 | H2, |
200 | H3, |
201 | H4, |
202 | H5, |
203 | H6, // Paragraph-like |
204 | |
205 | L, |
206 | LI, |
207 | Lbl, |
208 | LBody, // List elements |
209 | |
210 | Table, |
211 | TR, |
212 | TH, |
213 | TD, |
214 | THead, |
215 | , |
216 | TBody, // Table elements |
217 | |
218 | Ruby, |
219 | RB, |
220 | RT, |
221 | RP, // Ruby text elements |
222 | Warichu, |
223 | WT, |
224 | WP, |
225 | |
226 | Figure, |
227 | Formula, |
228 | Form, // Illustration-like elements |
229 | }; |
230 | |
231 | static const Ref InvalidRef; |
232 | |
233 | const char *getTypeName() const; |
234 | Type getType() const { return type; } |
235 | bool isOk() const { return type != Unknown; } |
236 | bool isBlock() const; |
237 | bool isInline() const; |
238 | bool isGrouping() const; |
239 | |
240 | inline bool isContent() const { return (type == MCID) || isObjectRef(); } |
241 | inline bool isObjectRef() const { return (type == OBJR && c->ref != Ref::INVALID()); } |
242 | |
243 | int getMCID() const { return c->mcid; } |
244 | Ref getObjectRef() const { return c->ref; } |
245 | Ref getParentRef() const { return isContent() ? parent->getParentRef() : s->parentRef; } |
246 | StructElement *getParent() const { return parent; } // returns NULL if parent is StructTreeRoot |
247 | bool () const; |
248 | bool (Ref &ref) const; |
249 | bool hasStmRef() const { return stmRef.isRef(); } |
250 | bool getStmRef(Ref &ref) const; |
251 | StructTreeRoot *getStructTreeRoot() { return treeRoot; } |
252 | |
253 | // Optional element identifier. |
254 | const GooString *getID() const { return isContent() ? nullptr : s->id; } |
255 | GooString *getID() { return isContent() ? nullptr : s->id; } |
256 | |
257 | // Optional ISO language name, e.g. en_US |
258 | GooString *getLanguage() |
259 | { |
260 | if (!isContent() && s->language) { |
261 | return s->language; |
262 | } |
263 | return parent ? parent->getLanguage() : nullptr; |
264 | } |
265 | const GooString *getLanguage() const |
266 | { |
267 | if (!isContent() && s->language) { |
268 | return s->language; |
269 | } |
270 | return parent ? parent->getLanguage() : nullptr; |
271 | } |
272 | |
273 | // Optional revision number, defaults to zero. |
274 | unsigned int getRevision() const { return isContent() ? 0 : s->revision; } |
275 | void setRevision(unsigned int revision) |
276 | { |
277 | if (isContent()) { |
278 | s->revision = revision; |
279 | } |
280 | } |
281 | |
282 | // Optional element title, in human-readable form. |
283 | const GooString *getTitle() const { return isContent() ? nullptr : s->title; } |
284 | GooString *getTitle() { return isContent() ? nullptr : s->title; } |
285 | |
286 | // Optional element expanded abbreviation text. |
287 | const GooString *getExpandedAbbr() const { return isContent() ? nullptr : s->expandedAbbr; } |
288 | GooString *getExpandedAbbr() { return isContent() ? nullptr : s->expandedAbbr; } |
289 | |
290 | unsigned getNumChildren() const { return isContent() ? 0 : s->elements.size(); } |
291 | const StructElement *getChild(int i) const { return isContent() ? nullptr : s->elements.at(n: i); } |
292 | StructElement *getChild(int i) { return isContent() ? nullptr : s->elements.at(n: i); } |
293 | |
294 | void appendChild(StructElement *element) |
295 | { |
296 | if (!isContent() && element && element->isOk()) { |
297 | s->elements.push_back(x: element); |
298 | } |
299 | } |
300 | |
301 | unsigned getNumAttributes() const { return isContent() ? 0 : s->attributes.size(); } |
302 | const Attribute *getAttribute(int i) const { return isContent() ? nullptr : s->attributes.at(n: i); } |
303 | Attribute *getAttribute(int i) { return isContent() ? nullptr : s->attributes.at(n: i); } |
304 | |
305 | void appendAttribute(Attribute *attribute) |
306 | { |
307 | if (!isContent() && attribute) { |
308 | s->attributes.push_back(x: attribute); |
309 | } |
310 | } |
311 | |
312 | const Attribute *findAttribute(Attribute::Type attributeType, bool inherit = false, Attribute::Owner owner = Attribute::UnknownOwner) const; |
313 | |
314 | const GooString *getAltText() const { return isContent() ? nullptr : s->altText; } |
315 | GooString *getAltText() { return isContent() ? nullptr : s->altText; } |
316 | |
317 | const GooString *getActualText() const { return isContent() ? nullptr : s->actualText; } |
318 | GooString *getActualText() { return isContent() ? nullptr : s->actualText; } |
319 | |
320 | // Content text referenced by the element: |
321 | // |
322 | // - For MCID reference elements, this is just the text of the |
323 | // corresponding marked content object in the page stream, regardless |
324 | // of the setting of the "recursive" flag. |
325 | // - For other elements, if the "recursive" flag is set, the text |
326 | // enclosed by *all* the child MCID reference elements of the subtree |
327 | // is returned. The text is assembled by traversing the leaf MCID |
328 | // reference elements in logical order. |
329 | // - In any other case, the function returns nullptr. |
330 | // |
331 | // A new string is returned, and the ownership passed to the caller. |
332 | // |
333 | GooString *getText(bool recursive = true) const { return appendSubTreeText(string: nullptr, recursive); } |
334 | |
335 | const TextSpanArray getTextSpans() const |
336 | { |
337 | if (!isContent()) { |
338 | return TextSpanArray(); |
339 | } |
340 | MarkedContentOutputDev mcdev(getMCID(), stmRef); |
341 | return getTextSpansInternal(mcdev); |
342 | } |
343 | |
344 | ~StructElement(); |
345 | |
346 | private: |
347 | GooString *appendSubTreeText(GooString *string, bool recursive) const; |
348 | const TextSpanArray &getTextSpansInternal(MarkedContentOutputDev &mcdev) const; |
349 | |
350 | typedef std::vector<Attribute *> AttrPtrArray; |
351 | typedef std::vector<StructElement *> ElemPtrArray; |
352 | |
353 | struct StructData |
354 | { |
355 | Ref parentRef; |
356 | GooString *altText; |
357 | GooString *actualText; |
358 | GooString *id; |
359 | GooString *title; |
360 | GooString *expandedAbbr; |
361 | GooString *language; |
362 | unsigned int revision; |
363 | ElemPtrArray elements; |
364 | AttrPtrArray attributes; |
365 | |
366 | StructData(); |
367 | ~StructData(); |
368 | |
369 | StructData(const StructData &) = delete; |
370 | StructData &operator=(const StructData &) = delete; |
371 | }; |
372 | |
373 | // Data in content elements (MCID, MCR) |
374 | struct ContentData |
375 | { |
376 | union { |
377 | int mcid; |
378 | Ref ref; |
379 | }; |
380 | |
381 | explicit ContentData(int mcidA) : mcid(mcidA) { } |
382 | explicit ContentData(const Ref r) { ref = r; } |
383 | }; |
384 | |
385 | // Common data |
386 | Type type; |
387 | StructTreeRoot *treeRoot; |
388 | StructElement *parent; |
389 | mutable Object ; |
390 | Object stmRef; |
391 | |
392 | union { |
393 | StructData *s; |
394 | ContentData *c; |
395 | }; |
396 | |
397 | StructElement(Dict *elementDict, StructTreeRoot *treeRootA, StructElement *parentA, RefRecursionChecker &seen); |
398 | StructElement(int mcid, StructTreeRoot *treeRootA, StructElement *parentA); |
399 | StructElement(const Ref ref, StructTreeRoot *treeRootA, StructElement *parentA); |
400 | |
401 | void parse(Dict *elementDict); |
402 | StructElement *parseChild(const Object *ref, Object *childObj, RefRecursionChecker &seen); |
403 | void parseChildren(Dict *element, RefRecursionChecker &seen); |
404 | void parseAttributes(Dict *attributes, bool keepExisting = false); |
405 | |
406 | friend class StructTreeRoot; |
407 | }; |
408 | |
409 | #endif |
410 | |