1 | /* |
2 | SPDX-FileCopyrightText: 2013 Vishesh Handa <me@vhanda.in> |
3 | |
4 | SPDX-License-Identifier: LGPL-2.1-or-later |
5 | */ |
6 | |
7 | |
8 | #include "kfilemetadata_debug.h" |
9 | #include "office2007extractor.h" |
10 | |
11 | #include "dublincoreextractor.h" |
12 | #include <memory> |
13 | |
14 | #include <KZip> |
15 | |
16 | #include <QDomDocument> |
17 | #include <QXmlStreamReader> |
18 | |
19 | using namespace KFileMetaData; |
20 | |
21 | namespace { |
22 | inline QString cpNS() { return QStringLiteral("http://schemas.openxmlformats.org/package/2006/metadata/core-properties" ); } |
23 | inline QString relNS() { return QStringLiteral("http://schemas.openxmlformats.org/package/2006/relationships" ); } |
24 | inline QString extPropNST() { return QStringLiteral("http://schemas.openxmlformats.org/officeDocument/2006/extended-properties" ); } |
25 | inline QString extPropNSS() { return QStringLiteral("http://purl.oclc.org/ooxml/officeDocument/extendedProperties" ); } |
26 | |
27 | inline QString coreProp() { return QStringLiteral("http://schemas.openxmlformats.org/package/2006/relationships/metadata/core-properties" ); } |
28 | inline QString extPropT() { return QStringLiteral("http://schemas.openxmlformats.org/officeDocument/2006/relationships/extended-properties" ); } |
29 | inline QString extPropS() { return QStringLiteral("http://purl.oclc.org/ooxml/officeDocument/relationships/extendedProperties" ); } |
30 | } // namespace |
31 | |
32 | Office2007Extractor::(QObject* parent) |
33 | : ExtractorPlugin(parent) |
34 | { |
35 | |
36 | } |
37 | |
38 | const QStringList supportedMimeTypes = { |
39 | QStringLiteral("application/vnd.openxmlformats-officedocument.wordprocessingml.document" ), |
40 | QStringLiteral("application/vnd.openxmlformats-officedocument.wordprocessingml.template" ), |
41 | QStringLiteral("application/vnd.openxmlformats-officedocument.presentationml.presentation" ), |
42 | QStringLiteral("application/vnd.openxmlformats-officedocument.presentationml.slide" ), |
43 | QStringLiteral("application/vnd.openxmlformats-officedocument.presentationml.slideshow" ), |
44 | QStringLiteral("application/vnd.openxmlformats-officedocument.presentationml.template" ), |
45 | QStringLiteral("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" ), |
46 | QStringLiteral("application/vnd.openxmlformats-officedocument.spreadsheetml.template" ), |
47 | QStringLiteral("application/vnd.ms-xpsdocument" ), |
48 | QStringLiteral("application/oxps" ), |
49 | QStringLiteral("model/3mf" ), |
50 | }; |
51 | |
52 | QStringList Office2007Extractor::() const |
53 | { |
54 | return supportedMimeTypes; |
55 | } |
56 | |
57 | void Office2007Extractor::(ExtractionResult* result) |
58 | { |
59 | KZip zip(result->inputUrl()); |
60 | if (!zip.open(mode: QIODevice::ReadOnly)) { |
61 | qCWarning(KFILEMETADATA_LOG) << "Failed to open" << zip.fileName() << "-" << zip.errorString(); |
62 | return; |
63 | } |
64 | |
65 | const KArchiveDirectory* rootDir = zip.directory(); |
66 | if (!rootDir) { |
67 | qCWarning(KFILEMETADATA_LOG) << "Invalid document structure (main directory is missing)" ; |
68 | return; |
69 | } |
70 | |
71 | const bool = result->inputFlags() & ExtractionResult::ExtractMetaData; |
72 | |
73 | // Resolve part relationships according to ECMA-376-2 (Open Packaging Conventions, OPC) |
74 | const QDomElement relationsElem = [rootDir]() { |
75 | const KArchiveFile *baseRels = rootDir->file(QStringLiteral("_rels/.rels" )); |
76 | if (!baseRels) { |
77 | qCWarning(KFILEMETADATA_LOG) << "Invalid document structure - missing package relationship" ; |
78 | return QDomElement{}; |
79 | } |
80 | |
81 | QDomDocument relationsDoc; |
82 | relationsDoc.setContent(data: baseRels->data(), options: QDomDocument::ParseOption::UseNamespaceProcessing); |
83 | |
84 | auto relations = relationsDoc.firstChildElement(QStringLiteral("Relationships" )); |
85 | if (relations.isNull()) { |
86 | qCWarning(KFILEMETADATA_LOG) << "Invalid document structure - invalid package relationships" ; |
87 | } |
88 | return relations; |
89 | }(); |
90 | |
91 | auto targetByType = [&relationsElem](const QString &type, const QString &defVal = {}) -> QString { |
92 | for (auto rel = relationsElem.firstChildElement(); !rel.isNull(); rel = rel.nextSiblingElement()) { |
93 | if (rel.namespaceURI() == relNS() && rel.localName() == QStringLiteral("Relationship" ) |
94 | && rel.attribute(QStringLiteral("Type" )) == type) { |
95 | return rel.attribute(QStringLiteral("Target" )); |
96 | } |
97 | } |
98 | return defVal; |
99 | }; |
100 | |
101 | // Core Properties |
102 | const QString corePropertiesFile = targetByType(coreProp(), QStringLiteral("docProps/core.xml" )); |
103 | if (const KArchiveFile *file = extractMetaData ? rootDir->file(name: corePropertiesFile) : nullptr; file) { |
104 | QDomDocument coreDoc(QStringLiteral("core" )); |
105 | coreDoc.setContent(data: file->data(), options: QDomDocument::ParseOption::UseNamespaceProcessing); |
106 | |
107 | QDomElement cpElem = coreDoc.documentElement(); |
108 | |
109 | if (!cpElem.isNull() && cpElem.namespaceURI() == cpNS()) { |
110 | DublinCoreExtractor::extract(result, fragment: cpElem); |
111 | } |
112 | |
113 | auto elem = cpElem.firstChildElement(QStringLiteral("keywords" )); |
114 | if (!elem.isNull() && elem.namespaceURI() == cpNS()) { |
115 | for (auto c = elem.firstChild(); !c.isNull(); c = c.nextSibling()) { |
116 | if (const auto childElem = c.toElement(); childElem.localName() == QStringLiteral("value" ) && !childElem.text().isEmpty()) { |
117 | result->add(property: Property::Keywords, value: childElem.text()); |
118 | } else if (const auto tNode = c.toText(); !tNode.nodeValue().isEmpty()) { |
119 | result->add(property: Property::Keywords, value: tNode.nodeValue()); |
120 | } |
121 | } |
122 | } |
123 | } |
124 | |
125 | // Extended Properties - two valid relation types: "strict" (ECMA-376-1:2016) or "transitional" (ECMA-367-4:2016) |
126 | const QString extPropertiesFile = targetByType(extPropS(), targetByType(extPropT(), QStringLiteral("docProps/app.xml" ))); |
127 | if (const KArchiveFile *file = extractMetaData ? rootDir->file(name: extPropertiesFile) : nullptr; file) { |
128 | QDomDocument appDoc; |
129 | appDoc.setContent(data: file->data(), options: QDomDocument::ParseOption::UseNamespaceProcessing); |
130 | |
131 | QDomElement propsElem = appDoc.documentElement(); |
132 | |
133 | for (auto prop = propsElem.firstChildElement(); !prop.isNull(); prop = prop.nextSiblingElement()) { |
134 | // Look for properties as specified in ECMA-376-1, Annex A.6.2 Extended Properties |
135 | bool ok; |
136 | if (prop.localName() == QStringLiteral("Pages" )) { |
137 | if (int count = prop.text().toInt(ok: &ok); ok == true) { |
138 | result->add(property: Property::PageCount, value: count); |
139 | } |
140 | } else if (prop.localName() == QStringLiteral("Slides" )) { |
141 | if (int count = prop.text().toInt(ok: &ok); ok == true) { |
142 | // Map number of slides to PageCount |
143 | result->add(property: Property::PageCount, value: count); |
144 | } |
145 | } else if (prop.localName() == QStringLiteral("Words" )) { |
146 | if (int count = prop.text().toInt(ok: &ok); ok == true) { |
147 | result->add(property: Property::WordCount, value: count); |
148 | } |
149 | } else if (prop.localName() == QStringLiteral("Lines" )) { |
150 | if (int count = prop.text().toInt(ok: &ok); ok == true) { |
151 | result->add(property: Property::LineCount, value: count); |
152 | } |
153 | } else if (prop.localName() == QStringLiteral("Application" )) { |
154 | QString application = prop.text(); |
155 | if (!application.isEmpty()) { |
156 | result->add(property: Property::Generator, value: application); |
157 | } |
158 | } |
159 | } |
160 | } |
161 | |
162 | // |
163 | // Plain Text |
164 | // |
165 | bool = (result->inputFlags() & ExtractionResult::ExtractPlainText); |
166 | |
167 | if (const auto wordEntry = rootDir->entry(QStringLiteral("word" )); wordEntry) { |
168 | result->addType(type: Type::Document); |
169 | |
170 | if (!extractPlainText) { |
171 | return; |
172 | } |
173 | |
174 | if (!wordEntry->isDirectory()) { |
175 | qCWarning(KFILEMETADATA_LOG) << "Invalid document structure (word is not a directory)" ; |
176 | return; |
177 | } |
178 | |
179 | const KArchiveDirectory* wordDirectory = dynamic_cast<const KArchiveDirectory*>(wordEntry); |
180 | const QStringList wordEntries = wordDirectory->entries(); |
181 | |
182 | if (wordEntries.contains(QStringLiteral("document.xml" ))) { |
183 | const KArchiveFile* file = wordDirectory->file(QStringLiteral("document.xml" )); |
184 | |
185 | if (file) { |
186 | std::unique_ptr<QIODevice> contentIODevice{file->createDevice()}; |
187 | extractTextWithTag(device: contentIODevice.get(), QStringLiteral("w:t" ), result); |
188 | } |
189 | } |
190 | } |
191 | |
192 | else if (const auto xlEntry = rootDir->entry(QStringLiteral("xl" )); xlEntry) { |
193 | result->addType(type: Type::Document); |
194 | result->addType(type: Type::Spreadsheet); |
195 | |
196 | if (!extractPlainText) { |
197 | return; |
198 | } |
199 | |
200 | if (!xlEntry->isDirectory()) { |
201 | qCWarning(KFILEMETADATA_LOG) << "Invalid document structure (xl is not a directory)" ; |
202 | return; |
203 | } |
204 | |
205 | const auto xlDirectory = dynamic_cast<const KArchiveDirectory*>(xlEntry); |
206 | // TODO: Read the sheets from worksheets/*.xml, and dereference all cells |
207 | // values in order |
208 | const KArchiveFile* file = xlDirectory->file(QStringLiteral("sharedStrings.xml" )); |
209 | if (!file) { |
210 | return; |
211 | } |
212 | std::unique_ptr<QIODevice> contentIODevice{file->createDevice()}; |
213 | extractTextWithTag(device: contentIODevice.get(), QStringLiteral("t" ), result); |
214 | } |
215 | |
216 | else if (const auto pptEntry = rootDir->entry(QStringLiteral("ppt" )); pptEntry) { |
217 | result->addType(type: Type::Document); |
218 | result->addType(type: Type::Presentation); |
219 | |
220 | if (!extractPlainText) { |
221 | return; |
222 | } |
223 | |
224 | if (!pptEntry->isDirectory()) { |
225 | qCWarning(KFILEMETADATA_LOG) << "Invalid document structure (ppt is not a directory)" ; |
226 | return; |
227 | } |
228 | |
229 | const auto pptDirectory = dynamic_cast<const KArchiveDirectory*>(pptEntry); |
230 | const auto slidesEntry = pptDirectory->entry(QStringLiteral("slides" )); |
231 | if (!slidesEntry || !slidesEntry->isDirectory()) { |
232 | return; |
233 | } |
234 | |
235 | const auto slidesDirectory = dynamic_cast<const KArchiveDirectory*>(slidesEntry); |
236 | QStringList entries = slidesDirectory->entries(); |
237 | // TODO: Read the actual order from presentation.xml, and follow the |
238 | // references in ppt/_rels/presentation.xml.rel |
239 | std::sort(first: entries.begin(), last: entries.end()); |
240 | for (const QString & entryName : std::as_const(t&: entries)) { |
241 | const KArchiveFile* file = slidesDirectory->file(name: entryName); |
242 | if (!file) { |
243 | continue; |
244 | } |
245 | std::unique_ptr<QIODevice> contentIODevice{file->createDevice()}; |
246 | extractTextWithTag(device: contentIODevice.get(), QStringLiteral("a:t" ), result); |
247 | } |
248 | } |
249 | |
250 | else if (!relationsElem.isNull()) { |
251 | // Any other document type likely following OPC |
252 | result->addType(type: Type::Document); |
253 | } |
254 | } |
255 | |
256 | void Office2007Extractor::(QIODevice* device, const QString& tag, ExtractionResult* result) |
257 | { |
258 | QXmlStreamReader xml(device); |
259 | |
260 | while (!xml.atEnd()) { |
261 | xml.readNext(); |
262 | if (xml.qualifiedName().startsWith(s: tag) && xml.isStartElement()) { |
263 | QString str = xml.readElementText(behaviour: QXmlStreamReader::IncludeChildElements); |
264 | |
265 | if (!str.isEmpty()) { |
266 | result->append(text: str); |
267 | } |
268 | } |
269 | |
270 | if (xml.isEndDocument() || xml.hasError()) { |
271 | break; |
272 | } |
273 | } |
274 | } |
275 | |
276 | #include "moc_office2007extractor.cpp" |
277 | |