1/*
2 SPDX-FileCopyrightText: 2013 Vishesh Handa <me@vhanda.in>
3
4 SPDX-License-Identifier: LGPL-2.1-or-later
5*/
6
7
8#include "kfilemetadata_debug.h"
9#include "office2007extractor.h"
10
11#include "dublincoreextractor.h"
12#include <memory>
13
14#include <KZip>
15
16#include <QDomDocument>
17#include <QXmlStreamReader>
18
19using namespace KFileMetaData;
20
21namespace {
22inline QString cpNS() { return QStringLiteral("http://schemas.openxmlformats.org/package/2006/metadata/core-properties"); }
23inline QString relNS() { return QStringLiteral("http://schemas.openxmlformats.org/package/2006/relationships"); }
24inline QString extPropNST() { return QStringLiteral("http://schemas.openxmlformats.org/officeDocument/2006/extended-properties"); }
25inline QString extPropNSS() { return QStringLiteral("http://purl.oclc.org/ooxml/officeDocument/extendedProperties"); }
26
27inline QString coreProp() { return QStringLiteral("http://schemas.openxmlformats.org/package/2006/relationships/metadata/core-properties"); }
28inline QString extPropT() { return QStringLiteral("http://schemas.openxmlformats.org/officeDocument/2006/relationships/extended-properties"); }
29inline QString extPropS() { return QStringLiteral("http://purl.oclc.org/ooxml/officeDocument/relationships/extendedProperties"); }
30} // namespace
31
32Office2007Extractor::Office2007Extractor(QObject* parent)
33 : ExtractorPlugin(parent)
34{
35
36}
37
38const QStringList supportedMimeTypes = {
39 QStringLiteral("application/vnd.openxmlformats-officedocument.wordprocessingml.document"),
40 QStringLiteral("application/vnd.openxmlformats-officedocument.wordprocessingml.template"),
41 QStringLiteral("application/vnd.openxmlformats-officedocument.presentationml.presentation"),
42 QStringLiteral("application/vnd.openxmlformats-officedocument.presentationml.slide"),
43 QStringLiteral("application/vnd.openxmlformats-officedocument.presentationml.slideshow"),
44 QStringLiteral("application/vnd.openxmlformats-officedocument.presentationml.template"),
45 QStringLiteral("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"),
46 QStringLiteral("application/vnd.openxmlformats-officedocument.spreadsheetml.template"),
47 QStringLiteral("application/vnd.ms-xpsdocument"),
48 QStringLiteral("application/oxps"),
49 QStringLiteral("model/3mf"),
50};
51
52QStringList Office2007Extractor::mimetypes() const
53{
54 return supportedMimeTypes;
55}
56
57void Office2007Extractor::extract(ExtractionResult* result)
58{
59 KZip zip(result->inputUrl());
60 if (!zip.open(mode: QIODevice::ReadOnly)) {
61 qCWarning(KFILEMETADATA_LOG) << "Failed to open" << zip.fileName() << "-" << zip.errorString();
62 return;
63 }
64
65 const KArchiveDirectory* rootDir = zip.directory();
66 if (!rootDir) {
67 qCWarning(KFILEMETADATA_LOG) << "Invalid document structure (main directory is missing)";
68 return;
69 }
70
71 const bool extractMetaData = result->inputFlags() & ExtractionResult::ExtractMetaData;
72
73 // Resolve part relationships according to ECMA-376-2 (Open Packaging Conventions, OPC)
74 const QDomElement relationsElem = [rootDir]() {
75 const KArchiveFile *baseRels = rootDir->file(QStringLiteral("_rels/.rels"));
76 if (!baseRels) {
77 qCWarning(KFILEMETADATA_LOG) << "Invalid document structure - missing package relationship";
78 return QDomElement{};
79 }
80
81 QDomDocument relationsDoc;
82 relationsDoc.setContent(data: baseRels->data(), options: QDomDocument::ParseOption::UseNamespaceProcessing);
83
84 auto relations = relationsDoc.firstChildElement(QStringLiteral("Relationships"));
85 if (relations.isNull()) {
86 qCWarning(KFILEMETADATA_LOG) << "Invalid document structure - invalid package relationships";
87 }
88 return relations;
89 }();
90
91 auto targetByType = [&relationsElem](const QString &type, const QString &defVal = {}) -> QString {
92 for (auto rel = relationsElem.firstChildElement(); !rel.isNull(); rel = rel.nextSiblingElement()) {
93 if (rel.namespaceURI() == relNS() && rel.localName() == QStringLiteral("Relationship")
94 && rel.attribute(QStringLiteral("Type")) == type) {
95 return rel.attribute(QStringLiteral("Target"));
96 }
97 }
98 return defVal;
99 };
100
101 // Core Properties
102 const QString corePropertiesFile = targetByType(coreProp(), QStringLiteral("docProps/core.xml"));
103 if (const KArchiveFile *file = extractMetaData ? rootDir->file(name: corePropertiesFile) : nullptr; file) {
104 QDomDocument coreDoc(QStringLiteral("core"));
105 coreDoc.setContent(data: file->data(), options: QDomDocument::ParseOption::UseNamespaceProcessing);
106
107 QDomElement cpElem = coreDoc.documentElement();
108
109 if (!cpElem.isNull() && cpElem.namespaceURI() == cpNS()) {
110 DublinCoreExtractor::extract(result, fragment: cpElem);
111 }
112
113 auto elem = cpElem.firstChildElement(QStringLiteral("keywords"));
114 if (!elem.isNull() && elem.namespaceURI() == cpNS()) {
115 for (auto c = elem.firstChild(); !c.isNull(); c = c.nextSibling()) {
116 if (const auto childElem = c.toElement(); childElem.localName() == QStringLiteral("value") && !childElem.text().isEmpty()) {
117 result->add(property: Property::Keywords, value: childElem.text());
118 } else if (const auto tNode = c.toText(); !tNode.nodeValue().isEmpty()) {
119 result->add(property: Property::Keywords, value: tNode.nodeValue());
120 }
121 }
122 }
123 }
124
125 // Extended Properties - two valid relation types: "strict" (ECMA-376-1:2016) or "transitional" (ECMA-367-4:2016)
126 const QString extPropertiesFile = targetByType(extPropS(), targetByType(extPropT(), QStringLiteral("docProps/app.xml")));
127 if (const KArchiveFile *file = extractMetaData ? rootDir->file(name: extPropertiesFile) : nullptr; file) {
128 QDomDocument appDoc;
129 appDoc.setContent(data: file->data(), options: QDomDocument::ParseOption::UseNamespaceProcessing);
130
131 QDomElement propsElem = appDoc.documentElement();
132
133 for (auto prop = propsElem.firstChildElement(); !prop.isNull(); prop = prop.nextSiblingElement()) {
134 // Look for properties as specified in ECMA-376-1, Annex A.6.2 Extended Properties
135 bool ok;
136 if (prop.localName() == QStringLiteral("Pages")) {
137 if (int count = prop.text().toInt(ok: &ok); ok == true) {
138 result->add(property: Property::PageCount, value: count);
139 }
140 } else if (prop.localName() == QStringLiteral("Slides")) {
141 if (int count = prop.text().toInt(ok: &ok); ok == true) {
142 // Map number of slides to PageCount
143 result->add(property: Property::PageCount, value: count);
144 }
145 } else if (prop.localName() == QStringLiteral("Words")) {
146 if (int count = prop.text().toInt(ok: &ok); ok == true) {
147 result->add(property: Property::WordCount, value: count);
148 }
149 } else if (prop.localName() == QStringLiteral("Lines")) {
150 if (int count = prop.text().toInt(ok: &ok); ok == true) {
151 result->add(property: Property::LineCount, value: count);
152 }
153 } else if (prop.localName() == QStringLiteral("Application")) {
154 QString application = prop.text();
155 if (!application.isEmpty()) {
156 result->add(property: Property::Generator, value: application);
157 }
158 }
159 }
160 }
161
162 //
163 // Plain Text
164 //
165 bool extractPlainText = (result->inputFlags() & ExtractionResult::ExtractPlainText);
166
167 if (const auto wordEntry = rootDir->entry(QStringLiteral("word")); wordEntry) {
168 result->addType(type: Type::Document);
169
170 if (!extractPlainText) {
171 return;
172 }
173
174 if (!wordEntry->isDirectory()) {
175 qCWarning(KFILEMETADATA_LOG) << "Invalid document structure (word is not a directory)";
176 return;
177 }
178
179 const KArchiveDirectory* wordDirectory = dynamic_cast<const KArchiveDirectory*>(wordEntry);
180 const QStringList wordEntries = wordDirectory->entries();
181
182 if (wordEntries.contains(QStringLiteral("document.xml"))) {
183 const KArchiveFile* file = wordDirectory->file(QStringLiteral("document.xml"));
184
185 if (file) {
186 std::unique_ptr<QIODevice> contentIODevice{file->createDevice()};
187 extractTextWithTag(device: contentIODevice.get(), QStringLiteral("w:t"), result);
188 }
189 }
190 }
191
192 else if (const auto xlEntry = rootDir->entry(QStringLiteral("xl")); xlEntry) {
193 result->addType(type: Type::Document);
194 result->addType(type: Type::Spreadsheet);
195
196 if (!extractPlainText) {
197 return;
198 }
199
200 if (!xlEntry->isDirectory()) {
201 qCWarning(KFILEMETADATA_LOG) << "Invalid document structure (xl is not a directory)";
202 return;
203 }
204
205 const auto xlDirectory = dynamic_cast<const KArchiveDirectory*>(xlEntry);
206 // TODO: Read the sheets from worksheets/*.xml, and dereference all cells
207 // values in order
208 const KArchiveFile* file = xlDirectory->file(QStringLiteral("sharedStrings.xml"));
209 if (!file) {
210 return;
211 }
212 std::unique_ptr<QIODevice> contentIODevice{file->createDevice()};
213 extractTextWithTag(device: contentIODevice.get(), QStringLiteral("t"), result);
214 }
215
216 else if (const auto pptEntry = rootDir->entry(QStringLiteral("ppt")); pptEntry) {
217 result->addType(type: Type::Document);
218 result->addType(type: Type::Presentation);
219
220 if (!extractPlainText) {
221 return;
222 }
223
224 if (!pptEntry->isDirectory()) {
225 qCWarning(KFILEMETADATA_LOG) << "Invalid document structure (ppt is not a directory)";
226 return;
227 }
228
229 const auto pptDirectory = dynamic_cast<const KArchiveDirectory*>(pptEntry);
230 const auto slidesEntry = pptDirectory->entry(QStringLiteral("slides"));
231 if (!slidesEntry || !slidesEntry->isDirectory()) {
232 return;
233 }
234
235 const auto slidesDirectory = dynamic_cast<const KArchiveDirectory*>(slidesEntry);
236 QStringList entries = slidesDirectory->entries();
237 // TODO: Read the actual order from presentation.xml, and follow the
238 // references in ppt/_rels/presentation.xml.rel
239 std::sort(first: entries.begin(), last: entries.end());
240 for (const QString & entryName : std::as_const(t&: entries)) {
241 const KArchiveFile* file = slidesDirectory->file(name: entryName);
242 if (!file) {
243 continue;
244 }
245 std::unique_ptr<QIODevice> contentIODevice{file->createDevice()};
246 extractTextWithTag(device: contentIODevice.get(), QStringLiteral("a:t"), result);
247 }
248 }
249
250 else if (!relationsElem.isNull()) {
251 // Any other document type likely following OPC
252 result->addType(type: Type::Document);
253 }
254}
255
256void Office2007Extractor::extractTextWithTag(QIODevice* device, const QString& tag, ExtractionResult* result)
257{
258 QXmlStreamReader xml(device);
259
260 while (!xml.atEnd()) {
261 xml.readNext();
262 if (xml.qualifiedName().startsWith(s: tag) && xml.isStartElement()) {
263 QString str = xml.readElementText(behaviour: QXmlStreamReader::IncludeChildElements);
264
265 if (!str.isEmpty()) {
266 result->append(text: str);
267 }
268 }
269
270 if (xml.isEndDocument() || xml.hasError()) {
271 break;
272 }
273 }
274}
275
276#include "moc_office2007extractor.cpp"
277

source code of kfilemetadata/src/extractors/office2007extractor.cpp