1/*
2 SPDX-FileCopyrightText: 2013 Vishesh Handa <me@vhanda.in>
3 SPDX-FileCopyrightText: 2012 Jörg Ehrichs <joerg.ehrichs@gmx.de>
4 SPDX-FileCopyrightText: 2016 Christoph Cullmann <cullmann@kde.org>
5
6 SPDX-License-Identifier: LGPL-2.1-or-later
7*/
8
9
10#include "datetimeparser_p.h"
11#include "kfilemetadata_debug.h"
12#include "odfextractor.h"
13#include <memory>
14
15#include <KZip>
16
17#include <QDomDocument>
18#include <QFile>
19#include <QXmlStreamReader>
20
21namespace {
22
23inline QString dcNS() { return QStringLiteral("http://purl.org/dc/elements/1.1/"); }
24inline QString metaNS() { return QStringLiteral("urn:oasis:names:tc:opendocument:xmlns:meta:1.0"); }
25inline QString officeNS() { return QStringLiteral("urn:oasis:names:tc:opendocument:xmlns:office:1.0"); }
26inline QString bodyTag() { return QStringLiteral("body"); }
27
28QDomElement firstChildElementNS(const QDomNode &node, const QString &nsURI, const QString &localName)
29{
30 for (auto e = node.firstChildElement(); !e.isNull(); e = e.nextSiblingElement()) {
31 if (e.localName() == localName && e.namespaceURI() == nsURI) {
32 return e;
33 }
34 }
35
36 return QDomElement();
37}
38
39const QStringList supportedMimeTypes = {
40 QStringLiteral("application/vnd.oasis.opendocument.text"),
41 QStringLiteral("application/vnd.oasis.opendocument.text-template"),
42 QStringLiteral("application/vnd.oasis.opendocument.text-master"),
43 QStringLiteral("application/vnd.oasis.opendocument.text-master-template"),
44 QStringLiteral("application/vnd.oasis.opendocument.text-flat-xml"),
45 QStringLiteral("application/vnd.oasis.opendocument.presentation"),
46 QStringLiteral("application/vnd.oasis.opendocument.presentation-template"),
47 QStringLiteral("application/vnd.oasis.opendocument.presentation-flat-xml"),
48 QStringLiteral("application/vnd.oasis.opendocument.spreadsheet"),
49 QStringLiteral("application/vnd.oasis.opendocument.spreadsheet-template"),
50 QStringLiteral("application/vnd.oasis.opendocument.spreadsheet-flat-xml"),
51 QStringLiteral("application/vnd.oasis.opendocument.graphics"),
52 QStringLiteral("application/vnd.oasis.opendocument.graphics-template"),
53 QStringLiteral("application/vnd.oasis.opendocument.graphics-flat-xml"),
54};
55
56}
57
58using namespace KFileMetaData;
59
60OdfExtractor::OdfExtractor(QObject* parent)
61 : ExtractorPlugin(parent)
62{
63
64}
65
66QStringList OdfExtractor::mimetypes() const
67{
68 return supportedMimeTypes;
69}
70
71void OdfExtractor::extract(ExtractionResult* result)
72{
73 if (result->inputMimetype().endsWith(s: QLatin1String("-flat-xml"))) {
74 QFile file(result->inputUrl());
75 if (!file.open(flags: QIODevice::ReadOnly | QIODevice::Text)) {
76 return;
77 }
78
79 result->addType(type: Type::Document);
80 if (result->inputMimetype() == QLatin1String("application/vnd.oasis.opendocument.presentation-flat-xml")) {
81 result->addType(type: Type::Presentation);
82 } else if (result->inputMimetype() == QLatin1String("application/vnd.oasis.opendocument.spreadsheet-flat-xml")) {
83 result->addType(type: Type::Spreadsheet);
84 } else if (result->inputMimetype() == QLatin1String("application/vnd.oasis.opendocument.graphics-flat-xml")) {
85 result->addType(type: Type::Image);
86 }
87
88 if (result->inputFlags() & ExtractionResult::ExtractMetaData) {
89 parseMetaData(QStringLiteral("document"), data: file.readAll(), result);
90 }
91
92 if (result->inputFlags() & ExtractionResult::ExtractPlainText) {
93 file.seek(offset: 0);
94 extractPlainText(device: &file, result);
95 }
96
97 return;
98 }
99
100 KZip zip(result->inputUrl());
101 if (!zip.open(mode: QIODevice::ReadOnly)) {
102 qCWarning(KFILEMETADATA_LOG) << "Failed to open" << zip.fileName() << "-" << zip.errorString();
103 return;
104 }
105
106 const KArchiveDirectory* directory = zip.directory();
107 if (!directory) {
108 qCWarning(KFILEMETADATA_LOG) << "Invalid document structure (main directory is missing)";
109 return;
110 }
111
112 // we need a meta xml file in the archive!
113 const auto metaXml = directory->file(QStringLiteral("meta.xml"));
114 if (!metaXml) {
115 qCWarning(KFILEMETADATA_LOG) << "Invalid document structure (meta.xml is missing)";
116 return;
117 }
118
119 if (result->inputFlags() & ExtractionResult::ExtractMetaData) {
120 parseMetaData(QStringLiteral("document-meta"), data: metaXml->data(), result);
121 }
122
123 result->addType(type: Type::Document);
124 if ((result->inputMimetype() == QLatin1String("application/vnd.oasis.opendocument.presentation")) ||
125 (result->inputMimetype() == QLatin1String("application/vnd.oasis.opendocument.presentation-template"))) {
126 result->addType(type: Type::Presentation);
127 }
128 else if ((result->inputMimetype() == QLatin1String("application/vnd.oasis.opendocument.spreadsheet")) ||
129 (result->inputMimetype() == QLatin1String("application/vnd.oasis.opendocument.spreadsheet-template"))) {
130 result->addType(type: Type::Spreadsheet);
131 }
132 else if (result->inputMimetype() == QLatin1String("application/vnd.oasis.opendocument.graphics") ||
133 result->inputMimetype() == QLatin1String("application/vnd.oasis.opendocument.graphics-template")) {
134 result->addType(type: Type::Image);
135 }
136
137 if (!(result->inputFlags() & ExtractionResult::ExtractPlainText)) {
138 return;
139 }
140
141 // for content indexing, we need content xml file
142 const auto contentXml = directory->file(QStringLiteral("content.xml"));
143 if (!contentXml) {
144 qCWarning(KFILEMETADATA_LOG) << "Invalid document structure (content.xml is missing)";
145 return;
146 }
147
148 std::unique_ptr<QIODevice> contentIODevice{contentXml->createDevice()};
149 extractPlainText(device: contentIODevice.get(), result);
150}
151
152void OdfExtractor::parseMetaData(const QString &documentElementId, const QByteArray &data, ExtractionResult *result)
153{
154 QDomDocument metaData(QStringLiteral("metaData"));
155 metaData.setContent(data, options: QDomDocument::ParseOption::UseNamespaceProcessing);
156
157 // parse metadata ...
158 QDomElement meta = firstChildElementNS(node: firstChildElementNS(node: metaData,
159 nsURI: officeNS(), localName: documentElementId),
160 nsURI: officeNS(), QStringLiteral("meta"));
161
162 QDomNode n = meta.firstChild();
163 while (!n.isNull()) {
164 QDomElement e = n.toElement();
165 if (!e.isNull()) {
166 const QString namespaceURI = e.namespaceURI();
167 const QString localName = e.localName();
168
169 // Dublin Core
170 if (namespaceURI == dcNS()) {
171 if (localName == QLatin1String("description")) {
172 result->add(property: Property::Description, value: e.text());
173 } else if (localName == QLatin1String("subject")) {
174 result->add(property: Property::Subject, value: e.text());
175 } else if (localName == QLatin1String("title")) {
176 result->add(property: Property::Title, value: e.text());
177 } else if (localName == QLatin1String("creator")) {
178 result->add(property: Property::Author, value: e.text());
179 } else if (localName == QLatin1String("language")) {
180 result->add(property: Property::Language, value: e.text());
181 }
182 }
183 // Meta Properties
184 else if (namespaceURI == metaNS()) {
185 if (localName == QLatin1String("document-statistic")) {
186 bool ok = false;
187 int pageCount = e.attributeNS(nsURI: metaNS(), QStringLiteral("page-count")).toInt(ok: &ok);
188 if (ok) {
189 result->add(property: Property::PageCount, value: pageCount);
190 }
191
192 int wordCount = e.attributeNS(nsURI: metaNS(), QStringLiteral("word-count")).toInt(ok: &ok);
193 if (ok) {
194 result->add(property: Property::WordCount, value: wordCount);
195 }
196 } else if (localName == QLatin1String("keyword")) {
197 QString keywords = e.text();
198 result->add(property: Property::Keywords, value: keywords);
199 } else if (localName == QLatin1String("generator")) {
200 result->add(property: Property::Generator, value: e.text());
201 } else if (localName == QLatin1String("creation-date")) {
202 QDateTime dt = Parser::dateTimeFromString(dateString: e.text());
203 if (!dt.isNull()) {
204 result->add(property: Property::CreationDate, value: dt);
205 }
206 }
207 }
208 }
209 n = n.nextSibling();
210 }
211}
212
213void OdfExtractor::extractPlainText(QIODevice *device, ExtractionResult *result)
214{
215 bool inOfficeBody = false;
216
217 QXmlStreamReader xml(device);
218 while (!xml.atEnd()) {
219 xml.readNext();
220
221 if (xml.isStartElement() && !inOfficeBody && xml.namespaceUri() == officeNS() && xml.name() == bodyTag()) {
222 inOfficeBody = true;
223 } else if (xml.isEndElement() && inOfficeBody && xml.namespaceUri() == officeNS() && xml.name() == bodyTag()) {
224 break;
225 }
226
227 if (inOfficeBody && xml.isCharacters() && !xml.isWhitespace()) {
228 const QString str = xml.text().toString();
229 result->append(text: str);
230 }
231
232 if (xml.hasError() || xml.isEndDocument()) {
233 break;
234 }
235 }
236}
237
238#include "moc_odfextractor.cpp"
239

source code of kfilemetadata/src/extractors/odfextractor.cpp