| 1 | /* |
| 2 | SPDX-FileCopyrightText: 2013 Vishesh Handa <me@vhanda.in> |
| 3 | |
| 4 | SPDX-License-Identifier: LGPL-2.1-or-later |
| 5 | */ |
| 6 | |
| 7 | |
| 8 | #include "kfilemetadata_debug.h" |
| 9 | #include "office2007extractor.h" |
| 10 | |
| 11 | #include "dublincoreextractor.h" |
| 12 | #include <memory> |
| 13 | |
| 14 | #include <KZip> |
| 15 | |
| 16 | #include <QDomDocument> |
| 17 | #include <QXmlStreamReader> |
| 18 | |
| 19 | using namespace KFileMetaData; |
| 20 | |
| 21 | namespace { |
| 22 | inline QString cpNS() { return QStringLiteral("http://schemas.openxmlformats.org/package/2006/metadata/core-properties" ); } |
| 23 | inline QString relNS() { return QStringLiteral("http://schemas.openxmlformats.org/package/2006/relationships" ); } |
| 24 | inline QString extPropNST() { return QStringLiteral("http://schemas.openxmlformats.org/officeDocument/2006/extended-properties" ); } |
| 25 | inline QString extPropNSS() { return QStringLiteral("http://purl.oclc.org/ooxml/officeDocument/extendedProperties" ); } |
| 26 | |
| 27 | inline QString coreProp() { return QStringLiteral("http://schemas.openxmlformats.org/package/2006/relationships/metadata/core-properties" ); } |
| 28 | inline QString extPropT() { return QStringLiteral("http://schemas.openxmlformats.org/officeDocument/2006/relationships/extended-properties" ); } |
| 29 | inline QString extPropS() { return QStringLiteral("http://purl.oclc.org/ooxml/officeDocument/relationships/extendedProperties" ); } |
| 30 | } // namespace |
| 31 | |
| 32 | Office2007Extractor::(QObject* parent) |
| 33 | : ExtractorPlugin(parent) |
| 34 | { |
| 35 | |
| 36 | } |
| 37 | |
| 38 | const QStringList supportedMimeTypes = { |
| 39 | QStringLiteral("application/vnd.openxmlformats-officedocument.wordprocessingml.document" ), |
| 40 | QStringLiteral("application/vnd.openxmlformats-officedocument.wordprocessingml.template" ), |
| 41 | QStringLiteral("application/vnd.openxmlformats-officedocument.presentationml.presentation" ), |
| 42 | QStringLiteral("application/vnd.openxmlformats-officedocument.presentationml.slide" ), |
| 43 | QStringLiteral("application/vnd.openxmlformats-officedocument.presentationml.slideshow" ), |
| 44 | QStringLiteral("application/vnd.openxmlformats-officedocument.presentationml.template" ), |
| 45 | QStringLiteral("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" ), |
| 46 | QStringLiteral("application/vnd.openxmlformats-officedocument.spreadsheetml.template" ), |
| 47 | QStringLiteral("application/vnd.ms-xpsdocument" ), |
| 48 | QStringLiteral("application/oxps" ), |
| 49 | QStringLiteral("model/3mf" ), |
| 50 | }; |
| 51 | |
| 52 | QStringList Office2007Extractor::() const |
| 53 | { |
| 54 | return supportedMimeTypes; |
| 55 | } |
| 56 | |
| 57 | void Office2007Extractor::(ExtractionResult* result) |
| 58 | { |
| 59 | KZip zip(result->inputUrl()); |
| 60 | if (!zip.open(mode: QIODevice::ReadOnly)) { |
| 61 | qCWarning(KFILEMETADATA_LOG) << "Failed to open" << zip.fileName() << "-" << zip.errorString(); |
| 62 | return; |
| 63 | } |
| 64 | |
| 65 | const KArchiveDirectory* rootDir = zip.directory(); |
| 66 | if (!rootDir) { |
| 67 | qCWarning(KFILEMETADATA_LOG) << "Invalid document structure (main directory is missing)" ; |
| 68 | return; |
| 69 | } |
| 70 | |
| 71 | const bool = result->inputFlags() & ExtractionResult::ExtractMetaData; |
| 72 | |
| 73 | // Resolve part relationships according to ECMA-376-2 (Open Packaging Conventions, OPC) |
| 74 | const QDomElement relationsElem = [rootDir]() { |
| 75 | const KArchiveFile *baseRels = rootDir->file(QStringLiteral("_rels/.rels" )); |
| 76 | if (!baseRels) { |
| 77 | qCWarning(KFILEMETADATA_LOG) << "Invalid document structure - missing package relationship" ; |
| 78 | return QDomElement{}; |
| 79 | } |
| 80 | |
| 81 | QDomDocument relationsDoc; |
| 82 | relationsDoc.setContent(data: baseRels->data(), options: QDomDocument::ParseOption::UseNamespaceProcessing); |
| 83 | |
| 84 | auto relations = relationsDoc.firstChildElement(QStringLiteral("Relationships" )); |
| 85 | if (relations.isNull()) { |
| 86 | qCWarning(KFILEMETADATA_LOG) << "Invalid document structure - invalid package relationships" ; |
| 87 | } |
| 88 | return relations; |
| 89 | }(); |
| 90 | |
| 91 | auto targetByType = [&relationsElem](const QString &type, const QString &defVal = {}) -> QString { |
| 92 | for (auto rel = relationsElem.firstChildElement(); !rel.isNull(); rel = rel.nextSiblingElement()) { |
| 93 | if (rel.namespaceURI() == relNS() && rel.localName() == QStringLiteral("Relationship" ) |
| 94 | && rel.attribute(QStringLiteral("Type" )) == type) { |
| 95 | return rel.attribute(QStringLiteral("Target" )); |
| 96 | } |
| 97 | } |
| 98 | return defVal; |
| 99 | }; |
| 100 | |
| 101 | // Core Properties |
| 102 | const QString corePropertiesFile = targetByType(coreProp(), QStringLiteral("docProps/core.xml" )); |
| 103 | if (const KArchiveFile *file = extractMetaData ? rootDir->file(name: corePropertiesFile) : nullptr; file) { |
| 104 | QDomDocument coreDoc(QStringLiteral("core" )); |
| 105 | coreDoc.setContent(data: file->data(), options: QDomDocument::ParseOption::UseNamespaceProcessing); |
| 106 | |
| 107 | QDomElement cpElem = coreDoc.documentElement(); |
| 108 | |
| 109 | if (!cpElem.isNull() && cpElem.namespaceURI() == cpNS()) { |
| 110 | DublinCoreExtractor::extract(result, fragment: cpElem); |
| 111 | } |
| 112 | |
| 113 | auto elem = cpElem.firstChildElement(QStringLiteral("keywords" )); |
| 114 | if (!elem.isNull() && elem.namespaceURI() == cpNS()) { |
| 115 | for (auto c = elem.firstChild(); !c.isNull(); c = c.nextSibling()) { |
| 116 | if (const auto childElem = c.toElement(); childElem.localName() == QStringLiteral("value" ) && !childElem.text().isEmpty()) { |
| 117 | result->add(property: Property::Keywords, value: childElem.text()); |
| 118 | } else if (const auto tNode = c.toText(); !tNode.nodeValue().isEmpty()) { |
| 119 | result->add(property: Property::Keywords, value: tNode.nodeValue()); |
| 120 | } |
| 121 | } |
| 122 | } |
| 123 | } |
| 124 | |
| 125 | // Extended Properties - two valid relation types: "strict" (ECMA-376-1:2016) or "transitional" (ECMA-367-4:2016) |
| 126 | const QString extPropertiesFile = targetByType(extPropS(), targetByType(extPropT(), QStringLiteral("docProps/app.xml" ))); |
| 127 | if (const KArchiveFile *file = extractMetaData ? rootDir->file(name: extPropertiesFile) : nullptr; file) { |
| 128 | QDomDocument appDoc; |
| 129 | appDoc.setContent(data: file->data(), options: QDomDocument::ParseOption::UseNamespaceProcessing); |
| 130 | |
| 131 | QDomElement propsElem = appDoc.documentElement(); |
| 132 | |
| 133 | for (auto prop = propsElem.firstChildElement(); !prop.isNull(); prop = prop.nextSiblingElement()) { |
| 134 | // Look for properties as specified in ECMA-376-1, Annex A.6.2 Extended Properties |
| 135 | bool ok; |
| 136 | if (prop.localName() == QStringLiteral("Pages" )) { |
| 137 | if (int count = prop.text().toInt(ok: &ok); ok == true) { |
| 138 | result->add(property: Property::PageCount, value: count); |
| 139 | } |
| 140 | } else if (prop.localName() == QStringLiteral("Slides" )) { |
| 141 | if (int count = prop.text().toInt(ok: &ok); ok == true) { |
| 142 | // Map number of slides to PageCount |
| 143 | result->add(property: Property::PageCount, value: count); |
| 144 | } |
| 145 | } else if (prop.localName() == QStringLiteral("Words" )) { |
| 146 | if (int count = prop.text().toInt(ok: &ok); ok == true) { |
| 147 | result->add(property: Property::WordCount, value: count); |
| 148 | } |
| 149 | } else if (prop.localName() == QStringLiteral("Lines" )) { |
| 150 | if (int count = prop.text().toInt(ok: &ok); ok == true) { |
| 151 | result->add(property: Property::LineCount, value: count); |
| 152 | } |
| 153 | } else if (prop.localName() == QStringLiteral("Application" )) { |
| 154 | QString application = prop.text(); |
| 155 | if (!application.isEmpty()) { |
| 156 | result->add(property: Property::Generator, value: application); |
| 157 | } |
| 158 | } |
| 159 | } |
| 160 | } |
| 161 | |
| 162 | // |
| 163 | // Plain Text |
| 164 | // |
| 165 | bool = (result->inputFlags() & ExtractionResult::ExtractPlainText); |
| 166 | |
| 167 | if (const auto wordEntry = rootDir->entry(QStringLiteral("word" )); wordEntry) { |
| 168 | result->addType(type: Type::Document); |
| 169 | |
| 170 | if (!extractPlainText) { |
| 171 | return; |
| 172 | } |
| 173 | |
| 174 | if (!wordEntry->isDirectory()) { |
| 175 | qCWarning(KFILEMETADATA_LOG) << "Invalid document structure (word is not a directory)" ; |
| 176 | return; |
| 177 | } |
| 178 | |
| 179 | const KArchiveDirectory* wordDirectory = dynamic_cast<const KArchiveDirectory*>(wordEntry); |
| 180 | const QStringList wordEntries = wordDirectory->entries(); |
| 181 | |
| 182 | if (wordEntries.contains(QStringLiteral("document.xml" ))) { |
| 183 | const KArchiveFile* file = wordDirectory->file(QStringLiteral("document.xml" )); |
| 184 | |
| 185 | if (file) { |
| 186 | std::unique_ptr<QIODevice> contentIODevice{file->createDevice()}; |
| 187 | extractTextWithTag(device: contentIODevice.get(), QStringLiteral("w:t" ), result); |
| 188 | } |
| 189 | } |
| 190 | } |
| 191 | |
| 192 | else if (const auto xlEntry = rootDir->entry(QStringLiteral("xl" )); xlEntry) { |
| 193 | result->addType(type: Type::Document); |
| 194 | result->addType(type: Type::Spreadsheet); |
| 195 | |
| 196 | if (!extractPlainText) { |
| 197 | return; |
| 198 | } |
| 199 | |
| 200 | if (!xlEntry->isDirectory()) { |
| 201 | qCWarning(KFILEMETADATA_LOG) << "Invalid document structure (xl is not a directory)" ; |
| 202 | return; |
| 203 | } |
| 204 | |
| 205 | const auto xlDirectory = dynamic_cast<const KArchiveDirectory*>(xlEntry); |
| 206 | // TODO: Read the sheets from worksheets/*.xml, and dereference all cells |
| 207 | // values in order |
| 208 | const KArchiveFile* file = xlDirectory->file(QStringLiteral("sharedStrings.xml" )); |
| 209 | if (!file) { |
| 210 | return; |
| 211 | } |
| 212 | std::unique_ptr<QIODevice> contentIODevice{file->createDevice()}; |
| 213 | extractTextWithTag(device: contentIODevice.get(), QStringLiteral("t" ), result); |
| 214 | } |
| 215 | |
| 216 | else if (const auto pptEntry = rootDir->entry(QStringLiteral("ppt" )); pptEntry) { |
| 217 | result->addType(type: Type::Document); |
| 218 | result->addType(type: Type::Presentation); |
| 219 | |
| 220 | if (!extractPlainText) { |
| 221 | return; |
| 222 | } |
| 223 | |
| 224 | if (!pptEntry->isDirectory()) { |
| 225 | qCWarning(KFILEMETADATA_LOG) << "Invalid document structure (ppt is not a directory)" ; |
| 226 | return; |
| 227 | } |
| 228 | |
| 229 | const auto pptDirectory = dynamic_cast<const KArchiveDirectory*>(pptEntry); |
| 230 | const auto slidesEntry = pptDirectory->entry(QStringLiteral("slides" )); |
| 231 | if (!slidesEntry || !slidesEntry->isDirectory()) { |
| 232 | return; |
| 233 | } |
| 234 | |
| 235 | const auto slidesDirectory = dynamic_cast<const KArchiveDirectory*>(slidesEntry); |
| 236 | QStringList entries = slidesDirectory->entries(); |
| 237 | // TODO: Read the actual order from presentation.xml, and follow the |
| 238 | // references in ppt/_rels/presentation.xml.rel |
| 239 | std::sort(first: entries.begin(), last: entries.end()); |
| 240 | for (const QString & entryName : std::as_const(t&: entries)) { |
| 241 | const KArchiveFile* file = slidesDirectory->file(name: entryName); |
| 242 | if (!file) { |
| 243 | continue; |
| 244 | } |
| 245 | std::unique_ptr<QIODevice> contentIODevice{file->createDevice()}; |
| 246 | extractTextWithTag(device: contentIODevice.get(), QStringLiteral("a:t" ), result); |
| 247 | } |
| 248 | } |
| 249 | |
| 250 | else if (!relationsElem.isNull()) { |
| 251 | // Any other document type likely following OPC |
| 252 | result->addType(type: Type::Document); |
| 253 | } |
| 254 | } |
| 255 | |
| 256 | void Office2007Extractor::(QIODevice* device, const QString& tag, ExtractionResult* result) |
| 257 | { |
| 258 | QXmlStreamReader xml(device); |
| 259 | |
| 260 | while (!xml.atEnd()) { |
| 261 | xml.readNext(); |
| 262 | if (xml.qualifiedName().startsWith(s: tag) && xml.isStartElement()) { |
| 263 | QString str = xml.readElementText(behaviour: QXmlStreamReader::IncludeChildElements); |
| 264 | |
| 265 | if (!str.isEmpty()) { |
| 266 | result->append(text: str); |
| 267 | } |
| 268 | } |
| 269 | |
| 270 | if (xml.isEndDocument() || xml.hasError()) { |
| 271 | break; |
| 272 | } |
| 273 | } |
| 274 | } |
| 275 | |
| 276 | #include "moc_office2007extractor.cpp" |
| 277 | |