| 1 | /* |
| 2 | This file is part of a KMetaData File Extractor |
| 3 | SPDX-FileCopyrightText: 2013 Denis Steckelmacher <steckdenis@yahoo.fr> |
| 4 | |
| 5 | SPDX-License-Identifier: LGPL-2.1-or-later |
| 6 | */ |
| 7 | |
| 8 | #include "officeextractor.h" |
| 9 | #include "kfilemetadata_debug.h" |
| 10 | |
| 11 | #include <QRegularExpression> |
| 12 | #include <QStandardPaths> |
| 13 | |
| 14 | #include <QProcess> |
| 15 | |
| 16 | using namespace KFileMetaData; |
| 17 | |
| 18 | OfficeExtractor::(QObject* parent) |
| 19 | : ExtractorPlugin(parent) |
| 20 | { |
| 21 | // Find the executables of catdoc, catppt and xls2csv. If an executable cannot |
| 22 | // be found, indexing its corresponding MIME type will be disabled |
| 23 | findExe(QStringLiteral("application/msword" ), QStringLiteral("catdoc" ), fullPath&: m_catdoc); |
| 24 | findExe(QStringLiteral("application/vnd.ms-excel" ), QStringLiteral("xls2csv" ), fullPath&: m_xls2csv); |
| 25 | findExe(QStringLiteral("application/vnd.ms-powerpoint" ), QStringLiteral("catppt" ), fullPath&: m_catppt); |
| 26 | } |
| 27 | |
| 28 | void OfficeExtractor::(const QString& mimeType, const QString& name, QString& fullPath) |
| 29 | { |
| 30 | fullPath = QStandardPaths::findExecutable(executableName: name); |
| 31 | |
| 32 | if (!fullPath.isEmpty()) { |
| 33 | m_available_mime_types << mimeType; |
| 34 | } else { |
| 35 | qCDebug(KFILEMETADATA_LOG) << "Could not find executable in PATH:" << name; |
| 36 | } |
| 37 | } |
| 38 | |
| 39 | QStringList OfficeExtractor::() const |
| 40 | { |
| 41 | return m_available_mime_types; |
| 42 | } |
| 43 | |
| 44 | |
| 45 | void OfficeExtractor::(ExtractionResult* result) |
| 46 | { |
| 47 | QStringList args; |
| 48 | |
| 49 | args << QStringLiteral("-s" ) << QStringLiteral("cp1252" ); // FIXME: Store somewhere a map between the user's language and the encoding of the Windows files it may use ? |
| 50 | args << QStringLiteral("-d" ) << QStringLiteral("utf8" ); |
| 51 | |
| 52 | const bool = result->inputFlags() & ExtractionResult::ExtractPlainText; |
| 53 | |
| 54 | const QString fileUrl = result->inputUrl(); |
| 55 | const QString mimeType = result->inputMimetype(); |
| 56 | if (mimeType == QLatin1String("application/msword" )) { |
| 57 | result->addType(type: Type::Document); |
| 58 | |
| 59 | if (!extractPlainText) { |
| 60 | return; |
| 61 | } |
| 62 | |
| 63 | args << QStringLiteral("-w" ); |
| 64 | if (const auto contents = textFromFile(fileUrl, command: m_catdoc, arguments&: args); !contents.isEmpty()) { |
| 65 | // Now that we have the plain text content, count words, lines and characters |
| 66 | // (original code from plaintextextractor.cpp, authored by Vishesh Handa) |
| 67 | int lines = contents.count(c: QLatin1Char('\n')); |
| 68 | int words = contents.count(re: QRegularExpression(QStringLiteral("\\b\\w+\\b" ), QRegularExpression::UseUnicodePropertiesOption)); |
| 69 | |
| 70 | result->add(property: Property::WordCount, value: words); |
| 71 | result->add(property: Property::LineCount, value: lines); |
| 72 | result->append(text: contents); |
| 73 | } |
| 74 | } else if (mimeType == QLatin1String("application/vnd.ms-excel" )) { |
| 75 | result->addType(type: Type::Document); |
| 76 | result->addType(type: Type::Spreadsheet); |
| 77 | |
| 78 | if (!extractPlainText) { |
| 79 | return; |
| 80 | } |
| 81 | |
| 82 | args << QStringLiteral("-c" ) << QStringLiteral(" " ); |
| 83 | args << QStringLiteral("-b" ) << QStringLiteral(" " ); |
| 84 | args << QStringLiteral("-q" ) << QStringLiteral("0" ); |
| 85 | if (const auto contents = textFromFile(fileUrl, command: m_xls2csv, arguments&: args); !contents.isEmpty()) { |
| 86 | result->append(text: contents); |
| 87 | } |
| 88 | } else if (mimeType == QLatin1String("application/vnd.ms-powerpoint" )) { |
| 89 | result->addType(type: Type::Document); |
| 90 | result->addType(type: Type::Presentation); |
| 91 | |
| 92 | if (!extractPlainText) { |
| 93 | return; |
| 94 | } |
| 95 | |
| 96 | if (const auto contents = textFromFile(fileUrl, command: m_catppt, arguments&: args); !contents.isEmpty()) { |
| 97 | result->append(text: contents); |
| 98 | } |
| 99 | } |
| 100 | } |
| 101 | |
| 102 | QString OfficeExtractor::(const QString& fileUrl, const QString& command, QStringList& arguments) |
| 103 | { |
| 104 | if (command.isEmpty()) { |
| 105 | return {}; |
| 106 | } |
| 107 | |
| 108 | arguments << fileUrl; |
| 109 | |
| 110 | // Start a process and read its standard output |
| 111 | QProcess process; |
| 112 | |
| 113 | process.setReadChannel(QProcess::StandardOutput); |
| 114 | process.start(program: command, arguments, mode: QIODevice::ReadOnly); |
| 115 | process.waitForFinished(); |
| 116 | |
| 117 | if (process.exitStatus() != QProcess::NormalExit || process.exitCode() != 0) { |
| 118 | return QString(); |
| 119 | } else { |
| 120 | return QString::fromUtf8(ba: process.readAll()); |
| 121 | } |
| 122 | } |
| 123 | |
| 124 | #include "moc_officeextractor.cpp" |
| 125 | |