1/*
2 This file is part of a KMetaData File Extractor
3 SPDX-FileCopyrightText: 2013 Denis Steckelmacher <steckdenis@yahoo.fr>
4
5 SPDX-License-Identifier: LGPL-2.1-or-later
6*/
7
8#include "officeextractor.h"
9#include "kfilemetadata_debug.h"
10
11#include <QRegularExpression>
12#include <QStandardPaths>
13
14#include <QProcess>
15
16using namespace KFileMetaData;
17
18OfficeExtractor::OfficeExtractor(QObject* parent)
19 : ExtractorPlugin(parent)
20{
21 // Find the executables of catdoc, catppt and xls2csv. If an executable cannot
22 // be found, indexing its corresponding MIME type will be disabled
23 findExe(QStringLiteral("application/msword"), QStringLiteral("catdoc"), fullPath&: m_catdoc);
24 findExe(QStringLiteral("application/vnd.ms-excel"), QStringLiteral("xls2csv"), fullPath&: m_xls2csv);
25 findExe(QStringLiteral("application/vnd.ms-powerpoint"), QStringLiteral("catppt"), fullPath&: m_catppt);
26}
27
28void OfficeExtractor::findExe(const QString& mimeType, const QString& name, QString& fullPath)
29{
30 fullPath = QStandardPaths::findExecutable(executableName: name);
31
32 if (!fullPath.isEmpty()) {
33 m_available_mime_types << mimeType;
34 } else {
35 qCDebug(KFILEMETADATA_LOG) << "Could not find executable in PATH:" << name;
36 }
37}
38
39QStringList OfficeExtractor::mimetypes() const
40{
41 return m_available_mime_types;
42}
43
44
45void OfficeExtractor::extract(ExtractionResult* result)
46{
47 QStringList args;
48
49 args << QStringLiteral("-s") << QStringLiteral("cp1252"); // FIXME: Store somewhere a map between the user's language and the encoding of the Windows files it may use ?
50 args << QStringLiteral("-d") << QStringLiteral("utf8");
51
52 const bool extractPlainText = result->inputFlags() & ExtractionResult::ExtractPlainText;
53
54 const QString fileUrl = result->inputUrl();
55 const QString mimeType = result->inputMimetype();
56 if (mimeType == QLatin1String("application/msword")) {
57 result->addType(type: Type::Document);
58
59 if (!extractPlainText) {
60 return;
61 }
62
63 args << QStringLiteral("-w");
64 if (const auto contents = textFromFile(fileUrl, command: m_catdoc, arguments&: args); !contents.isEmpty()) {
65 // Now that we have the plain text content, count words, lines and characters
66 // (original code from plaintextextractor.cpp, authored by Vishesh Handa)
67 int lines = contents.count(c: QLatin1Char('\n'));
68 int words = contents.count(re: QRegularExpression(QStringLiteral("\\b\\w+\\b"), QRegularExpression::UseUnicodePropertiesOption));
69
70 result->add(property: Property::WordCount, value: words);
71 result->add(property: Property::LineCount, value: lines);
72 result->append(text: contents);
73 }
74 } else if (mimeType == QLatin1String("application/vnd.ms-excel")) {
75 result->addType(type: Type::Document);
76 result->addType(type: Type::Spreadsheet);
77
78 if (!extractPlainText) {
79 return;
80 }
81
82 args << QStringLiteral("-c") << QStringLiteral(" ");
83 args << QStringLiteral("-b") << QStringLiteral(" ");
84 args << QStringLiteral("-q") << QStringLiteral("0");
85 if (const auto contents = textFromFile(fileUrl, command: m_xls2csv, arguments&: args); !contents.isEmpty()) {
86 result->append(text: contents);
87 }
88 } else if (mimeType == QLatin1String("application/vnd.ms-powerpoint")) {
89 result->addType(type: Type::Document);
90 result->addType(type: Type::Presentation);
91
92 if (!extractPlainText) {
93 return;
94 }
95
96 if (const auto contents = textFromFile(fileUrl, command: m_catppt, arguments&: args); !contents.isEmpty()) {
97 result->append(text: contents);
98 }
99 }
100}
101
102QString OfficeExtractor::textFromFile(const QString& fileUrl, const QString& command, QStringList& arguments)
103{
104 if (command.isEmpty()) {
105 return {};
106 }
107
108 arguments << fileUrl;
109
110 // Start a process and read its standard output
111 QProcess process;
112
113 process.setReadChannel(QProcess::StandardOutput);
114 process.start(program: command, arguments, mode: QIODevice::ReadOnly);
115 process.waitForFinished();
116
117 if (process.exitStatus() != QProcess::NormalExit || process.exitCode() != 0) {
118 return QString();
119 } else {
120 return QString::fromUtf8(ba: process.readAll());
121 }
122}
123
124#include "moc_officeextractor.cpp"
125

source code of kfilemetadata/src/extractors/officeextractor.cpp