1 | /* |
2 | This file is part of a KMetaData File Extractor |
3 | SPDX-FileCopyrightText: 2013 Denis Steckelmacher <steckdenis@yahoo.fr> |
4 | |
5 | SPDX-License-Identifier: LGPL-2.1-or-later |
6 | */ |
7 | |
8 | #include "officeextractor.h" |
9 | #include "kfilemetadata_debug.h" |
10 | |
11 | #include <QRegularExpression> |
12 | #include <QStandardPaths> |
13 | |
14 | #include <QProcess> |
15 | |
16 | using namespace KFileMetaData; |
17 | |
18 | OfficeExtractor::(QObject* parent) |
19 | : ExtractorPlugin(parent) |
20 | { |
21 | // Find the executables of catdoc, catppt and xls2csv. If an executable cannot |
22 | // be found, indexing its corresponding MIME type will be disabled |
23 | findExe(QStringLiteral("application/msword" ), QStringLiteral("catdoc" ), fullPath&: m_catdoc); |
24 | findExe(QStringLiteral("application/vnd.ms-excel" ), QStringLiteral("xls2csv" ), fullPath&: m_xls2csv); |
25 | findExe(QStringLiteral("application/vnd.ms-powerpoint" ), QStringLiteral("catppt" ), fullPath&: m_catppt); |
26 | } |
27 | |
28 | void OfficeExtractor::(const QString& mimeType, const QString& name, QString& fullPath) |
29 | { |
30 | fullPath = QStandardPaths::findExecutable(executableName: name); |
31 | |
32 | if (!fullPath.isEmpty()) { |
33 | m_available_mime_types << mimeType; |
34 | } else { |
35 | qCDebug(KFILEMETADATA_LOG) << "Could not find executable in PATH:" << name; |
36 | } |
37 | } |
38 | |
39 | QStringList OfficeExtractor::() const |
40 | { |
41 | return m_available_mime_types; |
42 | } |
43 | |
44 | |
45 | void OfficeExtractor::(ExtractionResult* result) |
46 | { |
47 | QStringList args; |
48 | |
49 | args << QStringLiteral("-s" ) << QStringLiteral("cp1252" ); // FIXME: Store somewhere a map between the user's language and the encoding of the Windows files it may use ? |
50 | args << QStringLiteral("-d" ) << QStringLiteral("utf8" ); |
51 | |
52 | const bool = result->inputFlags() & ExtractionResult::ExtractPlainText; |
53 | |
54 | const QString fileUrl = result->inputUrl(); |
55 | const QString mimeType = result->inputMimetype(); |
56 | if (mimeType == QLatin1String("application/msword" )) { |
57 | result->addType(type: Type::Document); |
58 | |
59 | if (!extractPlainText) { |
60 | return; |
61 | } |
62 | |
63 | args << QStringLiteral("-w" ); |
64 | if (const auto contents = textFromFile(fileUrl, command: m_catdoc, arguments&: args); !contents.isEmpty()) { |
65 | // Now that we have the plain text content, count words, lines and characters |
66 | // (original code from plaintextextractor.cpp, authored by Vishesh Handa) |
67 | int lines = contents.count(c: QLatin1Char('\n')); |
68 | int words = contents.count(re: QRegularExpression(QStringLiteral("\\b\\w+\\b" ), QRegularExpression::UseUnicodePropertiesOption)); |
69 | |
70 | result->add(property: Property::WordCount, value: words); |
71 | result->add(property: Property::LineCount, value: lines); |
72 | result->append(text: contents); |
73 | } |
74 | } else if (mimeType == QLatin1String("application/vnd.ms-excel" )) { |
75 | result->addType(type: Type::Document); |
76 | result->addType(type: Type::Spreadsheet); |
77 | |
78 | if (!extractPlainText) { |
79 | return; |
80 | } |
81 | |
82 | args << QStringLiteral("-c" ) << QStringLiteral(" " ); |
83 | args << QStringLiteral("-b" ) << QStringLiteral(" " ); |
84 | args << QStringLiteral("-q" ) << QStringLiteral("0" ); |
85 | if (const auto contents = textFromFile(fileUrl, command: m_xls2csv, arguments&: args); !contents.isEmpty()) { |
86 | result->append(text: contents); |
87 | } |
88 | } else if (mimeType == QLatin1String("application/vnd.ms-powerpoint" )) { |
89 | result->addType(type: Type::Document); |
90 | result->addType(type: Type::Presentation); |
91 | |
92 | if (!extractPlainText) { |
93 | return; |
94 | } |
95 | |
96 | if (const auto contents = textFromFile(fileUrl, command: m_catppt, arguments&: args); !contents.isEmpty()) { |
97 | result->append(text: contents); |
98 | } |
99 | } |
100 | } |
101 | |
102 | QString OfficeExtractor::(const QString& fileUrl, const QString& command, QStringList& arguments) |
103 | { |
104 | if (command.isEmpty()) { |
105 | return {}; |
106 | } |
107 | |
108 | arguments << fileUrl; |
109 | |
110 | // Start a process and read its standard output |
111 | QProcess process; |
112 | |
113 | process.setReadChannel(QProcess::StandardOutput); |
114 | process.start(program: command, arguments, mode: QIODevice::ReadOnly); |
115 | process.waitForFinished(); |
116 | |
117 | if (process.exitStatus() != QProcess::NormalExit || process.exitCode() != 0) { |
118 | return QString(); |
119 | } else { |
120 | return QString::fromUtf8(ba: process.readAll()); |
121 | } |
122 | } |
123 | |
124 | #include "moc_officeextractor.cpp" |
125 | |