1// SPDX-FileCopyrightText: 2008 by Jakub Stachowski <qbast@go2.pl>
2// SPDX-License-Identifier: GPL-2.0-or-later
3
4#include "mobipocket.h"
5#include "decompressor.h"
6#include "pdb_p.h"
7#include "qmobipocket_debug.h"
8
9#include <QBuffer>
10#include <QIODevice>
11#include <QImageReader>
12#include <QRegularExpression>
13#include <QStringConverter>
14#include <QtEndian>
15
16namespace Mobipocket
17{
18
19struct DocumentPrivate
20{
21 DocumentPrivate(QIODevice *d)
22 : pdb(d)
23 {
24 }
25 PDB pdb;
26 std::unique_ptr<Decompressor> dec;
27 quint16 ntextrecords = 0;
28 quint16 maxRecordSize = 0;
29 bool valid = false;
30
31 // number of first record holding image. Usually it is directly after end of text, but not always
32 quint16 firstImageRecord = 0;
33 QMap<Document::MetaKey, QString> metadata;
34 QStringDecoder toUtf16;
35 bool drm = false;
36 quint32 extraflags = 0;
37
38 // index of Thumbnail image in image list. May be specified in EXTH.
39 int thumbnailIndex = -1;
40 // index of Cover image in image list. May be specified in EXTH.
41 int coverIndex = -1;
42
43 void init();
44 void findFirstImage();
45 void parseEXTH(QByteArrayView data);
46 void parseHtmlHead(const QString &data);
47};
48
49void DocumentPrivate::parseHtmlHead(const QString &data)
50{
51 static const QRegularExpression title(QLatin1String("<dc:title.*>(.*)</dc:title>"),
52 QRegularExpression::CaseInsensitiveOption | QRegularExpression::InvertedGreedinessOption);
53 static const QRegularExpression author(QLatin1String("<dc:creator.*>(.*)</dc:creator>"),
54 QRegularExpression::CaseInsensitiveOption | QRegularExpression::InvertedGreedinessOption);
55 static const QRegularExpression copyright(QLatin1String("<dc:rights.*>(.*)</dc:rights>"),
56 QRegularExpression::CaseInsensitiveOption | QRegularExpression::InvertedGreedinessOption);
57 static const QRegularExpression subject(QLatin1String("<dc:subject.*>(.*)</dc:subject>"),
58 QRegularExpression::CaseInsensitiveOption | QRegularExpression::InvertedGreedinessOption);
59 static const QRegularExpression description(QLatin1String("<dc:description.*>(.*)</dc:description>"),
60 QRegularExpression::CaseInsensitiveOption | QRegularExpression::InvertedGreedinessOption);
61
62 // title could have been already taken from MOBI record
63 if (!metadata.contains(key: Document::Title)) {
64 if (const auto titleMatch = title.match(subject: data); titleMatch.hasMatch())
65 metadata[Document::Title] = titleMatch.captured(nth: 1);
66 }
67 if (const auto authorMatch = author.match(subject: data); authorMatch.hasMatch())
68 metadata[Document::Author] = authorMatch.captured(nth: 1);
69 if (const auto copyrightMatch = copyright.match(subject: data); copyrightMatch.hasMatch())
70 metadata[Document::Copyright] = copyrightMatch.captured(nth: 1);
71 if (const auto subjectMatch = subject.match(subject: data); subjectMatch.hasMatch())
72 metadata[Document::Subject] = subjectMatch.captured(nth: 1);
73 if (const auto descriptionMatch = description.match(subject: data); descriptionMatch.hasMatch())
74 metadata[Document::Description] = descriptionMatch.captured(nth: 1);
75}
76
77namespace
78{
79 const QVector<QByteArray> getHuffRecords(const PDB &pdb)
80 {
81 const QByteArray header = pdb.getRecord(i: 0);
82 if (header[1] != 'H') {
83 return {};
84 }
85
86 quint32 huff_ofs = qFromBigEndian<quint32>(src: header.constData() + 0x70);
87 quint32 huff_num = qFromBigEndian<quint32>(src: header.constData() + 0x74);
88
89 // Check for overflow and out-of-bounds access
90 if (((huff_ofs + huff_num) < huff_num) || ((huff_ofs + huff_num) > pdb.recordCount())) {
91 return {};
92 }
93
94 QVector<QByteArray> records(huff_num);
95 for (quint32 i = 0; i < huff_num; i++) {
96 if (auto r = pdb.getRecord(i: huff_ofs + i); r.isNull()) {
97 return {};
98 } else {
99 records[i] = r;
100 }
101 }
102 return records;
103 };
104}
105
106void DocumentPrivate::init()
107{
108 quint32 encoding = 0;
109
110 if (!pdb.isValid())
111 return;
112 QByteArray mhead = pdb.getRecord(i: 0);
113 if (mhead.isNull() || mhead.size() < 14)
114 return;
115
116 dec = Decompressor::create(type: mhead[1], auxData: getHuffRecords(pdb));
117 if ((int)mhead[12] != 0 || (int)mhead[13] != 0)
118 drm = true;
119 if (!dec)
120 return;
121
122 ntextrecords = qFromBigEndian<quint16>(src: mhead.constData() + 8);
123 maxRecordSize = qFromBigEndian<quint16>(src: mhead.constData() + 10);
124 if (mhead.size() > 31)
125 encoding = qFromBigEndian<quint32>(src: mhead.constData() + 28);
126 if (encoding == 65001) {
127 toUtf16 = QStringDecoder(QStringDecoder::Utf8);
128 } else {
129 toUtf16 = QStringDecoder("windows-1252");
130 if (!toUtf16.isValid()) {
131 qCWarning(QMOBIPOCKET_LOG) << "Text codec \"windows-1252\" not supported by Qt library, falling back to Latin1";
132 toUtf16 = QStringDecoder(QStringConverter::Latin1);
133 }
134 }
135 if (mhead.size() >= 92)
136 parseEXTH(data: mhead);
137
138 if (mhead.size() >= 244) {
139 quint32 exthoffs = qFromBigEndian<quint32>(src: mhead.constData() + 20);
140 if ((exthoffs + 16) > 244) {
141 extraflags = qFromBigEndian<quint32>(src: mhead.constData() + 240);
142 }
143 }
144
145 // try getting metadata from HTML if nothing or only title was recovered from MOBI and EXTH records
146 if (metadata.size() < 2 && !drm)
147 parseHtmlHead(data: toUtf16(dec->decompress(data: pdb.getRecord(i: 1))));
148 valid = true;
149}
150
151void DocumentPrivate::findFirstImage()
152{
153 firstImageRecord = ntextrecords + 1;
154 while (firstImageRecord < pdb.recordCount()) {
155 QByteArray rec = pdb.getRecord(i: firstImageRecord);
156 if (rec.isNull())
157 return;
158 QBuffer buf(&rec);
159 buf.open(openMode: QIODevice::ReadOnly);
160 QImageReader r(&buf);
161 if (r.canRead())
162 return;
163 firstImageRecord++;
164 }
165}
166
167void DocumentPrivate::parseEXTH(QByteArrayView data)
168{
169 // try to get name
170 if (data.size() >= 92) {
171 qint32 nameoffset = qFromBigEndian<quint32>(src: data.constData() + 84);
172 qint32 namelen = qFromBigEndian<quint32>(src: data.constData() + 88);
173 if ((nameoffset + namelen) <= data.size()) {
174 metadata[Document::Title] = toUtf16(data.mid(pos: nameoffset, n: namelen));
175 }
176 }
177
178 quint32 exthoffs = qFromBigEndian<quint32>(src: data.constData() + 20);
179 if (exthoffs + 28 > quint32(data.size())) {
180 return;
181 }
182
183 if (data.mid(pos: exthoffs + 16, n: 4) != "EXTH")
184 return;
185 quint32 records = qFromBigEndian<quint32>(src: data.constData() + exthoffs + 24);
186 quint32 offset = exthoffs + 28;
187 for (unsigned int i = 0; i < records; i++) {
188 if (offset + 8 > quint32(data.size()))
189 break;
190 quint32 type = qFromBigEndian<quint32>(src: data.constData() + offset);
191 quint32 len = qFromBigEndian<quint32>(src: data.constData() + offset + 4);
192 if (offset + len > quint32(data.size()))
193 break;
194 switch (type) {
195 case 100:
196 metadata[Document::Author] = toUtf16(data.mid(pos: offset + 8, n: len - 8));
197 break;
198 case 103:
199 metadata[Document::Description] = toUtf16(data.mid(pos: offset + 8, n: len - 8));
200 break;
201 case 105:
202 metadata[Document::Subject] = toUtf16(data.mid(pos: offset + 8, n: len - 8));
203 break;
204 case 109:
205 metadata[Document::Copyright] = toUtf16(data.mid(pos: offset + 8, n: len - 8));
206 break;
207 case 201:
208 coverIndex = qFromBigEndian<quint32>(src: data.constData() + offset + 8);
209 break;
210 case 202:
211 thumbnailIndex = qFromBigEndian<quint32>(src: data.constData() + offset + 8);
212 break;
213 default:
214 // ignore
215 break;
216 }
217 offset += len;
218 }
219}
220
221Document::Document(QIODevice *dev)
222 : d(new DocumentPrivate(dev))
223{
224 Q_ASSERT(dev->openMode() & QIODevice::ReadOnly);
225 Q_ASSERT(!dev->isSequential());
226 d->init();
227}
228
229Document::~Document()
230{
231 delete d;
232}
233
234namespace
235{
236constexpr qsizetype preTrailingDataLength(QByteArrayView data, quint32 flags)
237{
238 if (flags == 0) {
239 return data.size();
240 }
241
242 for (int i = 31; i > 0; i--) {
243 if ((flags & (1u << i)) == 0) {
244 continue;
245 }
246
247 qsizetype chopN = 0;
248 for (int j = 0; j < 4; j++) {
249 if (j + 1 > data.size()) {
250 return 0;
251 }
252 quint8 l = data.at(n: data.size() - (j + 1));
253 chopN |= (l & 0x7f) << (7 * j);
254 if (l & 0x80) {
255 break;
256 }
257 }
258 data.chop(n: std::min<qsizetype>(a: chopN, b: data.size()));
259 }
260 if ((flags & 0x1) && !data.isEmpty()) {
261 quint8 l = data.back() & 0x3;
262 data.chop(n: std::min<qsizetype>(a: l + 1, b: data.size()));
263 }
264 return data.size();
265}
266static_assert(preTrailingDataLength(data: {"0\x00", 2}, flags: 0x0) == 2);
267static_assert(preTrailingDataLength(data: {"0\x00", 2}, flags: 0x1) == 1);
268static_assert(preTrailingDataLength(data: {"0\x01", 2}, flags: 0x1) == 0);
269static_assert(preTrailingDataLength(data: {"0\x02", 2}, flags: 0x1) == 0);
270static_assert(preTrailingDataLength(data: {"abcd\x03", 5}, flags: 0x1) == 1);
271static_assert(preTrailingDataLength(data: {"abcd\x81", 5}, flags: 0x2) == 4);
272static_assert(preTrailingDataLength(data: {"\x02\x01", 2}, flags: 0x2) == 0);
273static_assert(preTrailingDataLength(data: {"\x80\x02", 2}, flags: 0x2) == 0);
274static_assert(preTrailingDataLength(data: {"abcd\x85", 5}, flags: 0x2) == 0);
275static_assert(preTrailingDataLength(data: {"abc\x01\x7f\x82", 6}, flags: 0x2) == 4);
276static_assert(preTrailingDataLength(data: {"abc\x01\x80\x02", 6}, flags: 0x2) == 4);
277static_assert(preTrailingDataLength(data: {"abc\x01\x7f\x82", 6}, flags: 0x3) == 2);
278static_assert(preTrailingDataLength(data: {"abc\x81\x80\x02", 6}, flags: 0x6) == 3);
279static_assert(preTrailingDataLength(data: {"abc\x00\x81\x81", 6}, flags: 0x7) == 3);
280} // namespace
281
282QString Document::text(int size) const
283{
284 QByteArray whole;
285 for (int i = 1; i < d->ntextrecords + 1; i++) {
286 auto record = d->pdb.getRecord(i);
287 record.resize(size: preTrailingDataLength(data: record, flags: d->extraflags));
288 QByteArray decompressedRecord = d->dec->decompress(data: record);
289 whole += decompressedRecord;
290 if (!d->dec->isValid()) {
291 d->valid = false;
292 return QString();
293 }
294 if (size != -1 && whole.size() > size)
295 break;
296 }
297 return d->toUtf16(whole);
298}
299
300int Document::imageCount() const
301{
302 // FIXME: don't count FLIS and FCIS records
303 return d->pdb.recordCount() - d->ntextrecords;
304}
305
306bool Document::isValid() const
307{
308 return d->valid;
309}
310
311QImage Document::getImage(int i) const
312{
313 if (!d->firstImageRecord)
314 d->findFirstImage();
315
316 if ((i < 0) || (i > std::numeric_limits<quint16>::max()) //
317 || (d->firstImageRecord + i) >= d->pdb.recordCount()) {
318 return {};
319 }
320
321 QByteArray rec = d->pdb.getRecord(i: d->firstImageRecord + i);
322 return (rec.isNull()) ? QImage() : QImage::fromData(data: rec);
323}
324
325QMap<Document::MetaKey, QString> Document::metadata() const
326{
327 return d->metadata;
328}
329
330bool Document::hasDRM() const
331{
332 return d->drm;
333}
334
335QImage Document::thumbnail() const
336{
337 if (QImage img = getImage(i: d->thumbnailIndex); !img.isNull()) {
338 return img;
339 }
340
341 // Fall back to cover image, or return an empty image
342 return getImage(i: d->coverIndex);
343}
344
345}
346

source code of kdegraphics-mobipocket/lib/mobipocket.cpp