| 1 | // SPDX-FileCopyrightText: 2008 by Jakub Stachowski <qbast@go2.pl> |
| 2 | // SPDX-License-Identifier: GPL-2.0-or-later |
| 3 | |
| 4 | #include "mobipocket.h" |
| 5 | #include "decompressor.h" |
| 6 | #include "pdb_p.h" |
| 7 | #include "qmobipocket_debug.h" |
| 8 | |
| 9 | #include <QBuffer> |
| 10 | #include <QIODevice> |
| 11 | #include <QImageReader> |
| 12 | #include <QRegularExpression> |
| 13 | #include <QStringConverter> |
| 14 | #include <QtEndian> |
| 15 | |
| 16 | namespace Mobipocket |
| 17 | { |
| 18 | |
| 19 | struct DocumentPrivate |
| 20 | { |
| 21 | DocumentPrivate(QIODevice *d) |
| 22 | : pdb(d) |
| 23 | { |
| 24 | } |
| 25 | PDB pdb; |
| 26 | std::unique_ptr<Decompressor> dec; |
| 27 | quint16 ntextrecords = 0; |
| 28 | quint16 maxRecordSize = 0; |
| 29 | bool valid = false; |
| 30 | |
| 31 | // number of first record holding image. Usually it is directly after end of text, but not always |
| 32 | quint16 firstImageRecord = 0; |
| 33 | QMap<Document::MetaKey, QString> metadata; |
| 34 | QStringDecoder toUtf16; |
| 35 | bool drm = false; |
| 36 | quint32 = 0; |
| 37 | |
| 38 | // index of Thumbnail image in image list. May be specified in EXTH. |
| 39 | int thumbnailIndex = -1; |
| 40 | // index of Cover image in image list. May be specified in EXTH. |
| 41 | int coverIndex = -1; |
| 42 | |
| 43 | void init(); |
| 44 | void findFirstImage(); |
| 45 | void parseEXTH(QByteArrayView data); |
| 46 | void parseHtmlHead(const QString &data); |
| 47 | }; |
| 48 | |
| 49 | void DocumentPrivate::(const QString &data) |
| 50 | { |
| 51 | static const QRegularExpression title(QLatin1String("<dc:title.*>(.*)</dc:title>" ), |
| 52 | QRegularExpression::CaseInsensitiveOption | QRegularExpression::InvertedGreedinessOption); |
| 53 | static const QRegularExpression author(QLatin1String("<dc:creator.*>(.*)</dc:creator>" ), |
| 54 | QRegularExpression::CaseInsensitiveOption | QRegularExpression::InvertedGreedinessOption); |
| 55 | static const QRegularExpression copyright(QLatin1String("<dc:rights.*>(.*)</dc:rights>" ), |
| 56 | QRegularExpression::CaseInsensitiveOption | QRegularExpression::InvertedGreedinessOption); |
| 57 | static const QRegularExpression subject(QLatin1String("<dc:subject.*>(.*)</dc:subject>" ), |
| 58 | QRegularExpression::CaseInsensitiveOption | QRegularExpression::InvertedGreedinessOption); |
| 59 | static const QRegularExpression description(QLatin1String("<dc:description.*>(.*)</dc:description>" ), |
| 60 | QRegularExpression::CaseInsensitiveOption | QRegularExpression::InvertedGreedinessOption); |
| 61 | |
| 62 | // title could have been already taken from MOBI record |
| 63 | if (!metadata.contains(key: Document::Title)) { |
| 64 | if (const auto titleMatch = title.match(subject: data); titleMatch.hasMatch()) |
| 65 | metadata[Document::Title] = titleMatch.captured(nth: 1); |
| 66 | } |
| 67 | if (const auto authorMatch = author.match(subject: data); authorMatch.hasMatch()) |
| 68 | metadata[Document::Author] = authorMatch.captured(nth: 1); |
| 69 | if (const auto copyrightMatch = copyright.match(subject: data); copyrightMatch.hasMatch()) |
| 70 | metadata[Document::Copyright] = copyrightMatch.captured(nth: 1); |
| 71 | if (const auto subjectMatch = subject.match(subject: data); subjectMatch.hasMatch()) |
| 72 | metadata[Document::Subject] = subjectMatch.captured(nth: 1); |
| 73 | if (const auto descriptionMatch = description.match(subject: data); descriptionMatch.hasMatch()) |
| 74 | metadata[Document::Description] = descriptionMatch.captured(nth: 1); |
| 75 | } |
| 76 | |
| 77 | namespace |
| 78 | { |
| 79 | const QVector<QByteArray> getHuffRecords(const PDB &pdb) |
| 80 | { |
| 81 | const QByteArray = pdb.getRecord(i: 0); |
| 82 | if (header[1] != 'H') { |
| 83 | return {}; |
| 84 | } |
| 85 | |
| 86 | quint32 huff_ofs = qFromBigEndian<quint32>(src: header.constData() + 0x70); |
| 87 | quint32 huff_num = qFromBigEndian<quint32>(src: header.constData() + 0x74); |
| 88 | |
| 89 | // Check for overflow and out-of-bounds access |
| 90 | if (((huff_ofs + huff_num) < huff_num) || ((huff_ofs + huff_num) > pdb.recordCount())) { |
| 91 | return {}; |
| 92 | } |
| 93 | |
| 94 | QVector<QByteArray> records(huff_num); |
| 95 | for (quint32 i = 0; i < huff_num; i++) { |
| 96 | if (auto r = pdb.getRecord(i: huff_ofs + i); r.isNull()) { |
| 97 | return {}; |
| 98 | } else { |
| 99 | records[i] = r; |
| 100 | } |
| 101 | } |
| 102 | return records; |
| 103 | }; |
| 104 | } |
| 105 | |
| 106 | void DocumentPrivate::init() |
| 107 | { |
| 108 | quint32 encoding = 0; |
| 109 | |
| 110 | if (!pdb.isValid()) |
| 111 | return; |
| 112 | QByteArray mhead = pdb.getRecord(i: 0); |
| 113 | if (mhead.isNull() || mhead.size() < 14) |
| 114 | return; |
| 115 | |
| 116 | dec = Decompressor::create(type: mhead[1], auxData: getHuffRecords(pdb)); |
| 117 | if ((int)mhead[12] != 0 || (int)mhead[13] != 0) |
| 118 | drm = true; |
| 119 | if (!dec) |
| 120 | return; |
| 121 | |
| 122 | ntextrecords = qFromBigEndian<quint16>(src: mhead.constData() + 8); |
| 123 | maxRecordSize = qFromBigEndian<quint16>(src: mhead.constData() + 10); |
| 124 | if (mhead.size() > 31) |
| 125 | encoding = qFromBigEndian<quint32>(src: mhead.constData() + 28); |
| 126 | if (encoding == 65001) { |
| 127 | toUtf16 = QStringDecoder(QStringDecoder::Utf8); |
| 128 | } else { |
| 129 | toUtf16 = QStringDecoder("windows-1252" ); |
| 130 | if (!toUtf16.isValid()) { |
| 131 | qCWarning(QMOBIPOCKET_LOG) << "Text codec \"windows-1252\" not supported by Qt library, falling back to Latin1" ; |
| 132 | toUtf16 = QStringDecoder(QStringConverter::Latin1); |
| 133 | } |
| 134 | } |
| 135 | if (mhead.size() >= 92) |
| 136 | parseEXTH(data: mhead); |
| 137 | |
| 138 | if (mhead.size() >= 244) { |
| 139 | quint32 exthoffs = qFromBigEndian<quint32>(src: mhead.constData() + 20); |
| 140 | if ((exthoffs + 16) > 244) { |
| 141 | extraflags = qFromBigEndian<quint32>(src: mhead.constData() + 240); |
| 142 | } |
| 143 | } |
| 144 | |
| 145 | // try getting metadata from HTML if nothing or only title was recovered from MOBI and EXTH records |
| 146 | if (metadata.size() < 2 && !drm) |
| 147 | parseHtmlHead(data: toUtf16(dec->decompress(data: pdb.getRecord(i: 1)))); |
| 148 | valid = true; |
| 149 | } |
| 150 | |
| 151 | void DocumentPrivate::findFirstImage() |
| 152 | { |
| 153 | firstImageRecord = ntextrecords + 1; |
| 154 | while (firstImageRecord < pdb.recordCount()) { |
| 155 | QByteArray rec = pdb.getRecord(i: firstImageRecord); |
| 156 | if (rec.isNull()) |
| 157 | return; |
| 158 | QBuffer buf(&rec); |
| 159 | buf.open(openMode: QIODevice::ReadOnly); |
| 160 | QImageReader r(&buf); |
| 161 | if (r.canRead()) |
| 162 | return; |
| 163 | firstImageRecord++; |
| 164 | } |
| 165 | } |
| 166 | |
| 167 | void DocumentPrivate::parseEXTH(QByteArrayView data) |
| 168 | { |
| 169 | // try to get name |
| 170 | if (data.size() >= 92) { |
| 171 | qint32 nameoffset = qFromBigEndian<quint32>(src: data.constData() + 84); |
| 172 | qint32 namelen = qFromBigEndian<quint32>(src: data.constData() + 88); |
| 173 | if ((nameoffset + namelen) <= data.size()) { |
| 174 | metadata[Document::Title] = toUtf16(data.mid(pos: nameoffset, n: namelen)); |
| 175 | } |
| 176 | } |
| 177 | |
| 178 | quint32 exthoffs = qFromBigEndian<quint32>(src: data.constData() + 20); |
| 179 | if (exthoffs + 28 > quint32(data.size())) { |
| 180 | return; |
| 181 | } |
| 182 | |
| 183 | if (data.mid(pos: exthoffs + 16, n: 4) != "EXTH" ) |
| 184 | return; |
| 185 | quint32 records = qFromBigEndian<quint32>(src: data.constData() + exthoffs + 24); |
| 186 | quint32 offset = exthoffs + 28; |
| 187 | for (unsigned int i = 0; i < records; i++) { |
| 188 | if (offset + 8 > quint32(data.size())) |
| 189 | break; |
| 190 | quint32 type = qFromBigEndian<quint32>(src: data.constData() + offset); |
| 191 | quint32 len = qFromBigEndian<quint32>(src: data.constData() + offset + 4); |
| 192 | if (offset + len > quint32(data.size())) |
| 193 | break; |
| 194 | switch (type) { |
| 195 | case 100: |
| 196 | metadata[Document::Author] = toUtf16(data.mid(pos: offset + 8, n: len - 8)); |
| 197 | break; |
| 198 | case 103: |
| 199 | metadata[Document::Description] = toUtf16(data.mid(pos: offset + 8, n: len - 8)); |
| 200 | break; |
| 201 | case 105: |
| 202 | metadata[Document::Subject] = toUtf16(data.mid(pos: offset + 8, n: len - 8)); |
| 203 | break; |
| 204 | case 109: |
| 205 | metadata[Document::Copyright] = toUtf16(data.mid(pos: offset + 8, n: len - 8)); |
| 206 | break; |
| 207 | case 201: |
| 208 | coverIndex = qFromBigEndian<quint32>(src: data.constData() + offset + 8); |
| 209 | break; |
| 210 | case 202: |
| 211 | thumbnailIndex = qFromBigEndian<quint32>(src: data.constData() + offset + 8); |
| 212 | break; |
| 213 | default: |
| 214 | // ignore |
| 215 | break; |
| 216 | } |
| 217 | offset += len; |
| 218 | } |
| 219 | } |
| 220 | |
| 221 | Document::Document(QIODevice *dev) |
| 222 | : d(new DocumentPrivate(dev)) |
| 223 | { |
| 224 | Q_ASSERT(dev->openMode() & QIODevice::ReadOnly); |
| 225 | Q_ASSERT(!dev->isSequential()); |
| 226 | d->init(); |
| 227 | } |
| 228 | |
| 229 | Document::~Document() |
| 230 | { |
| 231 | delete d; |
| 232 | } |
| 233 | |
| 234 | namespace |
| 235 | { |
| 236 | constexpr qsizetype preTrailingDataLength(QByteArrayView data, quint32 flags) |
| 237 | { |
| 238 | if (flags == 0) { |
| 239 | return data.size(); |
| 240 | } |
| 241 | |
| 242 | for (int i = 31; i > 0; i--) { |
| 243 | if ((flags & (1u << i)) == 0) { |
| 244 | continue; |
| 245 | } |
| 246 | |
| 247 | qsizetype chopN = 0; |
| 248 | for (int j = 0; j < 4; j++) { |
| 249 | if (j + 1 > data.size()) { |
| 250 | return 0; |
| 251 | } |
| 252 | quint8 l = data.at(n: data.size() - (j + 1)); |
| 253 | chopN |= (l & 0x7f) << (7 * j); |
| 254 | if (l & 0x80) { |
| 255 | break; |
| 256 | } |
| 257 | } |
| 258 | data.chop(n: std::min<qsizetype>(a: chopN, b: data.size())); |
| 259 | } |
| 260 | if ((flags & 0x1) && !data.isEmpty()) { |
| 261 | quint8 l = data.back() & 0x3; |
| 262 | data.chop(n: std::min<qsizetype>(a: l + 1, b: data.size())); |
| 263 | } |
| 264 | return data.size(); |
| 265 | } |
| 266 | static_assert(preTrailingDataLength(data: {"0\x00" , 2}, flags: 0x0) == 2); |
| 267 | static_assert(preTrailingDataLength(data: {"0\x00" , 2}, flags: 0x1) == 1); |
| 268 | static_assert(preTrailingDataLength(data: {"0\x01" , 2}, flags: 0x1) == 0); |
| 269 | static_assert(preTrailingDataLength(data: {"0\x02" , 2}, flags: 0x1) == 0); |
| 270 | static_assert(preTrailingDataLength(data: {"abcd\x03" , 5}, flags: 0x1) == 1); |
| 271 | static_assert(preTrailingDataLength(data: {"abcd\x81" , 5}, flags: 0x2) == 4); |
| 272 | static_assert(preTrailingDataLength(data: {"\x02\x01" , 2}, flags: 0x2) == 0); |
| 273 | static_assert(preTrailingDataLength(data: {"\x80\x02" , 2}, flags: 0x2) == 0); |
| 274 | static_assert(preTrailingDataLength(data: {"abcd\x85" , 5}, flags: 0x2) == 0); |
| 275 | static_assert(preTrailingDataLength(data: {"abc\x01\x7f\x82" , 6}, flags: 0x2) == 4); |
| 276 | static_assert(preTrailingDataLength(data: {"abc\x01\x80\x02" , 6}, flags: 0x2) == 4); |
| 277 | static_assert(preTrailingDataLength(data: {"abc\x01\x7f\x82" , 6}, flags: 0x3) == 2); |
| 278 | static_assert(preTrailingDataLength(data: {"abc\x81\x80\x02" , 6}, flags: 0x6) == 3); |
| 279 | static_assert(preTrailingDataLength(data: {"abc\x00\x81\x81" , 6}, flags: 0x7) == 3); |
| 280 | } // namespace |
| 281 | |
| 282 | QString Document::text(int size) const |
| 283 | { |
| 284 | QByteArray whole; |
| 285 | for (int i = 1; i < d->ntextrecords + 1; i++) { |
| 286 | auto record = d->pdb.getRecord(i); |
| 287 | record.resize(size: preTrailingDataLength(data: record, flags: d->extraflags)); |
| 288 | QByteArray decompressedRecord = d->dec->decompress(data: record); |
| 289 | whole += decompressedRecord; |
| 290 | if (!d->dec->isValid()) { |
| 291 | d->valid = false; |
| 292 | return QString(); |
| 293 | } |
| 294 | if (size != -1 && whole.size() > size) |
| 295 | break; |
| 296 | } |
| 297 | return d->toUtf16(whole); |
| 298 | } |
| 299 | |
| 300 | int Document::imageCount() const |
| 301 | { |
| 302 | // FIXME: don't count FLIS and FCIS records |
| 303 | return d->pdb.recordCount() - d->ntextrecords; |
| 304 | } |
| 305 | |
| 306 | bool Document::isValid() const |
| 307 | { |
| 308 | return d->valid; |
| 309 | } |
| 310 | |
| 311 | QImage Document::getImage(int i) const |
| 312 | { |
| 313 | if (!d->firstImageRecord) |
| 314 | d->findFirstImage(); |
| 315 | |
| 316 | if ((i < 0) || (i > std::numeric_limits<quint16>::max()) // |
| 317 | || (d->firstImageRecord + i) >= d->pdb.recordCount()) { |
| 318 | return {}; |
| 319 | } |
| 320 | |
| 321 | QByteArray rec = d->pdb.getRecord(i: d->firstImageRecord + i); |
| 322 | return (rec.isNull()) ? QImage() : QImage::fromData(data: rec); |
| 323 | } |
| 324 | |
| 325 | QMap<Document::MetaKey, QString> Document::metadata() const |
| 326 | { |
| 327 | return d->metadata; |
| 328 | } |
| 329 | |
| 330 | bool Document::hasDRM() const |
| 331 | { |
| 332 | return d->drm; |
| 333 | } |
| 334 | |
| 335 | QImage Document::thumbnail() const |
| 336 | { |
| 337 | if (QImage img = getImage(i: d->thumbnailIndex); !img.isNull()) { |
| 338 | return img; |
| 339 | } |
| 340 | |
| 341 | // Fall back to cover image, or return an empty image |
| 342 | return getImage(i: d->coverIndex); |
| 343 | } |
| 344 | |
| 345 | } |
| 346 | |