| 1 | /* |
| 2 | This file is part of the KDE libraries |
| 3 | |
| 4 | SPDX-FileCopyrightText: 2008 Wang Hoi <zealot.hoi@gmail.com> |
| 5 | |
| 6 | SPDX-License-Identifier: LGPL-2.0-or-later |
| 7 | */ |
| 8 | #ifndef KENCODINGPROBER_H |
| 9 | #define KENCODINGPROBER_H |
| 10 | |
| 11 | // enable debug of private probers |
| 12 | // #define DEBUG_PROBE |
| 13 | |
| 14 | #include <kcodecs_export.h> |
| 15 | |
| 16 | #ifdef DEBUG_PROBE |
| 17 | #include <QDebug> |
| 18 | #endif |
| 19 | |
| 20 | #include <QCoreApplication> |
| 21 | #include <QString> |
| 22 | #include <memory> |
| 23 | |
| 24 | class KEncodingProberPrivate; |
| 25 | |
| 26 | /*! |
| 27 | * \class KEncodingProber |
| 28 | * \inmodule KCodecs |
| 29 | * |
| 30 | * \brief Provides encoding detection(probe) capabilities. |
| 31 | * |
| 32 | * Probe the encoding of raw data only. |
| 33 | * In the case it can't find it, return the most possible encoding it guessed. |
| 34 | * |
| 35 | * Always do Unicode probe regardless the ProberType |
| 36 | * |
| 37 | * Feed data to it several times with feed() until ProberState changes to FoundIt/NotMe, |
| 38 | * or confidence() returns a value you find acceptable. |
| 39 | * |
| 40 | * Intended lifetime of the object: one instance per ProberType. |
| 41 | * |
| 42 | * Typical use: |
| 43 | * \code |
| 44 | * QByteArray data, moredata; |
| 45 | * ... |
| 46 | * KEncodingProber prober(KEncodingProber::Chinese); |
| 47 | * prober.feed(data); |
| 48 | * prober.feed(moredata); |
| 49 | * if (prober.confidence() > 0.6) |
| 50 | * encoding = prober.encoding(); |
| 51 | * \endcode |
| 52 | * |
| 53 | * At least 256 characters are needed to change the ProberState from Probing to FoundIt. |
| 54 | * If you don't have so many characters to probe, |
| 55 | * decide whether to accept the encoding it guessed so far according to the Confidence by yourself. |
| 56 | * |
| 57 | */ |
| 58 | class KCODECS_EXPORT KEncodingProber |
| 59 | { |
| 60 | Q_DECLARE_TR_FUNCTIONS(KEncodingProber) |
| 61 | |
| 62 | public: |
| 63 | /*! |
| 64 | * \value FoundIt Sure find the encoding |
| 65 | * \value NotMe Sure not included in current ProberType's all supported encodings |
| 66 | * \value Probing Need more data to make a decision |
| 67 | */ |
| 68 | enum ProberState { |
| 69 | FoundIt, |
| 70 | NotMe, |
| 71 | Probing, |
| 72 | }; |
| 73 | |
| 74 | /*! |
| 75 | * \value None |
| 76 | * \value Universal |
| 77 | * \value Arabic |
| 78 | * \value Baltic |
| 79 | * \value CentralEuropean |
| 80 | * \value ChineseSimplified |
| 81 | * \value ChineseTraditional |
| 82 | * \value Cyrillic |
| 83 | * \value Greek |
| 84 | * \value Hebrew |
| 85 | * \value Japanese |
| 86 | * \value Korean |
| 87 | * \value NorthernSaami |
| 88 | * \value Other |
| 89 | * \value SouthEasternEurope |
| 90 | * \value Thai |
| 91 | * \value Turkish |
| 92 | * \value Unicode |
| 93 | * \value WesternEuropean |
| 94 | */ |
| 95 | enum ProberType { |
| 96 | None, |
| 97 | Universal, |
| 98 | Arabic, |
| 99 | Baltic, |
| 100 | CentralEuropean, |
| 101 | ChineseSimplified, |
| 102 | ChineseTraditional, |
| 103 | Cyrillic, |
| 104 | Greek, |
| 105 | Hebrew, |
| 106 | Japanese, |
| 107 | Korean, |
| 108 | NorthernSaami, |
| 109 | Other, |
| 110 | SouthEasternEurope, |
| 111 | Thai, |
| 112 | Turkish, |
| 113 | Unicode, |
| 114 | WesternEuropean, |
| 115 | }; |
| 116 | |
| 117 | /*! |
| 118 | * Default ProberType is Universal(detect all possible encodings) |
| 119 | */ |
| 120 | KEncodingProber(ProberType proberType = Universal); |
| 121 | |
| 122 | ~KEncodingProber(); |
| 123 | |
| 124 | KEncodingProber(const KEncodingProber &) = delete; |
| 125 | KEncodingProber &operator=(const KEncodingProber &) = delete; |
| 126 | |
| 127 | /*! |
| 128 | * reset the prober's internal state and data. |
| 129 | */ |
| 130 | void reset(); |
| 131 | |
| 132 | /*! |
| 133 | * The main class method |
| 134 | * |
| 135 | * Feed \a data to the prober |
| 136 | * |
| 137 | * Returns the ProberState after probing the fed data. |
| 138 | */ |
| 139 | ProberState feed(QByteArrayView data); |
| 140 | // for API compatibility |
| 141 | inline ProberState feed(const char *data, qsizetype len) |
| 142 | { |
| 143 | return feed(data: {data, len}); |
| 144 | } |
| 145 | |
| 146 | /*! |
| 147 | * Returns the prober's current ProberState |
| 148 | * |
| 149 | */ |
| 150 | ProberState state() const; |
| 151 | |
| 152 | /*! |
| 153 | * Returns a QByteArray with the name of the best encoding it has guessed so far |
| 154 | * \since 4.2.2 |
| 155 | */ |
| 156 | QByteArray encoding() const; |
| 157 | |
| 158 | /*! |
| 159 | * Returns the confidence(sureness) of encoding it guessed so far (0.0 ~ 0.99), not very reliable for single byte encodings |
| 160 | */ |
| 161 | float confidence() const; |
| 162 | |
| 163 | ProberType proberType() const; |
| 164 | |
| 165 | /*! |
| 166 | * change current prober's ProberType and reset the prober |
| 167 | * |
| 168 | * \a proberType the new type |
| 169 | */ |
| 170 | void setProberType(ProberType proberType); |
| 171 | |
| 172 | /*! |
| 173 | * Returns the ProberType for \a lang (e.g. proberTypeForName("Chinese Simplified") will return KEncodingProber::ChineseSimplified |
| 174 | */ |
| 175 | static ProberType proberTypeForName(const QString &lang); |
| 176 | |
| 177 | /*! |
| 178 | * map ProberType to language string |
| 179 | * |
| 180 | * \a proberType the proper type |
| 181 | * |
| 182 | * Returns the language string |
| 183 | */ |
| 184 | static QString nameForProberType(ProberType proberType); |
| 185 | |
| 186 | private: |
| 187 | std::unique_ptr<KEncodingProberPrivate> const d; |
| 188 | }; |
| 189 | |
| 190 | #endif |
| 191 | |