1 | /* |
2 | This file is part of the KDE libraries |
3 | |
4 | SPDX-FileCopyrightText: 2008 Wang Hoi <zealot.hoi@gmail.com> |
5 | |
6 | SPDX-License-Identifier: LGPL-2.0-or-later |
7 | */ |
8 | #ifndef KENCODINGPROBER_H |
9 | #define KENCODINGPROBER_H |
10 | |
11 | // enable debug of private probers |
12 | // #define DEBUG_PROBE |
13 | |
14 | #include <kcodecs_export.h> |
15 | |
16 | #ifdef DEBUG_PROBE |
17 | #include <QDebug> |
18 | #endif |
19 | |
20 | #include <QCoreApplication> |
21 | #include <QString> |
22 | #include <memory> |
23 | |
24 | class KEncodingProberPrivate; |
25 | |
26 | /** |
27 | * @class KEncodingProber kencodingprober.h KEncodingProber |
28 | * |
29 | * @short Provides encoding detection(probe) capabilities. |
30 | * |
31 | * Probe the encoding of raw data only. |
32 | * In the case it can't find it, return the most possible encoding it guessed. |
33 | * |
34 | * Always do Unicode probe regardless the ProberType |
35 | * |
36 | * Feed data to it several times with feed() until ProberState changes to FoundIt/NotMe, |
37 | * or confidence() returns a value you find acceptable. |
38 | * |
39 | * Intended lifetime of the object: one instance per ProberType. |
40 | * |
41 | * Typical use: |
42 | * \code |
43 | * QByteArray data, moredata; |
44 | * ... |
45 | * KEncodingProber prober(KEncodingProber::Chinese); |
46 | * prober.feed(data); |
47 | * prober.feed(moredata); |
48 | * if (prober.confidence() > 0.6) |
49 | * encoding = prober.encoding(); |
50 | * \endcode |
51 | * |
52 | * At least 256 characters are needed to change the ProberState from Probing to FoundIt. |
53 | * If you don't have so many characters to probe, |
54 | * decide whether to accept the encoding it guessed so far according to the Confidence by yourself. |
55 | * |
56 | * @short Guess encoding of char array |
57 | * |
58 | */ |
59 | class KCODECS_EXPORT KEncodingProber |
60 | { |
61 | Q_DECLARE_TR_FUNCTIONS(KEncodingProber) |
62 | |
63 | public: |
64 | enum ProberState { |
65 | FoundIt, /**< Sure find the encoding */ |
66 | NotMe, /**< Sure not included in current ProberType's all supported encodings */ |
67 | Probing, /**< Need more data to make a decision */ |
68 | }; |
69 | |
70 | enum ProberType { |
71 | None, |
72 | Universal, |
73 | Arabic, |
74 | Baltic, |
75 | CentralEuropean, |
76 | ChineseSimplified, |
77 | ChineseTraditional, |
78 | Cyrillic, |
79 | Greek, |
80 | Hebrew, |
81 | Japanese, |
82 | Korean, |
83 | NorthernSaami, |
84 | Other, |
85 | SouthEasternEurope, |
86 | Thai, |
87 | Turkish, |
88 | Unicode, |
89 | WesternEuropean, |
90 | }; |
91 | |
92 | /** |
93 | * Default ProberType is Universal(detect all possible encodings) |
94 | */ |
95 | KEncodingProber(ProberType proberType = Universal); |
96 | |
97 | ~KEncodingProber(); |
98 | |
99 | KEncodingProber(const KEncodingProber &) = delete; |
100 | KEncodingProber &operator=(const KEncodingProber &) = delete; |
101 | |
102 | /** |
103 | * reset the prober's internal state and data. |
104 | */ |
105 | void reset(); |
106 | |
107 | /** |
108 | * The main class method |
109 | * |
110 | * feed data to the prober |
111 | * |
112 | * @returns the ProberState after probing the fed data. |
113 | */ |
114 | ProberState feed(QByteArrayView data); |
115 | // for API compatibility |
116 | inline ProberState feed(const char *data, qsizetype len) |
117 | { |
118 | return feed(data: {data, len}); |
119 | } |
120 | |
121 | /** |
122 | * @returns the prober's current ProberState |
123 | * |
124 | */ |
125 | ProberState state() const; |
126 | |
127 | /** |
128 | * @returns a QByteArray with the name of the best encoding it has guessed so far |
129 | * @since 4.2.2 |
130 | */ |
131 | QByteArray encoding() const; |
132 | |
133 | /** |
134 | * @returns the confidence(sureness) of encoding it guessed so far (0.0 ~ 0.99), not very reliable for single byte encodings |
135 | */ |
136 | float confidence() const; |
137 | |
138 | ProberType proberType() const; |
139 | |
140 | /** |
141 | * change current prober's ProberType and reset the prober |
142 | */ |
143 | void setProberType(ProberType proberType); |
144 | |
145 | /** |
146 | * @return the ProberType for lang (e.g. proberTypeForName("Chinese Simplified") will return KEncodingProber::ChineseSimplified |
147 | */ |
148 | static ProberType proberTypeForName(const QString &lang); |
149 | |
150 | /** |
151 | * map ProberType to language string |
152 | */ |
153 | static QString nameForProberType(ProberType proberType); |
154 | |
155 | private: |
156 | std::unique_ptr<KEncodingProberPrivate> const d; |
157 | }; |
158 | |
159 | #endif |
160 | |