1/*
2 This file is part of the KDE libraries
3
4 SPDX-FileCopyrightText: 2008 Wang Hoi <zealot.hoi@gmail.com>
5
6 SPDX-License-Identifier: LGPL-2.0-or-later
7*/
8#ifndef KENCODINGPROBER_H
9#define KENCODINGPROBER_H
10
11// enable debug of private probers
12// #define DEBUG_PROBE
13
14#include <kcodecs_export.h>
15
16#ifdef DEBUG_PROBE
17#include <QDebug>
18#endif
19
20#include <QCoreApplication>
21#include <QString>
22#include <memory>
23
24class KEncodingProberPrivate;
25
26/**
27 * @class KEncodingProber kencodingprober.h KEncodingProber
28 *
29 * @short Provides encoding detection(probe) capabilities.
30 *
31 * Probe the encoding of raw data only.
32 * In the case it can't find it, return the most possible encoding it guessed.
33 *
34 * Always do Unicode probe regardless the ProberType
35 *
36 * Feed data to it several times with feed() until ProberState changes to FoundIt/NotMe,
37 * or confidence() returns a value you find acceptable.
38 *
39 * Intended lifetime of the object: one instance per ProberType.
40 *
41 * Typical use:
42 * \code
43 * QByteArray data, moredata;
44 * ...
45 * KEncodingProber prober(KEncodingProber::Chinese);
46 * prober.feed(data);
47 * prober.feed(moredata);
48 * if (prober.confidence() > 0.6)
49 * encoding = prober.encoding();
50 * \endcode
51 *
52 * At least 256 characters are needed to change the ProberState from Probing to FoundIt.
53 * If you don't have so many characters to probe,
54 * decide whether to accept the encoding it guessed so far according to the Confidence by yourself.
55 *
56 * @short Guess encoding of char array
57 *
58 */
59class KCODECS_EXPORT KEncodingProber
60{
61 Q_DECLARE_TR_FUNCTIONS(KEncodingProber)
62
63public:
64 enum ProberState {
65 FoundIt, /**< Sure find the encoding */
66 NotMe, /**< Sure not included in current ProberType's all supported encodings */
67 Probing, /**< Need more data to make a decision */
68 };
69
70 enum ProberType {
71 None,
72 Universal,
73 Arabic,
74 Baltic,
75 CentralEuropean,
76 ChineseSimplified,
77 ChineseTraditional,
78 Cyrillic,
79 Greek,
80 Hebrew,
81 Japanese,
82 Korean,
83 NorthernSaami,
84 Other,
85 SouthEasternEurope,
86 Thai,
87 Turkish,
88 Unicode,
89 WesternEuropean,
90 };
91
92 /**
93 * Default ProberType is Universal(detect all possible encodings)
94 */
95 KEncodingProber(ProberType proberType = Universal);
96
97 ~KEncodingProber();
98
99 KEncodingProber(const KEncodingProber &) = delete;
100 KEncodingProber &operator=(const KEncodingProber &) = delete;
101
102 /**
103 * reset the prober's internal state and data.
104 */
105 void reset();
106
107 /**
108 * The main class method
109 *
110 * feed data to the prober
111 *
112 * @returns the ProberState after probing the fed data.
113 */
114 ProberState feed(QByteArrayView data);
115 // for API compatibility
116 inline ProberState feed(const char *data, qsizetype len)
117 {
118 return feed(data: {data, len});
119 }
120
121 /**
122 * @returns the prober's current ProberState
123 *
124 */
125 ProberState state() const;
126
127 /**
128 * @returns a QByteArray with the name of the best encoding it has guessed so far
129 * @since 4.2.2
130 */
131 QByteArray encoding() const;
132
133 /**
134 * @returns the confidence(sureness) of encoding it guessed so far (0.0 ~ 0.99), not very reliable for single byte encodings
135 */
136 float confidence() const;
137
138 ProberType proberType() const;
139
140 /**
141 * change current prober's ProberType and reset the prober
142 */
143 void setProberType(ProberType proberType);
144
145 /**
146 * @return the ProberType for lang (e.g. proberTypeForName("Chinese Simplified") will return KEncodingProber::ChineseSimplified
147 */
148 static ProberType proberTypeForName(const QString &lang);
149
150 /**
151 * map ProberType to language string
152 */
153 static QString nameForProberType(ProberType proberType);
154
155private:
156 std::unique_ptr<KEncodingProberPrivate> const d;
157};
158
159#endif
160

source code of kcodecs/src/kencodingprober.h