1/* This file is part of the KDE libraries
2
3 SPDX-FileCopyrightText: 2009 Jakub Stachowski <qbast@go2.pl>
4
5 SPDX-License-Identifier: LGPL-2.0-or-later
6*/
7
8#ifndef ABSTRACTTOKENIZER_H
9#define ABSTRACTTOKENIZER_H
10
11#include "sonnetcore_export.h"
12#include <QString>
13
14#include <memory>
15
16namespace Sonnet
17{
18struct Token {
19 QStringView token = nullptr;
20 int positionInBuffer = -1;
21
22 QString toString() const
23 {
24 return token.toString();
25 }
26
27 Q_DECL_CONSTEXPR int length() const
28 {
29 return token.size();
30 }
31
32 /*
33 * position in buffer of which the token is a view.
34 */
35 Q_DECL_CONSTEXPR int position() const
36 {
37 return positionInBuffer;
38 }
39
40 Q_DECL_CONSTEXPR bool isNull() const
41 {
42 return token.isNull();
43 }
44
45 Q_DECL_CONSTEXPR bool isEmpty() const
46 {
47 return token.isEmpty();
48 }
49
50 Q_DECL_CONSTEXPR QChar at(qsizetype n) const
51 {
52 return token.at(n);
53 }
54};
55
56/*!
57 * \internal
58 * AbstractTokenizer breaks text into smaller pieces - words, sentences, paragraphs.
59 *
60 * AbstractTokenizer is an abstract class that must be subclassed to be used. It provides API modelled
61 * after Java-style iterators. During tokenization buffer can be modified using provided replace() method.
62 */
63class AbstractTokenizer
64{
65public:
66 virtual ~AbstractTokenizer()
67 {
68 }
69
70 /*!
71 * Sets text to tokenize. It also resets tokenizer state.
72 */
73 virtual void setBuffer(const QString &buffer = QString()) = 0;
74 /*!
75 * Returns true if there is another token available.
76 * Returns true if another token is available, false if not.
77 */
78 virtual bool hasNext() const = 0;
79
80 /*!
81 * Returns next token or null QString if there is none
82 */
83 virtual Token next() = 0;
84
85 /*! Returns content of currently tokenized buffer*/
86 virtual QString buffer() const = 0;
87
88 /*!
89 * Replace part of text in current buffer. Always use this function instead of directly
90 * changing data in underlying buffer or tokenizer's internal state may become inconsistent.
91 */
92 virtual void replace(int position, int len, const QString &newWord) = 0;
93};
94
95class BreakTokenizerPrivate;
96
97/*!
98 * \brief WordTokenizer splits supplied buffer into individual words.
99 *
100 * WordTokenizer splits buffer into words according to rules from Unicode standard 5.1.
101 * If purpose is to check spelling, use isSpellcheckable() to determine if current word should be
102 * checked or ignored.
103 *
104 * Usage example:
105 *
106 * \code
107 * WordTokenizer t(buffer);
108 * Speller sp;
109 * while (t.hasNext()) {
110 * Token word=t.next();
111 * if (!t.isSpellcheckable()) continue;
112 * qDebug() << word.toString() << " " << sp.isCorrect(word.toString());
113 * }
114 * \endcode
115 *
116 * This example checks spelling of given buffer.
117 * \since 4.3
118 * \internal
119 */
120class SONNETCORE_EXPORT WordTokenizer : public AbstractTokenizer
121{
122public:
123 /*!
124 * Constructor for word tokenizer
125 * \a buffer
126 */
127 WordTokenizer(const QString &buffer = QString());
128 ~WordTokenizer() override;
129
130 void setBuffer(const QString &buffer) override;
131 bool hasNext() const override;
132 Token next() override;
133 QString buffer() const override;
134 void replace(int position, int len, const QString &newWord) override;
135
136 /*! Returns true if this word should be spell checked. This ignores email addresses, URLs and other things according to configuration */
137 bool isSpellcheckable() const;
138
139 /*! If ignore uppercase is true, then any word containing only uppercase letters will be considered unsuitable for spell check */
140 void setIgnoreUppercase(bool val);
141
142private:
143 SONNETCORE_NO_EXPORT bool isUppercase(QStringView word) const;
144
145private:
146 std::unique_ptr<BreakTokenizerPrivate> const d;
147};
148
149/*!
150 * \internal
151 *
152 * \brief SentenceTokenizer splits supplied buffer into individual sentences.
153 *
154 * SentenceTokenizer splits buffer into sentences according to rules from Unicode standard 5.1.
155 * \since 4.3
156 */
157class SONNETCORE_EXPORT SentenceTokenizer : public AbstractTokenizer
158{
159public:
160 /*!
161 */
162 SentenceTokenizer(const QString &buffer = QString());
163 ~SentenceTokenizer() override;
164 void setBuffer(const QString &buffer) override;
165 bool hasNext() const override;
166 Token next() override;
167 QString buffer() const override;
168 void replace(int position, int len, const QString &newWord) override;
169
170private:
171 std::unique_ptr<BreakTokenizerPrivate> const d;
172};
173}
174#endif
175

source code of sonnet/src/core/tokenizer_p.h