1 | /* This file is part of the KDE libraries |
2 | |
3 | SPDX-FileCopyrightText: 2009 Jakub Stachowski <qbast@go2.pl> |
4 | |
5 | SPDX-License-Identifier: LGPL-2.0-or-later |
6 | */ |
7 | |
8 | #ifndef ABSTRACTTOKENIZER_H |
9 | #define ABSTRACTTOKENIZER_H |
10 | |
11 | #include "sonnetcore_export.h" |
12 | #include <QString> |
13 | |
14 | #include <memory> |
15 | |
16 | namespace Sonnet |
17 | { |
18 | struct Token { |
19 | QStringView token = nullptr; |
20 | int positionInBuffer = -1; |
21 | |
22 | QString toString() const |
23 | { |
24 | return token.toString(); |
25 | } |
26 | |
27 | /** |
28 | * @brief length of this token |
29 | */ |
30 | Q_DECL_CONSTEXPR int length() const |
31 | { |
32 | return token.size(); |
33 | } |
34 | |
35 | /** |
36 | * @brief position in buffer of which the @ref token is a view |
37 | */ |
38 | Q_DECL_CONSTEXPR int position() const |
39 | { |
40 | return positionInBuffer; |
41 | } |
42 | |
43 | Q_DECL_CONSTEXPR bool isNull() const |
44 | { |
45 | return token.isNull(); |
46 | } |
47 | |
48 | Q_DECL_CONSTEXPR bool isEmpty() const |
49 | { |
50 | return token.isEmpty(); |
51 | } |
52 | |
53 | Q_DECL_CONSTEXPR QChar at(qsizetype n) const |
54 | { |
55 | return token.at(n); |
56 | } |
57 | }; |
58 | |
59 | /** |
60 | * @short AbstractTokenizer breaks text into smaller pieces - words, sentences, paragraphs. |
61 | * |
62 | * AbstractTokenizer is an abstract class that must be subclassed to be used. It provides API modelled |
63 | * after Java-style iterators. During tokenization buffer can be modified using provided replace() method. |
64 | * |
65 | * @since 4.3 |
66 | */ |
67 | class AbstractTokenizer |
68 | { |
69 | public: |
70 | virtual ~AbstractTokenizer() |
71 | { |
72 | } |
73 | |
74 | /** |
75 | * Sets text to tokenize. It also resets tokenizer state. |
76 | */ |
77 | virtual void setBuffer(const QString &buffer = QString()) = 0; |
78 | /** |
79 | * Returns true if there is another token available. |
80 | * @return true if another token is available, false if not. |
81 | */ |
82 | virtual bool hasNext() const = 0; |
83 | |
84 | /** |
85 | * Returns next token or null QString if there is none |
86 | */ |
87 | virtual Token next() = 0; |
88 | |
89 | /** Returns content of currently tokenized buffer*/ |
90 | virtual QString buffer() const = 0; |
91 | |
92 | /** |
93 | * Replace part of text in current buffer. Always use this function instead of directly |
94 | * changing data in underlying buffer or tokenizer's internal state may become inconsistent. |
95 | */ |
96 | virtual void replace(int position, int len, const QString &newWord) = 0; |
97 | }; |
98 | |
99 | class BreakTokenizerPrivate; |
100 | |
101 | /** |
102 | @short WordTokenizer splits supplied buffer into individual words. |
103 | |
104 | WordTokenizer splits buffer into words according to rules from Unicode standard 5.1. |
105 | If purpose is to check spelling, use isSpellcheckable() to determine if current word should be |
106 | checked or ignored. |
107 | |
108 | Usage example: |
109 | |
110 | @code |
111 | WordTokenizer t(buffer); |
112 | Speller sp; |
113 | while (t.hasNext()) { |
114 | Token word=t.next(); |
115 | if (!t.isSpellcheckable()) continue; |
116 | qDebug() << word.toString() << " " << sp.isCorrect(word.toString()); |
117 | } |
118 | @endcode |
119 | |
120 | This example checks spelling of given buffer |
121 | * @since 4.3 |
122 | */ |
123 | class SONNETCORE_EXPORT WordTokenizer : public AbstractTokenizer |
124 | { |
125 | public: |
126 | /** |
127 | * Constructor for word tokenizer |
128 | * @param buffer |
129 | */ |
130 | WordTokenizer(const QString &buffer = QString()); |
131 | ~WordTokenizer() override; |
132 | |
133 | void setBuffer(const QString &buffer) override; |
134 | bool hasNext() const override; |
135 | Token next() override; |
136 | QString buffer() const override; |
137 | void replace(int position, int len, const QString &newWord) override; |
138 | |
139 | /** Returns true if this word should be spell checked. This ignores email addresses, URLs and other things according to configuration */ |
140 | bool isSpellcheckable() const; |
141 | |
142 | /** If ignore uppercase is true, then any word containing only uppercase letters will be considered unsuitable for spell check */ |
143 | void setIgnoreUppercase(bool val); |
144 | |
145 | private: |
146 | SONNETCORE_NO_EXPORT bool isUppercase(QStringView word) const; |
147 | |
148 | private: |
149 | std::unique_ptr<BreakTokenizerPrivate> const d; |
150 | }; |
151 | |
152 | /** |
153 | @short SentenceTokenizer splits supplied buffer into individual sentences. |
154 | |
155 | SentenceTokenizer splits buffer into sentences according to rules from Unicode standard 5.1. |
156 | * @since 4.3 |
157 | */ |
158 | class SONNETCORE_EXPORT SentenceTokenizer : public AbstractTokenizer |
159 | { |
160 | public: |
161 | SentenceTokenizer(const QString &buffer = QString()); |
162 | ~SentenceTokenizer() override; |
163 | void setBuffer(const QString &buffer) override; |
164 | bool hasNext() const override; |
165 | Token next() override; |
166 | QString buffer() const override; |
167 | void replace(int position, int len, const QString &newWord) override; |
168 | |
169 | private: |
170 | std::unique_ptr<BreakTokenizerPrivate> const d; |
171 | }; |
172 | } |
173 | #endif |
174 | |