1 | /* This file is part of the KDE libraries |
2 | |
3 | SPDX-FileCopyrightText: 2004 Zack Rusin <zack@kde.org> |
4 | SPDX-FileCopyrightText: 2006 Jacob R Rideout <kde@jacobrideout.net> |
5 | SPDX-FileCopyrightText: 2009 Jakub Stachowski <qbast@go2.pl> |
6 | |
7 | SPDX-License-Identifier: LGPL-2.0-or-later |
8 | */ |
9 | |
10 | #include <QList> |
11 | #include <QString> |
12 | |
13 | #include "textbreaks_p.h" |
14 | #include "tokenizer_p.h" |
15 | |
16 | namespace Sonnet |
17 | { |
18 | class BreakTokenizerPrivate |
19 | { |
20 | public: |
21 | enum Type { |
22 | Words, |
23 | Sentences, |
24 | }; |
25 | |
26 | BreakTokenizerPrivate(Type s) |
27 | : breakFinder(new TextBreaks) |
28 | , itemPosition(-1) |
29 | , cacheValid(false) |
30 | , type(s) |
31 | { |
32 | } |
33 | |
34 | ~BreakTokenizerPrivate() |
35 | { |
36 | delete breakFinder; |
37 | } |
38 | |
39 | TextBreaks::Positions breaks() const; |
40 | void invalidate(); |
41 | void shiftBreaks(int from, int offset); |
42 | void replace(int pos, int len, const QString &newWord); |
43 | |
44 | TextBreaks *const breakFinder; |
45 | QString buffer; |
46 | |
47 | int itemPosition = -1; |
48 | mutable bool cacheValid; |
49 | Token last; |
50 | const Type type; |
51 | bool inAddress = false; |
52 | bool ignoreUppercase = false; |
53 | |
54 | bool hasNext() const; |
55 | Token next(); |
56 | void setBuffer(const QString &b) |
57 | { |
58 | invalidate(); |
59 | buffer = b; |
60 | } |
61 | |
62 | private: |
63 | void regenerateCache() const; |
64 | mutable TextBreaks::Positions cachedBreaks; |
65 | }; |
66 | |
67 | void BreakTokenizerPrivate::invalidate() |
68 | { |
69 | cacheValid = false; |
70 | itemPosition = -1; |
71 | } |
72 | |
73 | bool BreakTokenizerPrivate::hasNext() const |
74 | { |
75 | if (itemPosition >= (breaks().size() - 1)) { |
76 | return false; |
77 | } |
78 | |
79 | return true; |
80 | } |
81 | |
82 | TextBreaks::Positions BreakTokenizerPrivate::breaks() const |
83 | { |
84 | if (!cacheValid) { |
85 | regenerateCache(); |
86 | } |
87 | |
88 | return cachedBreaks; |
89 | } |
90 | |
91 | void BreakTokenizerPrivate::shiftBreaks(int from, int offset) |
92 | { |
93 | for (int i = 0; i < cachedBreaks.size(); i++) { |
94 | if (cachedBreaks[i].start > from) { |
95 | cachedBreaks[i].start = cachedBreaks[i].start - offset; |
96 | } |
97 | } |
98 | } |
99 | |
100 | void BreakTokenizerPrivate::regenerateCache() const |
101 | { |
102 | if (!breakFinder || buffer.isEmpty()) { |
103 | cachedBreaks = TextBreaks::Positions(); |
104 | } |
105 | |
106 | if (breakFinder) { |
107 | breakFinder->setText(buffer); |
108 | |
109 | if (type == Sentences) { |
110 | cachedBreaks = breakFinder->sentenceBreaks(); |
111 | } else if (type == Words) { |
112 | cachedBreaks = breakFinder->wordBreaks(); |
113 | } |
114 | } |
115 | |
116 | cacheValid = true; |
117 | } |
118 | |
119 | Token BreakTokenizerPrivate::next() |
120 | { |
121 | Token block; |
122 | |
123 | if (!hasNext()) { |
124 | last = block; |
125 | return block; |
126 | } |
127 | |
128 | itemPosition++; |
129 | |
130 | const TextBreaks::Positions breaks = this->breaks(); |
131 | const TextBreaks::Position &textBreak = breaks.at(i: itemPosition); |
132 | QStringView token = QStringView(buffer).mid(pos: textBreak.start, n: textBreak.length); |
133 | last = {.token: token, .positionInBuffer: textBreak.start}; |
134 | return last; |
135 | } |
136 | |
137 | void BreakTokenizerPrivate::replace(int pos, int len, const QString &newWord) |
138 | { |
139 | buffer.replace(i: pos, len, after: newWord); |
140 | int offset = len - newWord.length(); |
141 | if (cacheValid) { |
142 | shiftBreaks(from: pos, offset); |
143 | } |
144 | } |
145 | |
146 | /*-----------------------------------------------------------*/ |
147 | |
148 | WordTokenizer::WordTokenizer(const QString &buffer) |
149 | : d(new BreakTokenizerPrivate(BreakTokenizerPrivate::Words)) |
150 | { |
151 | setBuffer(buffer); |
152 | } |
153 | |
154 | WordTokenizer::~WordTokenizer() = default; |
155 | |
156 | bool WordTokenizer::hasNext() const |
157 | { |
158 | return d->hasNext(); |
159 | } |
160 | |
161 | void WordTokenizer::setBuffer(const QString &buffer) |
162 | { |
163 | d->setBuffer(buffer); |
164 | } |
165 | |
166 | Token WordTokenizer::next() |
167 | { |
168 | Token n = d->next(); |
169 | |
170 | // end of address of url? |
171 | if (d->inAddress && n.position() > 0 && d->buffer[n.position() - 1].isSpace()) { |
172 | d->inAddress = false; |
173 | } |
174 | |
175 | // check if this word starts an email address of url |
176 | if (!d->inAddress || hasNext()) { |
177 | const int pos = n.position() + n.length(); |
178 | if ((pos < d->buffer.length()) && d->buffer[pos] == QLatin1Char('@')) { |
179 | d->inAddress = true; |
180 | } |
181 | if ((pos + 2 < d->buffer.length()) && d->buffer[pos] == QLatin1Char(':') && d->buffer[pos + 1] == QLatin1Char('/') |
182 | && d->buffer[pos + 2] == QLatin1Char('/')) { |
183 | d->inAddress = true; |
184 | } |
185 | } |
186 | return n; |
187 | } |
188 | |
189 | QString WordTokenizer::buffer() const |
190 | { |
191 | return d->buffer; |
192 | } |
193 | |
194 | bool WordTokenizer::isUppercase(QStringView word) const |
195 | { |
196 | for (int i = 0; i < word.length(); ++i) { |
197 | if (word.at(n: i).isLetter() && !word.at(n: i).isUpper()) { |
198 | return false; |
199 | } |
200 | } |
201 | return true; |
202 | } |
203 | |
204 | void WordTokenizer::setIgnoreUppercase(bool val) |
205 | { |
206 | d->ignoreUppercase = val; |
207 | } |
208 | |
209 | void WordTokenizer::replace(int pos, int len, const QString &newWord) |
210 | { |
211 | d->replace(pos, len, newWord); |
212 | } |
213 | |
214 | bool WordTokenizer::isSpellcheckable() const |
215 | { |
216 | if (d->last.isNull() || d->last.isEmpty()) { |
217 | return false; |
218 | } |
219 | if (!d->last.at(n: 0).isLetter()) { |
220 | return false; |
221 | } |
222 | if (d->inAddress) { |
223 | return false; |
224 | } |
225 | if (d->ignoreUppercase && isUppercase(word: d->last.token)) { |
226 | return false; |
227 | } |
228 | return true; |
229 | } |
230 | |
231 | /* --------------------------------------------------------------------*/ |
232 | |
233 | SentenceTokenizer::SentenceTokenizer(const QString &buffer) |
234 | : d(new BreakTokenizerPrivate(BreakTokenizerPrivate::Sentences)) |
235 | { |
236 | setBuffer(buffer); |
237 | } |
238 | |
239 | SentenceTokenizer::~SentenceTokenizer() = default; |
240 | |
241 | bool SentenceTokenizer::hasNext() const |
242 | { |
243 | return d->hasNext(); |
244 | } |
245 | |
246 | void SentenceTokenizer::setBuffer(const QString &buffer) |
247 | { |
248 | d->setBuffer(buffer); |
249 | } |
250 | |
251 | Token SentenceTokenizer::next() |
252 | { |
253 | return d->next(); |
254 | } |
255 | |
256 | QString SentenceTokenizer::buffer() const |
257 | { |
258 | return d->buffer; |
259 | } |
260 | |
261 | void SentenceTokenizer::replace(int pos, int len, const QString &newWord) |
262 | { |
263 | d->replace(pos, len, newWord); |
264 | } |
265 | } |
266 | |