1/* This file is part of the KDE libraries
2
3 SPDX-FileCopyrightText: 2004 Zack Rusin <zack@kde.org>
4 SPDX-FileCopyrightText: 2006 Jacob R Rideout <kde@jacobrideout.net>
5 SPDX-FileCopyrightText: 2009 Jakub Stachowski <qbast@go2.pl>
6
7 SPDX-License-Identifier: LGPL-2.0-or-later
8*/
9
10#include <QList>
11#include <QString>
12
13#include "textbreaks_p.h"
14#include "tokenizer_p.h"
15
16namespace Sonnet
17{
18class BreakTokenizerPrivate
19{
20public:
21 enum Type {
22 Words,
23 Sentences,
24 };
25
26 BreakTokenizerPrivate(Type s)
27 : breakFinder(new TextBreaks)
28 , itemPosition(-1)
29 , cacheValid(false)
30 , type(s)
31 {
32 }
33
34 ~BreakTokenizerPrivate()
35 {
36 delete breakFinder;
37 }
38
39 TextBreaks::Positions breaks() const;
40 void invalidate();
41 void shiftBreaks(int from, int offset);
42 void replace(int pos, int len, const QString &newWord);
43
44 TextBreaks *const breakFinder;
45 QString buffer;
46
47 int itemPosition = -1;
48 mutable bool cacheValid;
49 Token last;
50 const Type type;
51 bool inAddress = false;
52 bool ignoreUppercase = false;
53
54 bool hasNext() const;
55 Token next();
56 void setBuffer(const QString &b)
57 {
58 invalidate();
59 buffer = b;
60 }
61
62private:
63 void regenerateCache() const;
64 mutable TextBreaks::Positions cachedBreaks;
65};
66
67void BreakTokenizerPrivate::invalidate()
68{
69 cacheValid = false;
70 itemPosition = -1;
71}
72
73bool BreakTokenizerPrivate::hasNext() const
74{
75 if (itemPosition >= (breaks().size() - 1)) {
76 return false;
77 }
78
79 return true;
80}
81
82TextBreaks::Positions BreakTokenizerPrivate::breaks() const
83{
84 if (!cacheValid) {
85 regenerateCache();
86 }
87
88 return cachedBreaks;
89}
90
91void BreakTokenizerPrivate::shiftBreaks(int from, int offset)
92{
93 for (int i = 0; i < cachedBreaks.size(); i++) {
94 if (cachedBreaks[i].start > from) {
95 cachedBreaks[i].start = cachedBreaks[i].start - offset;
96 }
97 }
98}
99
100void BreakTokenizerPrivate::regenerateCache() const
101{
102 if (!breakFinder || buffer.isEmpty()) {
103 cachedBreaks = TextBreaks::Positions();
104 }
105
106 if (breakFinder) {
107 breakFinder->setText(buffer);
108
109 if (type == Sentences) {
110 cachedBreaks = breakFinder->sentenceBreaks();
111 } else if (type == Words) {
112 cachedBreaks = breakFinder->wordBreaks();
113 }
114 }
115
116 cacheValid = true;
117}
118
119Token BreakTokenizerPrivate::next()
120{
121 Token block;
122
123 if (!hasNext()) {
124 last = block;
125 return block;
126 }
127
128 itemPosition++;
129
130 const TextBreaks::Positions breaks = this->breaks();
131 const TextBreaks::Position &textBreak = breaks.at(i: itemPosition);
132 QStringView token = QStringView(buffer).mid(pos: textBreak.start, n: textBreak.length);
133 last = {.token: token, .positionInBuffer: textBreak.start};
134 return last;
135}
136
137void BreakTokenizerPrivate::replace(int pos, int len, const QString &newWord)
138{
139 buffer.replace(i: pos, len, after: newWord);
140 int offset = len - newWord.length();
141 if (cacheValid) {
142 shiftBreaks(from: pos, offset);
143 }
144}
145
146/*-----------------------------------------------------------*/
147
148WordTokenizer::WordTokenizer(const QString &buffer)
149 : d(new BreakTokenizerPrivate(BreakTokenizerPrivate::Words))
150{
151 setBuffer(buffer);
152}
153
154WordTokenizer::~WordTokenizer() = default;
155
156bool WordTokenizer::hasNext() const
157{
158 return d->hasNext();
159}
160
161void WordTokenizer::setBuffer(const QString &buffer)
162{
163 d->setBuffer(buffer);
164}
165
166Token WordTokenizer::next()
167{
168 Token n = d->next();
169
170 // end of address of url?
171 if (d->inAddress && n.position() > 0 && d->buffer[n.position() - 1].isSpace()) {
172 d->inAddress = false;
173 }
174
175 // check if this word starts an email address of url
176 if (!d->inAddress || hasNext()) {
177 const int pos = n.position() + n.length();
178 if ((pos < d->buffer.length()) && d->buffer[pos] == QLatin1Char('@')) {
179 d->inAddress = true;
180 }
181 if ((pos + 2 < d->buffer.length()) && d->buffer[pos] == QLatin1Char(':') && d->buffer[pos + 1] == QLatin1Char('/')
182 && d->buffer[pos + 2] == QLatin1Char('/')) {
183 d->inAddress = true;
184 }
185 }
186 return n;
187}
188
189QString WordTokenizer::buffer() const
190{
191 return d->buffer;
192}
193
194bool WordTokenizer::isUppercase(QStringView word) const
195{
196 for (int i = 0; i < word.length(); ++i) {
197 if (word.at(n: i).isLetter() && !word.at(n: i).isUpper()) {
198 return false;
199 }
200 }
201 return true;
202}
203
204void WordTokenizer::setIgnoreUppercase(bool val)
205{
206 d->ignoreUppercase = val;
207}
208
209void WordTokenizer::replace(int pos, int len, const QString &newWord)
210{
211 d->replace(pos, len, newWord);
212}
213
214bool WordTokenizer::isSpellcheckable() const
215{
216 if (d->last.isNull() || d->last.isEmpty()) {
217 return false;
218 }
219 if (!d->last.at(n: 0).isLetter()) {
220 return false;
221 }
222 if (d->inAddress) {
223 return false;
224 }
225 if (d->ignoreUppercase && isUppercase(word: d->last.token)) {
226 return false;
227 }
228 return true;
229}
230
231/* --------------------------------------------------------------------*/
232
233SentenceTokenizer::SentenceTokenizer(const QString &buffer)
234 : d(new BreakTokenizerPrivate(BreakTokenizerPrivate::Sentences))
235{
236 setBuffer(buffer);
237}
238
239SentenceTokenizer::~SentenceTokenizer() = default;
240
241bool SentenceTokenizer::hasNext() const
242{
243 return d->hasNext();
244}
245
246void SentenceTokenizer::setBuffer(const QString &buffer)
247{
248 d->setBuffer(buffer);
249}
250
251Token SentenceTokenizer::next()
252{
253 return d->next();
254}
255
256QString SentenceTokenizer::buffer() const
257{
258 return d->buffer;
259}
260
261void SentenceTokenizer::replace(int pos, int len, const QString &newWord)
262{
263 d->replace(pos, len, newWord);
264}
265}
266

source code of sonnet/src/core/tokenizer.cpp