1/* This file is part of the KDE libraries
2
3 SPDX-FileCopyrightText: 2009 Jakub Stachowski <qbast@go2.pl>
4
5 SPDX-License-Identifier: LGPL-2.0-or-later
6*/
7
8#ifndef ABSTRACTTOKENIZER_H
9#define ABSTRACTTOKENIZER_H
10
11#include "sonnetcore_export.h"
12#include <QString>
13
14#include <memory>
15
16namespace Sonnet
17{
18struct Token {
19 QStringView token = nullptr;
20 int positionInBuffer = -1;
21
22 QString toString() const
23 {
24 return token.toString();
25 }
26
27 /**
28 * @brief length of this token
29 */
30 Q_DECL_CONSTEXPR int length() const
31 {
32 return token.size();
33 }
34
35 /**
36 * @brief position in buffer of which the @ref token is a view
37 */
38 Q_DECL_CONSTEXPR int position() const
39 {
40 return positionInBuffer;
41 }
42
43 Q_DECL_CONSTEXPR bool isNull() const
44 {
45 return token.isNull();
46 }
47
48 Q_DECL_CONSTEXPR bool isEmpty() const
49 {
50 return token.isEmpty();
51 }
52
53 Q_DECL_CONSTEXPR QChar at(qsizetype n) const
54 {
55 return token.at(n);
56 }
57};
58
59/**
60 * @short AbstractTokenizer breaks text into smaller pieces - words, sentences, paragraphs.
61 *
62 * AbstractTokenizer is an abstract class that must be subclassed to be used. It provides API modelled
63 * after Java-style iterators. During tokenization buffer can be modified using provided replace() method.
64 *
65 * @since 4.3
66 */
67class AbstractTokenizer
68{
69public:
70 virtual ~AbstractTokenizer()
71 {
72 }
73
74 /**
75 * Sets text to tokenize. It also resets tokenizer state.
76 */
77 virtual void setBuffer(const QString &buffer = QString()) = 0;
78 /**
79 * Returns true if there is another token available.
80 * @return true if another token is available, false if not.
81 */
82 virtual bool hasNext() const = 0;
83
84 /**
85 * Returns next token or null QString if there is none
86 */
87 virtual Token next() = 0;
88
89 /** Returns content of currently tokenized buffer*/
90 virtual QString buffer() const = 0;
91
92 /**
93 * Replace part of text in current buffer. Always use this function instead of directly
94 * changing data in underlying buffer or tokenizer's internal state may become inconsistent.
95 */
96 virtual void replace(int position, int len, const QString &newWord) = 0;
97};
98
99class BreakTokenizerPrivate;
100
101/**
102@short WordTokenizer splits supplied buffer into individual words.
103
104WordTokenizer splits buffer into words according to rules from Unicode standard 5.1.
105If purpose is to check spelling, use isSpellcheckable() to determine if current word should be
106checked or ignored.
107
108Usage example:
109
110@code
111WordTokenizer t(buffer);
112Speller sp;
113while (t.hasNext()) {
114 Token word=t.next();
115 if (!t.isSpellcheckable()) continue;
116 qDebug() << word.toString() << " " << sp.isCorrect(word.toString());
117}
118@endcode
119
120This example checks spelling of given buffer
121 * @since 4.3
122*/
123class SONNETCORE_EXPORT WordTokenizer : public AbstractTokenizer
124{
125public:
126 /**
127 * Constructor for word tokenizer
128 * @param buffer
129 */
130 WordTokenizer(const QString &buffer = QString());
131 ~WordTokenizer() override;
132
133 void setBuffer(const QString &buffer) override;
134 bool hasNext() const override;
135 Token next() override;
136 QString buffer() const override;
137 void replace(int position, int len, const QString &newWord) override;
138
139 /** Returns true if this word should be spell checked. This ignores email addresses, URLs and other things according to configuration */
140 bool isSpellcheckable() const;
141
142 /** If ignore uppercase is true, then any word containing only uppercase letters will be considered unsuitable for spell check */
143 void setIgnoreUppercase(bool val);
144
145private:
146 SONNETCORE_NO_EXPORT bool isUppercase(QStringView word) const;
147
148private:
149 std::unique_ptr<BreakTokenizerPrivate> const d;
150};
151
152/**
153@short SentenceTokenizer splits supplied buffer into individual sentences.
154
155SentenceTokenizer splits buffer into sentences according to rules from Unicode standard 5.1.
156 * @since 4.3
157*/
158class SONNETCORE_EXPORT SentenceTokenizer : public AbstractTokenizer
159{
160public:
161 SentenceTokenizer(const QString &buffer = QString());
162 ~SentenceTokenizer() override;
163 void setBuffer(const QString &buffer) override;
164 bool hasNext() const override;
165 Token next() override;
166 QString buffer() const override;
167 void replace(int position, int len, const QString &newWord) override;
168
169private:
170 std::unique_ptr<BreakTokenizerPrivate> const d;
171};
172}
173#endif
174

source code of sonnet/src/core/tokenizer_p.h