1 | /* This file is part of the KDE libraries |
2 | |
3 | SPDX-FileCopyrightText: 2009 Jakub Stachowski <qbast@go2.pl> |
4 | |
5 | SPDX-License-Identifier: LGPL-2.0-or-later |
6 | */ |
7 | |
8 | #ifndef ABSTRACTTOKENIZER_H |
9 | #define ABSTRACTTOKENIZER_H |
10 | |
11 | #include "sonnetcore_export.h" |
12 | #include <QString> |
13 | |
14 | #include <memory> |
15 | |
16 | namespace Sonnet |
17 | { |
18 | struct Token { |
19 | QStringView token = nullptr; |
20 | int positionInBuffer = -1; |
21 | |
22 | QString toString() const |
23 | { |
24 | return token.toString(); |
25 | } |
26 | |
27 | Q_DECL_CONSTEXPR int length() const |
28 | { |
29 | return token.size(); |
30 | } |
31 | |
32 | /* |
33 | * position in buffer of which the token is a view. |
34 | */ |
35 | Q_DECL_CONSTEXPR int position() const |
36 | { |
37 | return positionInBuffer; |
38 | } |
39 | |
40 | Q_DECL_CONSTEXPR bool isNull() const |
41 | { |
42 | return token.isNull(); |
43 | } |
44 | |
45 | Q_DECL_CONSTEXPR bool isEmpty() const |
46 | { |
47 | return token.isEmpty(); |
48 | } |
49 | |
50 | Q_DECL_CONSTEXPR QChar at(qsizetype n) const |
51 | { |
52 | return token.at(n); |
53 | } |
54 | }; |
55 | |
56 | /*! |
57 | * \internal |
58 | * AbstractTokenizer breaks text into smaller pieces - words, sentences, paragraphs. |
59 | * |
60 | * AbstractTokenizer is an abstract class that must be subclassed to be used. It provides API modelled |
61 | * after Java-style iterators. During tokenization buffer can be modified using provided replace() method. |
62 | */ |
63 | class AbstractTokenizer |
64 | { |
65 | public: |
66 | virtual ~AbstractTokenizer() |
67 | { |
68 | } |
69 | |
70 | /*! |
71 | * Sets text to tokenize. It also resets tokenizer state. |
72 | */ |
73 | virtual void setBuffer(const QString &buffer = QString()) = 0; |
74 | /*! |
75 | * Returns true if there is another token available. |
76 | * Returns true if another token is available, false if not. |
77 | */ |
78 | virtual bool hasNext() const = 0; |
79 | |
80 | /*! |
81 | * Returns next token or null QString if there is none |
82 | */ |
83 | virtual Token next() = 0; |
84 | |
85 | /*! Returns content of currently tokenized buffer*/ |
86 | virtual QString buffer() const = 0; |
87 | |
88 | /*! |
89 | * Replace part of text in current buffer. Always use this function instead of directly |
90 | * changing data in underlying buffer or tokenizer's internal state may become inconsistent. |
91 | */ |
92 | virtual void replace(int position, int len, const QString &newWord) = 0; |
93 | }; |
94 | |
95 | class BreakTokenizerPrivate; |
96 | |
97 | /*! |
98 | * \brief WordTokenizer splits supplied buffer into individual words. |
99 | * |
100 | * WordTokenizer splits buffer into words according to rules from Unicode standard 5.1. |
101 | * If purpose is to check spelling, use isSpellcheckable() to determine if current word should be |
102 | * checked or ignored. |
103 | * |
104 | * Usage example: |
105 | * |
106 | * \code |
107 | * WordTokenizer t(buffer); |
108 | * Speller sp; |
109 | * while (t.hasNext()) { |
110 | * Token word=t.next(); |
111 | * if (!t.isSpellcheckable()) continue; |
112 | * qDebug() << word.toString() << " " << sp.isCorrect(word.toString()); |
113 | * } |
114 | * \endcode |
115 | * |
116 | * This example checks spelling of given buffer. |
117 | * \since 4.3 |
118 | * \internal |
119 | */ |
120 | class SONNETCORE_EXPORT WordTokenizer : public AbstractTokenizer |
121 | { |
122 | public: |
123 | /*! |
124 | * Constructor for word tokenizer |
125 | * \a buffer |
126 | */ |
127 | WordTokenizer(const QString &buffer = QString()); |
128 | ~WordTokenizer() override; |
129 | |
130 | void setBuffer(const QString &buffer) override; |
131 | bool hasNext() const override; |
132 | Token next() override; |
133 | QString buffer() const override; |
134 | void replace(int position, int len, const QString &newWord) override; |
135 | |
136 | /*! Returns true if this word should be spell checked. This ignores email addresses, URLs and other things according to configuration */ |
137 | bool isSpellcheckable() const; |
138 | |
139 | /*! If ignore uppercase is true, then any word containing only uppercase letters will be considered unsuitable for spell check */ |
140 | void setIgnoreUppercase(bool val); |
141 | |
142 | private: |
143 | SONNETCORE_NO_EXPORT bool isUppercase(QStringView word) const; |
144 | |
145 | private: |
146 | std::unique_ptr<BreakTokenizerPrivate> const d; |
147 | }; |
148 | |
149 | /*! |
150 | * \internal |
151 | * |
152 | * \brief SentenceTokenizer splits supplied buffer into individual sentences. |
153 | * |
154 | * SentenceTokenizer splits buffer into sentences according to rules from Unicode standard 5.1. |
155 | * \since 4.3 |
156 | */ |
157 | class SONNETCORE_EXPORT SentenceTokenizer : public AbstractTokenizer |
158 | { |
159 | public: |
160 | /*! |
161 | */ |
162 | SentenceTokenizer(const QString &buffer = QString()); |
163 | ~SentenceTokenizer() override; |
164 | void setBuffer(const QString &buffer) override; |
165 | bool hasNext() const override; |
166 | Token next() override; |
167 | QString buffer() const override; |
168 | void replace(int position, int len, const QString &newWord) override; |
169 | |
170 | private: |
171 | std::unique_ptr<BreakTokenizerPrivate> const d; |
172 | }; |
173 | } |
174 | #endif |
175 | |