| 1 | /* This file is part of the KDE libraries |
| 2 | |
| 3 | SPDX-FileCopyrightText: 2009 Jakub Stachowski <qbast@go2.pl> |
| 4 | |
| 5 | SPDX-License-Identifier: LGPL-2.0-or-later |
| 6 | */ |
| 7 | |
| 8 | #ifndef ABSTRACTTOKENIZER_H |
| 9 | #define ABSTRACTTOKENIZER_H |
| 10 | |
| 11 | #include "sonnetcore_export.h" |
| 12 | #include <QString> |
| 13 | |
| 14 | #include <memory> |
| 15 | |
| 16 | namespace Sonnet |
| 17 | { |
| 18 | struct Token { |
| 19 | QStringView token = nullptr; |
| 20 | int positionInBuffer = -1; |
| 21 | |
| 22 | QString toString() const |
| 23 | { |
| 24 | return token.toString(); |
| 25 | } |
| 26 | |
| 27 | Q_DECL_CONSTEXPR int length() const |
| 28 | { |
| 29 | return token.size(); |
| 30 | } |
| 31 | |
| 32 | /* |
| 33 | * position in buffer of which the token is a view. |
| 34 | */ |
| 35 | Q_DECL_CONSTEXPR int position() const |
| 36 | { |
| 37 | return positionInBuffer; |
| 38 | } |
| 39 | |
| 40 | Q_DECL_CONSTEXPR bool isNull() const |
| 41 | { |
| 42 | return token.isNull(); |
| 43 | } |
| 44 | |
| 45 | Q_DECL_CONSTEXPR bool isEmpty() const |
| 46 | { |
| 47 | return token.isEmpty(); |
| 48 | } |
| 49 | |
| 50 | Q_DECL_CONSTEXPR QChar at(qsizetype n) const |
| 51 | { |
| 52 | return token.at(n); |
| 53 | } |
| 54 | }; |
| 55 | |
| 56 | /*! |
| 57 | * \internal |
| 58 | * AbstractTokenizer breaks text into smaller pieces - words, sentences, paragraphs. |
| 59 | * |
| 60 | * AbstractTokenizer is an abstract class that must be subclassed to be used. It provides API modelled |
| 61 | * after Java-style iterators. During tokenization buffer can be modified using provided replace() method. |
| 62 | */ |
| 63 | class AbstractTokenizer |
| 64 | { |
| 65 | public: |
| 66 | virtual ~AbstractTokenizer() |
| 67 | { |
| 68 | } |
| 69 | |
| 70 | /*! |
| 71 | * Sets text to tokenize. It also resets tokenizer state. |
| 72 | */ |
| 73 | virtual void setBuffer(const QString &buffer = QString()) = 0; |
| 74 | /*! |
| 75 | * Returns true if there is another token available. |
| 76 | * Returns true if another token is available, false if not. |
| 77 | */ |
| 78 | virtual bool hasNext() const = 0; |
| 79 | |
| 80 | /*! |
| 81 | * Returns next token or null QString if there is none |
| 82 | */ |
| 83 | virtual Token next() = 0; |
| 84 | |
| 85 | /*! Returns content of currently tokenized buffer*/ |
| 86 | virtual QString buffer() const = 0; |
| 87 | |
| 88 | /*! |
| 89 | * Replace part of text in current buffer. Always use this function instead of directly |
| 90 | * changing data in underlying buffer or tokenizer's internal state may become inconsistent. |
| 91 | */ |
| 92 | virtual void replace(int position, int len, const QString &newWord) = 0; |
| 93 | }; |
| 94 | |
| 95 | class BreakTokenizerPrivate; |
| 96 | |
| 97 | /*! |
| 98 | * \brief WordTokenizer splits supplied buffer into individual words. |
| 99 | * |
| 100 | * WordTokenizer splits buffer into words according to rules from Unicode standard 5.1. |
| 101 | * If purpose is to check spelling, use isSpellcheckable() to determine if current word should be |
| 102 | * checked or ignored. |
| 103 | * |
| 104 | * Usage example: |
| 105 | * |
| 106 | * \code |
| 107 | * WordTokenizer t(buffer); |
| 108 | * Speller sp; |
| 109 | * while (t.hasNext()) { |
| 110 | * Token word=t.next(); |
| 111 | * if (!t.isSpellcheckable()) continue; |
| 112 | * qDebug() << word.toString() << " " << sp.isCorrect(word.toString()); |
| 113 | * } |
| 114 | * \endcode |
| 115 | * |
| 116 | * This example checks spelling of given buffer. |
| 117 | * \since 4.3 |
| 118 | * \internal |
| 119 | */ |
| 120 | class SONNETCORE_EXPORT WordTokenizer : public AbstractTokenizer |
| 121 | { |
| 122 | public: |
| 123 | /*! |
| 124 | * Constructor for word tokenizer |
| 125 | * \a buffer |
| 126 | */ |
| 127 | WordTokenizer(const QString &buffer = QString()); |
| 128 | ~WordTokenizer() override; |
| 129 | |
| 130 | void setBuffer(const QString &buffer) override; |
| 131 | bool hasNext() const override; |
| 132 | Token next() override; |
| 133 | QString buffer() const override; |
| 134 | void replace(int position, int len, const QString &newWord) override; |
| 135 | |
| 136 | /*! Returns true if this word should be spell checked. This ignores email addresses, URLs and other things according to configuration */ |
| 137 | bool isSpellcheckable() const; |
| 138 | |
| 139 | /*! If ignore uppercase is true, then any word containing only uppercase letters will be considered unsuitable for spell check */ |
| 140 | void setIgnoreUppercase(bool val); |
| 141 | |
| 142 | private: |
| 143 | SONNETCORE_NO_EXPORT bool isUppercase(QStringView word) const; |
| 144 | |
| 145 | private: |
| 146 | std::unique_ptr<BreakTokenizerPrivate> const d; |
| 147 | }; |
| 148 | |
| 149 | /*! |
| 150 | * \internal |
| 151 | * |
| 152 | * \brief SentenceTokenizer splits supplied buffer into individual sentences. |
| 153 | * |
| 154 | * SentenceTokenizer splits buffer into sentences according to rules from Unicode standard 5.1. |
| 155 | * \since 4.3 |
| 156 | */ |
| 157 | class SONNETCORE_EXPORT SentenceTokenizer : public AbstractTokenizer |
| 158 | { |
| 159 | public: |
| 160 | /*! |
| 161 | */ |
| 162 | SentenceTokenizer(const QString &buffer = QString()); |
| 163 | ~SentenceTokenizer() override; |
| 164 | void setBuffer(const QString &buffer) override; |
| 165 | bool hasNext() const override; |
| 166 | Token next() override; |
| 167 | QString buffer() const override; |
| 168 | void replace(int position, int len, const QString &newWord) override; |
| 169 | |
| 170 | private: |
| 171 | std::unique_ptr<BreakTokenizerPrivate> const d; |
| 172 | }; |
| 173 | } |
| 174 | #endif |
| 175 | |