tokenizer_p.h source code [sonnet/src/core/tokenizer_p.h]

1	/ This file is part of the KDE libraries*
2
3	SPDX-FileCopyrightText: 2009 Jakub Stachowski <qbast@go2.pl>
4
5	SPDX-License-Identifier: LGPL-2.0-or-later
6	*/
7
8	#ifndef ABSTRACTTOKENIZER_H
9	#define ABSTRACTTOKENIZER_H
10
11	#include "sonnetcore_export.h"
12	#include <QString>
13
14	#include <memory>
15
16	namespace Sonnet
17	{
18	struct Token {
19	QStringView token = nullptr;
20	int positionInBuffer = -`1`;
21
22	QString toString() const
23	{
24	return token.toString();
25	}
26
27	/**
28	* @brief length of this token
29	*/
30	Q_DECL_CONSTEXPR int length() const
31	{
32	return token.size();
33	}
34
35	/**
36	* @brief position in buffer of which the @ref token is a view
37	*/
38	Q_DECL_CONSTEXPR int position() const
39	{
40	return positionInBuffer;
41	}
42
43	Q_DECL_CONSTEXPR bool isNull() const
44	{
45	return token.isNull();
46	}
47
48	Q_DECL_CONSTEXPR bool isEmpty() const
49	{
50	return token.isEmpty();
51	}
52
53	Q_DECL_CONSTEXPR QChar at(qsizetype n) const
54	{
55	return token.at(n);
56	}
57	};
58
59	/**
60	* @short AbstractTokenizer breaks text into smaller pieces - words, sentences, paragraphs.
61	*
62	* AbstractTokenizer is an abstract class that must be subclassed to be used. It provides API modelled
63	* after Java-style iterators. During tokenization buffer can be modified using provided replace() method.
64	*
65	* @since 4.3
66	*/
67	class AbstractTokenizer
68	{
69	public:
70	virtual ~AbstractTokenizer()
71	{
72	}
73
74	/**
75	* Sets text to tokenize. It also resets tokenizer state.
76	*/
77	virtual void setBuffer(const QString &buffer = QString()) = `0`;
78	/**
79	* Returns true if there is another token available.
80	* @return true if another token is available, false if not.
81	*/
82	virtual bool hasNext() const = `0`;
83
84	/**
85	* Returns next token or null QString if there is none
86	*/
87	virtual Token next() = `0`;
88
89	/* Returns content of currently tokenized buffer/
90	virtual QString buffer() const = `0`;
91
92	/**
93	* Replace part of text in current buffer. Always use this function instead of directly
94	* changing data in underlying buffer or tokenizer's internal state may become inconsistent.
95	*/
96	virtual void replace(int position, int len, const QString &newWord) = `0`;
97	};
98
99	class BreakTokenizerPrivate;
100
101	/**
102	@short WordTokenizer splits supplied buffer into individual words.
103
104	WordTokenizer splits buffer into words according to rules from Unicode standard 5.1.
105	If purpose is to check spelling, use isSpellcheckable() to determine if current word should be
106	checked or ignored.
107
108	Usage example:
109
110	@code
111	WordTokenizer t(buffer);
112	Speller sp;
113	while (t.hasNext()) {
114	Token word=t.next();
115	if (!t.isSpellcheckable()) continue;
116	qDebug() << word.toString() << " " << sp.isCorrect(word.toString());
117	}
118	@endcode
119
120	This example checks spelling of given buffer
121	* @since 4.3
122	*/
123	class SONNETCORE_EXPORT WordTokenizer : public AbstractTokenizer
124	{
125	public:
126	/**
127	* Constructor for word tokenizer
128	* @param buffer
129	*/
130	WordTokenizer(const QString &buffer = QString());
131	~WordTokenizer() override;
132
133	void setBuffer(const QString &buffer) override;
134	bool hasNext() const override;
135	Token next() override;
136	QString buffer() const override;
137	void replace(int position, int len, const QString &newWord) override;
138
139	/* Returns true if this word should be spell checked. This ignores email addresses, URLs and other things according to configuration /
140	bool isSpellcheckable() const;
141
142	/* If ignore uppercase is true, then any word containing only uppercase letters will be considered unsuitable for spell check /
143	void setIgnoreUppercase(bool val);
144
145	private:
146	SONNETCORE_NO_EXPORT bool isUppercase(QStringView word) const;
147
148	private:
149	std::unique_ptr<BreakTokenizerPrivate> const d;
150	};
151
152	/**
153	@short SentenceTokenizer splits supplied buffer into individual sentences.
154
155	SentenceTokenizer splits buffer into sentences according to rules from Unicode standard 5.1.
156	* @since 4.3
157	*/
158	class SONNETCORE_EXPORT SentenceTokenizer : public AbstractTokenizer
159	{
160	public:
161	SentenceTokenizer(const QString &buffer = QString());
162	~SentenceTokenizer() override;
163	void setBuffer(const QString &buffer) override;
164	bool hasNext() const override;
165	Token next() override;
166	QString buffer() const override;
167	void replace(int position, int len, const QString &newWord) override;
168
169	private:
170	std::unique_ptr<BreakTokenizerPrivate> const d;
171	};
172	}
173	#endif
174

source code of sonnet/src/core/tokenizer_p.h