qtexttospeech_flite_processor.cpp source code [qtspeech/src/plugins/tts/flite/qtexttospeech_flite_processor.cpp]

1	// Copyright (C) 2022 The Qt Company Ltd.
2	// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only
3
4	#include "qtexttospeech_flite_processor.h"
5	#include "qtexttospeech_flite_plugin.h"
6
7	#include <QtCore/QCoreApplication>
8	#include <QtCore/QString>
9	#include <QtCore/QLocale>
10	#include <QtCore/QMap>
11
12	#include <flite/flite.h>
13
14	QT_BEGIN_NAMESPACE
15
16	using namespace Qt::StringLiterals;
17
18	QTextToSpeechProcessorFlite::QTextToSpeechProcessorFlite(const QAudioDevice &audioDevice)
19	: m_audioDevice (audioDevice)
20	{
21	init();
22	}
23
24	QTextToSpeechProcessorFlite::~QTextToSpeechProcessorFlite()
25	{
26	for (const VoiceInfo &voice : std::as_const(t&: m_voices))
27	voice.unregister_func(voice.vox);
28	}
29
30	const QList<QTextToSpeechProcessorFlite::VoiceInfo> &QTextToSpeechProcessorFlite::voices() const
31	{
32	return m_voices;
33	}
34
35	void QTextToSpeechProcessorFlite::startTokenTimer()
36	{
37	qCDebug(lcSpeechTtsFlite) << "Starting token timer with" << m_tokens.count() - m_currentToken << "left";
38
39	const TokenData &token = m_tokens.at(i: m_currentToken);
40	const qint64 playedTime = m_audioSink->processedUSecs() / `1000`;
41	m_tokenTimer.start(msec: qMax(a: token.startTime - playedTime, b: `0`), t: Qt::PreciseTimer, obj: this);
42	}
43
44	int QTextToSpeechProcessorFlite::audioOutputCb(const cst_wave w, int* start, int size,
45	int last, cst_audio_streaming_info *asi)
46	{
47	QTextToSpeechProcessorFlite processor = static_cast<QTextToSpeechProcessorFlite >(asi->userdata);
48	if (processor) {
49	if (asi->item == NULL)
50	asi->item = relation_head(r: utt_relation(u: asi->utt,name: "Token"));
51
52	const float startTime = flite_ffeature_float(item: asi->item, featpath: "R:Token.daughter1.R:SylStructure.daughter1.daughter1.R:Segment.p.end");
53	const int startSample = int(startTime * float(w->sample_rate));
54	if ((startSample >= start) && (startSample < start + size)) {
55	const char *ws = flite_ffeature_string(item: asi->item, featpath: "whitespace");
56	const char *prepunc = flite_ffeature_string(item: asi->item, featpath: "prepunctuation");
57	if (cst_streq("0",prepunc))
58	prepunc = "";
59	const char *token = flite_ffeature_string(item: asi->item, featpath: "name");
60	const char *postpunc = flite_ffeature_string(item: asi->item, featpath: "punc");
61	if (cst_streq("0",postpunc))
62	postpunc = "";
63	if (token) {
64	qCDebug(lcSpeechTtsFlite).nospace() << "Processing token start_time: " << startTime
65	<< " content: \"" << ws << prepunc << "'" << token << "'" << postpunc << "\"";
66	processor->m_tokens.append(t: TokenData{
67	.startTime: qRound(f: startTime * `1000`),
68	.text: QString::fromUtf8(utf8: token)
69	});
70	if (!processor->m_tokenTimer.isActive())
71	processor->startTokenTimer();
72	}
73	asi->item = item_next(i: asi->item);
74	}
75	return processor->audioOutput(w, start, size, last, asi);
76	}
77	return CST_AUDIO_STREAM_STOP;
78	}
79
80	int QTextToSpeechProcessorFlite::audioOutput(const cst_wave w, int* start, int size,
81	int last, cst_audio_streaming_info *asi)
82	{
83	Q_UNUSED(asi);
84	Q_ASSERT(QThread::currentThread() == thread());
85	if (size == `0`)
86	return CST_AUDIO_STREAM_CONT;
87	if (start == `0` && !initAudio(rate: w->sample_rate, channelCount: w->num_channels))
88	return CST_AUDIO_STREAM_STOP;
89
90	const qsizetype bytesToWrite = size * sizeof(short);
91
92	if (!m_audioBuffer->write(data: reinterpret_cast<const char *>(&w->samples[start]), len: bytesToWrite)) {
93	setError(err: QTextToSpeech::ErrorReason::Playback,
94	errorString: QCoreApplication::translate(context: "QTextToSpeech", key: "Audio streaming error."));
95	stop();
96	return CST_AUDIO_STREAM_STOP;
97	}
98
99	// Stats for debugging
100	++numberChunks;
101	totalBytes += bytesToWrite;
102
103	if (last == `1`) {
104	qCDebug(lcSpeechTtsFlite) << "last data chunk written";
105	m_audioBuffer->close();
106	}
107	return CST_AUDIO_STREAM_CONT;
108	}
109
110	int QTextToSpeechProcessorFlite::dataOutputCb(const cst_wave w, int* start, int size,
111	int last, cst_audio_streaming_info *asi)
112	{
113	QTextToSpeechProcessorFlite processor = static_cast<QTextToSpeechProcessorFlite >(asi->userdata);
114	if (processor)
115	return processor->dataOutput(w, start, size, last, asi);
116	return CST_AUDIO_STREAM_STOP;
117	}
118
119	int QTextToSpeechProcessorFlite::dataOutput(const cst_wave w, int* start, int size,
120	int last, cst_audio_streaming_info *)
121	{
122	if (start == `0`)
123	emit stateChanged(QTextToSpeech::Synthesizing);
124
125	QAudioFormat format;
126	if (w->num_channels == `1`)
127	format.setChannelConfig(QAudioFormat::ChannelConfigMono);
128	else
129	format.setChannelCount(w->num_channels);
130	format.setSampleRate(w->sample_rate);
131	format.setSampleFormat(QAudioFormat::Int16);
132
133	if (!format.isValid())
134	return CST_AUDIO_STREAM_STOP;
135
136	const qsizetype bytesToWrite = size * format.bytesPerSample();
137	emit synthesized(format, array: QByteArray (reinterpret_cast<const char *>(&w->samples[start]), bytesToWrite));
138
139	if (last == `1`)
140	emit stateChanged(QTextToSpeech::Ready);
141
142	return CST_AUDIO_STREAM_CONT;
143	}
144
145	void QTextToSpeechProcessorFlite::timerEvent(QTimerEvent *event)
146	{
147	if (event->timerId() != m_tokenTimer.timerId()) {
148	QObject::timerEvent(event);
149	return;
150	}
151
152	qCDebug(lcSpeechTtsFlite) << "Moving current token" << m_currentToken << m_tokens.size();
153	auto currentToken = m_tokens.at(i: m_currentToken);
154	m_index = m_text.indexOf(s: currentToken.text, from: m_index);
155	emit sayingWord(word: currentToken.text, begin: m_index, length: currentToken.text.length());
156	m_index += currentToken.text.length();
157	++m_currentToken;
158	if (m_currentToken == m_tokens.size())
159	m_tokenTimer.stop();
160	else
161	startTokenTimer();
162	}
163
164	void QTextToSpeechProcessorFlite::processText(const QString &text, int voiceId, double pitch, double rate, OutputHandler outputHandler)
165	{
166	qCDebug(lcSpeechTtsFlite) << "processText() begin";
167	if (!checkVoice(voiceId))
168	return;
169
170	m_text = text;
171	m_tokens.clear();
172	m_currentToken = `0`;
173	m_index = `0`;
174	float secsToSpeak = -`1`;
175	const VoiceInfo &voiceInfo = m_voices.at(i: voiceId);
176	cst_voice *voice = voiceInfo.vox;
177	cst_audio_streaming_info *asi = new_audio_streaming_info();
178	asi->asc = outputHandler;
179	asi->userdata = (void )this*;
180	feat_set(f: voice->features, name: "streaming_info", v: audio_streaming_info_val(v: asi));
181	setRateForVoice(voice, rate);
182	setPitchForVoice(voice, pitch);
183	secsToSpeak = flite_text_to_speech(text: text.toUtf8().constData(), voice, outtype: "none");
184
185	if (secsToSpeak <= `0`) {
186	setError(err: QTextToSpeech::ErrorReason::Input,
187	errorString: QCoreApplication::translate(context: "QTextToSpeech", key: "Speech synthesizing failure."));
188	return;
189	}
190
191	qCDebug(lcSpeechTtsFlite) << "processText() end" << secsToSpeak << "Seconds";
192	}
193
194	void QTextToSpeechProcessorFlite::setRateForVoice(cst_voice voice, float* rate)
195	{
196	float stretch = `1.0`;
197	Q_ASSERT(rate >= -`1.0` && rate <= `1.0`);
198	// Stretch multipliers taken from Speech Dispatcher
199	if (rate < `0`)
200	stretch -= rate * `2`;
201	if (rate > `0`)
202	stretch -= rate * (`100.0` / `175.0`);
203	feat_set_float(f: voice->features, name: "duration_stretch", v: stretch);
204	}
205
206	void QTextToSpeechProcessorFlite::setPitchForVoice(cst_voice voice, float* pitch)
207	{
208	float f0;
209	Q_ASSERT(pitch >= -`1.0` && pitch <= `1.0`);
210	// Conversion taken from Speech Dispatcher
211	f0 = (pitch * `80`) + `100`;
212	feat_set_float(f: voice->features, name: "int_f0_target_mean", v: f0);
213	}
214
215	typedef cst_voice(registerFnType)();
216	typedef void(unregisterFnType)(cst_voice );
217
218	bool QTextToSpeechProcessorFlite::init()
219	{
220	flite_init();
221
222	const QLocale locale(QLocale::English, QLocale::UnitedStates);
223	// ### FIXME: hardcode for now, the only voice files we know about are for en_US
224	// We could source the language and perhaps the list of voices we want to load
225	// (hardcoded below) from an environment variable.
226	const QLatin1StringView langCode("us");
227	const QLatin1StringView libPrefix("flite_cmu_%1_%2.so.1");
228	const QLatin1StringView registerPrefix("register_cmu_%1_%2");
229	const QLatin1StringView unregisterPrefix("unregister_cmu_%1_%2");
230
231	for (const auto &voice : fliteAvailableVoices(libPrefix, langCode)) {
232	QLibrary library(libPrefix.arg(args: langCode, args: voice));
233	if (!library.load()) {
234	qWarning(msg: "Voice library could not be loaded: %s", qPrintable(library.fileName()));
235	continue;
236	}
237	auto registerFn = reinterpret_cast<registerFnType>(library.resolve(
238	symbol: registerPrefix.arg(args: langCode, args: voice).toLatin1().constData()));
239	auto unregisterFn = reinterpret_cast<unregisterFnType>(library.resolve(
240	symbol: unregisterPrefix.arg(args: langCode, args: voice).toLatin1().constData()));
241	if (registerFn && unregisterFn) {
242	const int id = m_voices.count();
243	m_voices.append(t: VoiceInfo{
244	.id: id,
245	.vox: registerFn(),
246	.unregister_func: unregisterFn,
247	.name: voice,
248	.locale: locale.name(),
249	.gender: QVoice::Male,
250	.age: QVoice::Adult
251	});
252	} else {
253	library.unload();
254	}
255	}
256
257	return !m_voices.isEmpty();
258	}
259
260	QStringList QTextToSpeechProcessorFlite::fliteAvailableVoices(const QString &libPrefix,
261	const QString &langCode) const
262	{
263	// Read statically linked voices
264	QStringList voices;
265	for (const cst_val *v = flite_voice_list; v; v = val_cdr(v)) {
266	cst_voice *voice = val_voice(v: val_car(v));
267	voices.append(t: voice->name);
268	}
269
270	// Read available libraries
271	// TODO: make default library paths OS dependent
272	const QProcessEnvironment pe;
273	QStringList ldPaths = pe.value(name: "LD_LIBRARY_PATH"_L1).split(sep: ":", behavior: Qt::SkipEmptyParts);
274	if (ldPaths.isEmpty()) {
275	ldPaths = QStringList {"/usr/lib64"_L1, "/usr/local/lib64"_L1, "/lib64"_L1,
276	"/usr/lib/x86_64-linux-gnu"_L1, "/usr/lib"_L1};
277	} else {
278	ldPaths.removeDuplicates();
279	}
280
281	const QString libPattern = ("lib"_L1 + libPrefix).arg(a: langCode).arg(a: "*"_L1);
282	for (const auto &path : ldPaths) {
283	QDir dir(path);
284	if (!dir.isReadable() \|\| dir.isEmpty())
285	continue;
286	dir.setNameFilters({libPattern});
287	dir.setFilter(QDir::Files);
288	const QFileInfoList fileList = dir.entryInfoList();
289	for (const auto &file : fileList) {
290	const QString vox = file.fileName().mid(position: `16`, n: file.fileName().indexOf(c: u`'.'`) - `16`);
291	voices.append(t: vox);
292	}
293	}
294
295	voices.removeDuplicates();
296	return voices;
297	}
298
299	bool QTextToSpeechProcessorFlite::initAudio(double rate, int channelCount)
300	{
301	m_format.setSampleFormat(QAudioFormat::Int16);
302	m_format.setSampleRate(rate);
303	m_format.setChannelCount(channelCount);
304	switch (channelCount) {
305	case `1`:
306	m_format.setChannelConfig(QAudioFormat::ChannelConfigMono);
307	break;
308	case `2`:
309	m_format.setChannelConfig(QAudioFormat::ChannelConfigStereo);
310	break;
311	case `3`:
312	m_format.setChannelConfig(QAudioFormat::ChannelConfig2Dot1);
313	break;
314	case `5`:
315	m_format.setChannelConfig(QAudioFormat::ChannelConfigSurround5Dot0);
316	break;
317	case `6`:
318	m_format.setChannelConfig(QAudioFormat::ChannelConfigSurround5Dot1);
319	break;
320	case `7`:
321	m_format.setChannelConfig(QAudioFormat::ChannelConfigSurround7Dot0);
322	break;
323	case `8`:
324	m_format.setChannelConfig(QAudioFormat::ChannelConfigSurround7Dot1);
325	break;
326	default:
327	m_format.setChannelConfig(QAudioFormat::ChannelConfigUnknown);
328	break;
329	}
330	if (!checkFormat(format: m_format))
331	return false;
332
333	createSink();
334
335	m_audioSink->setVolume(m_volume);
336
337	return true;
338	}
339
340	void QTextToSpeechProcessorFlite::deleteSink()
341	{
342	if (m_audioSink) {
343	m_audioSink->disconnect();
344	delete m_audioSink;
345	m_audioSink = nullptr;
346	m_audioBuffer = nullptr;
347	}
348	}
349
350	void QTextToSpeechProcessorFlite::createSink()
351	{
352	// Create new sink if none exists or the format has changed
353	if (!m_audioSink \|\| (m_audioSink->format() != m_format)) {
354	// No signals while we create new sink with QIODevice
355	const bool sigs = signalsBlocked();
356	auto resetSignals = qScopeGuard(f: [this, sigs](){ blockSignals(b: sigs); });
357	blockSignals(b: true);
358	deleteSink();
359	m_audioSink = new QAudioSink (m_audioDevice, m_format, this);
360	connect(sender: m_audioSink, signal: &QAudioSink::stateChanged, context: this, slot: &QTextToSpeechProcessorFlite::changeState);
361	connect(sender: QThread::currentThread(), signal: &QThread::finished, context: m_audioSink, slot: &QObject::deleteLater);
362	}
363	m_audioBuffer = m_audioSink->start();
364	if (!m_audioBuffer) {
365	deleteSink();
366	setError(err: QTextToSpeech::ErrorReason::Playback,
367	errorString: QCoreApplication::translate(context: "QTextToSpeech", key: "Audio Open error: No I/O device available."));
368	}
369
370	numberChunks = `0`;
371	totalBytes = `0`;
372	}
373
374	// Wrapper for QAudioSink::stateChanged, bypassing early idle bug
375	void QTextToSpeechProcessorFlite::changeState(QAudio::State newState)
376	{
377	if (m_state == newState)
378	return;
379
380	qCDebug(lcSpeechTtsFlite) << "Audio sink state transition" << m_state << newState;
381
382	switch (newState) {
383	case QAudio::ActiveState:
384	// Once the sink starts playing, start a timer to keep track of the tokens.
385	if (!m_tokenTimer.isActive() && m_currentToken < m_tokens.count())
386	startTokenTimer();
387	break;
388	case QAudio::SuspendedState:
389	case QAudio::IdleState:
390	case QAudio::StoppedState:
391	m_tokenTimer.stop();
392	break;
393	}
394
395	m_state = newState;
396	const QTextToSpeech::State ttsState = audioStateToTts(audioState: newState);
397	emit stateChanged(ttsState);
398	}
399
400	void QTextToSpeechProcessorFlite::setError(QTextToSpeech::ErrorReason err, const QString &errorString)
401	{
402	if (err == QTextToSpeech::ErrorReason::NoError) {
403	changeState(newState: QAudio::IdleState);
404	return;
405	}
406
407	qCDebug(lcSpeechTtsFlite) << "Error" << err << errorString;
408	emit stateChanged(QTextToSpeech::Error);
409	emit errorOccurred(error: err, errorString);
410	}
411
412	constexpr QTextToSpeech::State QTextToSpeechProcessorFlite::audioStateToTts(QAudio::State AudioState)
413	{
414	switch (AudioState) {
415	case QAudio::ActiveState:
416	return QTextToSpeech::Speaking;
417	case QAudio::IdleState:
418	return QTextToSpeech::Ready;
419	case QAudio::SuspendedState:
420	return QTextToSpeech::Paused;
421	case QAudio::StoppedState:
422	return QTextToSpeech::Ready;
423	}
424	Q_UNREACHABLE();
425	}
426
427	void QTextToSpeechProcessorFlite::deinitAudio()
428	{
429	m_tokenTimer.stop();
430	m_index = -`1`;
431	m_currentToken = -`1`;
432	deleteSink();
433	}
434
435	// Check format/device and set corresponding error messages
436	bool QTextToSpeechProcessorFlite::checkFormat(const QAudioFormat &format)
437	{
438	QString formatString;
439	QDebug (&formatString) << format;
440	bool formatOK = true;
441
442	// Format must be valid
443	if (!format.isValid()) {
444	formatOK = false;
445	setError(err: QTextToSpeech::ErrorReason::Playback,
446	errorString: QCoreApplication::translate(context: "QTextToSpeech", key: "Invalid audio format: %1")
447	.arg(a: formatString));
448	}
449
450	// Device must exist
451	if (m_audioDevice.isNull()) {
452	formatOK = false;
453	setError(err: QTextToSpeech::ErrorReason::Playback,
454	errorString: QCoreApplication::translate(context: "QTextToSpeech", key: "No audio device specified."));
455	}
456
457	// Device must support requested format
458	if (!m_audioDevice.isFormatSupported(format)) {
459	formatOK = false;
460	setError(err: QTextToSpeech::ErrorReason::Playback,
461	errorString: QCoreApplication::translate(context: "QTextToSpeech", key: "Audio device does not support format: %1")
462	.arg(a: formatString));
463	}
464
465	return formatOK;
466	}
467
468	// Check voice validity
469	bool QTextToSpeechProcessorFlite::checkVoice(int voiceId)
470	{
471	if (voiceId >= `0` && voiceId < m_voices.size())
472	return true;
473
474	setError(err: QTextToSpeech::ErrorReason::Configuration,
475	errorString: QCoreApplication::translate(context: "QTextToSpeech", key: "Invalid voiceId %1.").arg(a: voiceId));
476	return false;;
477	}
478
479	// Wrap QAudioSink::state and compensate early idle bug
480	QAudio::State QTextToSpeechProcessorFlite::audioSinkState() const
481	{
482	return (m_audioSink) ? m_state : QAudio::StoppedState;
483	}
484
485	// Stop current and cancel subsequent utterances
486	void QTextToSpeechProcessorFlite::stop()
487	{
488	if (audioSinkState() == QAudio::ActiveState \|\| audioSinkState() == QAudio::SuspendedState) {
489	deinitAudio();
490	// Call manual state change as audio sink has been deleted
491	changeState(newState: QAudio::StoppedState);
492	}
493	}
494
495	void QTextToSpeechProcessorFlite::pause()
496	{
497	if (audioSinkState() == QAudio::ActiveState)
498	m_audioSink->suspend();
499	}
500
501	void QTextToSpeechProcessorFlite::resume()
502	{
503	if (audioSinkState() == QAudio::SuspendedState) {
504	m_audioSink->resume();
505	// QAudioSink in push mode transitions to Idle when resumed, even if
506	// there is still data to play. Workaround this weird behavior if we
507	// know we are not done yet.
508	changeState(newState: QAudio::ActiveState);
509	}
510	}
511
512	void QTextToSpeechProcessorFlite::say(const QString &text, int voiceId, double pitch, double rate, double volume)
513	{
514	if (text.isEmpty())
515	return;
516
517	if (!checkVoice(voiceId))
518	return;
519
520	m_volume = volume;
521	processText(text, voiceId, pitch, rate, outputHandler: QTextToSpeechProcessorFlite::audioOutputCb);
522	}
523
524	void QTextToSpeechProcessorFlite::synthesize(const QString &text, int voiceId, double pitch, double rate, double volume)
525	{
526	if (text.isEmpty())
527	return;
528
529	if (!checkVoice(voiceId))
530	return;
531
532	m_volume = volume;
533	processText(text, voiceId, pitch, rate, outputHandler: QTextToSpeechProcessorFlite::dataOutputCb);
534	}
535
536	QT_END_NAMESPACE
537

source code of qtspeech/src/plugins/tts/flite/qtexttospeech_flite_processor.cpp