1// Copyright (C) 2022 The Qt Company Ltd.
2// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only
3
4#include "qtexttospeech_flite_processor.h"
5#include "qtexttospeech_flite_plugin.h"
6
7#include <QtCore/QCoreApplication>
8#include <QtCore/QString>
9#include <QtCore/QLocale>
10#include <QtCore/QMap>
11
12#include <flite/flite.h>
13
14QT_BEGIN_NAMESPACE
15
16using namespace Qt::StringLiterals;
17
18QTextToSpeechProcessorFlite::QTextToSpeechProcessorFlite(const QAudioDevice &audioDevice)
19 : m_audioDevice(audioDevice)
20{
21 init();
22}
23
24QTextToSpeechProcessorFlite::~QTextToSpeechProcessorFlite()
25{
26 for (const VoiceInfo &voice : std::as_const(t&: m_voices))
27 voice.unregister_func(voice.vox);
28}
29
30const QList<QTextToSpeechProcessorFlite::VoiceInfo> &QTextToSpeechProcessorFlite::voices() const
31{
32 return m_voices;
33}
34
35void QTextToSpeechProcessorFlite::startTokenTimer()
36{
37 qCDebug(lcSpeechTtsFlite) << "Starting token timer with" << m_tokens.count() - m_currentToken << "left";
38
39 const TokenData &token = m_tokens.at(i: m_currentToken);
40 const qint64 playedTime = m_audioSink->processedUSecs() / 1000;
41 m_tokenTimer.start(msec: qMax(a: token.startTime - playedTime, b: 0), t: Qt::PreciseTimer, obj: this);
42}
43
44int QTextToSpeechProcessorFlite::audioOutputCb(const cst_wave *w, int start, int size,
45 int last, cst_audio_streaming_info *asi)
46{
47 QTextToSpeechProcessorFlite *processor = static_cast<QTextToSpeechProcessorFlite *>(asi->userdata);
48 if (processor) {
49 if (asi->item == NULL)
50 asi->item = relation_head(r: utt_relation(u: asi->utt,name: "Token"));
51
52 const float startTime = flite_ffeature_float(item: asi->item, featpath: "R:Token.daughter1.R:SylStructure.daughter1.daughter1.R:Segment.p.end");
53 const int startSample = int(startTime * float(w->sample_rate));
54 if ((startSample >= start) && (startSample < start + size)) {
55 const char *ws = flite_ffeature_string(item: asi->item, featpath: "whitespace");
56 const char *prepunc = flite_ffeature_string(item: asi->item, featpath: "prepunctuation");
57 if (cst_streq("0",prepunc))
58 prepunc = "";
59 const char *token = flite_ffeature_string(item: asi->item, featpath: "name");
60 const char *postpunc = flite_ffeature_string(item: asi->item, featpath: "punc");
61 if (cst_streq("0",postpunc))
62 postpunc = "";
63 if (token) {
64 qCDebug(lcSpeechTtsFlite).nospace() << "Processing token start_time: " << startTime
65 << " content: \"" << ws << prepunc << "'" << token << "'" << postpunc << "\"";
66 processor->m_tokens.append(t: TokenData{
67 .startTime: qRound(f: startTime * 1000),
68 .text: QString::fromUtf8(utf8: token)
69 });
70 if (!processor->m_tokenTimer.isActive())
71 processor->startTokenTimer();
72 }
73 asi->item = item_next(i: asi->item);
74 }
75 return processor->audioOutput(w, start, size, last, asi);
76 }
77 return CST_AUDIO_STREAM_STOP;
78}
79
80int QTextToSpeechProcessorFlite::audioOutput(const cst_wave *w, int start, int size,
81 int last, cst_audio_streaming_info *asi)
82{
83 Q_UNUSED(asi);
84 Q_ASSERT(QThread::currentThread() == thread());
85 if (size == 0)
86 return CST_AUDIO_STREAM_CONT;
87 if (start == 0 && !initAudio(rate: w->sample_rate, channelCount: w->num_channels))
88 return CST_AUDIO_STREAM_STOP;
89
90 const qsizetype bytesToWrite = size * sizeof(short);
91
92 if (!m_audioBuffer->write(data: reinterpret_cast<const char *>(&w->samples[start]), len: bytesToWrite)) {
93 setError(err: QTextToSpeech::ErrorReason::Playback,
94 errorString: QCoreApplication::translate(context: "QTextToSpeech", key: "Audio streaming error."));
95 stop();
96 return CST_AUDIO_STREAM_STOP;
97 }
98
99 // Stats for debugging
100 ++numberChunks;
101 totalBytes += bytesToWrite;
102
103 if (last == 1) {
104 qCDebug(lcSpeechTtsFlite) << "last data chunk written";
105 m_audioBuffer->close();
106 }
107 return CST_AUDIO_STREAM_CONT;
108}
109
110int QTextToSpeechProcessorFlite::dataOutputCb(const cst_wave *w, int start, int size,
111 int last, cst_audio_streaming_info *asi)
112{
113 QTextToSpeechProcessorFlite *processor = static_cast<QTextToSpeechProcessorFlite *>(asi->userdata);
114 if (processor)
115 return processor->dataOutput(w, start, size, last, asi);
116 return CST_AUDIO_STREAM_STOP;
117}
118
119int QTextToSpeechProcessorFlite::dataOutput(const cst_wave *w, int start, int size,
120 int last, cst_audio_streaming_info *)
121{
122 if (start == 0)
123 emit stateChanged(QTextToSpeech::Synthesizing);
124
125 QAudioFormat format;
126 if (w->num_channels == 1)
127 format.setChannelConfig(QAudioFormat::ChannelConfigMono);
128 else
129 format.setChannelCount(w->num_channels);
130 format.setSampleRate(w->sample_rate);
131 format.setSampleFormat(QAudioFormat::Int16);
132
133 if (!format.isValid())
134 return CST_AUDIO_STREAM_STOP;
135
136 const qsizetype bytesToWrite = size * format.bytesPerSample();
137 emit synthesized(format, array: QByteArray(reinterpret_cast<const char *>(&w->samples[start]), bytesToWrite));
138
139 if (last == 1)
140 emit stateChanged(QTextToSpeech::Ready);
141
142 return CST_AUDIO_STREAM_CONT;
143}
144
145void QTextToSpeechProcessorFlite::timerEvent(QTimerEvent *event)
146{
147 if (event->timerId() != m_tokenTimer.timerId()) {
148 QObject::timerEvent(event);
149 return;
150 }
151
152 qCDebug(lcSpeechTtsFlite) << "Moving current token" << m_currentToken << m_tokens.size();
153 auto currentToken = m_tokens.at(i: m_currentToken);
154 m_index = m_text.indexOf(s: currentToken.text, from: m_index);
155 emit sayingWord(word: currentToken.text, begin: m_index, length: currentToken.text.length());
156 m_index += currentToken.text.length();
157 ++m_currentToken;
158 if (m_currentToken == m_tokens.size())
159 m_tokenTimer.stop();
160 else
161 startTokenTimer();
162}
163
164void QTextToSpeechProcessorFlite::processText(const QString &text, int voiceId, double pitch, double rate, OutputHandler outputHandler)
165{
166 qCDebug(lcSpeechTtsFlite) << "processText() begin";
167 if (!checkVoice(voiceId))
168 return;
169
170 m_text = text;
171 m_tokens.clear();
172 m_currentToken = 0;
173 m_index = 0;
174 float secsToSpeak = -1;
175 const VoiceInfo &voiceInfo = m_voices.at(i: voiceId);
176 cst_voice *voice = voiceInfo.vox;
177 cst_audio_streaming_info *asi = new_audio_streaming_info();
178 asi->asc = outputHandler;
179 asi->userdata = (void *)this;
180 feat_set(f: voice->features, name: "streaming_info", v: audio_streaming_info_val(v: asi));
181 setRateForVoice(voice, rate);
182 setPitchForVoice(voice, pitch);
183 secsToSpeak = flite_text_to_speech(text: text.toUtf8().constData(), voice, outtype: "none");
184
185 if (secsToSpeak <= 0) {
186 setError(err: QTextToSpeech::ErrorReason::Input,
187 errorString: QCoreApplication::translate(context: "QTextToSpeech", key: "Speech synthesizing failure."));
188 return;
189 }
190
191 qCDebug(lcSpeechTtsFlite) << "processText() end" << secsToSpeak << "Seconds";
192}
193
194void QTextToSpeechProcessorFlite::setRateForVoice(cst_voice *voice, float rate)
195{
196 float stretch = 1.0;
197 Q_ASSERT(rate >= -1.0 && rate <= 1.0);
198 // Stretch multipliers taken from Speech Dispatcher
199 if (rate < 0)
200 stretch -= rate * 2;
201 if (rate > 0)
202 stretch -= rate * (100.0 / 175.0);
203 feat_set_float(f: voice->features, name: "duration_stretch", v: stretch);
204}
205
206void QTextToSpeechProcessorFlite::setPitchForVoice(cst_voice *voice, float pitch)
207{
208 float f0;
209 Q_ASSERT(pitch >= -1.0 && pitch <= 1.0);
210 // Conversion taken from Speech Dispatcher
211 f0 = (pitch * 80) + 100;
212 feat_set_float(f: voice->features, name: "int_f0_target_mean", v: f0);
213}
214
215typedef cst_voice*(*registerFnType)();
216typedef void(*unregisterFnType)(cst_voice *);
217
218bool QTextToSpeechProcessorFlite::init()
219{
220 flite_init();
221
222 const QLocale locale(QLocale::English, QLocale::UnitedStates);
223 // ### FIXME: hardcode for now, the only voice files we know about are for en_US
224 // We could source the language and perhaps the list of voices we want to load
225 // (hardcoded below) from an environment variable.
226 const QLatin1StringView langCode("us");
227 const QLatin1StringView libPrefix("flite_cmu_%1_%2.so.1");
228 const QLatin1StringView registerPrefix("register_cmu_%1_%2");
229 const QLatin1StringView unregisterPrefix("unregister_cmu_%1_%2");
230
231 for (const auto &voice : fliteAvailableVoices(libPrefix, langCode)) {
232 QLibrary library(libPrefix.arg(args: langCode, args: voice));
233 if (!library.load()) {
234 qWarning(msg: "Voice library could not be loaded: %s", qPrintable(library.fileName()));
235 continue;
236 }
237 auto registerFn = reinterpret_cast<registerFnType>(library.resolve(
238 symbol: registerPrefix.arg(args: langCode, args: voice).toLatin1().constData()));
239 auto unregisterFn = reinterpret_cast<unregisterFnType>(library.resolve(
240 symbol: unregisterPrefix.arg(args: langCode, args: voice).toLatin1().constData()));
241 if (registerFn && unregisterFn) {
242 const int id = m_voices.count();
243 m_voices.append(t: VoiceInfo{
244 .id: id,
245 .vox: registerFn(),
246 .unregister_func: unregisterFn,
247 .name: voice,
248 .locale: locale.name(),
249 .gender: QVoice::Male,
250 .age: QVoice::Adult
251 });
252 } else {
253 library.unload();
254 }
255 }
256
257 return !m_voices.isEmpty();
258}
259
260QStringList QTextToSpeechProcessorFlite::fliteAvailableVoices(const QString &libPrefix,
261 const QString &langCode) const
262{
263 // Read statically linked voices
264 QStringList voices;
265 for (const cst_val *v = flite_voice_list; v; v = val_cdr(v)) {
266 cst_voice *voice = val_voice(v: val_car(v));
267 voices.append(t: voice->name);
268 }
269
270 // Read available libraries
271 // TODO: make default library paths OS dependent
272 const QProcessEnvironment pe;
273 QStringList ldPaths = pe.value(name: "LD_LIBRARY_PATH"_L1).split(sep: ":", behavior: Qt::SkipEmptyParts);
274 if (ldPaths.isEmpty()) {
275 ldPaths = QStringList{"/usr/lib64"_L1, "/usr/local/lib64"_L1, "/lib64"_L1,
276 "/usr/lib/x86_64-linux-gnu"_L1, "/usr/lib"_L1};
277 } else {
278 ldPaths.removeDuplicates();
279 }
280
281 const QString libPattern = ("lib"_L1 + libPrefix).arg(a: langCode).arg(a: "*"_L1);
282 for (const auto &path : ldPaths) {
283 QDir dir(path);
284 if (!dir.isReadable() || dir.isEmpty())
285 continue;
286 dir.setNameFilters({libPattern});
287 dir.setFilter(QDir::Files);
288 const QFileInfoList fileList = dir.entryInfoList();
289 for (const auto &file : fileList) {
290 const QString vox = file.fileName().mid(position: 16, n: file.fileName().indexOf(c: u'.') - 16);
291 voices.append(t: vox);
292 }
293 }
294
295 voices.removeDuplicates();
296 return voices;
297}
298
299bool QTextToSpeechProcessorFlite::initAudio(double rate, int channelCount)
300{
301 m_format.setSampleFormat(QAudioFormat::Int16);
302 m_format.setSampleRate(rate);
303 m_format.setChannelCount(channelCount);
304 switch (channelCount) {
305 case 1:
306 m_format.setChannelConfig(QAudioFormat::ChannelConfigMono);
307 break;
308 case 2:
309 m_format.setChannelConfig(QAudioFormat::ChannelConfigStereo);
310 break;
311 case 3:
312 m_format.setChannelConfig(QAudioFormat::ChannelConfig2Dot1);
313 break;
314 case 5:
315 m_format.setChannelConfig(QAudioFormat::ChannelConfigSurround5Dot0);
316 break;
317 case 6:
318 m_format.setChannelConfig(QAudioFormat::ChannelConfigSurround5Dot1);
319 break;
320 case 7:
321 m_format.setChannelConfig(QAudioFormat::ChannelConfigSurround7Dot0);
322 break;
323 case 8:
324 m_format.setChannelConfig(QAudioFormat::ChannelConfigSurround7Dot1);
325 break;
326 default:
327 m_format.setChannelConfig(QAudioFormat::ChannelConfigUnknown);
328 break;
329 }
330 if (!checkFormat(format: m_format))
331 return false;
332
333 createSink();
334
335 m_audioSink->setVolume(m_volume);
336
337 return true;
338}
339
340void QTextToSpeechProcessorFlite::deleteSink()
341{
342 if (m_audioSink) {
343 m_audioSink->disconnect();
344 delete m_audioSink;
345 m_audioSink = nullptr;
346 m_audioBuffer = nullptr;
347 }
348}
349
350void QTextToSpeechProcessorFlite::createSink()
351{
352 // Create new sink if none exists or the format has changed
353 if (!m_audioSink || (m_audioSink->format() != m_format)) {
354 // No signals while we create new sink with QIODevice
355 const bool sigs = signalsBlocked();
356 auto resetSignals = qScopeGuard(f: [this, sigs](){ blockSignals(b: sigs); });
357 blockSignals(b: true);
358 deleteSink();
359 m_audioSink = new QAudioSink(m_audioDevice, m_format, this);
360 connect(sender: m_audioSink, signal: &QAudioSink::stateChanged, context: this, slot: &QTextToSpeechProcessorFlite::changeState);
361 connect(sender: QThread::currentThread(), signal: &QThread::finished, context: m_audioSink, slot: &QObject::deleteLater);
362 }
363 m_audioBuffer = m_audioSink->start();
364 if (!m_audioBuffer) {
365 deleteSink();
366 setError(err: QTextToSpeech::ErrorReason::Playback,
367 errorString: QCoreApplication::translate(context: "QTextToSpeech", key: "Audio Open error: No I/O device available."));
368 }
369
370 numberChunks = 0;
371 totalBytes = 0;
372}
373
374// Wrapper for QAudioSink::stateChanged, bypassing early idle bug
375void QTextToSpeechProcessorFlite::changeState(QAudio::State newState)
376{
377 if (m_state == newState)
378 return;
379
380 qCDebug(lcSpeechTtsFlite) << "Audio sink state transition" << m_state << newState;
381
382 switch (newState) {
383 case QAudio::ActiveState:
384 // Once the sink starts playing, start a timer to keep track of the tokens.
385 if (!m_tokenTimer.isActive() && m_currentToken < m_tokens.count())
386 startTokenTimer();
387 break;
388 case QAudio::SuspendedState:
389 case QAudio::IdleState:
390 case QAudio::StoppedState:
391 m_tokenTimer.stop();
392 break;
393 }
394
395 m_state = newState;
396 const QTextToSpeech::State ttsState = audioStateToTts(audioState: newState);
397 emit stateChanged(ttsState);
398}
399
400void QTextToSpeechProcessorFlite::setError(QTextToSpeech::ErrorReason err, const QString &errorString)
401{
402 if (err == QTextToSpeech::ErrorReason::NoError) {
403 changeState(newState: QAudio::IdleState);
404 return;
405 }
406
407 qCDebug(lcSpeechTtsFlite) << "Error" << err << errorString;
408 emit stateChanged(QTextToSpeech::Error);
409 emit errorOccurred(error: err, errorString);
410}
411
412constexpr QTextToSpeech::State QTextToSpeechProcessorFlite::audioStateToTts(QAudio::State AudioState)
413{
414 switch (AudioState) {
415 case QAudio::ActiveState:
416 return QTextToSpeech::Speaking;
417 case QAudio::IdleState:
418 return QTextToSpeech::Ready;
419 case QAudio::SuspendedState:
420 return QTextToSpeech::Paused;
421 case QAudio::StoppedState:
422 return QTextToSpeech::Ready;
423 }
424 Q_UNREACHABLE();
425}
426
427void QTextToSpeechProcessorFlite::deinitAudio()
428{
429 m_tokenTimer.stop();
430 m_index = -1;
431 m_currentToken = -1;
432 deleteSink();
433}
434
435// Check format/device and set corresponding error messages
436bool QTextToSpeechProcessorFlite::checkFormat(const QAudioFormat &format)
437{
438 QString formatString;
439 QDebug(&formatString) << format;
440 bool formatOK = true;
441
442 // Format must be valid
443 if (!format.isValid()) {
444 formatOK = false;
445 setError(err: QTextToSpeech::ErrorReason::Playback,
446 errorString: QCoreApplication::translate(context: "QTextToSpeech", key: "Invalid audio format: %1")
447 .arg(a: formatString));
448 }
449
450 // Device must exist
451 if (m_audioDevice.isNull()) {
452 formatOK = false;
453 setError(err: QTextToSpeech::ErrorReason::Playback,
454 errorString: QCoreApplication::translate(context: "QTextToSpeech", key: "No audio device specified."));
455 }
456
457 // Device must support requested format
458 if (!m_audioDevice.isFormatSupported(format)) {
459 formatOK = false;
460 setError(err: QTextToSpeech::ErrorReason::Playback,
461 errorString: QCoreApplication::translate(context: "QTextToSpeech", key: "Audio device does not support format: %1")
462 .arg(a: formatString));
463 }
464
465 return formatOK;
466}
467
468// Check voice validity
469bool QTextToSpeechProcessorFlite::checkVoice(int voiceId)
470{
471 if (voiceId >= 0 && voiceId < m_voices.size())
472 return true;
473
474 setError(err: QTextToSpeech::ErrorReason::Configuration,
475 errorString: QCoreApplication::translate(context: "QTextToSpeech", key: "Invalid voiceId %1.").arg(a: voiceId));
476 return false;;
477}
478
479// Wrap QAudioSink::state and compensate early idle bug
480QAudio::State QTextToSpeechProcessorFlite::audioSinkState() const
481{
482 return (m_audioSink) ? m_state : QAudio::StoppedState;
483}
484
485// Stop current and cancel subsequent utterances
486void QTextToSpeechProcessorFlite::stop()
487{
488 if (audioSinkState() == QAudio::ActiveState || audioSinkState() == QAudio::SuspendedState) {
489 deinitAudio();
490 // Call manual state change as audio sink has been deleted
491 changeState(newState: QAudio::StoppedState);
492 }
493}
494
495void QTextToSpeechProcessorFlite::pause()
496{
497 if (audioSinkState() == QAudio::ActiveState)
498 m_audioSink->suspend();
499}
500
501void QTextToSpeechProcessorFlite::resume()
502{
503 if (audioSinkState() == QAudio::SuspendedState) {
504 m_audioSink->resume();
505 // QAudioSink in push mode transitions to Idle when resumed, even if
506 // there is still data to play. Workaround this weird behavior if we
507 // know we are not done yet.
508 changeState(newState: QAudio::ActiveState);
509 }
510}
511
512void QTextToSpeechProcessorFlite::say(const QString &text, int voiceId, double pitch, double rate, double volume)
513{
514 if (text.isEmpty())
515 return;
516
517 if (!checkVoice(voiceId))
518 return;
519
520 m_volume = volume;
521 processText(text, voiceId, pitch, rate, outputHandler: QTextToSpeechProcessorFlite::audioOutputCb);
522}
523
524void QTextToSpeechProcessorFlite::synthesize(const QString &text, int voiceId, double pitch, double rate, double volume)
525{
526 if (text.isEmpty())
527 return;
528
529 if (!checkVoice(voiceId))
530 return;
531
532 m_volume = volume;
533 processText(text, voiceId, pitch, rate, outputHandler: QTextToSpeechProcessorFlite::dataOutputCb);
534}
535
536QT_END_NAMESPACE
537

source code of qtspeech/src/plugins/tts/flite/qtexttospeech_flite_processor.cpp