1// Copyright (C) 2022 The Qt Company Ltd.
2// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only
3
4#include "qtexttospeech_flite_processor.h"
5#include "qtexttospeech_flite_plugin.h"
6
7#include <QtCore/qcoreapplication.h>
8#include <QtCore/qlocale.h>
9#include <QtCore/qmap.h>
10#include <QtCore/qprocessordetection.h>
11#include <QtCore/qspan.h>
12#include <QtCore/qstring.h>
13
14#include <thread>
15
16#include <flite/flite.h>
17
18QT_BEGIN_NAMESPACE
19
20using namespace Qt::StringLiterals;
21
22namespace {
23
24void setRateForVoice(cst_voice *voice, float rate)
25{
26 float stretch = 1.0;
27 Q_ASSERT(rate >= -1.0 && rate <= 1.0);
28 // Stretch multipliers taken from Speech Dispatcher
29 if (rate < 0)
30 stretch -= rate * 2;
31 if (rate > 0)
32 stretch -= rate * (100.0f / 175.0f);
33 feat_set_float(f: voice->features, name: "duration_stretch", v: stretch);
34}
35
36void setPitchForVoice(cst_voice *voice, float pitch)
37{
38 float f0;
39 Q_ASSERT(pitch >= -1.0 && pitch <= 1.0);
40 // Conversion taken from Speech Dispatcher
41 f0 = (pitch * 80) + 100;
42 feat_set_float(f: voice->features, name: "int_f0_target_mean", v: f0);
43}
44
45// Read available flite voices
46QStringList fliteAvailableVoices(const QString &libPrefix, const QString &langCode)
47{
48 // Read statically linked voices
49 QStringList voices;
50 for (const cst_val *v = flite_voice_list; v; v = val_cdr(v)) {
51 cst_voice *voice = val_voice(v: val_car(v));
52 voices.append(t: voice->name);
53 }
54
55 // Read available libraries
56 static const QStringList ldPaths = [] {
57 const QProcessEnvironment pe;
58 QStringList ldPaths = pe.value(name: u"LD_LIBRARY_PATH"_s).split(sep: u":"_s, behavior: Qt::SkipEmptyParts);
59 if (ldPaths.isEmpty()) {
60 ldPaths = QStringList{
61 // Fedora-style lib64 library paths
62 u"/usr/lib64"_s,
63 u"/usr/local/lib64"_s,
64 u"/lib64"_s,
65
66 // Debian-style multi-arch library paths
67#if defined(Q_PROCESSOR_ARM_V8)
68# if defined(__MUSL__)
69 u"/usr/lib/aarch64-linux-musl"_s,
70# else
71 u"/usr/lib/aarch64-linux-gnu"_s,
72# endif
73#elif defined(Q_PROCESSOR_ARM_V7)
74# if defined(__MUSL__)
75 u"/usr/lib/arm-linux-musleabihf"_s,
76# else
77# if defined(__ARM_PCS_VFP)
78 u"/usr/lib/arm-linux-gnueabihf"_s,
79# else
80 u"/usr/lib/arm-linux-gnueabi"_s,
81# endif
82# endif
83#elif defined(Q_PROCESSOR_X86_64)
84 u"/usr/lib/x86_64-linux-gnu"_s,
85#elif defined(Q_PROCESSOR_X86)
86 u"/usr/lib/i686-linux-gnu"_s,
87 u"/usr/lib/i386-linux-gnu"_s,
88#endif
89
90 // generic paths
91 u"/usr/lib"_s,
92 u"/usr/local/lib"_s,
93 u"/lib"_s,
94 };
95 } else {
96 ldPaths.removeDuplicates();
97 }
98
99 ldPaths.removeIf(pred: [](const QString &path) {
100 QDir dir(path);
101 return !dir.isReadable() || dir.isEmpty();
102 });
103
104 qCDebug(lcSpeechTtsFlite) << "QTextToSpeechProcessorFlite: initialized voice paths to"
105 << ldPaths;
106
107 return ldPaths;
108 }();
109
110 const QString libPattern = QString(u"lib"_s + libPrefix).arg(a: langCode).arg(a: "*"_L1);
111 for (const auto &path : ldPaths) {
112 QDir dir(path);
113 dir.setNameFilters({ libPattern });
114 dir.setFilter(QDir::Files);
115 const QFileInfoList fileList = dir.entryInfoList();
116 for (const auto &file : fileList) {
117 QString vox = file.fileName().mid(position: 16, n: file.fileName().indexOf(ch: u'.') - 16);
118 voices.append(t: std::move(vox));
119 }
120 }
121
122 voices.removeDuplicates();
123 return voices;
124}
125
126QAudioFormat getAudioFormat(const cst_wave &w)
127{
128 QAudioFormat fmt;
129 fmt.setSampleFormat(QAudioFormat::Int16);
130 fmt.setSampleRate(w.sample_rate);
131 fmt.setChannelCount(w.num_channels);
132 fmt.setChannelConfig(QAudioFormat::defaultChannelConfigForChannelCount(channelCount: w.num_channels));
133 return fmt;
134}
135
136} // namespace
137
138QTextToSpeechProcessorFlite::QTextToSpeechProcessorFlite(const QAudioDevice &audioDevice)
139 : m_audioDevice(audioDevice)
140{
141 init();
142}
143
144QTextToSpeechProcessorFlite::~QTextToSpeechProcessorFlite()
145{
146 for (const VoiceInfo &voice : std::as_const(t&: m_voices))
147 voice.unregister_func(voice.vox);
148}
149
150const QList<QTextToSpeechProcessorFlite::VoiceInfo> &QTextToSpeechProcessorFlite::voices() const
151{
152 return m_voices;
153}
154
155int QTextToSpeechProcessorFlite::audioOutputCb(const cst_wave *w, int start, int size,
156 int last, cst_audio_streaming_info *asi)
157{
158 auto *processor = static_cast<QTextToSpeechProcessorFlite *>(asi->userdata);
159 Q_ASSERT(processor);
160
161 if (!asi->item)
162 asi->item = relation_head(r: utt_relation(u: asi->utt, name: "Token"));
163
164 const float tokenStartTime = flite_ffeature_float(
165 item: asi->item, featpath: "R:Token.daughter1.R:SylStructure.daughter1.daughter1.R:Segment.p.end");
166 const int tokenStartSample = int(tokenStartTime * float(w->sample_rate));
167 if ((tokenStartSample >= start) && (tokenStartSample < start + size)) {
168 // a new token starts in this chunk
169 processor->audioHandleNewToken(
170 tokenStartTime: std::chrono::milliseconds(std::lround(x: tokenStartTime * 1000)), asi);
171 asi->item = item_next(i: asi->item);
172 }
173 return processor->audioOutput(w, start, size, last, asi);
174}
175
176int QTextToSpeechProcessorFlite::audioOutput(const cst_wave *w, int start, int size, int last,
177 cst_audio_streaming_info *)
178{
179 Q_ASSERT(QThread::currentThread() == thread());
180 if (size == 0)
181 return CST_AUDIO_STREAM_CONT;
182 if (start == 0 && !initAudio(w))
183 return CST_AUDIO_STREAM_STOP;
184
185 QSpan fliteStream{ w->samples + start, size };
186 QSpan fliteBytes = as_bytes(s: fliteStream);
187
188 using namespace std::chrono_literals;
189
190 std::optional<std::chrono::steady_clock::time_point> startTime;
191 qsizetype totalBytesWritten = 0;
192
193 auto handleStreamingError = [&] {
194 setError(err: QTextToSpeech::ErrorReason::Playback,
195 errorString: QCoreApplication::translate(context: "QTextToSpeech", key: "Audio streaming error."));
196 stop();
197 return CST_AUDIO_STREAM_STOP;
198 };
199
200 while (!fliteBytes.isEmpty()) {
201 qsizetype bytesWritten = m_audioIODevice->write(
202 data: reinterpret_cast<const char *>(fliteBytes.data()), len: fliteBytes.size());
203
204 if (bytesWritten < 0) // something really went wrong
205 return handleStreamingError();
206
207 totalBytesWritten += bytesWritten;
208 if (bytesWritten == fliteBytes.size())
209 break;
210
211 if (bytesWritten)
212 fliteBytes = fliteBytes.subspan(pos: bytesWritten); // ranges::drop
213
214 // we could not write (all) data to the QIODevice. Back off and retry for 5 seconds before
215 // we give up. We cannot query the state of the QAudioSink here, as that would require event
216 // loop interaction.
217 constexpr auto timeout = 5s;
218
219 if (!startTime)
220 startTime = std::chrono::steady_clock::now();
221 else if (std::chrono::steady_clock::now() - *startTime > timeout)
222 return handleStreamingError();
223
224 std::this_thread::sleep_for(rtime: 5ms);
225 }
226
227 // Stats for debugging
228 ++numberChunks;
229 totalBytes += totalBytesWritten;
230
231 if (last == 1) {
232 qCDebug(lcSpeechTtsFlite) << "last data chunk written";
233 m_audioIODevice->close();
234 }
235 return CST_AUDIO_STREAM_CONT;
236}
237
238void QTextToSpeechProcessorFlite::audioHandleNewToken(std::chrono::milliseconds tokenStartTime,
239 cst_audio_streaming_info *asi)
240{
241 auto normalizeFeatureString = [&](const char *feature) -> const char * {
242 const char *featureString = flite_ffeature_string(item: asi->item, featpath: feature);
243 if (cst_streq("0", featureString))
244 return "";
245 return featureString;
246 };
247
248 const char *token = flite_ffeature_string(item: asi->item, featpath: "name");
249 if (!token) {
250 Q_UNLIKELY_BRANCH;
251 qCWarning(lcSpeechTtsFlite) << "No token found, skipping";
252 return;
253 }
254
255 qCDebug(lcSpeechTtsFlite).nospace()
256 << "Processing token start_time: " << tokenStartTime << " content: \""
257 << flite_ffeature_string(item: asi->item, featpath: "whitespace")
258 << normalizeFeatureString("prepunctuation") << "'" << token << "'"
259 << normalizeFeatureString("punc") << "\"";
260
261 QString currentToken = QString::fromUtf8(utf8: token);
262 m_index = m_text.indexOf(s: currentToken, from: m_index);
263 emit sayingWord(word: currentToken, begin: m_index, length: currentToken.length());
264}
265
266int QTextToSpeechProcessorFlite::dataOutputCb(const cst_wave *w, int start, int size,
267 int last, cst_audio_streaming_info *asi)
268{
269 auto *processor = static_cast<QTextToSpeechProcessorFlite *>(asi->userdata);
270 Q_ASSERT(processor);
271 return processor->dataOutput(w, start, size, last, asi);
272}
273
274int QTextToSpeechProcessorFlite::dataOutput(const cst_wave *w, int start, int size,
275 int last, cst_audio_streaming_info *)
276{
277 if (start == 0)
278 emit stateChanged(QTextToSpeech::Synthesizing);
279
280 if (!m_synthesisFormat) {
281 QAudioFormat format = getAudioFormat(w: *w);
282 if (!format.isValid())
283 return CST_AUDIO_STREAM_STOP;
284 m_synthesisFormat = format;
285 }
286
287 const qsizetype bytesToWrite = size * m_synthesisFormat->bytesPerSample();
288 emit synthesized(format: *m_synthesisFormat,
289 array: QByteArray(reinterpret_cast<const char *>(&w->samples[start]), bytesToWrite));
290
291 if (last == 1)
292 emit stateChanged(QTextToSpeech::Ready);
293
294 return CST_AUDIO_STREAM_CONT;
295}
296
297void QTextToSpeechProcessorFlite::processText(const QString &text, int voiceId, float pitch,
298 float rate, OutputHandler outputHandler)
299{
300 qCDebug(lcSpeechTtsFlite) << "processText() begin";
301 if (!checkVoice(voiceId))
302 return;
303
304 m_text = text;
305 m_index = 0;
306 float secsToSpeak = -1;
307 const VoiceInfo &voiceInfo = m_voices.at(i: voiceId);
308 cst_voice *voice = voiceInfo.vox;
309 cst_audio_streaming_info *asi = new_audio_streaming_info();
310 asi->asc = outputHandler;
311 asi->userdata = (void *)this;
312 feat_set(f: voice->features, name: "streaming_info", v: audio_streaming_info_val(v: asi));
313 setRateForVoice(voice, rate);
314 setPitchForVoice(voice, pitch);
315 secsToSpeak = flite_text_to_speech(text: text.toUtf8().constData(), voice, outtype: "none");
316
317 if (secsToSpeak <= 0) {
318 setError(err: QTextToSpeech::ErrorReason::Input,
319 errorString: QCoreApplication::translate(context: "QTextToSpeech", key: "Speech synthesizing failure."));
320 return;
321 }
322
323 qCDebug(lcSpeechTtsFlite) << "processText() end" << secsToSpeak << "Seconds";
324}
325
326typedef cst_voice*(*registerFnType)();
327typedef void(*unregisterFnType)(cst_voice *);
328
329bool QTextToSpeechProcessorFlite::init()
330{
331 flite_init();
332
333 const QLocale locale(QLocale::English, QLocale::UnitedStates);
334 // ### FIXME: hardcode for now, the only voice files we know about are for en_US
335 // We could source the language and perhaps the list of voices we want to load
336 // (hardcoded below) from an environment variable.
337 const QString langCode(u"us"_s);
338 const QString libPrefix(u"flite_cmu_%1_%2.so.1"_s);
339 const QString registerPrefix(u"register_cmu_%1_%2"_s);
340 const QString unregisterPrefix(u"unregister_cmu_%1_%2"_s);
341
342 for (const auto &voice : fliteAvailableVoices(libPrefix, langCode)) {
343 QLibrary library(libPrefix.arg(args: langCode, args: voice));
344 if (!library.load()) {
345 qWarning(msg: "Voice library could not be loaded: %s", qPrintable(library.fileName()));
346 continue;
347 }
348 auto registerFn = reinterpret_cast<registerFnType>(library.resolve(
349 symbol: registerPrefix.arg(args: langCode, args: voice).toLatin1().constData()));
350 auto unregisterFn = reinterpret_cast<unregisterFnType>(library.resolve(
351 symbol: unregisterPrefix.arg(args: langCode, args: voice).toLatin1().constData()));
352 if (registerFn && unregisterFn) {
353 const int id = int(m_voices.count());
354 m_voices.append(t: VoiceInfo{
355 .id: id,
356 .vox: registerFn(),
357 .unregister_func: unregisterFn,
358 .name: voice,
359 .locale: locale.name(),
360 .gender: QVoice::Male,
361 .age: QVoice::Adult
362 });
363 } else {
364 library.unload();
365 }
366 }
367
368 return !m_voices.isEmpty();
369}
370
371bool QTextToSpeechProcessorFlite::initAudio(const cst_wave *w)
372{
373 m_format = getAudioFormat(w: *w);
374 if (!checkFormat(format: m_format))
375 return false;
376
377 createSink();
378
379 return bool(m_audioSink);
380}
381
382void QTextToSpeechProcessorFlite::deleteSink()
383{
384 if (m_audioSink) {
385 m_audioSink->disconnect();
386 delete m_audioSink;
387 m_audioSink = nullptr;
388 m_audioIODevice = nullptr;
389 }
390}
391
392void QTextToSpeechProcessorFlite::createSink()
393{
394 using namespace std::chrono;
395 // Create new sink if none exists or the format has changed
396 if (!m_audioSink || (m_audioSink->format() != m_format)) {
397 // No signals while we create new sink with QIODevice
398 const bool sigs = signalsBlocked();
399 auto resetSignals = qScopeGuard(f: [this, sigs](){ blockSignals(b: sigs); });
400 blockSignals(b: true);
401 deleteSink();
402 m_audioSink = new QAudioSink(m_audioDevice, m_format, this);
403 m_audioSink->setVolume(m_volume);
404 constexpr auto bufferDuration = milliseconds(100);
405 m_audioSink->setBufferSize(m_format.bytesForDuration(microseconds: microseconds(bufferDuration).count()));
406 connect(sender: m_audioSink, signal: &QAudioSink::stateChanged, context: this,
407 slot: &QTextToSpeechProcessorFlite::changeState);
408 connect(sender: QThread::currentThread(), signal: &QThread::finished, context: m_audioSink, slot: &QObject::deleteLater);
409 } else {
410 // stop before we can restart with a new QIODevice
411 m_audioSink->reset();
412 }
413
414 m_audioIODevice = m_audioSink->start();
415 if (!m_audioIODevice) {
416 deleteSink();
417 setError(err: QTextToSpeech::ErrorReason::Playback,
418 errorString: QCoreApplication::translate(context: "QTextToSpeech", key: "Audio Open error: No I/O device available."));
419 }
420
421 numberChunks = 0;
422 totalBytes = 0;
423}
424
425// Wrapper for QAudioSink::stateChanged, bypassing early idle bug
426void QTextToSpeechProcessorFlite::changeState(QAudio::State newState)
427{
428 if (m_state == newState)
429 return;
430
431 qCDebug(lcSpeechTtsFlite) << "Audio sink state transition" << m_state << newState;
432
433 m_state = newState;
434 const QTextToSpeech::State ttsState = audioStateToTts(audioState: newState);
435 emit stateChanged(ttsState);
436}
437
438void QTextToSpeechProcessorFlite::setError(QTextToSpeech::ErrorReason err, const QString &errorString)
439{
440 if (err == QTextToSpeech::ErrorReason::NoError) {
441 changeState(newState: QAudio::IdleState);
442 return;
443 }
444
445 qCDebug(lcSpeechTtsFlite) << "Error" << err << errorString;
446 emit stateChanged(QTextToSpeech::Error);
447 emit errorOccurred(error: err, errorString);
448}
449
450constexpr QTextToSpeech::State QTextToSpeechProcessorFlite::audioStateToTts(QAudio::State AudioState)
451{
452 switch (AudioState) {
453 case QAudio::ActiveState:
454 return QTextToSpeech::Speaking;
455 case QAudio::IdleState:
456 return QTextToSpeech::Ready;
457 case QAudio::SuspendedState:
458 return QTextToSpeech::Paused;
459 case QAudio::StoppedState:
460 return QTextToSpeech::Ready;
461 }
462 Q_UNREACHABLE();
463}
464
465void QTextToSpeechProcessorFlite::deinitAudio()
466{
467 m_index = -1;
468 deleteSink();
469}
470
471// Check format/device and set corresponding error messages
472bool QTextToSpeechProcessorFlite::checkFormat(const QAudioFormat &format)
473{
474 auto streamToString = [](auto &&arg) {
475 QString string;
476 QDebug(&string) << arg;
477 return string;
478 };
479
480 bool formatOK = true;
481
482 // Format must be valid
483 if (!format.isValid()) {
484 formatOK = false;
485 setError(err: QTextToSpeech::ErrorReason::Playback,
486 errorString: QCoreApplication::translate(context: "QTextToSpeech", key: "Invalid audio format: %1")
487 .arg(a: streamToString(format)));
488 }
489
490 // Device must exist
491 if (m_audioDevice.isNull()) {
492 formatOK = false;
493 setError(err: QTextToSpeech::ErrorReason::Playback,
494 errorString: QCoreApplication::translate(context: "QTextToSpeech", key: "No audio device specified."));
495 }
496
497 // Device must support requested format
498 if (!m_audioDevice.isFormatSupported(format)) {
499 formatOK = false;
500 setError(err: QTextToSpeech::ErrorReason::Playback,
501 errorString: QCoreApplication::translate(context: "QTextToSpeech",
502 key: "Audio device does not support format: %1")
503 .arg(a: streamToString(format)));
504 }
505
506 return formatOK;
507}
508
509// Check voice validity
510bool QTextToSpeechProcessorFlite::checkVoice(int voiceId)
511{
512 if (voiceId >= 0 && voiceId < m_voices.size())
513 return true;
514
515 setError(err: QTextToSpeech::ErrorReason::Configuration,
516 errorString: QCoreApplication::translate(context: "QTextToSpeech", key: "Invalid voiceId %1.").arg(a: voiceId));
517 return false;
518}
519
520// Wrap QAudioSink::state and compensate early idle bug
521QAudio::State QTextToSpeechProcessorFlite::audioSinkState() const
522{
523 return (m_audioSink) ? m_state : QAudio::StoppedState;
524}
525
526// Stop current and cancel subsequent utterances
527void QTextToSpeechProcessorFlite::stop()
528{
529 if (audioSinkState() == QAudio::ActiveState || audioSinkState() == QAudio::SuspendedState) {
530 deinitAudio();
531 // Call manual state change as audio sink has been deleted
532 changeState(newState: QAudio::StoppedState);
533 }
534}
535
536void QTextToSpeechProcessorFlite::pause()
537{
538 if (audioSinkState() == QAudio::ActiveState)
539 m_audioSink->suspend();
540}
541
542void QTextToSpeechProcessorFlite::resume()
543{
544 if (audioSinkState() == QAudio::SuspendedState) {
545 m_audioSink->resume();
546 // QAudioSink in push mode transitions to Idle when resumed, even if
547 // there is still data to play. Workaround this weird behavior if we
548 // know we are not done yet.
549 changeState(newState: QAudio::ActiveState);
550 }
551}
552
553void QTextToSpeechProcessorFlite::say(const QString &text, int voiceId, double pitch, double rate, double volume)
554{
555 if (text.isEmpty())
556 return;
557
558 if (!checkVoice(voiceId))
559 return;
560
561 m_volume = volume;
562 processText(text, voiceId, pitch: float(pitch), rate: float(rate),
563 outputHandler: QTextToSpeechProcessorFlite::audioOutputCb);
564}
565
566void QTextToSpeechProcessorFlite::synthesize(const QString &text, int voiceId, double pitch, double rate, double volume)
567{
568 if (text.isEmpty())
569 return;
570
571 if (!checkVoice(voiceId))
572 return;
573
574 m_synthesisFormat = std::nullopt;
575 m_volume = volume;
576 processText(text, voiceId, pitch: float(pitch), rate: float(rate),
577 outputHandler: QTextToSpeechProcessorFlite::dataOutputCb);
578 m_synthesisFormat = std::nullopt;
579}
580
581QT_END_NAMESPACE
582

source code of qtspeech/src/plugins/tts/flite/qtexttospeech_flite_processor.cpp