1 | // Copyright (C) 2022 The Qt Company Ltd. |
2 | // SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only |
3 | |
4 | #include "qtexttospeech_flite_processor.h" |
5 | #include "qtexttospeech_flite_plugin.h" |
6 | |
7 | #include <QtCore/QCoreApplication> |
8 | #include <QtCore/QString> |
9 | #include <QtCore/QLocale> |
10 | #include <QtCore/QMap> |
11 | |
12 | #include <flite/flite.h> |
13 | |
14 | QT_BEGIN_NAMESPACE |
15 | |
16 | using namespace Qt::StringLiterals; |
17 | |
18 | QTextToSpeechProcessorFlite::QTextToSpeechProcessorFlite(const QAudioDevice &audioDevice) |
19 | : m_audioDevice(audioDevice) |
20 | { |
21 | init(); |
22 | } |
23 | |
24 | QTextToSpeechProcessorFlite::~QTextToSpeechProcessorFlite() |
25 | { |
26 | for (const VoiceInfo &voice : std::as_const(t&: m_voices)) |
27 | voice.unregister_func(voice.vox); |
28 | } |
29 | |
30 | const QList<QTextToSpeechProcessorFlite::VoiceInfo> &QTextToSpeechProcessorFlite::voices() const |
31 | { |
32 | return m_voices; |
33 | } |
34 | |
35 | void QTextToSpeechProcessorFlite::startTokenTimer() |
36 | { |
37 | qCDebug(lcSpeechTtsFlite) << "Starting token timer with" << m_tokens.count() - m_currentToken << "left" ; |
38 | |
39 | const TokenData &token = m_tokens.at(i: m_currentToken); |
40 | const qint64 playedTime = m_audioSink->processedUSecs() / 1000; |
41 | m_tokenTimer.start(msec: qMax(a: token.startTime - playedTime, b: 0), t: Qt::PreciseTimer, obj: this); |
42 | } |
43 | |
44 | int QTextToSpeechProcessorFlite::audioOutputCb(const cst_wave *w, int start, int size, |
45 | int last, cst_audio_streaming_info *asi) |
46 | { |
47 | QTextToSpeechProcessorFlite *processor = static_cast<QTextToSpeechProcessorFlite *>(asi->userdata); |
48 | if (processor) { |
49 | if (asi->item == NULL) |
50 | asi->item = relation_head(r: utt_relation(u: asi->utt,name: "Token" )); |
51 | |
52 | const float startTime = flite_ffeature_float(item: asi->item, featpath: "R:Token.daughter1.R:SylStructure.daughter1.daughter1.R:Segment.p.end" ); |
53 | const int startSample = int(startTime * float(w->sample_rate)); |
54 | if ((startSample >= start) && (startSample < start + size)) { |
55 | const char *ws = flite_ffeature_string(item: asi->item, featpath: "whitespace" ); |
56 | const char *prepunc = flite_ffeature_string(item: asi->item, featpath: "prepunctuation" ); |
57 | if (cst_streq("0" ,prepunc)) |
58 | prepunc = "" ; |
59 | const char *token = flite_ffeature_string(item: asi->item, featpath: "name" ); |
60 | const char *postpunc = flite_ffeature_string(item: asi->item, featpath: "punc" ); |
61 | if (cst_streq("0" ,postpunc)) |
62 | postpunc = "" ; |
63 | if (token) { |
64 | qCDebug(lcSpeechTtsFlite).nospace() << "Processing token start_time: " << startTime |
65 | << " content: \"" << ws << prepunc << "'" << token << "'" << postpunc << "\"" ; |
66 | processor->m_tokens.append(t: TokenData{ |
67 | .startTime: qRound(f: startTime * 1000), |
68 | .text: QString::fromUtf8(utf8: token) |
69 | }); |
70 | if (!processor->m_tokenTimer.isActive()) |
71 | processor->startTokenTimer(); |
72 | } |
73 | asi->item = item_next(i: asi->item); |
74 | } |
75 | return processor->audioOutput(w, start, size, last, asi); |
76 | } |
77 | return CST_AUDIO_STREAM_STOP; |
78 | } |
79 | |
80 | int QTextToSpeechProcessorFlite::audioOutput(const cst_wave *w, int start, int size, |
81 | int last, cst_audio_streaming_info *asi) |
82 | { |
83 | Q_UNUSED(asi); |
84 | Q_ASSERT(QThread::currentThread() == thread()); |
85 | if (size == 0) |
86 | return CST_AUDIO_STREAM_CONT; |
87 | if (start == 0 && !initAudio(rate: w->sample_rate, channelCount: w->num_channels)) |
88 | return CST_AUDIO_STREAM_STOP; |
89 | |
90 | const qsizetype bytesToWrite = size * sizeof(short); |
91 | |
92 | if (!m_audioBuffer->write(data: reinterpret_cast<const char *>(&w->samples[start]), len: bytesToWrite)) { |
93 | setError(err: QTextToSpeech::ErrorReason::Playback, |
94 | errorString: QCoreApplication::translate(context: "QTextToSpeech" , key: "Audio streaming error." )); |
95 | stop(); |
96 | return CST_AUDIO_STREAM_STOP; |
97 | } |
98 | |
99 | // Stats for debugging |
100 | ++numberChunks; |
101 | totalBytes += bytesToWrite; |
102 | |
103 | if (last == 1) { |
104 | qCDebug(lcSpeechTtsFlite) << "last data chunk written" ; |
105 | m_audioBuffer->close(); |
106 | } |
107 | return CST_AUDIO_STREAM_CONT; |
108 | } |
109 | |
110 | int QTextToSpeechProcessorFlite::dataOutputCb(const cst_wave *w, int start, int size, |
111 | int last, cst_audio_streaming_info *asi) |
112 | { |
113 | QTextToSpeechProcessorFlite *processor = static_cast<QTextToSpeechProcessorFlite *>(asi->userdata); |
114 | if (processor) |
115 | return processor->dataOutput(w, start, size, last, asi); |
116 | return CST_AUDIO_STREAM_STOP; |
117 | } |
118 | |
119 | int QTextToSpeechProcessorFlite::dataOutput(const cst_wave *w, int start, int size, |
120 | int last, cst_audio_streaming_info *) |
121 | { |
122 | if (start == 0) |
123 | emit stateChanged(QTextToSpeech::Synthesizing); |
124 | |
125 | QAudioFormat format; |
126 | if (w->num_channels == 1) |
127 | format.setChannelConfig(QAudioFormat::ChannelConfigMono); |
128 | else |
129 | format.setChannelCount(w->num_channels); |
130 | format.setSampleRate(w->sample_rate); |
131 | format.setSampleFormat(QAudioFormat::Int16); |
132 | |
133 | if (!format.isValid()) |
134 | return CST_AUDIO_STREAM_STOP; |
135 | |
136 | const qsizetype bytesToWrite = size * format.bytesPerSample(); |
137 | emit synthesized(format, array: QByteArray(reinterpret_cast<const char *>(&w->samples[start]), bytesToWrite)); |
138 | |
139 | if (last == 1) |
140 | emit stateChanged(QTextToSpeech::Ready); |
141 | |
142 | return CST_AUDIO_STREAM_CONT; |
143 | } |
144 | |
145 | void QTextToSpeechProcessorFlite::timerEvent(QTimerEvent *event) |
146 | { |
147 | if (event->timerId() != m_tokenTimer.timerId()) { |
148 | QObject::timerEvent(event); |
149 | return; |
150 | } |
151 | |
152 | qCDebug(lcSpeechTtsFlite) << "Moving current token" << m_currentToken << m_tokens.size(); |
153 | auto currentToken = m_tokens.at(i: m_currentToken); |
154 | m_index = m_text.indexOf(s: currentToken.text, from: m_index); |
155 | emit sayingWord(word: currentToken.text, begin: m_index, length: currentToken.text.length()); |
156 | m_index += currentToken.text.length(); |
157 | ++m_currentToken; |
158 | if (m_currentToken == m_tokens.size()) |
159 | m_tokenTimer.stop(); |
160 | else |
161 | startTokenTimer(); |
162 | } |
163 | |
164 | void QTextToSpeechProcessorFlite::processText(const QString &text, int voiceId, double pitch, double rate, OutputHandler outputHandler) |
165 | { |
166 | qCDebug(lcSpeechTtsFlite) << "processText() begin" ; |
167 | if (!checkVoice(voiceId)) |
168 | return; |
169 | |
170 | m_text = text; |
171 | m_tokens.clear(); |
172 | m_currentToken = 0; |
173 | m_index = 0; |
174 | float secsToSpeak = -1; |
175 | const VoiceInfo &voiceInfo = m_voices.at(i: voiceId); |
176 | cst_voice *voice = voiceInfo.vox; |
177 | cst_audio_streaming_info *asi = new_audio_streaming_info(); |
178 | asi->asc = outputHandler; |
179 | asi->userdata = (void *)this; |
180 | feat_set(f: voice->features, name: "streaming_info" , v: audio_streaming_info_val(v: asi)); |
181 | setRateForVoice(voice, rate); |
182 | setPitchForVoice(voice, pitch); |
183 | secsToSpeak = flite_text_to_speech(text: text.toUtf8().constData(), voice, outtype: "none" ); |
184 | |
185 | if (secsToSpeak <= 0) { |
186 | setError(err: QTextToSpeech::ErrorReason::Input, |
187 | errorString: QCoreApplication::translate(context: "QTextToSpeech" , key: "Speech synthesizing failure." )); |
188 | return; |
189 | } |
190 | |
191 | qCDebug(lcSpeechTtsFlite) << "processText() end" << secsToSpeak << "Seconds" ; |
192 | } |
193 | |
194 | void QTextToSpeechProcessorFlite::setRateForVoice(cst_voice *voice, float rate) |
195 | { |
196 | float stretch = 1.0; |
197 | Q_ASSERT(rate >= -1.0 && rate <= 1.0); |
198 | // Stretch multipliers taken from Speech Dispatcher |
199 | if (rate < 0) |
200 | stretch -= rate * 2; |
201 | if (rate > 0) |
202 | stretch -= rate * (100.0 / 175.0); |
203 | feat_set_float(f: voice->features, name: "duration_stretch" , v: stretch); |
204 | } |
205 | |
206 | void QTextToSpeechProcessorFlite::setPitchForVoice(cst_voice *voice, float pitch) |
207 | { |
208 | float f0; |
209 | Q_ASSERT(pitch >= -1.0 && pitch <= 1.0); |
210 | // Conversion taken from Speech Dispatcher |
211 | f0 = (pitch * 80) + 100; |
212 | feat_set_float(f: voice->features, name: "int_f0_target_mean" , v: f0); |
213 | } |
214 | |
215 | typedef cst_voice*(*registerFnType)(); |
216 | typedef void(*unregisterFnType)(cst_voice *); |
217 | |
218 | bool QTextToSpeechProcessorFlite::init() |
219 | { |
220 | flite_init(); |
221 | |
222 | const QLocale locale(QLocale::English, QLocale::UnitedStates); |
223 | // ### FIXME: hardcode for now, the only voice files we know about are for en_US |
224 | // We could source the language and perhaps the list of voices we want to load |
225 | // (hardcoded below) from an environment variable. |
226 | const QLatin1StringView langCode("us" ); |
227 | const QLatin1StringView libPrefix("flite_cmu_%1_%2.so.1" ); |
228 | const QLatin1StringView registerPrefix("register_cmu_%1_%2" ); |
229 | const QLatin1StringView unregisterPrefix("unregister_cmu_%1_%2" ); |
230 | |
231 | for (const auto &voice : fliteAvailableVoices(libPrefix, langCode)) { |
232 | QLibrary library(libPrefix.arg(args: langCode, args: voice)); |
233 | if (!library.load()) { |
234 | qWarning(msg: "Voice library could not be loaded: %s" , qPrintable(library.fileName())); |
235 | continue; |
236 | } |
237 | auto registerFn = reinterpret_cast<registerFnType>(library.resolve( |
238 | symbol: registerPrefix.arg(args: langCode, args: voice).toLatin1().constData())); |
239 | auto unregisterFn = reinterpret_cast<unregisterFnType>(library.resolve( |
240 | symbol: unregisterPrefix.arg(args: langCode, args: voice).toLatin1().constData())); |
241 | if (registerFn && unregisterFn) { |
242 | const int id = m_voices.count(); |
243 | m_voices.append(t: VoiceInfo{ |
244 | .id: id, |
245 | .vox: registerFn(), |
246 | .unregister_func: unregisterFn, |
247 | .name: voice, |
248 | .locale: locale.name(), |
249 | .gender: QVoice::Male, |
250 | .age: QVoice::Adult |
251 | }); |
252 | } else { |
253 | library.unload(); |
254 | } |
255 | } |
256 | |
257 | return !m_voices.isEmpty(); |
258 | } |
259 | |
260 | QStringList QTextToSpeechProcessorFlite::fliteAvailableVoices(const QString &libPrefix, |
261 | const QString &langCode) const |
262 | { |
263 | // Read statically linked voices |
264 | QStringList voices; |
265 | for (const cst_val *v = flite_voice_list; v; v = val_cdr(v)) { |
266 | cst_voice *voice = val_voice(v: val_car(v)); |
267 | voices.append(t: voice->name); |
268 | } |
269 | |
270 | // Read available libraries |
271 | // TODO: make default library paths OS dependent |
272 | const QProcessEnvironment pe; |
273 | QStringList ldPaths = pe.value(name: "LD_LIBRARY_PATH"_L1 ).split(sep: ":" , behavior: Qt::SkipEmptyParts); |
274 | if (ldPaths.isEmpty()) { |
275 | ldPaths = QStringList{"/usr/lib64"_L1 , "/usr/local/lib64"_L1 , "/lib64"_L1 , |
276 | "/usr/lib/x86_64-linux-gnu"_L1 , "/usr/lib"_L1 }; |
277 | } else { |
278 | ldPaths.removeDuplicates(); |
279 | } |
280 | |
281 | const QString libPattern = ("lib"_L1 + libPrefix).arg(a: langCode).arg(a: "*"_L1 ); |
282 | for (const auto &path : ldPaths) { |
283 | QDir dir(path); |
284 | if (!dir.isReadable() || dir.isEmpty()) |
285 | continue; |
286 | dir.setNameFilters({libPattern}); |
287 | dir.setFilter(QDir::Files); |
288 | const QFileInfoList fileList = dir.entryInfoList(); |
289 | for (const auto &file : fileList) { |
290 | const QString vox = file.fileName().mid(position: 16, n: file.fileName().indexOf(c: u'.') - 16); |
291 | voices.append(t: vox); |
292 | } |
293 | } |
294 | |
295 | voices.removeDuplicates(); |
296 | return voices; |
297 | } |
298 | |
299 | bool QTextToSpeechProcessorFlite::initAudio(double rate, int channelCount) |
300 | { |
301 | m_format.setSampleFormat(QAudioFormat::Int16); |
302 | m_format.setSampleRate(rate); |
303 | m_format.setChannelCount(channelCount); |
304 | switch (channelCount) { |
305 | case 1: |
306 | m_format.setChannelConfig(QAudioFormat::ChannelConfigMono); |
307 | break; |
308 | case 2: |
309 | m_format.setChannelConfig(QAudioFormat::ChannelConfigStereo); |
310 | break; |
311 | case 3: |
312 | m_format.setChannelConfig(QAudioFormat::ChannelConfig2Dot1); |
313 | break; |
314 | case 5: |
315 | m_format.setChannelConfig(QAudioFormat::ChannelConfigSurround5Dot0); |
316 | break; |
317 | case 6: |
318 | m_format.setChannelConfig(QAudioFormat::ChannelConfigSurround5Dot1); |
319 | break; |
320 | case 7: |
321 | m_format.setChannelConfig(QAudioFormat::ChannelConfigSurround7Dot0); |
322 | break; |
323 | case 8: |
324 | m_format.setChannelConfig(QAudioFormat::ChannelConfigSurround7Dot1); |
325 | break; |
326 | default: |
327 | m_format.setChannelConfig(QAudioFormat::ChannelConfigUnknown); |
328 | break; |
329 | } |
330 | if (!checkFormat(format: m_format)) |
331 | return false; |
332 | |
333 | createSink(); |
334 | |
335 | m_audioSink->setVolume(m_volume); |
336 | |
337 | return true; |
338 | } |
339 | |
340 | void QTextToSpeechProcessorFlite::deleteSink() |
341 | { |
342 | if (m_audioSink) { |
343 | m_audioSink->disconnect(); |
344 | delete m_audioSink; |
345 | m_audioSink = nullptr; |
346 | m_audioBuffer = nullptr; |
347 | } |
348 | } |
349 | |
350 | void QTextToSpeechProcessorFlite::createSink() |
351 | { |
352 | // Create new sink if none exists or the format has changed |
353 | if (!m_audioSink || (m_audioSink->format() != m_format)) { |
354 | // No signals while we create new sink with QIODevice |
355 | const bool sigs = signalsBlocked(); |
356 | auto resetSignals = qScopeGuard(f: [this, sigs](){ blockSignals(b: sigs); }); |
357 | blockSignals(b: true); |
358 | deleteSink(); |
359 | m_audioSink = new QAudioSink(m_audioDevice, m_format, this); |
360 | connect(sender: m_audioSink, signal: &QAudioSink::stateChanged, context: this, slot: &QTextToSpeechProcessorFlite::changeState); |
361 | connect(sender: QThread::currentThread(), signal: &QThread::finished, context: m_audioSink, slot: &QObject::deleteLater); |
362 | } |
363 | m_audioBuffer = m_audioSink->start(); |
364 | if (!m_audioBuffer) { |
365 | deleteSink(); |
366 | setError(err: QTextToSpeech::ErrorReason::Playback, |
367 | errorString: QCoreApplication::translate(context: "QTextToSpeech" , key: "Audio Open error: No I/O device available." )); |
368 | } |
369 | |
370 | numberChunks = 0; |
371 | totalBytes = 0; |
372 | } |
373 | |
374 | // Wrapper for QAudioSink::stateChanged, bypassing early idle bug |
375 | void QTextToSpeechProcessorFlite::changeState(QAudio::State newState) |
376 | { |
377 | if (m_state == newState) |
378 | return; |
379 | |
380 | qCDebug(lcSpeechTtsFlite) << "Audio sink state transition" << m_state << newState; |
381 | |
382 | switch (newState) { |
383 | case QAudio::ActiveState: |
384 | // Once the sink starts playing, start a timer to keep track of the tokens. |
385 | if (!m_tokenTimer.isActive() && m_currentToken < m_tokens.count()) |
386 | startTokenTimer(); |
387 | break; |
388 | case QAudio::SuspendedState: |
389 | case QAudio::IdleState: |
390 | case QAudio::StoppedState: |
391 | m_tokenTimer.stop(); |
392 | break; |
393 | } |
394 | |
395 | m_state = newState; |
396 | const QTextToSpeech::State ttsState = audioStateToTts(audioState: newState); |
397 | emit stateChanged(ttsState); |
398 | } |
399 | |
400 | void QTextToSpeechProcessorFlite::setError(QTextToSpeech::ErrorReason err, const QString &errorString) |
401 | { |
402 | if (err == QTextToSpeech::ErrorReason::NoError) { |
403 | changeState(newState: QAudio::IdleState); |
404 | return; |
405 | } |
406 | |
407 | qCDebug(lcSpeechTtsFlite) << "Error" << err << errorString; |
408 | emit stateChanged(QTextToSpeech::Error); |
409 | emit errorOccurred(error: err, errorString); |
410 | } |
411 | |
412 | constexpr QTextToSpeech::State QTextToSpeechProcessorFlite::audioStateToTts(QAudio::State AudioState) |
413 | { |
414 | switch (AudioState) { |
415 | case QAudio::ActiveState: |
416 | return QTextToSpeech::Speaking; |
417 | case QAudio::IdleState: |
418 | return QTextToSpeech::Ready; |
419 | case QAudio::SuspendedState: |
420 | return QTextToSpeech::Paused; |
421 | case QAudio::StoppedState: |
422 | return QTextToSpeech::Ready; |
423 | } |
424 | Q_UNREACHABLE(); |
425 | } |
426 | |
427 | void QTextToSpeechProcessorFlite::deinitAudio() |
428 | { |
429 | m_tokenTimer.stop(); |
430 | m_index = -1; |
431 | m_currentToken = -1; |
432 | deleteSink(); |
433 | } |
434 | |
435 | // Check format/device and set corresponding error messages |
436 | bool QTextToSpeechProcessorFlite::checkFormat(const QAudioFormat &format) |
437 | { |
438 | QString formatString; |
439 | QDebug(&formatString) << format; |
440 | bool formatOK = true; |
441 | |
442 | // Format must be valid |
443 | if (!format.isValid()) { |
444 | formatOK = false; |
445 | setError(err: QTextToSpeech::ErrorReason::Playback, |
446 | errorString: QCoreApplication::translate(context: "QTextToSpeech" , key: "Invalid audio format: %1" ) |
447 | .arg(a: formatString)); |
448 | } |
449 | |
450 | // Device must exist |
451 | if (m_audioDevice.isNull()) { |
452 | formatOK = false; |
453 | setError(err: QTextToSpeech::ErrorReason::Playback, |
454 | errorString: QCoreApplication::translate(context: "QTextToSpeech" , key: "No audio device specified." )); |
455 | } |
456 | |
457 | // Device must support requested format |
458 | if (!m_audioDevice.isFormatSupported(format)) { |
459 | formatOK = false; |
460 | setError(err: QTextToSpeech::ErrorReason::Playback, |
461 | errorString: QCoreApplication::translate(context: "QTextToSpeech" , key: "Audio device does not support format: %1" ) |
462 | .arg(a: formatString)); |
463 | } |
464 | |
465 | return formatOK; |
466 | } |
467 | |
468 | // Check voice validity |
469 | bool QTextToSpeechProcessorFlite::checkVoice(int voiceId) |
470 | { |
471 | if (voiceId >= 0 && voiceId < m_voices.size()) |
472 | return true; |
473 | |
474 | setError(err: QTextToSpeech::ErrorReason::Configuration, |
475 | errorString: QCoreApplication::translate(context: "QTextToSpeech" , key: "Invalid voiceId %1." ).arg(a: voiceId)); |
476 | return false;; |
477 | } |
478 | |
479 | // Wrap QAudioSink::state and compensate early idle bug |
480 | QAudio::State QTextToSpeechProcessorFlite::audioSinkState() const |
481 | { |
482 | return (m_audioSink) ? m_state : QAudio::StoppedState; |
483 | } |
484 | |
485 | // Stop current and cancel subsequent utterances |
486 | void QTextToSpeechProcessorFlite::stop() |
487 | { |
488 | if (audioSinkState() == QAudio::ActiveState || audioSinkState() == QAudio::SuspendedState) { |
489 | deinitAudio(); |
490 | // Call manual state change as audio sink has been deleted |
491 | changeState(newState: QAudio::StoppedState); |
492 | } |
493 | } |
494 | |
495 | void QTextToSpeechProcessorFlite::pause() |
496 | { |
497 | if (audioSinkState() == QAudio::ActiveState) |
498 | m_audioSink->suspend(); |
499 | } |
500 | |
501 | void QTextToSpeechProcessorFlite::resume() |
502 | { |
503 | if (audioSinkState() == QAudio::SuspendedState) { |
504 | m_audioSink->resume(); |
505 | // QAudioSink in push mode transitions to Idle when resumed, even if |
506 | // there is still data to play. Workaround this weird behavior if we |
507 | // know we are not done yet. |
508 | changeState(newState: QAudio::ActiveState); |
509 | } |
510 | } |
511 | |
512 | void QTextToSpeechProcessorFlite::say(const QString &text, int voiceId, double pitch, double rate, double volume) |
513 | { |
514 | if (text.isEmpty()) |
515 | return; |
516 | |
517 | if (!checkVoice(voiceId)) |
518 | return; |
519 | |
520 | m_volume = volume; |
521 | processText(text, voiceId, pitch, rate, outputHandler: QTextToSpeechProcessorFlite::audioOutputCb); |
522 | } |
523 | |
524 | void QTextToSpeechProcessorFlite::synthesize(const QString &text, int voiceId, double pitch, double rate, double volume) |
525 | { |
526 | if (text.isEmpty()) |
527 | return; |
528 | |
529 | if (!checkVoice(voiceId)) |
530 | return; |
531 | |
532 | m_volume = volume; |
533 | processText(text, voiceId, pitch, rate, outputHandler: QTextToSpeechProcessorFlite::dataOutputCb); |
534 | } |
535 | |
536 | QT_END_NAMESPACE |
537 | |