1// Copyright (C) 2016 The Qt Company Ltd.
2// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR GPL-3.0-only WITH Qt-GPL-exception-1.0
3
4#include "lupdate.h"
5
6#include "simtexth.h"
7#include "translator.h"
8
9#include <QtCore/QCoreApplication>
10#include <QtCore/QDebug>
11#include <QtCore/QList>
12#include <QtCore/QMap>
13#include <QtCore/QStringList>
14
15QT_BEGIN_NAMESPACE
16
17static bool isDigitFriendly(QChar c)
18{
19 return c.isPunct() || c.isSpace();
20}
21
22static int numberLength(const QString &s, int i)
23{
24 if (i >= s.size() || !s.at(i).isDigit())
25 return 0;
26
27 int pos = i;
28 do {
29 ++i;
30 } while (i < s.size()
31 && (s.at(i).isDigit()
32 || (isDigitFriendly(c: s[i])
33 && i + 1 < s.size()
34 && (s[i + 1].isDigit()
35 || (isDigitFriendly(c: s[i + 1])
36 && i + 2 < s.size()
37 && s[i + 2].isDigit())))));
38 return i - pos;
39}
40
41
42/*
43 Returns a version of 'key' where all numbers have been replaced by zeroes. If
44 there were none, returns "".
45*/
46static QString zeroKey(const QString &key)
47{
48 QString zeroed;
49 bool metSomething = false;
50
51 for (int i = 0; i < key.size(); ++i) {
52 int len = numberLength(s: key, i);
53 if (len > 0) {
54 i += len;
55 zeroed.append(c: QLatin1Char('0'));
56 metSomething = true;
57 } else {
58 zeroed.append(c: key.at(i));
59 }
60 }
61 return metSomething ? zeroed : QString();
62}
63
64static QString translationAttempt(const QString &oldTranslation,
65 const QString &oldSource, const QString &newSource)
66{
67 int p = zeroKey(key: oldSource).count(c: QLatin1Char('0'));
68 QString attempt;
69 QStringList oldNumbers;
70 QStringList newNumbers;
71 QList<bool> met(p);
72 QList<int> matchedYet(p);
73 int i, j;
74 int k = 0, ell, best;
75 int m, n;
76 int pass;
77
78 /*
79 This algorithm is hard to follow, so we'll consider an example
80 all along: oldTranslation is "XeT 3.0", oldSource is "TeX 3.0"
81 and newSource is "XeT 3.1".
82
83 First, we set up two tables: oldNumbers and newNumbers. In our
84 example, oldNumber[0] is "3.0" and newNumber[0] is "3.1".
85 */
86 for (i = 0, j = 0; i < oldSource.size(); i++, j++) {
87 m = numberLength(s: oldSource, i);
88 n = numberLength(s: newSource, i: j);
89 if (m > 0) {
90 oldNumbers.append(t: oldSource.mid(position: i, n: m + 1));
91 newNumbers.append(t: newSource.mid(position: j, n: n + 1));
92 i += m;
93 j += n;
94 met[k] = false;
95 matchedYet[k] = 0;
96 k++;
97 }
98 }
99
100 /*
101 We now go over the old translation, "XeT 3.0", one letter at a
102 time, looking for numbers found in oldNumbers. Whenever such a
103 number is met, it is replaced with its newNumber equivalent. In
104 our example, the "3.0" of "XeT 3.0" becomes "3.1".
105 */
106 for (i = 0; i < oldTranslation.size(); i++) {
107 attempt += oldTranslation[i];
108 for (k = 0; k < p; k++) {
109 if (matchedYet[k] < oldNumbers[k].size() &&
110 oldTranslation[i] == oldNumbers[k][matchedYet[k]]) {
111 matchedYet[k]++;
112 } else {
113 matchedYet[k] = 0;
114 }
115 }
116
117 /*
118 Let's find out if the last character ended a match. We make
119 two passes over the data. In the first pass, we try to
120 match only numbers that weren't matched yet; if that fails,
121 the second pass does the trick. This is useful in some
122 suspicious cases, flagged below.
123 */
124 for (pass = 0; pass < 2; pass++) {
125 best = p; // an impossible value
126 for (k = 0; k < p; k++) {
127 if ((!met[k] || pass > 0) &&
128 matchedYet[k] == oldNumbers[k].size() &&
129 numberLength(s: oldTranslation, i: i + 1 - matchedYet[k]) == matchedYet[k]) {
130 // the longer the better
131 if (best == p || matchedYet[k] > matchedYet[best])
132 best = k;
133 }
134 }
135 if (best != p) {
136 attempt.truncate(pos: attempt.size() - matchedYet[best]);
137 attempt += newNumbers[best];
138 met[best] = true;
139 for (k = 0; k < p; k++)
140 matchedYet[k] = 0;
141 break;
142 }
143 }
144 }
145
146 /*
147 We flag two kinds of suspicious cases. They are identified as
148 such with comments such as "{2000?}" at the end.
149
150 Example of the first kind: old source text "TeX 3.0" translated
151 as "XeT 2.0" is flagged "TeX 2.0 {3.0?}", no matter what the
152 new text is.
153 */
154 for (k = 0; k < p; k++) {
155 if (!met[k])
156 attempt += QLatin1String(" {") + newNumbers[k] + QLatin1String("?}");
157 }
158
159 /*
160 Example of the second kind: "1 of 1" translated as "1 af 1",
161 with new source text "1 of 2", generates "1 af 2 {1 or 2?}"
162 because it's not clear which of "1 af 2" and "2 af 1" is right.
163 */
164 for (k = 0; k < p; k++) {
165 for (ell = 0; ell < p; ell++) {
166 if (k != ell && oldNumbers[k] == oldNumbers[ell] &&
167 newNumbers[k] < newNumbers[ell])
168 attempt += QLatin1String(" {") + newNumbers[k] + QLatin1String(" or ") +
169 newNumbers[ell] + QLatin1String("?}");
170 }
171 }
172 return attempt;
173}
174
175
176/*
177 Augments a Translator with translations easily derived from
178 similar existing (probably obsolete) translations.
179
180 For example, if "TeX 3.0" is translated as "XeT 3.0" and "TeX 3.1"
181 has no translation, "XeT 3.1" is added to the translator and is
182 marked Unfinished.
183
184 Returns the number of additional messages that this heuristic translated.
185*/
186int applyNumberHeuristic(Translator &tor)
187{
188 QMap<QString, QPair<QString, QString> > translated;
189 QList<bool> untranslated(tor.messageCount());
190 int inserted = 0;
191
192 for (int i = 0; i < tor.messageCount(); ++i) {
193 const TranslatorMessage &msg = tor.message(i);
194 bool hasTranslation = msg.isTranslated();
195 if (msg.type() == TranslatorMessage::Unfinished) {
196 if (!hasTranslation)
197 untranslated[i] = true;
198 } else if (hasTranslation && msg.translations().size() == 1) {
199 const QString &key = zeroKey(key: msg.sourceText());
200 if (!key.isEmpty())
201 translated.insert(key, value: qMakePair(value1: msg.sourceText(), value2: msg.translation()));
202 }
203 }
204
205 for (int i = 0; i < tor.messageCount(); ++i) {
206 if (untranslated[i]) {
207 TranslatorMessage &msg = tor.message(i);
208 const QString &key = zeroKey(key: msg.sourceText());
209 if (!key.isEmpty()) {
210 const auto t = translated.constFind(key);
211 if (t != translated.constEnd() && t->first != msg.sourceText()) {
212 msg.setTranslation(translationAttempt(oldTranslation: t->second, oldSource: t->first,
213 newSource: msg.sourceText()));
214 inserted++;
215 }
216 }
217 }
218 }
219 return inserted;
220}
221
222
223/*
224 Augments a Translator with trivially derived translations.
225
226 For example, if "Enabled:" is consistendly translated as "Eingeschaltet:" no
227 matter the context or the comment, "Eingeschaltet:" is added as the
228 translation of any untranslated "Enabled:" text and is marked Unfinished.
229
230 Returns the number of additional messages that this heuristic translated.
231*/
232
233int applySameTextHeuristic(Translator &tor)
234{
235 QMap<QString, QStringList> translated;
236 QMap<QString, bool> avoid; // Want a QTreeSet, in fact
237 QList<bool> untranslated(tor.messageCount());
238 int inserted = 0;
239
240 for (int i = 0; i < tor.messageCount(); ++i) {
241 const TranslatorMessage &msg = tor.message(i);
242 if (!msg.isTranslated()) {
243 if (msg.type() == TranslatorMessage::Unfinished)
244 untranslated[i] = true;
245 } else {
246 const QString &key = msg.sourceText();
247 const auto t = translated.constFind(key);
248 if (t != translated.constEnd()) {
249 /*
250 The same source text is translated at least two
251 different ways. Do nothing then.
252 */
253 if (*t != msg.translations()) {
254 translated.remove(key);
255 avoid.insert(key, value: true);
256 }
257 } else if (!avoid.contains(key)) {
258 translated.insert(key, value: msg.translations());
259 }
260 }
261 }
262
263 for (int i = 0; i < tor.messageCount(); ++i) {
264 if (untranslated[i]) {
265 TranslatorMessage &msg = tor.message(i);
266 const auto t = translated.constFind(key: msg.sourceText());
267 if (t != translated.constEnd()) {
268 msg.setTranslations(*t);
269 ++inserted;
270 }
271 }
272 }
273 return inserted;
274}
275
276
277
278/*
279 Merges two Translator objects. The first one
280 is a set of source texts and translations for a previous version of
281 the internationalized program; the second one is a set of fresh
282 source texts newly extracted from the source code, without any
283 translation yet.
284*/
285
286Translator merge(
287 const Translator &tor, const Translator &virginTor, const QList<Translator> &aliens,
288 UpdateOptions options, QString &err)
289{
290 int known = 0;
291 int neww = 0;
292 int obsoleted = 0;
293 int similarTextHeuristicCount = 0;
294
295 Translator outTor;
296 outTor.setLanguageCode(tor.languageCode());
297 outTor.setSourceLanguageCode(tor.sourceLanguageCode());
298 outTor.setLocationsType(tor.locationsType());
299
300 /*
301 The types of all the messages from the vernacular translator
302 are updated according to the virgin translator.
303 */
304 for (TranslatorMessage m : tor.messages()) {
305 TranslatorMessage::Type newType = TranslatorMessage::Finished;
306
307 if (m.sourceText().isEmpty() && m.id().isEmpty()) {
308 // context/file comment
309 int mvi = virginTor.find(context: m.context());
310 if (mvi >= 0)
311 m.setComment(virginTor.constMessage(i: mvi).comment());
312 } else {
313 TranslatorMessage::ExtraData extras;
314 const TranslatorMessage *mv;
315 int mvi = virginTor.find(msg: m);
316 if (mvi < 0) {
317 if (!(options & HeuristicSimilarText)) {
318 makeObsolete:
319 switch (m.type()) {
320 case TranslatorMessage::Finished:
321 newType = TranslatorMessage::Vanished;
322 obsoleted++;
323 break;
324 case TranslatorMessage::Unfinished:
325 newType = TranslatorMessage::Obsolete;
326 obsoleted++;
327 break;
328 default:
329 newType = m.type();
330 break;
331 }
332 m.clearReferences();
333 } else {
334 mvi = virginTor.find(context: m.context(), comment: m.comment(), refs: m.allReferences());
335 if (mvi < 0) {
336 // did not find it in the virgin, mark it as obsolete
337 goto makeObsolete;
338 }
339 mv = &virginTor.constMessage(i: mvi);
340 // Do not just accept it if its on the same line number,
341 // but different source text.
342 // Also check if the texts are more or less similar before
343 // we consider them to represent the same message...
344 if (getSimilarityScore(str1: m.sourceText(), str2: mv->sourceText()) < textSimilarityThreshold) {
345 // The virgin and vernacular sourceTexts are so different that we could not find it
346 goto makeObsolete;
347 }
348 // It is just slightly modified, assume that it is the same string
349
350 extras = mv->extras();
351
352 // Mark it as unfinished. (Since the source text
353 // was changed it might require re-translating...)
354 newType = TranslatorMessage::Unfinished;
355 ++similarTextHeuristicCount;
356 neww++;
357 goto outdateSource;
358 }
359 } else {
360 mv = &virginTor.message(i: mvi);
361 extras = mv->extras();
362 if (!mv->id().isEmpty()
363 && (mv->context() != m.context()
364 || mv->sourceText() != m.sourceText()
365 || mv->comment() != m.comment())) {
366 known++;
367 newType = TranslatorMessage::Unfinished;
368 m.setContext(mv->context());
369 m.setComment(mv->comment());
370 if (mv->sourceText() != m.sourceText()) {
371 outdateSource:
372 m.setOldSourceText(m.sourceText());
373 m.setSourceText(mv->sourceText());
374 const QString &oldpluralsource = m.extra(ba: QLatin1String("po-msgid_plural"));
375 if (!oldpluralsource.isEmpty())
376 extras.insert(key: QLatin1String("po-old_msgid_plural"), value: oldpluralsource);
377 }
378 } else {
379 switch (m.type()) {
380 case TranslatorMessage::Finished:
381 default:
382 if (m.isPlural() == mv->isPlural()) {
383 newType = TranslatorMessage::Finished;
384 } else {
385 newType = TranslatorMessage::Unfinished;
386 }
387 known++;
388 break;
389 case TranslatorMessage::Unfinished:
390 newType = TranslatorMessage::Unfinished;
391 known++;
392 break;
393 case TranslatorMessage::Vanished:
394 newType = TranslatorMessage::Finished;
395 neww++;
396 break;
397 case TranslatorMessage::Obsolete:
398 newType = TranslatorMessage::Unfinished;
399 neww++;
400 break;
401 }
402 }
403
404 // Always get the filename and linenumber info from the
405 // virgin Translator, in case it has changed location.
406 // This should also enable us to read a file that does not
407 // have the <location> element.
408 // why not use operator=()? Because it overwrites e.g. userData.
409 m.setReferences(mv->allReferences());
410 m.setPlural(mv->isPlural());
411 m.setExtras(extras);
412 m.setExtraComment(mv->extraComment());
413 m.setId(mv->id());
414 }
415 }
416
417 m.setType(newType);
418 outTor.append(msg: m);
419 }
420
421 /*
422 Messages found only in the virgin translator are added to the
423 vernacular translator.
424 */
425 for (const TranslatorMessage &mv : virginTor.messages()) {
426 if (mv.sourceText().isEmpty() && mv.id().isEmpty()) {
427 if (tor.find(context: mv.context()) >= 0)
428 continue;
429 } else {
430 if (tor.find(msg: mv) >= 0)
431 continue;
432 if (options & HeuristicSimilarText) {
433 int mi = tor.find(context: mv.context(), comment: mv.comment(), refs: mv.allReferences());
434 if (mi >= 0) {
435 // The similar message found in tor (ts file) must NOT correspond exactly
436 // to an other message is virginTor
437 if (virginTor.find(msg: tor.constMessage(i: mi)) < 0) {
438 if (getSimilarityScore(str1: tor.constMessage(i: mi).sourceText(), str2: mv.sourceText())
439 >= textSimilarityThreshold)
440 continue;
441 }
442 }
443 }
444 }
445 if (options & NoLocations)
446 outTor.append(msg: mv);
447 else
448 outTor.appendSorted(msg: mv);
449 if (!mv.sourceText().isEmpty() || !mv.id().isEmpty())
450 ++neww;
451 }
452
453 /*
454 "Alien" translators can be used to augment the vernacular translator.
455 */
456 for (const Translator &alf : aliens) {
457 for (TranslatorMessage mv : alf.messages()) {
458 if (mv.sourceText().isEmpty() || !mv.isTranslated())
459 continue;
460 int mvi = outTor.find(msg: mv);
461 if (mvi >= 0) {
462 TranslatorMessage &tm = outTor.message(i: mvi);
463 if (tm.type() != TranslatorMessage::Finished && !tm.isTranslated()) {
464 tm.setTranslations(mv.translations());
465 --neww;
466 ++known;
467 }
468 } else {
469 /*
470 * Don't do simtex search, as the locations are likely to be
471 * completely off anyway, so we'd find nothing.
472 */
473 /*
474 * Add the unmatched messages as obsoletes, so the Linguist GUI
475 * will offer them as possible translations.
476 */
477 mv.clearReferences();
478 mv.setType(mv.type() == TranslatorMessage::Finished
479 ? TranslatorMessage::Vanished : TranslatorMessage::Obsolete);
480 if (options & NoLocations)
481 outTor.append(msg: mv);
482 else
483 outTor.appendSorted(msg: mv);
484 ++known;
485 ++obsoleted;
486 }
487 }
488 }
489
490 /*
491 The same-text heuristic handles cases where a message has an
492 obsolete counterpart with a different context or comment.
493 */
494 int sameTextHeuristicCount = (options & HeuristicSameText) ? applySameTextHeuristic(tor&: outTor) : 0;
495
496 /*
497 The number heuristic handles cases where a message has an
498 obsolete counterpart with mostly numbers differing in the
499 source text.
500 */
501 int sameNumberHeuristicCount = (options & HeuristicNumber) ? applyNumberHeuristic(tor&: outTor) : 0;
502
503 if (options & Verbose) {
504 int totalFound = neww + known;
505 err += QStringLiteral(" Found %1 source text(s) (%2 new and %3 already existing)\n")
506 .arg(a: totalFound).arg(a: neww).arg(a: known);
507
508 if (obsoleted) {
509 if (options & NoObsolete) {
510 err += QStringLiteral(" Removed %1 obsolete entries\n").arg(a: obsoleted);
511 } else {
512 err += QStringLiteral(" Kept %1 obsolete entries\n").arg(a: obsoleted);
513 }
514 }
515
516 if (sameNumberHeuristicCount)
517 err += QStringLiteral(" Number heuristic provided %1 translation(s)\n")
518 .arg(a: sameNumberHeuristicCount);
519 if (sameTextHeuristicCount)
520 err += QStringLiteral(" Same-text heuristic provided %1 translation(s)\n")
521 .arg(a: sameTextHeuristicCount);
522 if (similarTextHeuristicCount)
523 err += QStringLiteral(" Similar-text heuristic provided %1 translation(s)\n")
524 .arg(a: similarTextHeuristicCount);
525 }
526 return outTor;
527}
528
529QT_END_NAMESPACE
530

source code of qttools/src/linguist/lupdate/merge.cpp