1 | // Copyright (C) 2016 The Qt Company Ltd. |
2 | // SPDX-License-Identifier: LicenseRef-Qt-Commercial OR GPL-3.0-only WITH Qt-GPL-exception-1.0 |
3 | |
4 | #include "lupdate.h" |
5 | |
6 | #include "simtexth.h" |
7 | #include "translator.h" |
8 | |
9 | #include <QtCore/QCoreApplication> |
10 | #include <QtCore/QDebug> |
11 | #include <QtCore/QList> |
12 | #include <QtCore/QMap> |
13 | #include <QtCore/QStringList> |
14 | |
15 | QT_BEGIN_NAMESPACE |
16 | |
17 | static bool isDigitFriendly(QChar c) |
18 | { |
19 | return c.isPunct() || c.isSpace(); |
20 | } |
21 | |
22 | static int numberLength(const QString &s, int i) |
23 | { |
24 | if (i >= s.size() || !s.at(i).isDigit()) |
25 | return 0; |
26 | |
27 | int pos = i; |
28 | do { |
29 | ++i; |
30 | } while (i < s.size() |
31 | && (s.at(i).isDigit() |
32 | || (isDigitFriendly(c: s[i]) |
33 | && i + 1 < s.size() |
34 | && (s[i + 1].isDigit() |
35 | || (isDigitFriendly(c: s[i + 1]) |
36 | && i + 2 < s.size() |
37 | && s[i + 2].isDigit()))))); |
38 | return i - pos; |
39 | } |
40 | |
41 | |
42 | /* |
43 | Returns a version of 'key' where all numbers have been replaced by zeroes. If |
44 | there were none, returns "". |
45 | */ |
46 | static QString zeroKey(const QString &key) |
47 | { |
48 | QString zeroed; |
49 | bool metSomething = false; |
50 | |
51 | for (int i = 0; i < key.size(); ++i) { |
52 | int len = numberLength(s: key, i); |
53 | if (len > 0) { |
54 | i += len; |
55 | zeroed.append(c: QLatin1Char('0')); |
56 | metSomething = true; |
57 | } else { |
58 | zeroed.append(c: key.at(i)); |
59 | } |
60 | } |
61 | return metSomething ? zeroed : QString(); |
62 | } |
63 | |
64 | static QString translationAttempt(const QString &oldTranslation, |
65 | const QString &oldSource, const QString &newSource) |
66 | { |
67 | int p = zeroKey(key: oldSource).count(c: QLatin1Char('0')); |
68 | QString attempt; |
69 | QStringList oldNumbers; |
70 | QStringList newNumbers; |
71 | QList<bool> met(p); |
72 | QList<int> matchedYet(p); |
73 | int i, j; |
74 | int k = 0, ell, best; |
75 | int m, n; |
76 | int pass; |
77 | |
78 | /* |
79 | This algorithm is hard to follow, so we'll consider an example |
80 | all along: oldTranslation is "XeT 3.0", oldSource is "TeX 3.0" |
81 | and newSource is "XeT 3.1". |
82 | |
83 | First, we set up two tables: oldNumbers and newNumbers. In our |
84 | example, oldNumber[0] is "3.0" and newNumber[0] is "3.1". |
85 | */ |
86 | for (i = 0, j = 0; i < oldSource.size(); i++, j++) { |
87 | m = numberLength(s: oldSource, i); |
88 | n = numberLength(s: newSource, i: j); |
89 | if (m > 0) { |
90 | oldNumbers.append(t: oldSource.mid(position: i, n: m + 1)); |
91 | newNumbers.append(t: newSource.mid(position: j, n: n + 1)); |
92 | i += m; |
93 | j += n; |
94 | met[k] = false; |
95 | matchedYet[k] = 0; |
96 | k++; |
97 | } |
98 | } |
99 | |
100 | /* |
101 | We now go over the old translation, "XeT 3.0", one letter at a |
102 | time, looking for numbers found in oldNumbers. Whenever such a |
103 | number is met, it is replaced with its newNumber equivalent. In |
104 | our example, the "3.0" of "XeT 3.0" becomes "3.1". |
105 | */ |
106 | for (i = 0; i < oldTranslation.size(); i++) { |
107 | attempt += oldTranslation[i]; |
108 | for (k = 0; k < p; k++) { |
109 | if (matchedYet[k] < oldNumbers[k].size() && |
110 | oldTranslation[i] == oldNumbers[k][matchedYet[k]]) { |
111 | matchedYet[k]++; |
112 | } else { |
113 | matchedYet[k] = 0; |
114 | } |
115 | } |
116 | |
117 | /* |
118 | Let's find out if the last character ended a match. We make |
119 | two passes over the data. In the first pass, we try to |
120 | match only numbers that weren't matched yet; if that fails, |
121 | the second pass does the trick. This is useful in some |
122 | suspicious cases, flagged below. |
123 | */ |
124 | for (pass = 0; pass < 2; pass++) { |
125 | best = p; // an impossible value |
126 | for (k = 0; k < p; k++) { |
127 | if ((!met[k] || pass > 0) && |
128 | matchedYet[k] == oldNumbers[k].size() && |
129 | numberLength(s: oldTranslation, i: i + 1 - matchedYet[k]) == matchedYet[k]) { |
130 | // the longer the better |
131 | if (best == p || matchedYet[k] > matchedYet[best]) |
132 | best = k; |
133 | } |
134 | } |
135 | if (best != p) { |
136 | attempt.truncate(pos: attempt.size() - matchedYet[best]); |
137 | attempt += newNumbers[best]; |
138 | met[best] = true; |
139 | for (k = 0; k < p; k++) |
140 | matchedYet[k] = 0; |
141 | break; |
142 | } |
143 | } |
144 | } |
145 | |
146 | /* |
147 | We flag two kinds of suspicious cases. They are identified as |
148 | such with comments such as "{2000?}" at the end. |
149 | |
150 | Example of the first kind: old source text "TeX 3.0" translated |
151 | as "XeT 2.0" is flagged "TeX 2.0 {3.0?}", no matter what the |
152 | new text is. |
153 | */ |
154 | for (k = 0; k < p; k++) { |
155 | if (!met[k]) |
156 | attempt += QLatin1String(" {" ) + newNumbers[k] + QLatin1String("?}" ); |
157 | } |
158 | |
159 | /* |
160 | Example of the second kind: "1 of 1" translated as "1 af 1", |
161 | with new source text "1 of 2", generates "1 af 2 {1 or 2?}" |
162 | because it's not clear which of "1 af 2" and "2 af 1" is right. |
163 | */ |
164 | for (k = 0; k < p; k++) { |
165 | for (ell = 0; ell < p; ell++) { |
166 | if (k != ell && oldNumbers[k] == oldNumbers[ell] && |
167 | newNumbers[k] < newNumbers[ell]) |
168 | attempt += QLatin1String(" {" ) + newNumbers[k] + QLatin1String(" or " ) + |
169 | newNumbers[ell] + QLatin1String("?}" ); |
170 | } |
171 | } |
172 | return attempt; |
173 | } |
174 | |
175 | |
176 | /* |
177 | Augments a Translator with translations easily derived from |
178 | similar existing (probably obsolete) translations. |
179 | |
180 | For example, if "TeX 3.0" is translated as "XeT 3.0" and "TeX 3.1" |
181 | has no translation, "XeT 3.1" is added to the translator and is |
182 | marked Unfinished. |
183 | |
184 | Returns the number of additional messages that this heuristic translated. |
185 | */ |
186 | int applyNumberHeuristic(Translator &tor) |
187 | { |
188 | QMap<QString, QPair<QString, QString> > translated; |
189 | QList<bool> untranslated(tor.messageCount()); |
190 | int inserted = 0; |
191 | |
192 | for (int i = 0; i < tor.messageCount(); ++i) { |
193 | const TranslatorMessage &msg = tor.message(i); |
194 | bool hasTranslation = msg.isTranslated(); |
195 | if (msg.type() == TranslatorMessage::Unfinished) { |
196 | if (!hasTranslation) |
197 | untranslated[i] = true; |
198 | } else if (hasTranslation && msg.translations().size() == 1) { |
199 | const QString &key = zeroKey(key: msg.sourceText()); |
200 | if (!key.isEmpty()) |
201 | translated.insert(key, value: qMakePair(value1: msg.sourceText(), value2: msg.translation())); |
202 | } |
203 | } |
204 | |
205 | for (int i = 0; i < tor.messageCount(); ++i) { |
206 | if (untranslated[i]) { |
207 | TranslatorMessage &msg = tor.message(i); |
208 | const QString &key = zeroKey(key: msg.sourceText()); |
209 | if (!key.isEmpty()) { |
210 | const auto t = translated.constFind(key); |
211 | if (t != translated.constEnd() && t->first != msg.sourceText()) { |
212 | msg.setTranslation(translationAttempt(oldTranslation: t->second, oldSource: t->first, |
213 | newSource: msg.sourceText())); |
214 | inserted++; |
215 | } |
216 | } |
217 | } |
218 | } |
219 | return inserted; |
220 | } |
221 | |
222 | |
223 | /* |
224 | Augments a Translator with trivially derived translations. |
225 | |
226 | For example, if "Enabled:" is consistendly translated as "Eingeschaltet:" no |
227 | matter the context or the comment, "Eingeschaltet:" is added as the |
228 | translation of any untranslated "Enabled:" text and is marked Unfinished. |
229 | |
230 | Returns the number of additional messages that this heuristic translated. |
231 | */ |
232 | |
233 | int applySameTextHeuristic(Translator &tor) |
234 | { |
235 | QMap<QString, QStringList> translated; |
236 | QMap<QString, bool> avoid; // Want a QTreeSet, in fact |
237 | QList<bool> untranslated(tor.messageCount()); |
238 | int inserted = 0; |
239 | |
240 | for (int i = 0; i < tor.messageCount(); ++i) { |
241 | const TranslatorMessage &msg = tor.message(i); |
242 | if (!msg.isTranslated()) { |
243 | if (msg.type() == TranslatorMessage::Unfinished) |
244 | untranslated[i] = true; |
245 | } else { |
246 | const QString &key = msg.sourceText(); |
247 | const auto t = translated.constFind(key); |
248 | if (t != translated.constEnd()) { |
249 | /* |
250 | The same source text is translated at least two |
251 | different ways. Do nothing then. |
252 | */ |
253 | if (*t != msg.translations()) { |
254 | translated.remove(key); |
255 | avoid.insert(key, value: true); |
256 | } |
257 | } else if (!avoid.contains(key)) { |
258 | translated.insert(key, value: msg.translations()); |
259 | } |
260 | } |
261 | } |
262 | |
263 | for (int i = 0; i < tor.messageCount(); ++i) { |
264 | if (untranslated[i]) { |
265 | TranslatorMessage &msg = tor.message(i); |
266 | const auto t = translated.constFind(key: msg.sourceText()); |
267 | if (t != translated.constEnd()) { |
268 | msg.setTranslations(*t); |
269 | ++inserted; |
270 | } |
271 | } |
272 | } |
273 | return inserted; |
274 | } |
275 | |
276 | |
277 | |
278 | /* |
279 | Merges two Translator objects. The first one |
280 | is a set of source texts and translations for a previous version of |
281 | the internationalized program; the second one is a set of fresh |
282 | source texts newly extracted from the source code, without any |
283 | translation yet. |
284 | */ |
285 | |
286 | Translator merge( |
287 | const Translator &tor, const Translator &virginTor, const QList<Translator> &aliens, |
288 | UpdateOptions options, QString &err) |
289 | { |
290 | int known = 0; |
291 | int neww = 0; |
292 | int obsoleted = 0; |
293 | int similarTextHeuristicCount = 0; |
294 | |
295 | Translator outTor; |
296 | outTor.setLanguageCode(tor.languageCode()); |
297 | outTor.setSourceLanguageCode(tor.sourceLanguageCode()); |
298 | outTor.setLocationsType(tor.locationsType()); |
299 | |
300 | /* |
301 | The types of all the messages from the vernacular translator |
302 | are updated according to the virgin translator. |
303 | */ |
304 | for (TranslatorMessage m : tor.messages()) { |
305 | TranslatorMessage::Type newType = TranslatorMessage::Finished; |
306 | |
307 | if (m.sourceText().isEmpty() && m.id().isEmpty()) { |
308 | // context/file comment |
309 | int mvi = virginTor.find(context: m.context()); |
310 | if (mvi >= 0) |
311 | m.setComment(virginTor.constMessage(i: mvi).comment()); |
312 | } else { |
313 | TranslatorMessage::ExtraData ; |
314 | const TranslatorMessage *mv; |
315 | int mvi = virginTor.find(msg: m); |
316 | if (mvi < 0) { |
317 | if (!(options & HeuristicSimilarText)) { |
318 | makeObsolete: |
319 | switch (m.type()) { |
320 | case TranslatorMessage::Finished: |
321 | newType = TranslatorMessage::Vanished; |
322 | obsoleted++; |
323 | break; |
324 | case TranslatorMessage::Unfinished: |
325 | newType = TranslatorMessage::Obsolete; |
326 | obsoleted++; |
327 | break; |
328 | default: |
329 | newType = m.type(); |
330 | break; |
331 | } |
332 | m.clearReferences(); |
333 | } else { |
334 | mvi = virginTor.find(context: m.context(), comment: m.comment(), refs: m.allReferences()); |
335 | if (mvi < 0) { |
336 | // did not find it in the virgin, mark it as obsolete |
337 | goto makeObsolete; |
338 | } |
339 | mv = &virginTor.constMessage(i: mvi); |
340 | // Do not just accept it if its on the same line number, |
341 | // but different source text. |
342 | // Also check if the texts are more or less similar before |
343 | // we consider them to represent the same message... |
344 | if (getSimilarityScore(str1: m.sourceText(), str2: mv->sourceText()) < textSimilarityThreshold) { |
345 | // The virgin and vernacular sourceTexts are so different that we could not find it |
346 | goto makeObsolete; |
347 | } |
348 | // It is just slightly modified, assume that it is the same string |
349 | |
350 | extras = mv->extras(); |
351 | |
352 | // Mark it as unfinished. (Since the source text |
353 | // was changed it might require re-translating...) |
354 | newType = TranslatorMessage::Unfinished; |
355 | ++similarTextHeuristicCount; |
356 | neww++; |
357 | goto outdateSource; |
358 | } |
359 | } else { |
360 | mv = &virginTor.message(i: mvi); |
361 | extras = mv->extras(); |
362 | if (!mv->id().isEmpty() |
363 | && (mv->context() != m.context() |
364 | || mv->sourceText() != m.sourceText() |
365 | || mv->comment() != m.comment())) { |
366 | known++; |
367 | newType = TranslatorMessage::Unfinished; |
368 | m.setContext(mv->context()); |
369 | m.setComment(mv->comment()); |
370 | if (mv->sourceText() != m.sourceText()) { |
371 | outdateSource: |
372 | m.setOldSourceText(m.sourceText()); |
373 | m.setSourceText(mv->sourceText()); |
374 | const QString &oldpluralsource = m.extra(ba: QLatin1String("po-msgid_plural" )); |
375 | if (!oldpluralsource.isEmpty()) |
376 | extras.insert(key: QLatin1String("po-old_msgid_plural" ), value: oldpluralsource); |
377 | } |
378 | } else { |
379 | switch (m.type()) { |
380 | case TranslatorMessage::Finished: |
381 | default: |
382 | if (m.isPlural() == mv->isPlural()) { |
383 | newType = TranslatorMessage::Finished; |
384 | } else { |
385 | newType = TranslatorMessage::Unfinished; |
386 | } |
387 | known++; |
388 | break; |
389 | case TranslatorMessage::Unfinished: |
390 | newType = TranslatorMessage::Unfinished; |
391 | known++; |
392 | break; |
393 | case TranslatorMessage::Vanished: |
394 | newType = TranslatorMessage::Finished; |
395 | neww++; |
396 | break; |
397 | case TranslatorMessage::Obsolete: |
398 | newType = TranslatorMessage::Unfinished; |
399 | neww++; |
400 | break; |
401 | } |
402 | } |
403 | |
404 | // Always get the filename and linenumber info from the |
405 | // virgin Translator, in case it has changed location. |
406 | // This should also enable us to read a file that does not |
407 | // have the <location> element. |
408 | // why not use operator=()? Because it overwrites e.g. userData. |
409 | m.setReferences(mv->allReferences()); |
410 | m.setPlural(mv->isPlural()); |
411 | m.setExtras(extras); |
412 | m.setExtraComment(mv->extraComment()); |
413 | m.setId(mv->id()); |
414 | } |
415 | } |
416 | |
417 | m.setType(newType); |
418 | outTor.append(msg: m); |
419 | } |
420 | |
421 | /* |
422 | Messages found only in the virgin translator are added to the |
423 | vernacular translator. |
424 | */ |
425 | for (const TranslatorMessage &mv : virginTor.messages()) { |
426 | if (mv.sourceText().isEmpty() && mv.id().isEmpty()) { |
427 | if (tor.find(context: mv.context()) >= 0) |
428 | continue; |
429 | } else { |
430 | if (tor.find(msg: mv) >= 0) |
431 | continue; |
432 | if (options & HeuristicSimilarText) { |
433 | int mi = tor.find(context: mv.context(), comment: mv.comment(), refs: mv.allReferences()); |
434 | if (mi >= 0) { |
435 | // The similar message found in tor (ts file) must NOT correspond exactly |
436 | // to an other message is virginTor |
437 | if (virginTor.find(msg: tor.constMessage(i: mi)) < 0) { |
438 | if (getSimilarityScore(str1: tor.constMessage(i: mi).sourceText(), str2: mv.sourceText()) |
439 | >= textSimilarityThreshold) |
440 | continue; |
441 | } |
442 | } |
443 | } |
444 | } |
445 | if (options & NoLocations) |
446 | outTor.append(msg: mv); |
447 | else |
448 | outTor.appendSorted(msg: mv); |
449 | if (!mv.sourceText().isEmpty() || !mv.id().isEmpty()) |
450 | ++neww; |
451 | } |
452 | |
453 | /* |
454 | "Alien" translators can be used to augment the vernacular translator. |
455 | */ |
456 | for (const Translator &alf : aliens) { |
457 | for (TranslatorMessage mv : alf.messages()) { |
458 | if (mv.sourceText().isEmpty() || !mv.isTranslated()) |
459 | continue; |
460 | int mvi = outTor.find(msg: mv); |
461 | if (mvi >= 0) { |
462 | TranslatorMessage &tm = outTor.message(i: mvi); |
463 | if (tm.type() != TranslatorMessage::Finished && !tm.isTranslated()) { |
464 | tm.setTranslations(mv.translations()); |
465 | --neww; |
466 | ++known; |
467 | } |
468 | } else { |
469 | /* |
470 | * Don't do simtex search, as the locations are likely to be |
471 | * completely off anyway, so we'd find nothing. |
472 | */ |
473 | /* |
474 | * Add the unmatched messages as obsoletes, so the Linguist GUI |
475 | * will offer them as possible translations. |
476 | */ |
477 | mv.clearReferences(); |
478 | mv.setType(mv.type() == TranslatorMessage::Finished |
479 | ? TranslatorMessage::Vanished : TranslatorMessage::Obsolete); |
480 | if (options & NoLocations) |
481 | outTor.append(msg: mv); |
482 | else |
483 | outTor.appendSorted(msg: mv); |
484 | ++known; |
485 | ++obsoleted; |
486 | } |
487 | } |
488 | } |
489 | |
490 | /* |
491 | The same-text heuristic handles cases where a message has an |
492 | obsolete counterpart with a different context or comment. |
493 | */ |
494 | int sameTextHeuristicCount = (options & HeuristicSameText) ? applySameTextHeuristic(tor&: outTor) : 0; |
495 | |
496 | /* |
497 | The number heuristic handles cases where a message has an |
498 | obsolete counterpart with mostly numbers differing in the |
499 | source text. |
500 | */ |
501 | int = (options & HeuristicNumber) ? applyNumberHeuristic(tor&: outTor) : 0; |
502 | |
503 | if (options & Verbose) { |
504 | int totalFound = neww + known; |
505 | err += QStringLiteral(" Found %1 source text(s) (%2 new and %3 already existing)\n" ) |
506 | .arg(a: totalFound).arg(a: neww).arg(a: known); |
507 | |
508 | if (obsoleted) { |
509 | if (options & NoObsolete) { |
510 | err += QStringLiteral(" Removed %1 obsolete entries\n" ).arg(a: obsoleted); |
511 | } else { |
512 | err += QStringLiteral(" Kept %1 obsolete entries\n" ).arg(a: obsoleted); |
513 | } |
514 | } |
515 | |
516 | if (sameNumberHeuristicCount) |
517 | err += QStringLiteral(" Number heuristic provided %1 translation(s)\n" ) |
518 | .arg(a: sameNumberHeuristicCount); |
519 | if (sameTextHeuristicCount) |
520 | err += QStringLiteral(" Same-text heuristic provided %1 translation(s)\n" ) |
521 | .arg(a: sameTextHeuristicCount); |
522 | if (similarTextHeuristicCount) |
523 | err += QStringLiteral(" Similar-text heuristic provided %1 translation(s)\n" ) |
524 | .arg(a: similarTextHeuristicCount); |
525 | } |
526 | return outTor; |
527 | } |
528 | |
529 | QT_END_NAMESPACE |
530 | |