1 | /**************************************************************************** |
2 | ** |
3 | ** Copyright (C) 2016 The Qt Company Ltd. |
4 | ** Contact: https://www.qt.io/licensing/ |
5 | ** |
6 | ** This file is part of the Qt Linguist of the Qt Toolkit. |
7 | ** |
8 | ** $QT_BEGIN_LICENSE:GPL-EXCEPT$ |
9 | ** Commercial License Usage |
10 | ** Licensees holding valid commercial Qt licenses may use this file in |
11 | ** accordance with the commercial license agreement provided with the |
12 | ** Software or, alternatively, in accordance with the terms contained in |
13 | ** a written agreement between you and The Qt Company. For licensing terms |
14 | ** and conditions see https://www.qt.io/terms-conditions. For further |
15 | ** information use the contact form at https://www.qt.io/contact-us. |
16 | ** |
17 | ** GNU General Public License Usage |
18 | ** Alternatively, this file may be used under the terms of the GNU |
19 | ** General Public License version 3 as published by the Free Software |
20 | ** Foundation with exceptions as appearing in the file LICENSE.GPL3-EXCEPT |
21 | ** included in the packaging of this file. Please review the following |
22 | ** information to ensure the GNU General Public License requirements will |
23 | ** be met: https://www.gnu.org/licenses/gpl-3.0.html. |
24 | ** |
25 | ** $QT_END_LICENSE$ |
26 | ** |
27 | ****************************************************************************/ |
28 | |
29 | #include "lupdate.h" |
30 | |
31 | #include "simtexth.h" |
32 | #include "translator.h" |
33 | |
34 | #include <QtCore/QCoreApplication> |
35 | #include <QtCore/QDebug> |
36 | #include <QtCore/QMap> |
37 | #include <QtCore/QStringList> |
38 | #include <QtCore/QVector> |
39 | |
40 | QT_BEGIN_NAMESPACE |
41 | |
42 | static bool isDigitFriendly(QChar c) |
43 | { |
44 | return c.isPunct() || c.isSpace(); |
45 | } |
46 | |
47 | static int numberLength(const QString &s, int i) |
48 | { |
49 | if (i >= s.size() || !s.at(i).isDigit()) |
50 | return 0; |
51 | |
52 | int pos = i; |
53 | do { |
54 | ++i; |
55 | } while (i < s.size() |
56 | && (s.at(i).isDigit() |
57 | || (isDigitFriendly(c: s[i]) |
58 | && i + 1 < s.size() |
59 | && (s[i + 1].isDigit() |
60 | || (isDigitFriendly(c: s[i + 1]) |
61 | && i + 2 < s.size() |
62 | && s[i + 2].isDigit()))))); |
63 | return i - pos; |
64 | } |
65 | |
66 | |
67 | /* |
68 | Returns a version of 'key' where all numbers have been replaced by zeroes. If |
69 | there were none, returns "". |
70 | */ |
71 | static QString zeroKey(const QString &key) |
72 | { |
73 | QString zeroed; |
74 | bool metSomething = false; |
75 | |
76 | for (int i = 0; i < key.size(); ++i) { |
77 | int len = numberLength(s: key, i); |
78 | if (len > 0) { |
79 | i += len; |
80 | zeroed.append(c: QLatin1Char('0')); |
81 | metSomething = true; |
82 | } else { |
83 | zeroed.append(c: key.at(i)); |
84 | } |
85 | } |
86 | return metSomething ? zeroed : QString(); |
87 | } |
88 | |
89 | static QString translationAttempt(const QString &oldTranslation, |
90 | const QString &oldSource, const QString &newSource) |
91 | { |
92 | int p = zeroKey(key: oldSource).count(c: QLatin1Char('0')); |
93 | QString attempt; |
94 | QStringList oldNumbers; |
95 | QStringList newNumbers; |
96 | QVector<bool> met(p); |
97 | QVector<int> matchedYet(p); |
98 | int i, j; |
99 | int k = 0, ell, best; |
100 | int m, n; |
101 | int pass; |
102 | |
103 | /* |
104 | This algorithm is hard to follow, so we'll consider an example |
105 | all along: oldTranslation is "XeT 3.0", oldSource is "TeX 3.0" |
106 | and newSource is "XeT 3.1". |
107 | |
108 | First, we set up two tables: oldNumbers and newNumbers. In our |
109 | example, oldNumber[0] is "3.0" and newNumber[0] is "3.1". |
110 | */ |
111 | for (i = 0, j = 0; i < oldSource.size(); i++, j++) { |
112 | m = numberLength(s: oldSource, i); |
113 | n = numberLength(s: newSource, i: j); |
114 | if (m > 0) { |
115 | oldNumbers.append(t: oldSource.mid(position: i, n: m + 1)); |
116 | newNumbers.append(t: newSource.mid(position: j, n: n + 1)); |
117 | i += m; |
118 | j += n; |
119 | met[k] = false; |
120 | matchedYet[k] = 0; |
121 | k++; |
122 | } |
123 | } |
124 | |
125 | /* |
126 | We now go over the old translation, "XeT 3.0", one letter at a |
127 | time, looking for numbers found in oldNumbers. Whenever such a |
128 | number is met, it is replaced with its newNumber equivalent. In |
129 | our example, the "3.0" of "XeT 3.0" becomes "3.1". |
130 | */ |
131 | for (i = 0; i < oldTranslation.length(); i++) { |
132 | attempt += oldTranslation[i]; |
133 | for (k = 0; k < p; k++) { |
134 | if (oldTranslation[i] == oldNumbers[k][matchedYet[k]]) |
135 | matchedYet[k]++; |
136 | else |
137 | matchedYet[k] = 0; |
138 | } |
139 | |
140 | /* |
141 | Let's find out if the last character ended a match. We make |
142 | two passes over the data. In the first pass, we try to |
143 | match only numbers that weren't matched yet; if that fails, |
144 | the second pass does the trick. This is useful in some |
145 | suspicious cases, flagged below. |
146 | */ |
147 | for (pass = 0; pass < 2; pass++) { |
148 | best = p; // an impossible value |
149 | for (k = 0; k < p; k++) { |
150 | if ((!met[k] || pass > 0) && |
151 | matchedYet[k] == oldNumbers[k].length() && |
152 | numberLength(s: oldTranslation, i: i + 1 - matchedYet[k]) == matchedYet[k]) { |
153 | // the longer the better |
154 | if (best == p || matchedYet[k] > matchedYet[best]) |
155 | best = k; |
156 | } |
157 | } |
158 | if (best != p) { |
159 | attempt.truncate(pos: attempt.length() - matchedYet[best]); |
160 | attempt += newNumbers[best]; |
161 | met[best] = true; |
162 | for (k = 0; k < p; k++) |
163 | matchedYet[k] = 0; |
164 | break; |
165 | } |
166 | } |
167 | } |
168 | |
169 | /* |
170 | We flag two kinds of suspicious cases. They are identified as |
171 | such with comments such as "{2000?}" at the end. |
172 | |
173 | Example of the first kind: old source text "TeX 3.0" translated |
174 | as "XeT 2.0" is flagged "TeX 2.0 {3.0?}", no matter what the |
175 | new text is. |
176 | */ |
177 | for (k = 0; k < p; k++) { |
178 | if (!met[k]) |
179 | attempt += QLatin1String(" {" ) + newNumbers[k] + QLatin1String("?}" ); |
180 | } |
181 | |
182 | /* |
183 | Example of the second kind: "1 of 1" translated as "1 af 1", |
184 | with new source text "1 of 2", generates "1 af 2 {1 or 2?}" |
185 | because it's not clear which of "1 af 2" and "2 af 1" is right. |
186 | */ |
187 | for (k = 0; k < p; k++) { |
188 | for (ell = 0; ell < p; ell++) { |
189 | if (k != ell && oldNumbers[k] == oldNumbers[ell] && |
190 | newNumbers[k] < newNumbers[ell]) |
191 | attempt += QLatin1String(" {" ) + newNumbers[k] + QLatin1String(" or " ) + |
192 | newNumbers[ell] + QLatin1String("?}" ); |
193 | } |
194 | } |
195 | return attempt; |
196 | } |
197 | |
198 | |
199 | /* |
200 | Augments a Translator with translations easily derived from |
201 | similar existing (probably obsolete) translations. |
202 | |
203 | For example, if "TeX 3.0" is translated as "XeT 3.0" and "TeX 3.1" |
204 | has no translation, "XeT 3.1" is added to the translator and is |
205 | marked Unfinished. |
206 | |
207 | Returns the number of additional messages that this heuristic translated. |
208 | */ |
209 | int applyNumberHeuristic(Translator &tor) |
210 | { |
211 | QMap<QString, QPair<QString, QString> > translated; |
212 | QVector<bool> untranslated(tor.messageCount()); |
213 | int inserted = 0; |
214 | |
215 | for (int i = 0; i < tor.messageCount(); ++i) { |
216 | const TranslatorMessage &msg = tor.message(i); |
217 | bool hasTranslation = msg.isTranslated(); |
218 | if (msg.type() == TranslatorMessage::Unfinished) { |
219 | if (!hasTranslation) |
220 | untranslated[i] = true; |
221 | } else if (hasTranslation && msg.translations().count() == 1) { |
222 | const QString &key = zeroKey(key: msg.sourceText()); |
223 | if (!key.isEmpty()) |
224 | translated.insert(akey: key, avalue: qMakePair(x: msg.sourceText(), y: msg.translation())); |
225 | } |
226 | } |
227 | |
228 | for (int i = 0; i < tor.messageCount(); ++i) { |
229 | if (untranslated[i]) { |
230 | TranslatorMessage &msg = tor.message(i); |
231 | const QString &key = zeroKey(key: msg.sourceText()); |
232 | if (!key.isEmpty()) { |
233 | QMap<QString, QPair<QString, QString> >::ConstIterator t = |
234 | translated.constFind(akey: key); |
235 | if (t != translated.constEnd() && t->first != msg.sourceText()) { |
236 | msg.setTranslation(translationAttempt(oldTranslation: t->second, oldSource: t->first, |
237 | newSource: msg.sourceText())); |
238 | inserted++; |
239 | } |
240 | } |
241 | } |
242 | } |
243 | return inserted; |
244 | } |
245 | |
246 | |
247 | /* |
248 | Augments a Translator with trivially derived translations. |
249 | |
250 | For example, if "Enabled:" is consistendly translated as "Eingeschaltet:" no |
251 | matter the context or the comment, "Eingeschaltet:" is added as the |
252 | translation of any untranslated "Enabled:" text and is marked Unfinished. |
253 | |
254 | Returns the number of additional messages that this heuristic translated. |
255 | */ |
256 | |
257 | int applySameTextHeuristic(Translator &tor) |
258 | { |
259 | QMap<QString, QStringList> translated; |
260 | QMap<QString, bool> avoid; // Want a QTreeSet, in fact |
261 | QVector<bool> untranslated(tor.messageCount()); |
262 | int inserted = 0; |
263 | |
264 | for (int i = 0; i < tor.messageCount(); ++i) { |
265 | const TranslatorMessage &msg = tor.message(i); |
266 | if (!msg.isTranslated()) { |
267 | if (msg.type() == TranslatorMessage::Unfinished) |
268 | untranslated[i] = true; |
269 | } else { |
270 | const QString &key = msg.sourceText(); |
271 | QMap<QString, QStringList>::ConstIterator t = translated.constFind(akey: key); |
272 | if (t != translated.constEnd()) { |
273 | /* |
274 | The same source text is translated at least two |
275 | different ways. Do nothing then. |
276 | */ |
277 | if (*t != msg.translations()) { |
278 | translated.remove(akey: key); |
279 | avoid.insert(akey: key, avalue: true); |
280 | } |
281 | } else if (!avoid.contains(akey: key)) { |
282 | translated.insert(akey: key, avalue: msg.translations()); |
283 | } |
284 | } |
285 | } |
286 | |
287 | for (int i = 0; i < tor.messageCount(); ++i) { |
288 | if (untranslated[i]) { |
289 | TranslatorMessage &msg = tor.message(i); |
290 | QMap<QString, QStringList>::ConstIterator t = translated.constFind(akey: msg.sourceText()); |
291 | if (t != translated.constEnd()) { |
292 | msg.setTranslations(*t); |
293 | ++inserted; |
294 | } |
295 | } |
296 | } |
297 | return inserted; |
298 | } |
299 | |
300 | |
301 | |
302 | /* |
303 | Merges two Translator objects. The first one |
304 | is a set of source texts and translations for a previous version of |
305 | the internationalized program; the second one is a set of fresh |
306 | source texts newly extracted from the source code, without any |
307 | translation yet. |
308 | */ |
309 | |
310 | Translator merge( |
311 | const Translator &tor, const Translator &virginTor, const QList<Translator> &aliens, |
312 | UpdateOptions options, QString &err) |
313 | { |
314 | int known = 0; |
315 | int neww = 0; |
316 | int obsoleted = 0; |
317 | int similarTextHeuristicCount = 0; |
318 | |
319 | Translator outTor; |
320 | outTor.setLanguageCode(tor.languageCode()); |
321 | outTor.setSourceLanguageCode(tor.sourceLanguageCode()); |
322 | outTor.setLocationsType(tor.locationsType()); |
323 | |
324 | /* |
325 | The types of all the messages from the vernacular translator |
326 | are updated according to the virgin translator. |
327 | */ |
328 | foreach (TranslatorMessage m, tor.messages()) { |
329 | TranslatorMessage::Type newType = TranslatorMessage::Finished; |
330 | |
331 | if (m.sourceText().isEmpty() && m.id().isEmpty()) { |
332 | // context/file comment |
333 | int mvi = virginTor.find(context: m.context()); |
334 | if (mvi >= 0) |
335 | m.setComment(virginTor.constMessage(i: mvi).comment()); |
336 | } else { |
337 | TranslatorMessage::ExtraData ; |
338 | const TranslatorMessage *mv; |
339 | int mvi = virginTor.find(msg: m); |
340 | if (mvi < 0) { |
341 | if (!(options & HeuristicSimilarText)) { |
342 | makeObsolete: |
343 | switch (m.type()) { |
344 | case TranslatorMessage::Finished: |
345 | newType = TranslatorMessage::Vanished; |
346 | obsoleted++; |
347 | break; |
348 | case TranslatorMessage::Unfinished: |
349 | newType = TranslatorMessage::Obsolete; |
350 | obsoleted++; |
351 | break; |
352 | default: |
353 | newType = m.type(); |
354 | break; |
355 | } |
356 | m.clearReferences(); |
357 | } else { |
358 | mvi = virginTor.find(context: m.context(), comment: m.comment(), refs: m.allReferences()); |
359 | if (mvi < 0) { |
360 | // did not find it in the virgin, mark it as obsolete |
361 | goto makeObsolete; |
362 | } |
363 | mv = &virginTor.constMessage(i: mvi); |
364 | // Do not just accept it if its on the same line number, |
365 | // but different source text. |
366 | // Also check if the texts are more or less similar before |
367 | // we consider them to represent the same message... |
368 | if (getSimilarityScore(str1: m.sourceText(), str2: mv->sourceText()) < textSimilarityThreshold) { |
369 | // The virgin and vernacular sourceTexts are so different that we could not find it |
370 | goto makeObsolete; |
371 | } |
372 | // It is just slightly modified, assume that it is the same string |
373 | |
374 | extras = mv->extras(); |
375 | |
376 | // Mark it as unfinished. (Since the source text |
377 | // was changed it might require re-translating...) |
378 | newType = TranslatorMessage::Unfinished; |
379 | ++similarTextHeuristicCount; |
380 | neww++; |
381 | goto outdateSource; |
382 | } |
383 | } else { |
384 | mv = &virginTor.message(i: mvi); |
385 | extras = mv->extras(); |
386 | if (!mv->id().isEmpty() |
387 | && (mv->context() != m.context() |
388 | || mv->sourceText() != m.sourceText() |
389 | || mv->comment() != m.comment())) { |
390 | known++; |
391 | newType = TranslatorMessage::Unfinished; |
392 | m.setContext(mv->context()); |
393 | m.setComment(mv->comment()); |
394 | if (mv->sourceText() != m.sourceText()) { |
395 | outdateSource: |
396 | m.setOldSourceText(m.sourceText()); |
397 | m.setSourceText(mv->sourceText()); |
398 | const QString &oldpluralsource = m.extra(ba: QLatin1String("po-msgid_plural" )); |
399 | if (!oldpluralsource.isEmpty()) |
400 | extras.insert(akey: QLatin1String("po-old_msgid_plural" ), avalue: oldpluralsource); |
401 | } |
402 | } else { |
403 | switch (m.type()) { |
404 | case TranslatorMessage::Finished: |
405 | default: |
406 | if (m.isPlural() == mv->isPlural()) { |
407 | newType = TranslatorMessage::Finished; |
408 | } else { |
409 | newType = TranslatorMessage::Unfinished; |
410 | } |
411 | known++; |
412 | break; |
413 | case TranslatorMessage::Unfinished: |
414 | newType = TranslatorMessage::Unfinished; |
415 | known++; |
416 | break; |
417 | case TranslatorMessage::Vanished: |
418 | newType = TranslatorMessage::Finished; |
419 | neww++; |
420 | break; |
421 | case TranslatorMessage::Obsolete: |
422 | newType = TranslatorMessage::Unfinished; |
423 | neww++; |
424 | break; |
425 | } |
426 | } |
427 | |
428 | // Always get the filename and linenumber info from the |
429 | // virgin Translator, in case it has changed location. |
430 | // This should also enable us to read a file that does not |
431 | // have the <location> element. |
432 | // why not use operator=()? Because it overwrites e.g. userData. |
433 | m.setReferences(mv->allReferences()); |
434 | m.setPlural(mv->isPlural()); |
435 | m.setExtras(extras); |
436 | m.setExtraComment(mv->extraComment()); |
437 | m.setId(mv->id()); |
438 | } |
439 | } |
440 | |
441 | m.setType(newType); |
442 | outTor.append(msg: m); |
443 | } |
444 | |
445 | /* |
446 | Messages found only in the virgin translator are added to the |
447 | vernacular translator. |
448 | */ |
449 | foreach (const TranslatorMessage &mv, virginTor.messages()) { |
450 | if (mv.sourceText().isEmpty() && mv.id().isEmpty()) { |
451 | if (tor.find(context: mv.context()) >= 0) |
452 | continue; |
453 | } else { |
454 | if (tor.find(msg: mv) >= 0) |
455 | continue; |
456 | if (options & HeuristicSimilarText) { |
457 | int mi = tor.find(context: mv.context(), comment: mv.comment(), refs: mv.allReferences()); |
458 | if (mi >= 0) { |
459 | // The similar message found in tor (ts file) must NOT correspond exactly |
460 | // to an other message is virginTor |
461 | if (virginTor.find(msg: tor.constMessage(i: mi)) < 0) { |
462 | if (getSimilarityScore(str1: tor.constMessage(i: mi).sourceText(), str2: mv.sourceText()) |
463 | >= textSimilarityThreshold) |
464 | continue; |
465 | } |
466 | } |
467 | } |
468 | } |
469 | if (options & NoLocations) |
470 | outTor.append(msg: mv); |
471 | else |
472 | outTor.appendSorted(msg: mv); |
473 | if (!mv.sourceText().isEmpty() || !mv.id().isEmpty()) |
474 | ++neww; |
475 | } |
476 | |
477 | /* |
478 | "Alien" translators can be used to augment the vernacular translator. |
479 | */ |
480 | foreach (const Translator &alf, aliens) { |
481 | foreach (TranslatorMessage mv, alf.messages()) { |
482 | if (mv.sourceText().isEmpty() || !mv.isTranslated()) |
483 | continue; |
484 | int mvi = outTor.find(msg: mv); |
485 | if (mvi >= 0) { |
486 | TranslatorMessage &tm = outTor.message(i: mvi); |
487 | if (tm.type() != TranslatorMessage::Finished && !tm.isTranslated()) { |
488 | tm.setTranslations(mv.translations()); |
489 | --neww; |
490 | ++known; |
491 | } |
492 | } else { |
493 | /* |
494 | * Don't do simtex search, as the locations are likely to be |
495 | * completely off anyway, so we'd find nothing. |
496 | */ |
497 | /* |
498 | * Add the unmatched messages as obsoletes, so the Linguist GUI |
499 | * will offer them as possible translations. |
500 | */ |
501 | mv.clearReferences(); |
502 | mv.setType(mv.type() == TranslatorMessage::Finished |
503 | ? TranslatorMessage::Vanished : TranslatorMessage::Obsolete); |
504 | if (options & NoLocations) |
505 | outTor.append(msg: mv); |
506 | else |
507 | outTor.appendSorted(msg: mv); |
508 | ++known; |
509 | ++obsoleted; |
510 | } |
511 | } |
512 | } |
513 | |
514 | /* |
515 | The same-text heuristic handles cases where a message has an |
516 | obsolete counterpart with a different context or comment. |
517 | */ |
518 | int sameTextHeuristicCount = (options & HeuristicSameText) ? applySameTextHeuristic(tor&: outTor) : 0; |
519 | |
520 | /* |
521 | The number heuristic handles cases where a message has an |
522 | obsolete counterpart with mostly numbers differing in the |
523 | source text. |
524 | */ |
525 | int = (options & HeuristicNumber) ? applyNumberHeuristic(tor&: outTor) : 0; |
526 | |
527 | if (options & Verbose) { |
528 | int totalFound = neww + known; |
529 | err += LU::tr(sourceText: " Found %n source text(s) (%1 new and %2 already existing)\n" , disambiguation: 0, n: totalFound).arg(a: neww).arg(a: known); |
530 | |
531 | if (obsoleted) { |
532 | if (options & NoObsolete) { |
533 | err += LU::tr(sourceText: " Removed %n obsolete entries\n" , disambiguation: 0, n: obsoleted); |
534 | } else { |
535 | err += LU::tr(sourceText: " Kept %n obsolete entries\n" , disambiguation: 0, n: obsoleted); |
536 | } |
537 | } |
538 | |
539 | if (sameNumberHeuristicCount) |
540 | err += LU::tr(sourceText: " Number heuristic provided %n translation(s)\n" , |
541 | disambiguation: 0, n: sameNumberHeuristicCount); |
542 | if (sameTextHeuristicCount) |
543 | err += LU::tr(sourceText: " Same-text heuristic provided %n translation(s)\n" , |
544 | disambiguation: 0, n: sameTextHeuristicCount); |
545 | if (similarTextHeuristicCount) |
546 | err += LU::tr(sourceText: " Similar-text heuristic provided %n translation(s)\n" , |
547 | disambiguation: 0, n: similarTextHeuristicCount); |
548 | } |
549 | return outTor; |
550 | } |
551 | |
552 | QT_END_NAMESPACE |
553 | |