simtexth.cpp source code [qttools/src/linguist/shared/simtexth.cpp]

1	/****************************************************************************
2	**
3	** Copyright (C) 2016 The Qt Company Ltd.
4	** Contact: https://www.qt.io/licensing/
5	**
6	** This file is part of the Qt Linguist of the Qt Toolkit.
7	**
8	** $QT_BEGIN_LICENSE:GPL-EXCEPT$
9	** Commercial License Usage
10	** Licensees holding valid commercial Qt licenses may use this file in
11	** accordance with the commercial license agreement provided with the
12	** Software or, alternatively, in accordance with the terms contained in
13	** a written agreement between you and The Qt Company. For licensing terms
14	** and conditions see https://www.qt.io/terms-conditions. For further
15	** information use the contact form at https://www.qt.io/contact-us.
16	**
17	** GNU General Public License Usage
18	** Alternatively, this file may be used under the terms of the GNU
19	** General Public License version 3 as published by the Free Software
20	** Foundation with exceptions as appearing in the file LICENSE.GPL3-EXCEPT
21	** included in the packaging of this file. Please review the following
22	** information to ensure the GNU General Public License requirements will
23	** be met: https://www.gnu.org/licenses/gpl-3.0.html.
24	**
25	** $QT_END_LICENSE$
26	**
27	****************************************************************************/
28
29	#include "simtexth.h"
30	#include "translator.h"
31
32	#include <QtCore/QByteArray>
33	#include <QtCore/QString>
34	#include <QtCore/QList>
35
36
37	QT_BEGIN_NAMESPACE
38
39	typedef QList<TranslatorMessage> TML;
40
41	/*
42	How similar are two texts? The approach used here relies on co-occurrence
43	matrices and is very efficient.
44
45	Let's see with an example: how similar are "here" and "hither"? The
46	co-occurrence matrix M for "here" is M[h,e] = 1, M[e,r] = 1, M[r,e] = 1, and 0
47	elsewhere; the matrix N for "hither" is N[h,i] = 1, N[i,t] = 1, ...,
48	N[h,e] = 1, N[e,r] = 1, and 0 elsewhere. The union U of both matrices is the
49	matrix U[i,j] = max { M[i,j], N[i,j] }, and the intersection V is
50	V[i,j] = min { M[i,j], N[i,j] }. The score for a pair of texts is
51
52	score = (sum of V[i,j] over all i, j) / (sum of U[i,j] over all i, j),
53
54	a formula suggested by Arnt Gulbrandsen. Here we have
55
56	score = 2 / 6,
57
58	or one third.
59
60	The implementation differs from this in a few details. Most importantly,
61	repetitions are ignored; for input "xxx", M[x,x] equals 1, not 2.
62	*/
63
64	/*
65	Every character is assigned to one of 20 buckets so that the co-occurrence
66	matrix requires only 20 20 = 400 bits, not 256 * 256 = 65536 bits or even*
67	more if we want the whole Unicode. Which character falls in which bucket is
68	arbitrary.
69
70	The second half of the table is a replica of the first half, because of
71	laziness.
72	*/
73	static const int indexOf[`256`] = {
74	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
75	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
76	// ! " # $ % & ' ( ) + , - . /*
77	`0`, `2`, `6`, `7`, `10`, `12`, `15`, `19`, `2`, `6`, `7`, `10`, `12`, `15`, `19`, `0`,
78	// 0 1 2 3 4 5 6 7 8 9 : ; < = > ?
79	`1`, `3`, `4`, `5`, `8`, `9`, `11`, `13`, `14`, `16`, `2`, `6`, `7`, `10`, `12`, `15`,
80	// @ A B C D E F G H I J K L M N O
81	`0`, `1`, `2`, `3`, `4`, `5`, `6`, `7`, `8`, `9`, `6`, `10`, `11`, `12`, `13`, `14`,
82	// P Q R S T U V W X Y Z [ \ ] ^ _
83	`15`, `12`, `16`, `17`, `18`, `19`, `2`, `10`, `15`, `7`, `19`, `2`, `6`, `7`, `10`, `0`,
84	// ` a b c d e f g h i j k l m n o
85	`0`, `1`, `2`, `3`, `4`, `5`, `6`, `7`, `8`, `9`, `6`, `10`, `11`, `12`, `13`, `14`,
86	// p q r s t u v w x y z { \| } ~
87	`15`, `12`, `16`, `17`, `18`, `19`, `2`, `10`, `15`, `7`, `19`, `2`, `6`, `7`, `10`, `0`,
88
89	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
90	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
91	`0`, `2`, `6`, `7`, `10`, `12`, `15`, `19`, `2`, `6`, `7`, `10`, `12`, `15`, `19`, `0`,
92	`1`, `3`, `4`, `5`, `8`, `9`, `11`, `13`, `14`, `16`, `2`, `6`, `7`, `10`, `12`, `15`,
93	`0`, `1`, `2`, `3`, `4`, `5`, `6`, `7`, `8`, `9`, `6`, `10`, `11`, `12`, `13`, `14`,
94	`15`, `12`, `16`, `17`, `18`, `19`, `2`, `10`, `15`, `7`, `19`, `2`, `6`, `7`, `10`, `0`,
95	`0`, `1`, `2`, `3`, `4`, `5`, `6`, `7`, `8`, `9`, `6`, `10`, `11`, `12`, `13`, `14`,
96	`15`, `12`, `16`, `17`, `18`, `19`, `2`, `10`, `15`, `7`, `19`, `2`, `6`, `7`, `10`, `0`
97	};
98
99	/*
100	The entry bitCount[i] (for i between 0 and 255) is the number of bits used to
101	represent i in binary.
102	*/
103	static const int bitCount[`256`] = {
104	`0`, `1`, `1`, `2`, `1`, `2`, `2`, `3`, `1`, `2`, `2`, `3`, `2`, `3`, `3`, `4`,
105	`1`, `2`, `2`, `3`, `2`, `3`, `3`, `4`, `2`, `3`, `3`, `4`, `3`, `4`, `4`, `5`,
106	`1`, `2`, `2`, `3`, `2`, `3`, `3`, `4`, `2`, `3`, `3`, `4`, `3`, `4`, `4`, `5`,
107	`2`, `3`, `3`, `4`, `3`, `4`, `4`, `5`, `3`, `4`, `4`, `5`, `4`, `5`, `5`, `6`,
108	`1`, `2`, `2`, `3`, `2`, `3`, `3`, `4`, `2`, `3`, `3`, `4`, `3`, `4`, `4`, `5`,
109	`2`, `3`, `3`, `4`, `3`, `4`, `4`, `5`, `3`, `4`, `4`, `5`, `4`, `5`, `5`, `6`,
110	`2`, `3`, `3`, `4`, `3`, `4`, `4`, `5`, `3`, `4`, `4`, `5`, `4`, `5`, `5`, `6`,
111	`3`, `4`, `4`, `5`, `4`, `5`, `5`, `6`, `4`, `5`, `5`, `6`, `5`, `6`, `6`, `7`,
112	`1`, `2`, `2`, `3`, `2`, `3`, `3`, `4`, `2`, `3`, `3`, `4`, `3`, `4`, `4`, `5`,
113	`2`, `3`, `3`, `4`, `3`, `4`, `4`, `5`, `3`, `4`, `4`, `5`, `4`, `5`, `5`, `6`,
114	`2`, `3`, `3`, `4`, `3`, `4`, `4`, `5`, `3`, `4`, `4`, `5`, `4`, `5`, `5`, `6`,
115	`3`, `4`, `4`, `5`, `4`, `5`, `5`, `6`, `4`, `5`, `5`, `6`, `5`, `6`, `6`, `7`,
116	`2`, `3`, `3`, `4`, `3`, `4`, `4`, `5`, `3`, `4`, `4`, `5`, `4`, `5`, `5`, `6`,
117	`3`, `4`, `4`, `5`, `4`, `5`, `5`, `6`, `4`, `5`, `5`, `6`, `5`, `6`, `6`, `7`,
118	`3`, `4`, `4`, `5`, `4`, `5`, `5`, `6`, `4`, `5`, `5`, `6`, `5`, `6`, `6`, `7`,
119	`4`, `5`, `5`, `6`, `5`, `6`, `6`, `7`, `5`, `6`, `6`, `7`, `6`, `7`, `7`, `8`
120	};
121
122	static inline void setCoOccurence(CoMatrix &m, char c, char d)
123	{
124	int k = indexOf[(uchar) c] + `20` * indexOf[(uchar) d];
125	m.b[k >> `3`] \|= (`1` << (k & `0x7`));
126	}
127
128	CoMatrix::CoMatrix(const QString &str)
129	{
130	QByteArray ba = str.toUtf8();
131	const char *text = ba.constData();
132	char c = `'\0'`, d;
133	memset( s: b, c: `0`, n: `52` );
134	/*
135	The Knuth books are not in the office only for show; they help make
136	loops 30% faster and 20% as readable.
137	*/
138	while ( (d = *text) != `'\0'` ) {
139	setCoOccurence(m&: *this, c, d);
140	if ( (c = *++text) != `'\0'` ) {
141	setCoOccurence(m&: *this, c: d, d: c);
142	text++;
143	}
144	}
145	}
146
147	static inline int worth(const CoMatrix &m)
148	{
149	int w = `0`;
150	for (int i = `0`; i < `50`; i++)
151	w += bitCount[m.b[i]];
152	return w;
153	}
154
155	static inline CoMatrix reunion(const CoMatrix &m, const CoMatrix &n)
156	{
157	CoMatrix p;
158	for (int i = `0`; i < `13`; ++i)
159	p.w[i] = m.w[i] \| n.w[i];
160	return p;
161	}
162
163	static inline CoMatrix intersection(const CoMatrix &m, const CoMatrix &n)
164	{
165	CoMatrix p;
166	for (int i = `0`; i < `13`; ++i)
167	p.w[i] = m.w[i] & n.w[i];
168	return p;
169	}
170
171	StringSimilarityMatcher::StringSimilarityMatcher(const QString &stringToMatch)
172	: m_cm (stringToMatch)
173	{
174	m_length = stringToMatch.length();
175	}
176
177	int StringSimilarityMatcher::getSimilarityScore(const QString &strCandidate)
178	{
179	CoMatrix cmTarget(strCandidate);
180	int delta = qAbs(t: m_length - strCandidate.size());
181	int score = ( (worth(m: intersection(m: m_cm, n: cmTarget)) + `1`) << `10` ) /
182	( worth(m: reunion(m: m_cm, n: cmTarget)) + (delta << `1`) + `1` );
183	return score;
184	}
185
186	CandidateList similarTextHeuristicCandidates(const Translator *tor,
187	const QString &text, int maxCandidates)
188	{
189	QList<int> scores;
190	CandidateList candidates;
191	StringSimilarityMatcher matcher(text);
192
193	foreach (const TranslatorMessage &mtm, tor->messages()) {
194	if (mtm.type() == TranslatorMessage::Unfinished
195	\|\| mtm.translation().isEmpty())
196	continue;
197
198	QString s = mtm.sourceText();
199	int score = matcher.getSimilarityScore(strCandidate: s);
200
201	if (candidates.size() == maxCandidates && score > scores [maxCandidates - `1`] )
202	candidates.removeLast();
203
204	if (candidates.size() < maxCandidates && score >= textSimilarityThreshold) {
205	Candidate cand(mtm.context(), s, mtm.comment(), mtm.translation());
206
207	int i;
208	for (i = `0`; i < candidates.size(); i++) {
209	if (score >= scores.at(i)) {
210	if (score == scores.at(i)) {
211	if (candidates.at(i) == cand)
212	goto continue_outer_loop;
213	} else {
214	break;
215	}
216	}
217	}
218	scores.insert(i, t: score);
219	candidates.insert(i, t: cand);
220	}
221	continue_outer_loop:
222	;
223	}
224	return candidates;
225	}
226
227	QT_END_NAMESPACE
228

source code of qttools/src/linguist/shared/simtexth.cpp