1/**
2 * parsetrigrams.cpp
3 *
4 * Parse a corpus of data and generate trigrams
5 *
6 * SPDX-FileCopyrightText: 2013 Martin Sandsmark <martin.sandsmark@kde.org>
7 *
8 * SPDX-License-Identifier: LGPL-2.1-or-later
9 */
10
11#include "guesslanguage.h"
12
13#include <QDebug>
14#include <QFile>
15#include <QHash>
16#include <QString>
17
18int main(int argc, char *argv[])
19{
20 if (argc < 3) {
21 qWarning() << argv[0] << "corpus.txt outfile.trigram";
22 return -1;
23 }
24
25 QFile file(QString::fromLocal8Bit(ba: argv[1]));
26 if (!file.open(flags: QIODevice::ReadOnly | QFile::Text)) {
27 qWarning() << "Unable to open corpus:" << argv[1];
28 return -1;
29 }
30 QTextStream stream(&file);
31
32 QFile outFile(QString::fromLocal8Bit(ba: argv[2]));
33 if (!outFile.open(flags: QIODevice::WriteOnly)) {
34 qWarning() << "Unable to open output file" << argv[2];
35 return -1;
36 }
37
38 QHash<QString, int> model;
39 qDebug() << "Reading in" << file.size() << "bytes";
40 QString trigram = stream.read(maxlen: 3);
41 QString contents = stream.readAll();
42 qDebug() << "finished reading!";
43 qDebug() << "Building model...";
44 for (int i = 0; i < contents.size(); i++) {
45 if (!contents[i].isPrint()) {
46 continue;
47 }
48 model[trigram]++;
49 trigram[0] = trigram[1];
50 trigram[1] = trigram[2];
51 trigram[2] = contents[i];
52 }
53 qDebug() << "model built!";
54
55 qDebug() << "Sorting...";
56 QMultiMap<int, QString> orderedTrigrams;
57
58 for (auto it = model.cbegin(); it != model.cend(); ++it) {
59 const QString data = it.key();
60 Q_ASSERT(data.size() >= 3);
61 bool hasTwoSpaces = ((data.size() > 1 && data[0].isSpace() && data[1].isSpace()) //
62 || (data.size() > 2 && data[1].isSpace() && data[2].isSpace()));
63
64 if (!hasTwoSpaces) {
65 orderedTrigrams.insert(key: it.value(), value: data);
66 }
67 }
68
69 qDebug() << "Sorted!";
70
71 qDebug() << "Weeding out...";
72
73 auto i = orderedTrigrams.begin();
74 while (orderedTrigrams.size() > Sonnet::MAXGRAMS) {
75 i = orderedTrigrams.erase(it: i);
76 }
77 qDebug() << "Weeded!";
78
79 qDebug() << "Storing...";
80 i = orderedTrigrams.end();
81 int count = 0;
82 QTextStream outStream(&outFile);
83
84 while (i != orderedTrigrams.begin()) {
85 --i;
86 outStream << *i << "\t\t\t" << count++ << '\n';
87 }
88}
89

source code of sonnet/data/gentrigrams.cpp