1 | /** |
2 | * parsetrigrams.cpp |
3 | * |
4 | * Parse a corpus of data and generate trigrams |
5 | * |
6 | * SPDX-FileCopyrightText: 2013 Martin Sandsmark <martin.sandsmark@kde.org> |
7 | * |
8 | * SPDX-License-Identifier: LGPL-2.1-or-later |
9 | */ |
10 | |
11 | #include "guesslanguage.h" |
12 | |
13 | #include <QDebug> |
14 | #include <QFile> |
15 | #include <QHash> |
16 | #include <QString> |
17 | |
18 | int main(int argc, char *argv[]) |
19 | { |
20 | if (argc < 3) { |
21 | qWarning() << argv[0] << "corpus.txt outfile.trigram" ; |
22 | return -1; |
23 | } |
24 | |
25 | QFile file(QString::fromLocal8Bit(ba: argv[1])); |
26 | if (!file.open(flags: QIODevice::ReadOnly | QFile::Text)) { |
27 | qWarning() << "Unable to open corpus:" << argv[1]; |
28 | return -1; |
29 | } |
30 | QTextStream stream(&file); |
31 | |
32 | QFile outFile(QString::fromLocal8Bit(ba: argv[2])); |
33 | if (!outFile.open(flags: QIODevice::WriteOnly)) { |
34 | qWarning() << "Unable to open output file" << argv[2]; |
35 | return -1; |
36 | } |
37 | |
38 | QHash<QString, int> model; |
39 | qDebug() << "Reading in" << file.size() << "bytes" ; |
40 | QString trigram = stream.read(maxlen: 3); |
41 | QString contents = stream.readAll(); |
42 | qDebug() << "finished reading!" ; |
43 | qDebug() << "Building model..." ; |
44 | for (int i = 0; i < contents.size(); i++) { |
45 | if (!contents[i].isPrint()) { |
46 | continue; |
47 | } |
48 | model[trigram]++; |
49 | trigram[0] = trigram[1]; |
50 | trigram[1] = trigram[2]; |
51 | trigram[2] = contents[i]; |
52 | } |
53 | qDebug() << "model built!" ; |
54 | |
55 | qDebug() << "Sorting..." ; |
56 | QMultiMap<int, QString> orderedTrigrams; |
57 | |
58 | for (auto it = model.cbegin(); it != model.cend(); ++it) { |
59 | const QString data = it.key(); |
60 | Q_ASSERT(data.size() >= 3); |
61 | bool hasTwoSpaces = ((data.size() > 1 && data[0].isSpace() && data[1].isSpace()) // |
62 | || (data.size() > 2 && data[1].isSpace() && data[2].isSpace())); |
63 | |
64 | if (!hasTwoSpaces) { |
65 | orderedTrigrams.insert(key: it.value(), value: data); |
66 | } |
67 | } |
68 | |
69 | qDebug() << "Sorted!" ; |
70 | |
71 | qDebug() << "Weeding out..." ; |
72 | |
73 | auto i = orderedTrigrams.begin(); |
74 | while (orderedTrigrams.size() > Sonnet::MAXGRAMS) { |
75 | i = orderedTrigrams.erase(it: i); |
76 | } |
77 | qDebug() << "Weeded!" ; |
78 | |
79 | qDebug() << "Storing..." ; |
80 | i = orderedTrigrams.end(); |
81 | int count = 0; |
82 | QTextStream outStream(&outFile); |
83 | |
84 | while (i != orderedTrigrams.begin()) { |
85 | --i; |
86 | outStream << *i << "\t\t\t" << count++ << '\n'; |
87 | } |
88 | } |
89 | |