| 1 | /** |
| 2 | * parsetrigrams.cpp |
| 3 | * |
| 4 | * Parse a corpus of data and generate trigrams |
| 5 | * |
| 6 | * SPDX-FileCopyrightText: 2013 Martin Sandsmark <martin.sandsmark@kde.org> |
| 7 | * |
| 8 | * SPDX-License-Identifier: LGPL-2.1-or-later |
| 9 | */ |
| 10 | |
| 11 | #include "guesslanguage.h" |
| 12 | |
| 13 | #include <QDebug> |
| 14 | #include <QFile> |
| 15 | #include <QHash> |
| 16 | #include <QString> |
| 17 | |
| 18 | int main(int argc, char *argv[]) |
| 19 | { |
| 20 | if (argc < 3) { |
| 21 | qWarning() << argv[0] << "corpus.txt outfile.trigram" ; |
| 22 | return -1; |
| 23 | } |
| 24 | |
| 25 | QFile file(QString::fromLocal8Bit(ba: argv[1])); |
| 26 | if (!file.open(flags: QIODevice::ReadOnly | QFile::Text)) { |
| 27 | qWarning() << "Unable to open corpus:" << argv[1]; |
| 28 | return -1; |
| 29 | } |
| 30 | QTextStream stream(&file); |
| 31 | |
| 32 | QFile outFile(QString::fromLocal8Bit(ba: argv[2])); |
| 33 | if (!outFile.open(flags: QIODevice::WriteOnly)) { |
| 34 | qWarning() << "Unable to open output file" << argv[2]; |
| 35 | return -1; |
| 36 | } |
| 37 | |
| 38 | QHash<QString, int> model; |
| 39 | qDebug() << "Reading in" << file.size() << "bytes" ; |
| 40 | QString trigram = stream.read(maxlen: 3); |
| 41 | QString contents = stream.readAll(); |
| 42 | qDebug() << "finished reading!" ; |
| 43 | qDebug() << "Building model..." ; |
| 44 | for (int i = 0; i < contents.size(); i++) { |
| 45 | if (!contents[i].isPrint()) { |
| 46 | continue; |
| 47 | } |
| 48 | model[trigram]++; |
| 49 | trigram[0] = trigram[1]; |
| 50 | trigram[1] = trigram[2]; |
| 51 | trigram[2] = contents[i]; |
| 52 | } |
| 53 | qDebug() << "model built!" ; |
| 54 | |
| 55 | qDebug() << "Sorting..." ; |
| 56 | QMultiMap<int, QString> orderedTrigrams; |
| 57 | |
| 58 | for (auto it = model.cbegin(); it != model.cend(); ++it) { |
| 59 | const QString data = it.key(); |
| 60 | Q_ASSERT(data.size() >= 3); |
| 61 | bool hasTwoSpaces = ((data.size() > 1 && data[0].isSpace() && data[1].isSpace()) // |
| 62 | || (data.size() > 2 && data[1].isSpace() && data[2].isSpace())); |
| 63 | |
| 64 | if (!hasTwoSpaces) { |
| 65 | orderedTrigrams.insert(key: it.value(), value: data); |
| 66 | } |
| 67 | } |
| 68 | |
| 69 | qDebug() << "Sorted!" ; |
| 70 | |
| 71 | qDebug() << "Weeding out..." ; |
| 72 | |
| 73 | auto i = orderedTrigrams.begin(); |
| 74 | while (orderedTrigrams.size() > Sonnet::MAXGRAMS) { |
| 75 | i = orderedTrigrams.erase(it: i); |
| 76 | } |
| 77 | qDebug() << "Weeded!" ; |
| 78 | |
| 79 | qDebug() << "Storing..." ; |
| 80 | i = orderedTrigrams.end(); |
| 81 | int count = 0; |
| 82 | QTextStream outStream(&outFile); |
| 83 | |
| 84 | while (i != orderedTrigrams.begin()) { |
| 85 | --i; |
| 86 | outStream << *i << "\t\t\t" << count++ << '\n'; |
| 87 | } |
| 88 | } |
| 89 | |