| 1 | /* |
| 2 | SPDX-FileCopyrightText: 2022 Waqar Ahmed <waqar.17a@gmail.com> |
| 3 | |
| 4 | SPDX-License-Identifier: LGPL-2.0-or-later |
| 5 | */ |
| 6 | #include "kateindentdetecter.h" |
| 7 | |
| 8 | #include "katedocument.h" |
| 9 | |
| 10 | KateIndentDetecter::KateIndentDetecter(KTextEditor::DocumentPrivate *doc) |
| 11 | : m_doc(doc) |
| 12 | { |
| 13 | } |
| 14 | |
| 15 | struct SpacesDiffResult { |
| 16 | int spacesDiff = 0; |
| 17 | bool looksLikeAlignment = false; |
| 18 | }; |
| 19 | |
| 20 | static SpacesDiffResult spacesDiff(const QString &a, int aLength, const QString &b, int bLength) |
| 21 | { |
| 22 | SpacesDiffResult result; |
| 23 | result.spacesDiff = 0; |
| 24 | result.looksLikeAlignment = false; |
| 25 | |
| 26 | // This can go both ways (e.g.): |
| 27 | // - a: "\t" |
| 28 | // - b: "\t " |
| 29 | // => This should count 1 tab and 4 spaces |
| 30 | |
| 31 | int i = 0; |
| 32 | |
| 33 | for (i = 0; i < aLength && i < bLength; i++) { |
| 34 | const auto aCharCode = a.at(i); |
| 35 | const auto bCharCode = b.at(i); |
| 36 | |
| 37 | if (aCharCode != bCharCode) { |
| 38 | break; |
| 39 | } |
| 40 | } |
| 41 | |
| 42 | int aSpacesCnt = 0; |
| 43 | int aTabsCount = 0; |
| 44 | for (int j = i; j < aLength; j++) { |
| 45 | const auto aCharCode = a.at(i: j); |
| 46 | if (aCharCode == QLatin1Char(' ')) { |
| 47 | aSpacesCnt++; |
| 48 | } else { |
| 49 | aTabsCount++; |
| 50 | } |
| 51 | } |
| 52 | |
| 53 | int bSpacesCnt = 0; |
| 54 | int bTabsCount = 0; |
| 55 | for (int j = i; j < bLength; j++) { |
| 56 | const auto bCharCode = b.at(i: j); |
| 57 | if (bCharCode == QLatin1Char(' ')) { |
| 58 | bSpacesCnt++; |
| 59 | } else { |
| 60 | bTabsCount++; |
| 61 | } |
| 62 | } |
| 63 | |
| 64 | if (aSpacesCnt > 0 && aTabsCount > 0) { |
| 65 | return result; |
| 66 | } |
| 67 | if (bSpacesCnt > 0 && bTabsCount > 0) { |
| 68 | return result; |
| 69 | } |
| 70 | |
| 71 | const auto tabsDiff = std::abs(x: aTabsCount - bTabsCount); |
| 72 | const auto spacesDiff = std::abs(x: aSpacesCnt - bSpacesCnt); |
| 73 | |
| 74 | if (tabsDiff == 0) { |
| 75 | // check if the indentation difference might be caused by alignment reasons |
| 76 | // sometime folks like to align their code, but this should not be used as a hint |
| 77 | result.spacesDiff = spacesDiff; |
| 78 | |
| 79 | if (spacesDiff > 0 && 0 <= bSpacesCnt - 1 && bSpacesCnt - 1 < a.length() && bSpacesCnt < b.length()) { |
| 80 | if (b.at(i: bSpacesCnt) != QLatin1Char(' ') && a.at(i: bSpacesCnt - 1) == QLatin1Char(' ')) { |
| 81 | if (a.at(i: a.length() - 1) == QLatin1Char(',')) { |
| 82 | // This looks like an alignment desire: e.g. |
| 83 | // const a = b + c, |
| 84 | // d = b - c; |
| 85 | result.looksLikeAlignment = true; |
| 86 | } |
| 87 | } |
| 88 | } |
| 89 | return result; |
| 90 | } |
| 91 | if (spacesDiff % tabsDiff == 0) { |
| 92 | result.spacesDiff = spacesDiff / tabsDiff; |
| 93 | return result; |
| 94 | } |
| 95 | return result; |
| 96 | } |
| 97 | |
| 98 | KateIndentDetecter::Result KateIndentDetecter::detect(int defaultTabSize, bool defaultInsertSpaces) |
| 99 | { |
| 100 | // Look at most at the first 10k lines |
| 101 | const int linesCount = std::min(a: m_doc->lines(), b: 10000); |
| 102 | |
| 103 | int linesIndentedWithTabsCount = 0; // number of lines that contain at least one tab in indentation |
| 104 | int linesIndentedWithSpacesCount = 0; // number of lines that contain only spaces in indentation |
| 105 | |
| 106 | QString previousLineText; // content of latest line that contained non-whitespace chars |
| 107 | int previousLineIndentation = 0; // index at which latest line contained the first non-whitespace char |
| 108 | |
| 109 | constexpr int ALLOWED_TAB_SIZE_GUESSES[] = {2, 4, 6, 8, 3, 5, 7}; // prefer even guesses for `tabSize`, limit to [2, 8]. |
| 110 | constexpr int MAX_ALLOWED_TAB_SIZE_GUESS = 8; // max(ALLOWED_TAB_SIZE_GUESSES) = 8 |
| 111 | |
| 112 | int spacesDiffCount[MAX_ALLOWED_TAB_SIZE_GUESS + 1] = {0, 0, 0, 0, 0, 0, 0, 0, 0}; // `tabSize` scores |
| 113 | SpacesDiffResult tmp; |
| 114 | |
| 115 | for (int lineNumber = 0; lineNumber < linesCount; lineNumber++) { |
| 116 | const QString currentLineText = m_doc->line(line: lineNumber); |
| 117 | const int currentLineLength = currentLineText.length(); |
| 118 | |
| 119 | bool currentLineHasContent = false; // does `currentLineText` contain non-whitespace chars |
| 120 | int currentLineIndentation = 0; // index at which `currentLineText` contains the first non-whitespace char |
| 121 | int currentLineSpacesCount = 0; // count of spaces found in `currentLineText` indentation |
| 122 | int currentLineTabsCount = 0; // count of tabs found in `currentLineText` indentation |
| 123 | for (int j = 0, lenJ = currentLineLength; j < lenJ; j++) { |
| 124 | const auto charCode = currentLineText.at(i: j); |
| 125 | |
| 126 | if (charCode == QLatin1Char('\t')) { |
| 127 | currentLineTabsCount++; |
| 128 | } else if (charCode == QLatin1Char(' ')) { |
| 129 | currentLineSpacesCount++; |
| 130 | } else { |
| 131 | // Hit non whitespace character on this line |
| 132 | currentLineHasContent = true; |
| 133 | currentLineIndentation = j; |
| 134 | break; |
| 135 | } |
| 136 | } |
| 137 | |
| 138 | // Ignore empty or only whitespace lines |
| 139 | if (!currentLineHasContent) { |
| 140 | continue; |
| 141 | } |
| 142 | |
| 143 | if (currentLineTabsCount > 0) { |
| 144 | linesIndentedWithTabsCount++; |
| 145 | } else if (currentLineSpacesCount > 1) { |
| 146 | linesIndentedWithSpacesCount++; |
| 147 | } |
| 148 | |
| 149 | tmp = spacesDiff(a: previousLineText, aLength: previousLineIndentation, b: currentLineText, bLength: currentLineIndentation); |
| 150 | |
| 151 | if (tmp.looksLikeAlignment) { |
| 152 | // if defaultInsertSpaces === true && the spaces count == tabSize, we may want to count it as valid indentation |
| 153 | // |
| 154 | // - item1 |
| 155 | // - item2 |
| 156 | // |
| 157 | // otherwise skip this line entirely |
| 158 | // |
| 159 | // const a = 1, |
| 160 | // b = 2; |
| 161 | |
| 162 | if (!(defaultInsertSpaces && defaultTabSize == tmp.spacesDiff)) { |
| 163 | continue; |
| 164 | } |
| 165 | } |
| 166 | |
| 167 | const int currentSpacesDiff = tmp.spacesDiff; |
| 168 | if (currentSpacesDiff <= MAX_ALLOWED_TAB_SIZE_GUESS) { |
| 169 | spacesDiffCount[currentSpacesDiff]++; |
| 170 | } |
| 171 | |
| 172 | previousLineText = currentLineText; |
| 173 | previousLineIndentation = currentLineIndentation; |
| 174 | } |
| 175 | |
| 176 | bool insertSpaces = defaultInsertSpaces; |
| 177 | if (linesIndentedWithTabsCount != linesIndentedWithSpacesCount) { |
| 178 | insertSpaces = (linesIndentedWithTabsCount < linesIndentedWithSpacesCount); |
| 179 | } |
| 180 | |
| 181 | int tabSize = defaultTabSize; |
| 182 | |
| 183 | // Guess tabSize only if inserting spaces... |
| 184 | if (insertSpaces) { |
| 185 | int tabSizeScore = 0; |
| 186 | for (const int possibleTabSize : ALLOWED_TAB_SIZE_GUESSES) { |
| 187 | // prefer multiples of two, if ever found one, see bug 474505 and autotests/input/indent_detect/bogus7spaces.md |
| 188 | const int possibleTabSizeScore = spacesDiffCount[possibleTabSize]; |
| 189 | if (possibleTabSizeScore > tabSizeScore && (possibleTabSize % 2 == 0 || tabSizeScore == 0)) { |
| 190 | tabSizeScore = possibleTabSizeScore; |
| 191 | tabSize = possibleTabSize; |
| 192 | } |
| 193 | } |
| 194 | |
| 195 | // Let a tabSize of 2 win even if it is not the maximum |
| 196 | // (only in case 4 was guessed) |
| 197 | if (tabSize == 4 && spacesDiffCount[4] > 0 && spacesDiffCount[2] > 0 && spacesDiffCount[2] >= spacesDiffCount[4] / 2) { |
| 198 | tabSize = 2; |
| 199 | } |
| 200 | |
| 201 | // If no indent detected, check if the file is 1 space indented |
| 202 | if (tabSizeScore == 0) { |
| 203 | const auto it = std::max_element(first: spacesDiffCount, last: spacesDiffCount + 9); |
| 204 | const auto maxIdx = std::distance(first: spacesDiffCount, last: it); |
| 205 | if (maxIdx == 1) { |
| 206 | tabSize = 1; |
| 207 | } |
| 208 | } |
| 209 | } |
| 210 | |
| 211 | return {.indentWidth = tabSize, .indentUsingSpaces = insertSpaces}; |
| 212 | } |
| 213 | |