| 1 | //===--- FuzzySymbolIndex.cpp - Lookup symbols for autocomplete -*- C++ -*-===// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | #include "FuzzySymbolIndex.h" |
| 9 | #include "llvm/Support/Regex.h" |
| 10 | |
| 11 | using clang::find_all_symbols::SymbolAndSignals; |
| 12 | using llvm::StringRef; |
| 13 | |
| 14 | namespace clang { |
| 15 | namespace include_fixer { |
| 16 | namespace { |
| 17 | |
| 18 | class MemSymbolIndex : public FuzzySymbolIndex { |
| 19 | public: |
| 20 | MemSymbolIndex(std::vector<SymbolAndSignals> Symbols) { |
| 21 | for (auto &Symbol : Symbols) { |
| 22 | auto Tokens = tokenize(Text: Symbol.Symbol.getName()); |
| 23 | this->Symbols.emplace_back( |
| 24 | args: StringRef(llvm::join(Begin: Tokens.begin(), End: Tokens.end(), Separator: " " )), |
| 25 | args: std::move(Symbol)); |
| 26 | } |
| 27 | } |
| 28 | |
| 29 | std::vector<SymbolAndSignals> search(StringRef Query) override { |
| 30 | auto Tokens = tokenize(Text: Query); |
| 31 | llvm::Regex Pattern("^" + queryRegexp(Tokens)); |
| 32 | std::vector<SymbolAndSignals> Results; |
| 33 | for (const Entry &E : Symbols) |
| 34 | if (Pattern.match(String: E.first)) |
| 35 | Results.push_back(x: E.second); |
| 36 | return Results; |
| 37 | } |
| 38 | |
| 39 | private: |
| 40 | using Entry = std::pair<llvm::SmallString<32>, SymbolAndSignals>; |
| 41 | std::vector<Entry> Symbols; |
| 42 | }; |
| 43 | |
| 44 | // Helpers for tokenize state machine. |
| 45 | enum TokenizeState { |
| 46 | EMPTY, // No pending characters. |
| 47 | ONE_BIG, // Read one uppercase letter, could be WORD or Word. |
| 48 | BIG_WORD, // Reading an uppercase WORD. |
| 49 | SMALL_WORD, // Reading a lowercase word. |
| 50 | NUMBER // Reading a number. |
| 51 | }; |
| 52 | |
| 53 | enum CharType { UPPER, LOWER, DIGIT, MISC }; |
| 54 | CharType classify(char c) { |
| 55 | if (isupper(c)) |
| 56 | return UPPER; |
| 57 | if (islower(c)) |
| 58 | return LOWER; |
| 59 | if (isdigit(c)) |
| 60 | return DIGIT; |
| 61 | return MISC; |
| 62 | } |
| 63 | |
| 64 | } // namespace |
| 65 | |
| 66 | std::vector<std::string> FuzzySymbolIndex::tokenize(StringRef Text) { |
| 67 | std::vector<std::string> Result; |
| 68 | // State describes the treatment of text from Start to I. |
| 69 | // Once text is Flush()ed into Result, we're done with it and advance Start. |
| 70 | TokenizeState State = EMPTY; |
| 71 | size_t Start = 0; |
| 72 | auto Flush = [&](size_t End) { |
| 73 | if (State != EMPTY) { |
| 74 | Result.push_back(x: Text.substr(Start, N: End - Start).lower()); |
| 75 | State = EMPTY; |
| 76 | } |
| 77 | Start = End; |
| 78 | }; |
| 79 | for (size_t I = 0; I < Text.size(); ++I) { |
| 80 | CharType Type = classify(c: Text[I]); |
| 81 | if (Type == MISC) |
| 82 | Flush(I); |
| 83 | else if (Type == LOWER) |
| 84 | switch (State) { |
| 85 | case BIG_WORD: |
| 86 | Flush(I - 1); // FOOBar: first token is FOO, not FOOB. |
| 87 | [[fallthrough]]; |
| 88 | case ONE_BIG: |
| 89 | State = SMALL_WORD; |
| 90 | [[fallthrough]]; |
| 91 | case SMALL_WORD: |
| 92 | break; |
| 93 | default: |
| 94 | Flush(I); |
| 95 | State = SMALL_WORD; |
| 96 | } |
| 97 | else if (Type == UPPER) |
| 98 | switch (State) { |
| 99 | case ONE_BIG: |
| 100 | State = BIG_WORD; |
| 101 | [[fallthrough]]; |
| 102 | case BIG_WORD: |
| 103 | break; |
| 104 | default: |
| 105 | Flush(I); |
| 106 | State = ONE_BIG; |
| 107 | } |
| 108 | else if (Type == DIGIT && State != NUMBER) { |
| 109 | Flush(I); |
| 110 | State = NUMBER; |
| 111 | } |
| 112 | } |
| 113 | Flush(Text.size()); |
| 114 | return Result; |
| 115 | } |
| 116 | |
| 117 | std::string |
| 118 | FuzzySymbolIndex::queryRegexp(const std::vector<std::string> &Tokens) { |
| 119 | std::string Result; |
| 120 | for (size_t I = 0; I < Tokens.size(); ++I) { |
| 121 | if (I) |
| 122 | Result.append(s: "[[:alnum:]]* " ); |
| 123 | for (size_t J = 0; J < Tokens[I].size(); ++J) { |
| 124 | if (J) |
| 125 | Result.append(s: "([[:alnum:]]* )?" ); |
| 126 | Result.push_back(c: Tokens[I][J]); |
| 127 | } |
| 128 | } |
| 129 | return Result; |
| 130 | } |
| 131 | |
| 132 | llvm::Expected<std::unique_ptr<FuzzySymbolIndex>> |
| 133 | FuzzySymbolIndex::createFromYAML(StringRef FilePath) { |
| 134 | auto Buffer = llvm::MemoryBuffer::getFile(Filename: FilePath, /*IsText=*/true); |
| 135 | if (!Buffer) |
| 136 | return llvm::errorCodeToError(EC: Buffer.getError()); |
| 137 | return std::make_unique<MemSymbolIndex>( |
| 138 | args: find_all_symbols::ReadSymbolInfosFromYAML(Yaml: Buffer.get()->getBuffer())); |
| 139 | } |
| 140 | |
| 141 | } // namespace include_fixer |
| 142 | } // namespace clang |
| 143 | |