clang-tools  7.0.0
Trigram.cpp
Go to the documentation of this file.
1 //===--- Trigram.cpp - Trigram generation for Fuzzy Matching ----*- C++ -*-===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 
10 #include "Trigram.h"
11 #include "../../FuzzyMatch.h"
12 #include "Token.h"
13 
14 #include "llvm/ADT/ArrayRef.h"
15 #include "llvm/ADT/DenseSet.h"
16 #include "llvm/ADT/StringExtras.h"
17 
18 #include <cctype>
19 #include <queue>
20 #include <string>
21 
22 using namespace llvm;
23 
24 namespace clang {
25 namespace clangd {
26 namespace dex {
27 
28 // FIXME(kbobyrev): Deal with short symbol symbol names. A viable approach would
29 // be generating unigrams and bigrams here, too. This would prevent symbol index
30 // from applying fuzzy matching on a tremendous number of symbols and allow
31 // supplementary retrieval for short queries.
32 //
33 // Short names (total segment length <3 characters) are currently ignored.
34 std::vector<Token> generateIdentifierTrigrams(llvm::StringRef Identifier) {
35  // Apply fuzzy matching text segmentation.
36  std::vector<CharRole> Roles(Identifier.size());
37  calculateRoles(Identifier,
38  llvm::makeMutableArrayRef(Roles.data(), Identifier.size()));
39 
40  std::string LowercaseIdentifier = Identifier.lower();
41 
42  // For each character, store indices of the characters to which fuzzy matching
43  // algorithm can jump. There are 3 possible variants:
44  //
45  // * Next Tail - next character from the same segment
46  // * Next Head - front character of the next segment
47  // * Skip-1-Next Head - front character of the skip-1-next segment
48  //
49  // Next stores tuples of three indices in the presented order, if a variant is
50  // not available then 0 is stored.
51  std::vector<std::array<unsigned, 3>> Next(LowercaseIdentifier.size());
52  unsigned NextTail = 0, NextHead = 0, NextNextHead = 0;
53  for (int I = LowercaseIdentifier.size() - 1; I >= 0; --I) {
54  Next[I] = {{NextTail, NextHead, NextNextHead}};
55  NextTail = Roles[I] == Tail ? I : 0;
56  if (Roles[I] == Head) {
57  NextNextHead = NextHead;
58  NextHead = I;
59  }
60  }
61 
62  DenseSet<Token> UniqueTrigrams;
63  std::array<char, 4> Chars;
64  for (size_t I = 0; I < LowercaseIdentifier.size(); ++I) {
65  // Skip delimiters.
66  if (Roles[I] != Head && Roles[I] != Tail)
67  continue;
68  for (const unsigned J : Next[I]) {
69  if (!J)
70  continue;
71  for (const unsigned K : Next[J]) {
72  if (!K)
73  continue;
74  Chars = {{LowercaseIdentifier[I], LowercaseIdentifier[J],
75  LowercaseIdentifier[K], 0}};
76  auto Trigram = Token(Token::Kind::Trigram, Chars.data());
77  // Push unique trigrams to the result.
78  if (!UniqueTrigrams.count(Trigram)) {
79  UniqueTrigrams.insert(Trigram);
80  }
81  }
82  }
83  }
84 
85  std::vector<Token> Result;
86  for (const auto &Trigram : UniqueTrigrams)
87  Result.push_back(Trigram);
88 
89  return Result;
90 }
91 
92 // FIXME(kbobyrev): Similarly, to generateIdentifierTrigrams, this ignores short
93 // inputs (total segment length <3 characters).
94 std::vector<Token> generateQueryTrigrams(llvm::StringRef Query) {
95  // Apply fuzzy matching text segmentation.
96  std::vector<CharRole> Roles(Query.size());
97  calculateRoles(Query, llvm::makeMutableArrayRef(Roles.data(), Query.size()));
98 
99  std::string LowercaseQuery = Query.lower();
100 
101  DenseSet<Token> UniqueTrigrams;
102  std::deque<char> Chars;
103 
104  for (size_t I = 0; I < LowercaseQuery.size(); ++I) {
105  // If current symbol is delimiter, just skip it.
106  if (Roles[I] != Head && Roles[I] != Tail)
107  continue;
108 
109  Chars.push_back(LowercaseQuery[I]);
110 
111  if (Chars.size() > 3)
112  Chars.pop_front();
113  if (Chars.size() == 3) {
114  auto Trigram =
115  Token(Token::Kind::Trigram, std::string(begin(Chars), end(Chars)));
116  // Push unique trigrams to the result.
117  if (!UniqueTrigrams.count(Trigram)) {
118  UniqueTrigrams.insert(Trigram);
119  }
120  }
121  }
122 
123  std::vector<Token> Result;
124  for (const auto &Trigram : UniqueTrigrams)
125  Result.push_back(Trigram);
126 
127  return Result;
128 }
129 
130 } // namespace dex
131 } // namespace clangd
132 } // namespace clang
Some operations such as code completion produce a set of candidates.
std::vector< Token > generateIdentifierTrigrams(llvm::StringRef Identifier)
Returns list of unique fuzzy-search trigrams from unqualified symbol.
Definition: Trigram.cpp:34
A Token represents an attribute of a symbol, such as a particular trigram present in the name (used f...
Definition: Token.h:40
std::vector< Token > generateQueryTrigrams(llvm::StringRef Query)
Returns list of unique fuzzy-search trigrams given a query.
Definition: Trigram.cpp:94
===– Representation.cpp - ClangDoc Representation --------—*- C++ -*-===//
CharTypeSet calculateRoles(StringRef Text, MutableArrayRef< CharRole > Roles)
Definition: FuzzyMatch.cpp:156