clang-tools  7.0.0
FuzzyMatch.h
Go to the documentation of this file.
1 //===--- FuzzyMatch.h - Approximate identifier matching ---------*- C++-*-===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This file implements fuzzy-matching of strings against identifiers.
11 // It indicates both the existence and quality of a match:
12 // 'eb' matches both 'emplace_back' and 'embed', the former has a better score.
13 //
14 //===----------------------------------------------------------------------===//
15 
16 #ifndef LLVM_CLANG_TOOLS_EXTRA_CLANGD_FUZZYMATCH_H
17 #define LLVM_CLANG_TOOLS_EXTRA_CLANGD_FUZZYMATCH_H
18 
19 #include "llvm/ADT/ArrayRef.h"
20 #include "llvm/ADT/Optional.h"
21 #include "llvm/ADT/SmallString.h"
22 #include "llvm/ADT/StringRef.h"
23 #include "llvm/Support/raw_ostream.h"
24 
25 namespace clang {
26 namespace clangd {
27 
28 // Utilities for word segmentation.
29 // FuzzyMatcher already incorporates this logic, so most users don't need this.
30 //
31 // A name like "fooBar_baz" consists of several parts foo, bar, baz.
32 // Aligning segmentation of word and pattern improves the fuzzy-match.
33 // For example: [lol] matches "LaughingOutLoud" better than "LionPopulation"
34 //
35 // First we classify each character into types (uppercase, lowercase, etc).
36 // Then we look at the sequence: e.g. [upper, lower] is the start of a segment.
37 
38 // We distinguish the types of characters that affect segmentation.
39 // It's not obvious how to segment digits, we treat them as lowercase letters.
40 // As we don't decode UTF-8, we treat bytes over 127 as lowercase too.
41 // This means we require exact (case-sensitive) match for those characters.
42 enum CharType : unsigned char {
43  Empty = 0, // Before-the-start and after-the-end (and control chars).
44  Lower = 1, // Lowercase letters, digits, and non-ASCII bytes.
45  Upper = 2, // Uppercase letters.
46  Punctuation = 3, // ASCII punctuation (including Space)
47 };
48 // A CharTypeSet is a bitfield representing all the character types in a word.
49 // Its bits are 1<<Empty, 1<<Lower, etc.
50 using CharTypeSet = unsigned char;
51 
52 // Each character's Role is the Head or Tail of a segment, or a Separator.
53 // e.g. XMLHttpRequest_Async
54 // +--+---+------ +----
55 // ^Head ^Tail ^Separator
56 enum CharRole : unsigned char {
57  Unknown = 0, // Stray control characters or impossible states.
58  Tail = 1, // Part of a word segment, but not the first character.
59  Head = 2, // The first character of a word segment.
60  Separator = 3, // Punctuation characters that separate word segments.
61 };
62 
63 // Compute segmentation of Text.
64 // Character roles are stored in Roles (Roles.size() must equal Text.size()).
65 // The set of character types encountered is returned, this may inform
66 // heuristics for dealing with poorly-segmented identifiers like "strndup".
67 CharTypeSet calculateRoles(llvm::StringRef Text,
68  llvm::MutableArrayRef<CharRole> Roles);
69 
70 // A matcher capable of matching and scoring strings against a single pattern.
71 // It's optimized for matching against many strings - match() does not allocate.
72 class FuzzyMatcher {
73 public:
74  // Characters beyond MaxPat are ignored.
75  FuzzyMatcher(llvm::StringRef Pattern);
76 
77  // If Word matches the pattern, return a score indicating the quality match.
78  // Scores usually fall in a [0,1] range, with 1 being a very good score.
79  // "Super" scores in (1,2] are possible if the pattern is the full word.
80  // Characters beyond MaxWord are ignored.
81  llvm::Optional<float> match(llvm::StringRef Word);
82 
83  llvm::StringRef pattern() const { return llvm::StringRef(Pat, PatN); }
84  bool empty() const { return PatN == 0; }
85 
86  // Dump internal state from the last match() to the stream, for debugging.
87  // Returns the pattern with [] around matched characters, e.g.
88  // [u_p] + "unique_ptr" --> "[u]nique[_p]tr"
89  llvm::SmallString<256> dumpLast(llvm::raw_ostream &) const;
90 
91 private:
92  // We truncate the pattern and the word to bound the cost of matching.
93  constexpr static int MaxPat = 63, MaxWord = 127;
94  // Action describes how a word character was matched to the pattern.
95  // It should be an enum, but this causes bitfield problems:
96  // - for MSVC the enum type must be explicitly unsigned for correctness
97  // - GCC 4.8 complains not all values fit if the type is unsigned
98  using Action = bool;
99  constexpr static Action Miss = false; // Word character was skipped.
100  constexpr static Action Match = true; // Matched against a pattern character.
101 
102  bool init(llvm::StringRef Word);
103  void buildGraph();
104  bool allowMatch(int P, int W, Action Last) const;
105  int skipPenalty(int W, Action Last) const;
106  int matchBonus(int P, int W, Action Last) const;
107 
108  // Pattern data is initialized by the constructor, then constant.
109  char Pat[MaxPat]; // Pattern data
110  int PatN; // Length
111  char LowPat[MaxPat]; // Pattern in lowercase
112  CharRole PatRole[MaxPat]; // Pattern segmentation info
113  CharTypeSet PatTypeSet; // Bitmask of 1<<CharType for all Pattern characters
114  float ScoreScale; // Normalizes scores for the pattern length.
115 
116  // Word data is initialized on each call to match(), mostly by init().
117  char Word[MaxWord]; // Word data
118  int WordN; // Length
119  char LowWord[MaxWord]; // Word in lowercase
120  CharRole WordRole[MaxWord]; // Word segmentation info
121  CharTypeSet WordTypeSet; // Bitmask of 1<<CharType for all Word characters
122  bool WordContainsPattern; // Simple substring check
123 
124  // Cumulative best-match score table.
125  // Boundary conditions are filled in by the constructor.
126  // The rest is repopulated for each match(), by buildGraph().
127  struct ScoreInfo {
128  signed int Score : 15;
129  Action Prev : 1;
130  };
131  ScoreInfo Scores[MaxPat + 1][MaxWord + 1][/* Last Action */ 2];
132 };
133 
134 } // namespace clangd
135 } // namespace clang
136 
137 #endif
unsigned char CharTypeSet
Definition: FuzzyMatch.h:50
FuzzyMatcher(llvm::StringRef Pattern)
Definition: FuzzyMatch.cpp:78
llvm::SmallString< 256 > dumpLast(llvm::raw_ostream &) const
Definition: FuzzyMatch.cpp:302
===– Representation.cpp - ClangDoc Representation --------—*- C++ -*-===//
llvm::Optional< float > match(llvm::StringRef Word)
Definition: FuzzyMatch.cpp:94
llvm::StringRef pattern() const
Definition: FuzzyMatch.h:83
CharTypeSet calculateRoles(StringRef Text, MutableArrayRef< CharRole > Roles)
Definition: FuzzyMatch.cpp:156