17 #include "llvm/ADT/StringSet.h" 18 #include "llvm/Support/ScopedPrinter.h" 31 auto Data = std::make_pair(std::move(Symbols), std::move(Refs));
32 return std::make_unique<Dex>(Data.first, Data.second, Rels, std::move(Data),
39 const Token RestrictedForCodeCompletion =
50 std::vector<Token> generateSearchTokens(
const Symbol &Sym) {
55 for (
const auto &ProximityURI :
59 Result.emplace_back(RestrictedForCodeCompletion);
60 if (!Sym.
Type.empty())
67 void Dex::buildIndex() {
69 std::vector<std::pair<float, const Symbol *>> ScoredSymbols(Symbols.size());
71 for (
size_t I = 0; I < Symbols.size(); ++I) {
72 const Symbol *Sym = Symbols[I];
73 LookupTable[Sym->
ID] = Sym;
74 ScoredSymbols[I] = {
quality(*Sym), Sym};
79 llvm::sort(ScoredSymbols, std::greater<std::pair<float, const Symbol *>>());
82 SymbolQuality.resize(Symbols.size());
84 for (
size_t I = 0; I < ScoredSymbols.size(); ++I) {
85 SymbolQuality[I] = ScoredSymbols[I].first;
86 Symbols[I] = ScoredSymbols[I].second;
90 llvm::DenseMap<Token, std::vector<DocID>> TempInvertedIndex;
91 for (
DocID SymbolRank = 0; SymbolRank < Symbols.size(); ++SymbolRank) {
92 const auto *Sym = Symbols[SymbolRank];
93 for (
const auto &
Token : generateSearchTokens(*Sym))
94 TempInvertedIndex[
Token].push_back(SymbolRank);
98 for (
const auto &TokenToPostingList : TempInvertedIndex)
100 {TokenToPostingList.first,
PostingList(TokenToPostingList.second)});
103 std::unique_ptr<Iterator> Dex::iterator(
const Token &Tok)
const {
104 auto It = InvertedIndex.find(Tok);
105 return It == InvertedIndex.end() ?
Corpus.
none()
106 : It->second.iterator(&It->first);
110 std::unique_ptr<Iterator> Dex::createFileProximityIterator(
111 llvm::ArrayRef<std::string> ProximityPaths)
const {
112 std::vector<std::unique_ptr<Iterator>> BoostingIterators;
114 llvm::StringSet<> ParentURIs;
115 llvm::StringMap<SourceParams> Sources;
116 for (
const auto &
Path : ProximityPaths) {
120 for (
const auto &ProximityURI : PathProximityURIs)
121 ParentURIs.insert(ProximityURI);
133 for (
const auto &ParentURI : ParentURIs.keys()) {
137 PathProximitySignals.
SymbolURI = ParentURI;
138 BoostingIterators.push_back(
142 BoostingIterators.push_back(
Corpus.
all());
147 std::unique_ptr<Iterator>
148 Dex::createTypeBoostingIterator(llvm::ArrayRef<std::string> Types)
const {
149 std::vector<std::unique_ptr<Iterator>> BoostingIterators;
152 auto Boost = PreferredTypeSignals.
evaluate();
153 for (
const auto &T : Types)
154 BoostingIterators.push_back(
156 BoostingIterators.push_back(
Corpus.
all());
165 assert(!StringRef(Req.
Query).contains(
"::") &&
166 "There must be no :: in query.");
171 bool More = !Req.
Query.empty() && Req.
Query.size() < 3;
173 std::vector<std::unique_ptr<Iterator>> Criteria;
178 std::vector<std::unique_ptr<Iterator>> TrigramIterators;
179 for (
const auto &Trigram : TrigramTokens)
180 TrigramIterators.push_back(iterator(Trigram));
184 std::vector<std::unique_ptr<Iterator>> ScopeIterators;
185 for (
const auto &Scope : Req.
Scopes)
188 ScopeIterators.push_back(
193 Criteria.push_back(createFileProximityIterator(Req.
ProximityPaths));
195 Criteria.push_back(createTypeBoostingIterator(Req.
PreferredTypes));
198 Criteria.push_back(iterator(RestrictedForCodeCompletion));
208 SPAN_ATTACH(Tracer,
"query", llvm::to_string(*Root));
209 vlog(
"Dex query tree: {0}", *Root);
211 using IDAndScore = std::pair<DocID, float>;
212 std::vector<IDAndScore> IDAndScores =
consume(*Root);
214 auto Compare = [](
const IDAndScore &LHS,
const IDAndScore &RHS) {
215 return LHS.second > RHS.second;
218 Req.
Limit ? *Req.
Limit : std::numeric_limits<size_t>::max(), Compare);
219 for (
const auto &IDAndScore : IDAndScores) {
220 const DocID SymbolDocID = IDAndScore.first;
221 const auto *Sym = Symbols[SymbolDocID];
222 const llvm::Optional<float>
Score = Filter.
match(Sym->Name);
227 const float FinalScore =
228 (*Score) * SymbolQuality[SymbolDocID] * IDAndScore.second;
231 if (Top.
push({SymbolDocID, FinalScore}))
237 for (
const auto &Item : std::move(Top).items())
245 for (
const auto &ID : Req.
IDs) {
246 auto I = LookupTable.find(ID);
247 if (I != LookupTable.end())
253 llvm::function_ref<
void(
const Ref &)>
Callback)
const {
256 Req.
Limit.getValueOr(std::numeric_limits<uint32_t>::max());
257 for (
const auto &ID : Req.
IDs)
258 for (
const auto &
Ref : Refs.lookup(ID)) {
274 Req.
Limit.getValueOr(std::numeric_limits<uint32_t>::max());
277 auto It = Relations.find(
278 std::make_pair(Subject, static_cast<uint8_t>(Req.
Predicate)));
279 if (It != Relations.end()) {
280 for (
const auto &
Object : It->second) {
292 size_t Bytes = Symbols.size() *
sizeof(
const Symbol *);
293 Bytes += SymbolQuality.size() *
sizeof(float);
294 Bytes += LookupTable.getMemorySize();
295 Bytes += InvertedIndex.getMemorySize();
296 for (
const auto &TokenToPostingList : InvertedIndex)
297 Bytes += TokenToPostingList.second.bytes();
298 Bytes += Refs.getMemorySize();
299 Bytes += Relations.getMemorySize();
300 return Bytes + BackingDataSize;
304 std::vector<std::string> Result;
307 "Non-empty argument of generateProximityURIs() should be a valid " 309 llvm::StringRef Body = ParsedURI->body();
318 Result.emplace_back(ParsedURI->toString());
319 while (!Body.empty() && --Limit > 0) {
322 Body = llvm::sys::path::parent_path(Body, llvm::sys::path::Style::posix);
325 URI(ParsedURI->scheme(), ParsedURI->authority(), Body).
toString());
An immutable symbol container that stores a set of symbols.
llvm::DenseSet< SymbolID > IDs
bool AnyScope
If set to true, allow symbols from any scope.
std::unique_ptr< Iterator > intersect(std::vector< std::unique_ptr< Iterator >> Children) const
Returns AND Iterator which performs the intersection of the PostingLists of its children.
bool RestrictForCodeCompletion
If set to true, only symbols for completion support will be considered.
An efficient structure of storing large set of symbol references in memory.
This defines Dex - a symbol index implementation based on query iterators over symbol tokens...
llvm::Optional< uint32_t > Limit
If set, limit the number of relations returned from the index.
PostingList is the storage of DocIDs which can be inserted to the Query Tree as a leaf by constructin...
llvm::DenseSet< SymbolID > IDs
Represents a symbol occurrence in the source file.
Path Proximity URI to symbol declaration.
llvm::unique_function< void(llvm::Expected< T >)> Callback
A Callback<T> is a void function that accepts Expected<T>.
llvm::StringRef Scope
The containing namespace. e.g. "" (global), "ns::" (top-level namespace).
std::vector< Token > generateIdentifierTrigrams(llvm::StringRef Identifier)
Returns list of unique fuzzy-search trigrams from unqualified symbol.
bool fuzzyFind(const FuzzyFindRequest &Req, llvm::function_ref< void(const Symbol &)> Callback) const override
Constructs iterators over tokens extracted from the query and exhausts it while applying Callback to ...
std::vector< std::string > PreferredTypes
Preferred types of symbols. These are raw representation of OpaqueType.
void vlog(const char *Fmt, Ts &&... Vals)
std::vector< std::string > Scopes
If this is non-empty, symbols must be in at least one of the scopes (e.g.
SymbolID ID
The ID of the symbol.
bool TypeMatchesPreferred
llvm::Optional< float > Score
std::vector< std::pair< DocID, float > > consume(Iterator &It)
Advances the iterator until it is exhausted.
std::unique_ptr< Iterator > unionOf(std::vector< std::unique_ptr< Iterator >> Children) const
Returns OR Iterator which performs the union of the PostingLists of its children. ...
URIDistance * FileProximityMatch
Whether or not this symbol is meant to be used for the code completion.
std::vector< std::string > generateProximityURIs(llvm::StringRef URIPath)
Returns Search Token for a number of parent directories of given Path.
std::vector< Token > generateQueryTrigrams(llvm::StringRef Query)
Returns list of unique fuzzy-search trigrams given a query.
static std::unique_ptr< SymbolIndex > build(SymbolSlab, RefSlab, RelationSlab)
Builds an index from slabs. The index takes ownership of the slab.
std::unique_ptr< Iterator > limit(std::unique_ptr< Iterator > Child, size_t Limit) const
Returns LIMIT iterator, which yields up to N elements of its child iterator.
static const char * toString(OffsetEncoding OE)
std::unique_ptr< Iterator > none() const
Returns FALSE Iterator which iterates over no documents.
std::string Path
A typedef to represent a file path.
std::string Query
A query string for the fuzzy find.
llvm::Optional< float > match(llvm::StringRef Word)
SymbolLocation CanonicalDeclaration
The location of the preferred declaration of the symbol.
bool push(value_type &&V)
uint32_t DocID
Symbol position in the list of all index symbols sorted by a pre-computed symbol quality.
void relations(const RelationsRequest &Req, llvm::function_ref< void(const SymbolID &, const Symbol &)> Callback) const override
llvm::DenseSet< SymbolID > Subjects
void lookup(const LookupRequest &Req, llvm::function_ref< void(const Symbol &)> Callback) const override
Looks up symbols with any of the given symbol IDs and applies Callback on each matched symbol...
llvm::StringRef SymbolURI
These are used to calculate proximity between the index symbol and the query.
static llvm::Expected< URI > create(llvm::StringRef AbsolutePath, llvm::StringRef Scheme)
Creates a URI for a file in the given scheme.
The class presents a C++ symbol, e.g.
===– Representation.cpp - ClangDoc Representation --------—*- C++ -*-===//
std::vector< std::string > ProximityPaths
Contextually relevant files (e.g.
llvm::StringRef Name
The unqualified name of the symbol, e.g. "bar" (for ns::bar).
llvm::Optional< uint32_t > Limit
If set, limit the number of refers returned from the index.
Symbol index queries consist of specific requirements for the requested symbol, such as high fuzzy ma...
A URI describes the location of a source file.
llvm::Optional< uint32_t > Limit
The number of top candidates to return.
Internal Token type for invalid/special tokens, e.g.
static llvm::Expected< URI > parse(llvm::StringRef Uri)
Parse a URI string "<scheme>:[//<authority>/]<path>".
std::unique_ptr< Iterator > boost(std::unique_ptr< Iterator > Child, float Factor) const
Returns BOOST iterator which multiplies the score of each item by given factor.
A Token represents an attribute of a symbol, such as a particular trigram present in the name (used f...
std::unique_ptr< Iterator > all() const
Returns TRUE Iterator which iterates over "virtual" PostingList containing all items in range [0...
size_t estimateMemoryUsage() const override
Returns estimated size of index (in bytes).
llvm::StringRef Type
Raw representation of the OpaqueType of the symbol, used for scoring purposes.
Records an event whose duration is the lifetime of the Span object.
Type of symbol (see Symbol::Type).
Attributes of a symbol-query pair that affect how much we like it.
#define SPAN_ATTACH(S, Name, Expr)
Attach a key-value pair to a Span event.
bool refs(const RefsRequest &Req, llvm::function_ref< void(const Ref &)> Callback) const override
Finds all occurrences (e.g.
float quality(const Symbol &S)
Computes query-independent quality score for a Symbol.
TopN<T> is a lossy container that preserves only the "best" N elements.