clang-tools  7.0.0
SourceCode.cpp
Go to the documentation of this file.
1 //===--- SourceCode.h - Manipulating source code as strings -----*- C++ -*-===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 #include "SourceCode.h"
10 
11 #include "Logger.h"
12 #include "clang/AST/ASTContext.h"
13 #include "clang/Basic/SourceManager.h"
14 #include "clang/Lex/Lexer.h"
15 #include "llvm/Support/Errc.h"
16 #include "llvm/Support/Error.h"
17 #include "llvm/Support/Path.h"
18 
19 namespace clang {
20 namespace clangd {
21 using namespace llvm;
22 
23 // Here be dragons. LSP positions use columns measured in *UTF-16 code units*!
24 // Clangd uses UTF-8 and byte-offsets internally, so conversion is nontrivial.
25 
26 // Iterates over unicode codepoints in the (UTF-8) string. For each,
27 // invokes CB(UTF-8 length, UTF-16 length), and breaks if it returns true.
28 // Returns true if CB returned true, false if we hit the end of string.
29 template <typename Callback>
30 static bool iterateCodepoints(StringRef U8, const Callback &CB) {
31  for (size_t I = 0; I < U8.size();) {
32  unsigned char C = static_cast<unsigned char>(U8[I]);
33  if (LLVM_LIKELY(!(C & 0x80))) { // ASCII character.
34  if (CB(1, 1))
35  return true;
36  ++I;
37  continue;
38  }
39  // This convenient property of UTF-8 holds for all non-ASCII characters.
40  size_t UTF8Length = countLeadingOnes(C);
41  // 0xxx is ASCII, handled above. 10xxx is a trailing byte, invalid here.
42  // 11111xxx is not valid UTF-8 at all. Assert because it's probably our bug.
43  assert((UTF8Length >= 2 && UTF8Length <= 4) &&
44  "Invalid UTF-8, or transcoding bug?");
45  I += UTF8Length; // Skip over all trailing bytes.
46  // A codepoint takes two UTF-16 code unit if it's astral (outside BMP).
47  // Astral codepoints are encoded as 4 bytes in UTF-8 (11110xxx ...)
48  if (CB(UTF8Length, UTF8Length == 4 ? 2 : 1))
49  return true;
50  }
51  return false;
52 }
53 
54 // Returns the offset into the string that matches \p Units UTF-16 code units.
55 // Conceptually, this converts to UTF-16, truncates to CodeUnits, converts back
56 // to UTF-8, and returns the length in bytes.
57 static size_t measureUTF16(StringRef U8, int U16Units, bool &Valid) {
58  size_t Result = 0;
59  Valid = U16Units == 0 || iterateCodepoints(U8, [&](int U8Len, int U16Len) {
60  Result += U8Len;
61  U16Units -= U16Len;
62  return U16Units <= 0;
63  });
64  if (U16Units < 0) // Offset was into the middle of a surrogate pair.
65  Valid = false;
66  // Don't return an out-of-range index if we overran.
67  return std::min(Result, U8.size());
68 }
69 
70 // Counts the number of UTF-16 code units needed to represent a string.
71 // Like most strings in clangd, the input is UTF-8 encoded.
72 static size_t utf16Len(StringRef U8) {
73  // A codepoint takes two UTF-16 code unit if it's astral (outside BMP).
74  // Astral codepoints are encoded as 4 bytes in UTF-8, starting with 11110xxx.
75  size_t Count = 0;
76  iterateCodepoints(U8, [&](int U8Len, int U16Len) {
77  Count += U16Len;
78  return false;
79  });
80  return Count;
81 }
82 
83 llvm::Expected<size_t> positionToOffset(StringRef Code, Position P,
84  bool AllowColumnsBeyondLineLength) {
85  if (P.line < 0)
86  return llvm::make_error<llvm::StringError>(
87  llvm::formatv("Line value can't be negative ({0})", P.line),
88  llvm::errc::invalid_argument);
89  if (P.character < 0)
90  return llvm::make_error<llvm::StringError>(
91  llvm::formatv("Character value can't be negative ({0})", P.character),
92  llvm::errc::invalid_argument);
93  size_t StartOfLine = 0;
94  for (int I = 0; I != P.line; ++I) {
95  size_t NextNL = Code.find('\n', StartOfLine);
96  if (NextNL == StringRef::npos)
97  return llvm::make_error<llvm::StringError>(
98  llvm::formatv("Line value is out of range ({0})", P.line),
99  llvm::errc::invalid_argument);
100  StartOfLine = NextNL + 1;
101  }
102 
103  size_t NextNL = Code.find('\n', StartOfLine);
104  if (NextNL == StringRef::npos)
105  NextNL = Code.size();
106 
107  bool Valid;
108  size_t ByteOffsetInLine = measureUTF16(
109  Code.substr(StartOfLine, NextNL - StartOfLine), P.character, Valid);
110  if (!Valid && !AllowColumnsBeyondLineLength)
111  return llvm::make_error<llvm::StringError>(
112  llvm::formatv("UTF-16 offset {0} is invalid for line {1}", P.character,
113  P.line),
114  llvm::errc::invalid_argument);
115  return StartOfLine + ByteOffsetInLine;
116 }
117 
118 Position offsetToPosition(StringRef Code, size_t Offset) {
119  Offset = std::min(Code.size(), Offset);
120  StringRef Before = Code.substr(0, Offset);
121  int Lines = Before.count('\n');
122  size_t PrevNL = Before.rfind('\n');
123  size_t StartOfLine = (PrevNL == StringRef::npos) ? 0 : (PrevNL + 1);
124  Position Pos;
125  Pos.line = Lines;
126  Pos.character = utf16Len(Before.substr(StartOfLine));
127  return Pos;
128 }
129 
130 Position sourceLocToPosition(const SourceManager &SM, SourceLocation Loc) {
131  // We use the SourceManager's line tables, but its column number is in bytes.
132  FileID FID;
133  unsigned Offset;
134  std::tie(FID, Offset) = SM.getDecomposedSpellingLoc(Loc);
135  Position P;
136  P.line = static_cast<int>(SM.getLineNumber(FID, Offset)) - 1;
137  bool Invalid = false;
138  StringRef Code = SM.getBufferData(FID, &Invalid);
139  if (!Invalid) {
140  auto ColumnInBytes = SM.getColumnNumber(FID, Offset) - 1;
141  auto LineSoFar = Code.substr(Offset - ColumnInBytes, ColumnInBytes);
142  P.character = utf16Len(LineSoFar);
143  }
144  return P;
145 }
146 
147 Range halfOpenToRange(const SourceManager &SM, CharSourceRange R) {
148  // Clang is 1-based, LSP uses 0-based indexes.
149  Position Begin = sourceLocToPosition(SM, R.getBegin());
150  Position End = sourceLocToPosition(SM, R.getEnd());
151 
152  return {Begin, End};
153 }
154 
155 std::pair<size_t, size_t> offsetToClangLineColumn(StringRef Code,
156  size_t Offset) {
157  Offset = std::min(Code.size(), Offset);
158  StringRef Before = Code.substr(0, Offset);
159  int Lines = Before.count('\n');
160  size_t PrevNL = Before.rfind('\n');
161  size_t StartOfLine = (PrevNL == StringRef::npos) ? 0 : (PrevNL + 1);
162  return {Lines + 1, Offset - StartOfLine + 1};
163 }
164 
165 std::pair<llvm::StringRef, llvm::StringRef>
166 splitQualifiedName(llvm::StringRef QName) {
167  size_t Pos = QName.rfind("::");
168  if (Pos == llvm::StringRef::npos)
169  return {StringRef(), QName};
170  return {QName.substr(0, Pos + 2), QName.substr(Pos + 2)};
171 }
172 
173 TextEdit replacementToEdit(StringRef Code, const tooling::Replacement &R) {
174  Range ReplacementRange = {
175  offsetToPosition(Code, R.getOffset()),
176  offsetToPosition(Code, R.getOffset() + R.getLength())};
177  return {ReplacementRange, R.getReplacementText()};
178 }
179 
180 std::vector<TextEdit> replacementsToEdits(StringRef Code,
181  const tooling::Replacements &Repls) {
182  std::vector<TextEdit> Edits;
183  for (const auto &R : Repls)
184  Edits.push_back(replacementToEdit(Code, R));
185  return Edits;
186 }
187 
188 llvm::Optional<std::string>
189 getAbsoluteFilePath(const FileEntry *F, const SourceManager &SourceMgr) {
190  SmallString<64> FilePath = F->tryGetRealPathName();
191  if (FilePath.empty())
192  FilePath = F->getName();
193  if (!llvm::sys::path::is_absolute(FilePath)) {
194  if (!SourceMgr.getFileManager().makeAbsolutePath(FilePath)) {
195  log("Could not turn relative path to absolute: {0}", FilePath);
196  return llvm::None;
197  }
198  }
199  return FilePath.str().str();
200 }
201 
202 } // namespace clangd
203 } // namespace clang
SourceLocation Loc
&#39;#&#39; location in the include directive
std::pair< size_t, size_t > offsetToClangLineColumn(StringRef Code, size_t Offset)
Definition: SourceCode.cpp:155
Some operations such as code completion produce a set of candidates.
static bool iterateCodepoints(StringRef U8, const Callback &CB)
Definition: SourceCode.cpp:30
llvm::unique_function< void(llvm::Expected< T >)> Callback
A Callback<T> is a void function that accepts Expected<T>.
Definition: Function.h:28
Documents should not be synced at all.
llvm::Expected< size_t > positionToOffset(StringRef Code, Position P, bool AllowColumnsBeyondLineLength)
Definition: SourceCode.cpp:83
TextEdit replacementToEdit(StringRef Code, const tooling::Replacement &R)
Definition: SourceCode.cpp:173
void log(const char *Fmt, Ts &&... Vals)
Definition: Logger.h:62
std::vector< TextEdit > replacementsToEdits(StringRef Code, const tooling::Replacements &Repls)
Definition: SourceCode.cpp:180
Position sourceLocToPosition(const SourceManager &SM, SourceLocation Loc)
Turn a SourceLocation into a [line, column] pair.
Definition: SourceCode.cpp:130
Position Pos
int line
Line position in a document (zero-based).
Definition: Protocol.h:91
int character
Character offset on a line in a document (zero-based).
Definition: Protocol.h:96
===– Representation.cpp - ClangDoc Representation --------—*- C++ -*-===//
std::pair< llvm::StringRef, llvm::StringRef > splitQualifiedName(llvm::StringRef QName)
From "a::b::c", return {"a::b::", "c"}.
Definition: SourceCode.cpp:166
static size_t utf16Len(StringRef U8)
Definition: SourceCode.cpp:72
static size_t measureUTF16(StringRef U8, int U16Units, bool &Valid)
Definition: SourceCode.cpp:57
llvm::Optional< std::string > getAbsoluteFilePath(const FileEntry *F, const SourceManager &SourceMgr)
Get the absolute file path of a given file entry.
Definition: SourceCode.cpp:189
unsigned Lines
Range halfOpenToRange(const SourceManager &SM, CharSourceRange R)
Definition: SourceCode.cpp:147
Position offsetToPosition(StringRef Code, size_t Offset)
Definition: SourceCode.cpp:118