12 #include "llvm/ADT/STLExtras.h"
13 #include "llvm/Support/Error.h"
14 #include "llvm/Support/MathExtras.h"
24 class ChunkIterator :
public Iterator {
26 explicit ChunkIterator(
const Token *Tok, llvm::ArrayRef<Chunk> Chunks)
27 : Tok(Tok), Chunks(Chunks), CurrentChunk(Chunks.begin()) {
28 if (!Chunks.empty()) {
29 DecompressedChunk = CurrentChunk->decompress();
30 CurrentID = DecompressedChunk.begin();
34 bool reachedEnd()
const override {
return CurrentChunk == Chunks.end(); }
37 void advance()
override {
38 assert(!reachedEnd() &&
39 "Posting List iterator can't advance() at the end.");
46 void advanceTo(
DocID ID)
override {
47 assert(!reachedEnd() &&
48 "Posting List iterator can't advance() at the end.");
53 CurrentID = std::partition_point(CurrentID, DecompressedChunk.end(),
54 [&](
const DocID D) {
return D < ID; });
58 DocID peek()
const override {
59 assert(!reachedEnd() &&
"Posting List iterator can't peek() at the end.");
64 assert(!reachedEnd() &&
65 "Posting List iterator can't consume() at the end.");
69 size_t estimateSize()
const override {
70 return Chunks.size() * ApproxEntriesPerChunk;
74 llvm::raw_ostream &dump(llvm::raw_ostream &
OS)
const override {
79 for (
const Chunk &C : Chunks)
80 for (
const DocID Doc : C.decompress()) {
89 void normalizeCursor() {
91 if (CurrentID != std::end(DecompressedChunk))
95 if (CurrentChunk == Chunks.end())
97 DecompressedChunk = CurrentChunk->decompress();
98 CurrentID = DecompressedChunk.begin();
102 void advanceToChunk(
DocID ID) {
103 if ((CurrentChunk != Chunks.end() - 1) &&
104 ((CurrentChunk + 1)->Head <= ID)) {
106 std::partition_point(CurrentChunk + 1, Chunks.end(),
107 [&](
const Chunk &C) {
return C.Head < ID; });
109 DecompressedChunk = CurrentChunk->decompress();
110 CurrentID = DecompressedChunk.begin();
115 llvm::ArrayRef<Chunk> Chunks;
120 decltype(Chunks)::const_iterator CurrentChunk;
121 llvm::SmallVector<DocID, Chunk::PayloadSize + 1> DecompressedChunk;
123 decltype(DecompressedChunk)::iterator CurrentID;
125 static constexpr
size_t ApproxEntriesPerChunk = 15;
128 static constexpr
size_t BitsPerEncodingByte = 7;
132 bool encodeVByte(
DocID Delta, llvm::MutableArrayRef<uint8_t> &
Payload) {
133 assert(Delta != 0 &&
"0 is not a valid PostingList delta.");
136 unsigned Width = 1 + llvm::findLastSet(Delta) / BitsPerEncodingByte;
141 uint8_t Encoding = Delta & 0x7f;
143 Payload.front() = Delta ? Encoding | 0x80 : Encoding;
145 }
while (Delta != 0);
168 std::vector<Chunk> encodeStream(llvm::ArrayRef<DocID> Documents) {
169 assert(!Documents.empty() &&
"Can't encode empty sequence.");
170 std::vector<Chunk> Result;
171 Result.emplace_back();
172 DocID Last = Result.back().Head = Documents.front();
173 llvm::MutableArrayRef<uint8_t> RemainingPayload = Result.back().Payload;
174 for (
DocID Doc : Documents.drop_front()) {
175 if (!encodeVByte(Doc - Last, RemainingPayload)) {
176 Result.emplace_back();
177 Result.back().Head = Doc;
178 RemainingPayload = Result.back().Payload;
182 return std::vector<Chunk>(Result);
187 llvm::Optional<DocID> readVByte(llvm::ArrayRef<uint8_t> &Bytes) {
188 if (Bytes.front() == 0 || Bytes.empty())
191 bool HasNextByte =
true;
192 for (
size_t Length = 0; HasNextByte && !Bytes.empty(); ++
Length) {
193 assert(
Length <= 5 &&
"Malformed VByte encoding sequence.");
195 Result |= (Bytes.front() & 0x7f) << (BitsPerEncodingByte *
Length);
196 if ((Bytes.front() & 0x80) == 0)
198 Bytes = Bytes.drop_front();
206 llvm::SmallVector<DocID, Chunk::PayloadSize + 1> Result{
Head};
207 llvm::ArrayRef<uint8_t> Bytes(
Payload);
209 for (
DocID Current =
Head; !Bytes.empty(); Current += Delta) {
210 auto MaybeDelta = readVByte(Bytes);
214 Result.push_back(Current + Delta);
216 return llvm::SmallVector<DocID, Chunk::PayloadSize + 1>{Result};
220 : Chunks(encodeStream(Documents)) {}
223 return std::make_unique<ChunkIterator>(Tok, Chunks);