Skip to content

Commit

Permalink
Add raw features collector.
Browse files Browse the repository at this point in the history
  • Loading branch information
toregge committed Sep 19, 2024
1 parent 5acb0a3 commit 5e00ba9
Show file tree
Hide file tree
Showing 3 changed files with 69 additions and 50 deletions.
69 changes: 19 additions & 50 deletions searchlib/src/vespa/searchlib/bitcompression/posocccompression.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

#include "posocccompression.h"
#include "posocc_fields_params.h"
#include "raw_features_collector.h"
#include <vespa/searchlib/fef/termfieldmatchdata.h>
#include <vespa/searchlib/fef/termfieldmatchdataarray.h>
#include <vespa/searchlib/index/postinglistparams.h>
Expand Down Expand Up @@ -40,16 +41,22 @@ readHeader(const vespalib::GenericHeader &header,
const_cast<PosOccFieldsParams *>(_fieldsParams)->readHeader(header, prefix);
}

template <bool bigEndian>
void
EG2PosOccDecodeContext<bigEndian>::
collect_raw_features_and_read_compr_buffer(RawFeaturesCollector& raw_features_collector, DocIdAndFeatures& features)
{
raw_features_collector.collect_before_read_compr_buffer(*this, features);
this->readComprBuffer();
raw_features_collector.fixup_after_read_compr_buffer(*this);
}

template <bool bigEndian>
void
EG2PosOccDecodeContext<bigEndian>::
readFeatures(search::index::DocIdAndFeatures &features)
{
features.clear_features(this->getBitOffset());
features.set_has_raw_data(true);
const uint64_t *rawFeatures = this->getCompr();
uint64_t rawFeaturesStartBitPos = this->getReadOffset();
RawFeaturesCollector raw_features_collector(*this, features);

const PosOccFieldParams &fieldParams = _fieldsParams->getFieldParams()[0];
uint32_t numElements = 1;
Expand All @@ -68,15 +75,10 @@ readFeatures(search::index::DocIdAndFeatures &features)
UC64_SKIPEXPGOLOMB_SMALL_NS(o, K_VALUE_POSOCC_ELEMENTWEIGHT, EC);
}
if (__builtin_expect(oCompr >= valE, false)) {
while (rawFeatures < oCompr) {
features.blob().push_back(*rawFeatures);
++rawFeatures;
}
UC64_DECODECONTEXT_STORE(o, _);
_readContext->readComprBuffer();
collect_raw_features_and_read_compr_buffer(raw_features_collector, features);
valE = _valE;
UC64_DECODECONTEXT_LOAD(o, _);
rawFeatures = oCompr;
}
}
UC64_SKIPEXPGOLOMB_SMALL_NS(o, K_VALUE_POSOCC_ELEMENTLEN, EC);
Expand All @@ -86,39 +88,24 @@ readFeatures(search::index::DocIdAndFeatures &features)
do {
if (__builtin_expect(oCompr >= valE, false)) {
UC64_DECODECONTEXT_STORE(o, _);
while (rawFeatures < oCompr) {
features.blob().push_back(*rawFeatures);
++rawFeatures;
}
_readContext->readComprBuffer();
collect_raw_features_and_read_compr_buffer(raw_features_collector, features);
valE = _valE;
UC64_DECODECONTEXT_LOAD(o, _);
rawFeatures = oCompr;
}
UC64_SKIPEXPGOLOMB_SMALL_NS(o, K_VALUE_POSOCC_FIRST_WORDPOS, EC);
} while (0);
for (uint32_t pos = 1; pos < numPositions; ++pos) {
if (__builtin_expect(oCompr >= valE, false)) {
UC64_DECODECONTEXT_STORE(o, _);
while (rawFeatures < oCompr) {
features.blob().push_back(*rawFeatures);
++rawFeatures;
}
_readContext->readComprBuffer();
collect_raw_features_and_read_compr_buffer(raw_features_collector, features);
valE = _valE;
UC64_DECODECONTEXT_LOAD(o, _);
rawFeatures = oCompr;
}
UC64_SKIPEXPGOLOMB_SMALL_NS(o,K_VALUE_POSOCC_DELTA_WORDPOS, EC);
}
}
UC64_DECODECONTEXT_STORE(o, _);
while (rawFeatures < oCompr) {
features.blob().push_back(*rawFeatures);
++rawFeatures;
}
uint64_t rawFeaturesEndBitPos = this->getReadOffset();
features.set_bit_length(rawFeaturesEndBitPos - rawFeaturesStartBitPos);
raw_features_collector.finish(*this, features);
this->readComprBufferIfNeeded();
}

Expand Down Expand Up @@ -462,10 +449,7 @@ void
EGPosOccDecodeContext<bigEndian>::
readFeatures(search::index::DocIdAndFeatures &features)
{
features.clear_features(this->getBitOffset());
features.set_has_raw_data(true);
const uint64_t *rawFeatures = this->getCompr();
uint64_t rawFeaturesStartBitPos = this->getReadOffset();
RawFeaturesCollector raw_features_collector(*this, features);

const PosOccFieldParams &fieldParams = _fieldsParams->getFieldParams()[0];
uint32_t elementLenK = EGPosOccEncodeContext<bigEndian>::
Expand All @@ -487,14 +471,9 @@ readFeatures(search::index::DocIdAndFeatures &features)
}
if (__builtin_expect(oCompr >= valE, false)) {
UC64_DECODECONTEXT_STORE(o, _);
while (rawFeatures < oCompr) {
features.blob().push_back(*rawFeatures);
++rawFeatures;
}
_readContext->readComprBuffer();
collect_raw_features_and_read_compr_buffer(raw_features_collector, features);
valE = _valE;
UC64_DECODECONTEXT_LOAD(o, _);
rawFeatures = oCompr;
}
}
UC64_DECODEEXPGOLOMB_SMALL_NS(o, elementLenK, EC);
Expand All @@ -507,25 +486,15 @@ readFeatures(search::index::DocIdAndFeatures &features)
for (uint32_t pos = 0; pos < numPositions; ++pos) {
if (__builtin_expect(oCompr >= valE, false)) {
UC64_DECODECONTEXT_STORE(o, _);
while (rawFeatures < oCompr) {
features.blob().push_back(*rawFeatures);
++rawFeatures;
}
_readContext->readComprBuffer();
collect_raw_features_and_read_compr_buffer(raw_features_collector, features);
valE = _valE;
UC64_DECODECONTEXT_LOAD(o, _);
rawFeatures = oCompr;
}
UC64_SKIPEXPGOLOMB_SMALL_NS(o, wordPosK, EC);
}
}
UC64_DECODECONTEXT_STORE(o, _);
while (rawFeatures < oCompr) {
features.blob().push_back(*rawFeatures);
++rawFeatures;
}
uint64_t rawFeaturesEndBitPos = this->getReadOffset();
features.set_bit_length(rawFeaturesEndBitPos - rawFeaturesStartBitPos);
raw_features_collector.finish(*this, features);
this->readComprBufferIfNeeded();
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
namespace search::bitcompression {

class PosOccFieldsParams;
class RawFeaturesCollector;

template <bool bigEndian>
class EG2PosOccDecodeContext : public FeatureDecodeContext<bigEndian>
Expand Down Expand Up @@ -79,6 +80,8 @@ class EG2PosOccDecodeContext : public FeatureDecodeContext<bigEndian>
void unpackFeatures(const search::fef::TermFieldMatchDataArray &matchData, uint32_t docId) override;
void setParams(const PostingListParams &params) override;
void getParams(PostingListParams &params) const override;
void collect_raw_features_and_read_compr_buffer(RawFeaturesCollector& raw_features_collector,
search::index::DocIdAndFeatures& features);
};


Expand Down Expand Up @@ -193,6 +196,7 @@ class EGPosOccDecodeContext : public EG2PosOccDecodeContext<bigEndian>
using ParentClass::_readContext;
using ParentClass::_fieldsParams;
using ParentClass::readHeader;
using ParentClass::collect_raw_features_and_read_compr_buffer;
using EC = EncodeContext64<bigEndian>;

EGPosOccDecodeContext(const PosOccFieldsParams *fieldsParams)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.

#pragma once

#include "compression.h"

namespace search::bitcompression {

class RawFeaturesCollector {
uint64_t _start_offset;
const uint64_t* _raw_features;

void collect(search::index::DocIdAndFeatures& features, const uint64_t* compr) {
auto& blob = features.blob();
auto* raw_features = _raw_features;
while (raw_features < compr) {
blob.emplace_back(*raw_features);
++raw_features;
}
}

public:
RawFeaturesCollector(const DecodeContext64Base& dc, search::index::DocIdAndFeatures& features)
: _start_offset(dc.getReadOffset()),
_raw_features(dc.getCompr())
{
features.clear_features(dc.getBitOffset());
features.set_has_raw_data(true);
}

void collect_before_read_compr_buffer(const DecodeContext64Base& dc, search::index::DocIdAndFeatures& features) {
collect(features, dc._valI);
}

void fixup_after_read_compr_buffer(const DecodeContext64Base& dc) {
_raw_features = dc._valI;
}

void finish(const DecodeContext64Base& dc, search::index::DocIdAndFeatures& features) {
collect(features, dc._valI);
auto end_offset = dc.getReadOffset();
features.set_bit_length( end_offset - _start_offset);
}
};

}

0 comments on commit 5e00ba9

Please sign in to comment.