Skip to content

Commit

Permalink
MB-59616: Adding vector-base64 field (WIP)
Browse files Browse the repository at this point in the history
 - Added a new field type called vector-base64.
 - Acts similar to vector in most cases.
 - When a new document arrives in the bleve layer, during
the parsing of all its fields in processProperty, if the
field mapping type is vector-base64, then its value is
decoded into a vector field and processed like a vector.
 - The standard golang base64 library is used for the decode
operation.
  • Loading branch information
Likith101 committed Feb 2, 2024
1 parent d78621d commit 8d37f78
Show file tree
Hide file tree
Showing 6 changed files with 189 additions and 2 deletions.
128 changes: 128 additions & 0 deletions document/field_vector_base64.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
// Copyright (c) 2024 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

//go:build vectors
// +build vectors

package document

import (
"encoding/base64"
"encoding/json"
"fmt"

index "github.com/blevesearch/bleve_index_api"
)

type VectorBase64Field struct {
vectorField *VectorField
encodedValue string
}

func (n *VectorBase64Field) Size() int {
return n.vectorField.Size()
}

func (n *VectorBase64Field) Name() string {
return n.vectorField.Name()
}

func (n *VectorBase64Field) ArrayPositions() []uint64 {
return n.vectorField.ArrayPositions()
}

func (n *VectorBase64Field) Options() index.FieldIndexingOptions {
return n.vectorField.Options()
}

func (n *VectorBase64Field) NumPlainTextBytes() uint64 {
return n.vectorField.NumPlainTextBytes()
}

func (n *VectorBase64Field) AnalyzedLength() int {
return n.vectorField.AnalyzedLength()
}

func (n *VectorBase64Field) EncodedFieldType() byte {
return 'e' // CHECK
}

func (n *VectorBase64Field) AnalyzedTokenFrequencies() index.TokenFrequencies {
return n.vectorField.AnalyzedTokenFrequencies()
}

func (n *VectorBase64Field) Analyze() {
// CHECK
}

func (n *VectorBase64Field) Value() []byte {
return n.vectorField.Value()
}

func (n *VectorBase64Field) GoString() string {
return fmt.Sprintf("&document.vectorFieldBase64Field{Name:%s, Options: %s, "+
"Value: %+v}", n.vectorField.Name(), n.vectorField.Options(), n.vectorField.Value())
}

// For the sake of not polluting the API, we are keeping arrayPositions as a
// parameter, but it is not used.
func NewVectorBase64Field(name string, arrayPositions []uint64, encodedValue string,
dims int, similarity, vectorIndexOptimizedFor string) (*VectorBase64Field, error) {

vector, err := decodeVector(encodedValue)
if err != nil {
return nil, err
}

return &VectorBase64Field{
vectorField: NewVectorFieldWithIndexingOptions(name, arrayPositions,
vector, dims, similarity,
vectorIndexOptimizedFor, DefaultVectorIndexingOptions),

encodedValue: encodedValue,
}, nil
}

func decodeVector(encodedValue string) ([]float32, error) {
decodedString, err := base64.StdEncoding.DecodeString(encodedValue)
if err != nil {
fmt.Println("Error decoding string:", err)
return nil, err
}

var decodedVector []float32
err = json.Unmarshal(decodedString, decodedVector)
if err != nil {
fmt.Println("Error decoding string:", err)
return nil, err
}

return decodedVector, nil
}

func (n *VectorBase64Field) Vector() []float32 {
return n.vectorField.Vector()
}

func (n *VectorBase64Field) Dims() int {
return n.vectorField.Dims()
}

func (n *VectorBase64Field) Similarity() string {
return n.vectorField.Similarity()
}

func (n *VectorBase64Field) IndexOptimizedFor() string {
return n.vectorField.IndexOptimizedFor()
}
2 changes: 2 additions & 0 deletions mapping/document.go
Original file line number Diff line number Diff line change
Expand Up @@ -443,6 +443,8 @@ func (dm *DocumentMapping) processProperty(property interface{}, path []string,
fieldMapping.processGeoShape(property, pathString, path, indexes, context)
} else if fieldMapping.Type == "geopoint" {
fieldMapping.processGeoPoint(property, pathString, path, indexes, context)
} else if fieldMapping.Type == "vector-base64" {
fieldMapping.processVectorBase64(property, pathString, path, indexes, context)
} else {
fieldMapping.processString(propertyValueString, pathString, path, indexes, context)
}
Expand Down
1 change: 0 additions & 1 deletion mapping/index.go
Original file line number Diff line number Diff line change
Expand Up @@ -320,7 +320,6 @@ func (im *IndexMappingImpl) determineType(data interface{}) string {

return im.DefaultType
}

func (im *IndexMappingImpl) MapDocument(doc *document.Document, data interface{}) error {
docType := im.determineType(data)
docMapping := im.mappingForType(docType)
Expand Down
9 changes: 9 additions & 0 deletions mapping/mapping_no_vectors.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,20 @@ func NewVectorFieldMapping() *FieldMapping {
return nil
}

func NewVectorBase64FieldMapping() *FieldMapping {
return nil
}

func (fm *FieldMapping) processVector(propertyMightBeVector interface{},
pathString string, path []string, indexes []uint64, context *walkContext) {

}

func (fm *FieldMapping) processVectorBase64(propertyMightBeVector interface{},
pathString string, path []string, indexes []uint64, context *walkContext) {

}

// -----------------------------------------------------------------------------
// document validation functions

Expand Down
47 changes: 46 additions & 1 deletion mapping/mapping_vectors.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@
package mapping

import (
"encoding/base64"
"encoding/json"
"fmt"
"reflect"

Expand All @@ -43,6 +45,17 @@ func NewVectorFieldMapping() *FieldMapping {
}
}

func NewVectorBase64FieldMapping() *FieldMapping {
return &FieldMapping{
Type: "vector-base64",
Store: false,
Index: true,
IncludeInAll: false,
DocValues: false,
SkipFreqNorm: true,
}
}

// validate and process a flat vector
func processFlatVector(vecV reflect.Value, dims int) ([]float32, bool) {
if vecV.Len() != dims {
Expand Down Expand Up @@ -121,6 +134,27 @@ func processVector(vecI interface{}, dims int) ([]float32, bool) {
return rv, true
}

func processVectorBase64(vecBase64 interface{}) (interface{}, bool) {

vecEncoded, ok := vecBase64.(string)
if !ok {
return nil, false
}

vecData, err := base64.StdEncoding.DecodeString(vecEncoded)
if err != nil {
return nil, false
}

var vector interface{}
err = json.Unmarshal(vecData, &vector)
if err != nil {
return nil, false
}

return vector, true
}

func (fm *FieldMapping) processVector(propertyMightBeVector interface{},
pathString string, path []string, indexes []uint64, context *walkContext) {
vector, ok := processVector(propertyMightBeVector, fm.Dims)
Expand All @@ -139,13 +173,24 @@ func (fm *FieldMapping) processVector(propertyMightBeVector interface{},
context.excludedFromAll = append(context.excludedFromAll, fieldName)
}

func (fm *FieldMapping) processVectorBase64(propertyMightBeVectorBase64 interface{},
pathString string, path []string, indexes []uint64, context *walkContext) {

propertyMightBeVector, ok := processVectorBase64(propertyMightBeVectorBase64)
if !ok {
return
}

fm.processVector(propertyMightBeVector, pathString, path, indexes, context)
}

// -----------------------------------------------------------------------------
// document validation functions

func validateFieldMapping(field *FieldMapping, parentName string,
fieldAliasCtx map[string]*FieldMapping) error {
switch field.Type {
case "vector":
case "vector", "vector-base64":
return validateVectorFieldAlias(field, parentName, fieldAliasCtx)
default: // non-vector field
return validateFieldType(field)
Expand Down
4 changes: 4 additions & 0 deletions mapping_vector.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,3 +22,7 @@ import "github.com/blevesearch/bleve/v2/mapping"
func NewVectorFieldMapping() *mapping.FieldMapping {
return mapping.NewVectorFieldMapping()
}

func NewVectorBase64FieldMapping() *mapping.FieldMapping {
return mapping.NewVectorBase64FieldMapping()
}

0 comments on commit 8d37f78

Please sign in to comment.