Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: introduce Text data type #39874

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,12 @@ require (
github.com/gin-gonic/gin v1.9.1
github.com/go-playground/validator/v10 v10.14.0
github.com/gofrs/flock v0.8.1
github.com/golang/protobuf v1.5.4
github.com/golang/protobuf v1.5.4 // indirect
github.com/google/btree v1.1.2
github.com/grpc-ecosystem/go-grpc-middleware v1.3.0
github.com/klauspost/compress v1.17.9
github.com/mgutz/ansi v0.0.0-20200706080929-d51e80ef957d
github.com/milvus-io/milvus-proto/go-api/v2 v2.5.0-beta.0.20250208062437-5af22aa4b559
github.com/milvus-io/milvus-proto/go-api/v2 v2.5.0-beta.0.20250212075039-390de935d742
github.com/minio/minio-go/v7 v7.0.73
github.com/pingcap/log v1.1.1-0.20221015072633-39906604fb81
github.com/prometheus/client_golang v1.14.0
Expand Down
4 changes: 2 additions & 2 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -660,8 +660,8 @@ github.com/milvus-io/cgosymbolizer v0.0.0-20240722103217-b7dee0e50119 h1:9VXijWu
github.com/milvus-io/cgosymbolizer v0.0.0-20240722103217-b7dee0e50119/go.mod h1:DvXTE/K/RtHehxU8/GtDs4vFtfw64jJ3PaCnFri8CRg=
github.com/milvus-io/gorocksdb v0.0.0-20220624081344-8c5f4212846b h1:TfeY0NxYxZzUfIfYe5qYDBzt4ZYRqzUjTR6CvUzjat8=
github.com/milvus-io/gorocksdb v0.0.0-20220624081344-8c5f4212846b/go.mod h1:iwW+9cWfIzzDseEBCCeDSN5SD16Tidvy8cwQ7ZY8Qj4=
github.com/milvus-io/milvus-proto/go-api/v2 v2.5.0-beta.0.20250208062437-5af22aa4b559 h1:c8n10eBkYU/HYaDUNAaKog4aIA3ZHO+GL7bHN2Ug/MA=
github.com/milvus-io/milvus-proto/go-api/v2 v2.5.0-beta.0.20250208062437-5af22aa4b559/go.mod h1:/6UT4zZl6awVeXLeE7UGDWZvXj3IWkRsh3mqsn0DiAs=
github.com/milvus-io/milvus-proto/go-api/v2 v2.5.0-beta.0.20250212075039-390de935d742 h1:BkdzBgzsSLBjsuXrwKWKdKN0C5Bk4U3inW0J7Dq6Yrc=
github.com/milvus-io/milvus-proto/go-api/v2 v2.5.0-beta.0.20250212075039-390de935d742/go.mod h1:/6UT4zZl6awVeXLeE7UGDWZvXj3IWkRsh3mqsn0DiAs=
github.com/milvus-io/pulsar-client-go v0.12.1 h1:O2JZp1tsYiO7C0MQ4hrUY/aJXnn2Gry6hpm7UodghmE=
github.com/milvus-io/pulsar-client-go v0.12.1/go.mod h1:dkutuH4oS2pXiGm+Ti7fQZ4MRjrMPZ8IJeEGAWMeckk=
github.com/minio/asm2plan9s v0.0.0-20200509001527-cdd76441f9d8 h1:AMFGa4R4MiIpspGNG7Z948v4n35fFGB3RR3G/ry4FWs=
Expand Down
6 changes: 4 additions & 2 deletions internal/core/src/common/ChunkWriter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -385,7 +385,8 @@
break;
}
case milvus::DataType::VARCHAR:
case milvus::DataType::STRING: {
case milvus::DataType::STRING:
case milvus::DataType::TEXT: {
w = std::make_shared<StringChunkWriter>(nullable);
break;
}
Expand Down Expand Up @@ -486,7 +487,8 @@
break;
}
case milvus::DataType::VARCHAR:
case milvus::DataType::STRING: {
case milvus::DataType::STRING:
case milvus::DataType::TEXT: {

Check warning on line 491 in internal/core/src/common/ChunkWriter.cpp

View check run for this annotation

Codecov / codecov/patch

internal/core/src/common/ChunkWriter.cpp#L490-L491

Added lines #L490 - L491 were not covered by tests
w = std::make_shared<StringChunkWriter>(
file, file_offset, nullable);
break;
Expand Down
4 changes: 3 additions & 1 deletion internal/core/src/common/FieldData.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -179,7 +179,8 @@ FieldDataImpl<Type, is_type_entire_row>::FillFieldData(
return FillFieldData(array_info.first, array_info.second);
}
case DataType::STRING:
case DataType::VARCHAR: {
case DataType::VARCHAR:
case DataType::TEXT: {
AssertInfo(array->type()->id() == arrow::Type::type::STRING,
"inconsistent data type");
auto string_array =
Expand Down Expand Up @@ -311,6 +312,7 @@ InitScalarFieldData(const DataType& type, bool nullable, int64_t cap_rows) {
type, nullable, cap_rows);
case DataType::STRING:
case DataType::VARCHAR:
case DataType::TEXT:
return std::make_shared<FieldData<std::string>>(
type, nullable, cap_rows);
case DataType::JSON:
Expand Down
14 changes: 14 additions & 0 deletions internal/core/src/common/Types.h
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,8 @@
VARCHAR = 21,
ARRAY = 22,
JSON = 23,
// GEOMETRY = 24 // reserved in proto
TEXT = 25,

// Some special Data type, start from after 50
// just for internal use now, may sync proto in future
Expand Down Expand Up @@ -182,6 +184,8 @@
return "array";
case DataType::JSON:
return "json";
case DataType::TEXT:
return "text";

Check warning on line 188 in internal/core/src/common/Types.h

View check run for this annotation

Codecov / codecov/patch

internal/core/src/common/Types.h#L187-L188

Added lines #L187 - L188 were not covered by tests
case DataType::VECTOR_FLOAT:
return "vector_float";
case DataType::VECTOR_BINARY:
Expand Down Expand Up @@ -255,6 +259,7 @@
switch (data_type) {
case DataType::VARCHAR:
case DataType::STRING:
case DataType::TEXT:
return true;
default:
return false;
Expand Down Expand Up @@ -538,6 +543,12 @@
static constexpr const char* Name = "STRING";
};

template <>
struct TypeTraits<DataType::TEXT> : public TypeTraits<DataType::VARCHAR> {
static constexpr DataType TypeKind = DataType::TEXT;
static constexpr const char* Name = "TEXT";
};

template <>
struct TypeTraits<DataType::ARRAY> {
using NativeType = void;
Expand Down Expand Up @@ -620,6 +631,9 @@
case milvus::DataType::VARCHAR:
name = "VARCHAR";
break;
case milvus::DataType::TEXT:
name = "TEXT";
break;

Check warning on line 636 in internal/core/src/common/Types.h

View check run for this annotation

Codecov / codecov/patch

internal/core/src/common/Types.h#L634-L636

Added lines #L634 - L636 were not covered by tests
case milvus::DataType::ARRAY:
name = "ARRAY";
break;
Expand Down
4 changes: 3 additions & 1 deletion internal/core/src/mmap/Utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ PaddingSize(const DataType& type) {
return simdjson::SIMDJSON_PADDING;
case DataType::VARCHAR:
case DataType::STRING:
case DataType::TEXT:
return FILE_STRING_PADDING;
break;
case DataType::ARRAY:
Expand Down Expand Up @@ -92,7 +93,8 @@ WriteFieldData(File& file,
BufferedWriter bw = BufferedWriter(file, 1048576);
switch (data_type) {
case DataType::VARCHAR:
case DataType::STRING: {
case DataType::STRING:
case DataType::TEXT: {
// write as: |size|data|size|data......
for (auto i = 0; i < data->get_num_rows(); ++i) {
indices.push_back(total_written);
Expand Down
9 changes: 6 additions & 3 deletions internal/core/src/segcore/ChunkedSegmentSealedImpl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -369,7 +369,8 @@
int64_t field_data_size = 0;
switch (data_type) {
case milvus::DataType::STRING:
case milvus::DataType::VARCHAR: {
case milvus::DataType::VARCHAR:
case milvus::DataType::TEXT: {
auto var_column =
std::make_shared<ChunkedVariableColumn<std::string>>(
field_meta);
Expand Down Expand Up @@ -571,7 +572,8 @@
if (IsVariableDataType(data_type)) {
switch (data_type) {
case milvus::DataType::STRING:
case milvus::DataType::VARCHAR: {
case milvus::DataType::VARCHAR:
case milvus::DataType::TEXT: {

Check warning on line 576 in internal/core/src/segcore/ChunkedSegmentSealedImpl.cpp

View check run for this annotation

Codecov / codecov/patch

internal/core/src/segcore/ChunkedSegmentSealedImpl.cpp#L575-L576

Added lines #L575 - L576 were not covered by tests
// auto var_column = std::make_shared<VariableColumn<std::string>>(
// file,
// total_written,
Expand Down Expand Up @@ -1579,7 +1581,8 @@
}
switch (field_meta.get_data_type()) {
case DataType::VARCHAR:
case DataType::STRING: {
case DataType::STRING:
case DataType::TEXT: {

Check warning on line 1585 in internal/core/src/segcore/ChunkedSegmentSealedImpl.cpp

View check run for this annotation

Codecov / codecov/patch

internal/core/src/segcore/ChunkedSegmentSealedImpl.cpp#L1584-L1585

Added lines #L1584 - L1585 were not covered by tests
bulk_subscript_ptr_impl<std::string>(
column.get(),
seg_offsets,
Expand Down
3 changes: 2 additions & 1 deletion internal/core/src/segcore/ConcurrentVector.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,8 @@ VectorBase::set_data_raw(ssize_t element_offset,
return set_data_raw(
element_offset, FIELD_DATA(data, double).data(), element_count);
}
case DataType::VARCHAR: {
case DataType::VARCHAR:
case DataType::TEXT: {
auto& field_data = FIELD_DATA(data, string);
std::vector<std::string> data_raw(field_data.begin(),
field_data.end());
Expand Down
3 changes: 2 additions & 1 deletion internal/core/src/segcore/InsertRecord.h
Original file line number Diff line number Diff line change
Expand Up @@ -452,7 +452,8 @@ struct InsertRecord {
this->append_data<double>(field_id, size_per_chunk);
break;
}
case DataType::VARCHAR: {
case DataType::VARCHAR:
case DataType::TEXT: {
this->append_data<std::string>(field_id, size_per_chunk);
break;
}
Expand Down
3 changes: 2 additions & 1 deletion internal/core/src/segcore/SegmentChunkReader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -288,7 +288,8 @@ SegmentChunkReader::GetChunkDataAccessor(DataType data_type,
case DataType::DOUBLE:
return GetChunkDataAccessor<double>(
field_id, chunk_id, data_barrier);
case DataType::VARCHAR: {
case DataType::VARCHAR:
case DataType::TEXT: {
return GetChunkDataAccessor<std::string>(
field_id, chunk_id, data_barrier);
}
Expand Down
3 changes: 2 additions & 1 deletion internal/core/src/segcore/SegmentGrowingImpl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -582,7 +582,8 @@ SegmentGrowingImpl::bulk_subscript(FieldId field_id,
->mutable_data());
break;
}
case DataType::VARCHAR: {
case DataType::VARCHAR:
case DataType::TEXT: {
bulk_subscript_ptr_impl<std::string>(vec_ptr,
seg_offsets,
count,
Expand Down
9 changes: 6 additions & 3 deletions internal/core/src/segcore/SegmentSealedImpl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -395,7 +395,8 @@ SegmentSealedImpl::LoadFieldData(FieldId field_id, FieldDataInfo& data) {
int64_t field_data_size = 0;
switch (data_type) {
case milvus::DataType::STRING:
case milvus::DataType::VARCHAR: {
case milvus::DataType::VARCHAR:
case milvus::DataType::TEXT: {
auto var_column = std::make_shared<
SingleChunkVariableColumn<std::string>>(
num_rows, field_meta, get_block_size());
Expand Down Expand Up @@ -571,7 +572,8 @@ SegmentSealedImpl::MapFieldData(const FieldId field_id, FieldDataInfo& data) {
if (IsVariableDataType(data_type)) {
switch (data_type) {
case milvus::DataType::STRING:
case milvus::DataType::VARCHAR: {
case milvus::DataType::VARCHAR:
case milvus::DataType::TEXT: {
auto var_column =
std::make_shared<SingleChunkVariableColumn<std::string>>(
file,
Expand Down Expand Up @@ -1399,7 +1401,8 @@ SegmentSealedImpl::get_raw_data(FieldId field_id,
}
switch (field_meta.get_data_type()) {
case DataType::VARCHAR:
case DataType::STRING: {
case DataType::STRING:
case DataType::TEXT: {
bulk_subscript_ptr_impl<std::string>(
column.get(),
seg_offsets,
Expand Down
15 changes: 10 additions & 5 deletions internal/core/src/segcore/Utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,8 @@
} else {
switch (data_type) {
case DataType::STRING:
case DataType::VARCHAR: {
case DataType::VARCHAR:
case DataType::TEXT: {
auto& string_data = FIELD_DATA(data, string);
for (auto& str : string_data) {
result += str.size();
Expand Down Expand Up @@ -187,7 +188,8 @@
break;
}
case DataType::VARCHAR:
case DataType::STRING: {
case DataType::STRING:
case DataType::TEXT: {
for (auto& array_bytes : array_data) {
auto element_num =
array_bytes.string_data().data_size();
Expand Down Expand Up @@ -276,7 +278,8 @@
break;
}
case DataType::VARCHAR:
case DataType::STRING: {
case DataType::STRING:
case DataType::TEXT: {
auto obj = scalar_array->mutable_string_data();
obj->mutable_data()->Reserve(count);
for (auto i = 0; i < count; i++) {
Expand Down Expand Up @@ -430,7 +433,8 @@
obj->mutable_data()->Add(data, data + count);
break;
}
case DataType::VARCHAR: {
case DataType::VARCHAR:
case DataType::TEXT: {
auto data = reinterpret_cast<const std::string*>(data_raw);
auto obj = scalar_array->mutable_string_data();
for (auto i = 0; i < count; i++) {
Expand Down Expand Up @@ -660,7 +664,8 @@
*(obj->mutable_data()->Add()) = data[src_offset];
break;
}
case DataType::VARCHAR: {
case DataType::VARCHAR:
case DataType::TEXT: {

Check warning on line 668 in internal/core/src/segcore/Utils.cpp

View check run for this annotation

Codecov / codecov/patch

internal/core/src/segcore/Utils.cpp#L667-L668

Added lines #L667 - L668 were not covered by tests
auto& data = FIELD_DATA(src_field_data, string);
auto obj = scalar_array->mutable_string_data();
*(obj->mutable_data()->Add()) = data[src_offset];
Expand Down
3 changes: 2 additions & 1 deletion internal/core/src/storage/Event.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -242,7 +242,8 @@ BaseEventData::Serialize() {
}
switch (data_type) {
case DataType::VARCHAR:
case DataType::STRING: {
case DataType::STRING:
case DataType::TEXT: {
for (size_t offset = 0; offset < field_data->get_num_rows();
++offset) {
auto str = static_cast<const std::string*>(
Expand Down
7 changes: 5 additions & 2 deletions internal/core/src/storage/Util.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -272,7 +272,8 @@ CreateArrowBuilder(DataType data_type) {
return std::make_shared<arrow::DoubleBuilder>();
}
case DataType::VARCHAR:
case DataType::STRING: {
case DataType::STRING:
case DataType::TEXT: {
return std::make_shared<arrow::StringBuilder>();
}
case DataType::ARRAY:
Expand Down Expand Up @@ -357,7 +358,8 @@ CreateArrowSchema(DataType data_type, bool nullable) {
{arrow::field("val", arrow::float64(), nullable)});
}
case DataType::VARCHAR:
case DataType::STRING: {
case DataType::STRING:
case DataType::TEXT: {
return arrow::schema(
{arrow::field("val", arrow::utf8(), nullable)});
}
Expand Down Expand Up @@ -812,6 +814,7 @@ CreateFieldData(const DataType& type,
type, nullable, total_num_rows);
case DataType::STRING:
case DataType::VARCHAR:
case DataType::TEXT:
return std::make_shared<FieldData<std::string>>(
type, nullable, total_num_rows);
case DataType::JSON:
Expand Down
2 changes: 1 addition & 1 deletion internal/flushcommon/pipeline/flow_graph_embedding_node.go
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@

embeddingData, ok := data.Data[inputFieldId].GetDataRows().([]string)
if !ok {
return fmt.Errorf("BM25 embedding failed: input field data not varchar")
return fmt.Errorf("BM25 embedding failed: input field data not varchar/text")

Check warning on line 89 in internal/flushcommon/pipeline/flow_graph_embedding_node.go

View check run for this annotation

Codecov / codecov/patch

internal/flushcommon/pipeline/flow_graph_embedding_node.go#L89

Added line #L89 was not covered by tests
}

output, err := runner.BatchRun(embeddingData)
Expand Down
7 changes: 7 additions & 0 deletions internal/parser/planparserv2/parser_visitor.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,10 @@
nestedPath = append(nestedPath, identifier)
}

if field.DataType == schemapb.DataType_Text {
return nil, fmt.Errorf("filter on text field (%s) is not supported yet", field.Name)
}

return &ExprWithType{
expr: &planpb.Expr{
Expr: &planpb.Expr_ColumnExpr{
Expand Down Expand Up @@ -494,6 +498,9 @@
if !typeutil.IsStringType(column.dataType) {
return fmt.Errorf("text match operation on non-string is unsupported")
}
if column.dataType == schemapb.DataType_Text {
return fmt.Errorf("text match operation on text field is not supported yet")
}

Check warning on line 503 in internal/parser/planparserv2/parser_visitor.go

View check run for this annotation

Codecov / codecov/patch

internal/parser/planparserv2/parser_visitor.go#L502-L503

Added lines #L502 - L503 were not covered by tests

queryText, err := convertEscapeSingle(ctx.StringLiteral().GetText())
if err != nil {
Expand Down
15 changes: 15 additions & 0 deletions internal/parser/planparserv2/plan_parser_v2_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -294,6 +294,21 @@ func TestExpr_PhraseMatch(t *testing.T) {
}
}

func TestExpr_TextField(t *testing.T) {
schema := newTestSchema(true)
helper, err := typeutil.CreateSchemaHelper(schema)
assert.NoError(t, err)

invalidExprs := []string{
`TextField == "query"`,
`text_match(TextField, "query")`,
}

for _, exprStr := range invalidExprs {
assertInvalidExpr(t, helper, exprStr)
}
}

func TestExpr_IsNull(t *testing.T) {
schema := newTestSchema(false)
schema.EnableDynamicField = false
Expand Down
1 change: 1 addition & 0 deletions internal/proxy/task.go
Original file line number Diff line number Diff line change
Expand Up @@ -397,6 +397,7 @@ func (t *createCollectionTask) PreExecute(ctx context.Context) error {
// valid max length per row parameters
// if max_length not specified, return error
if field.DataType == schemapb.DataType_VarChar ||
field.DataType == schemapb.DataType_Text ||
(field.GetDataType() == schemapb.DataType_Array && field.GetElementType() == schemapb.DataType_VarChar) {
err = validateMaxLengthPerRow(t.schema.Name, field)
if err != nil {
Expand Down
6 changes: 3 additions & 3 deletions internal/proxy/task_insert.go
Original file line number Diff line number Diff line change
Expand Up @@ -216,10 +216,10 @@
return err
}

// check varchar with analyzer was utf-8 format
err = checkVarcharFormat(it.schema, it.insertMsg)
// check varchar/text with analyzer was utf-8 format
err = checkInputUtf8Compatiable(it.schema, it.insertMsg)
if err != nil {
log.Warn("check varchar format failed", zap.Error(err))
log.Warn("check varchar/text format failed", zap.Error(err))

Check warning on line 222 in internal/proxy/task_insert.go

View check run for this annotation

Codecov / codecov/patch

internal/proxy/task_insert.go#L222

Added line #L222 was not covered by tests
return err
}

Expand Down
Loading
Loading