From c237a9cc2b5374f751914909113298118ff3c0bb Mon Sep 17 00:00:00 2001
From: fengjial <jialin_feng@163.com>
Date: Fri, 15 Mar 2024 16:05:36 +0800
Subject: [PATCH] add baidu vdb as retriever (#183)

---
 .../core/components/retriever/README.md       |  86 +---
 .../components/retriever/baidu_vdb/README.md  |  85 ++++
 .../retriever/baidu_vdb/__init__.py           |   0
 .../retriever/baidu_vdb/baiduvdb_retriever.py | 403 ++++++++++++++++++
 .../core/components/retriever/bes/README.md   |  88 ++++
 .../core/components/retriever/bes/__init__.py |   0
 .../retriever/{ => bes}/bes_retriever.py      |   0
 requirements.txt                              |   1 +
 8 files changed, 581 insertions(+), 82 deletions(-)
 create mode 100644 appbuilder/core/components/retriever/baidu_vdb/README.md
 create mode 100644 appbuilder/core/components/retriever/baidu_vdb/__init__.py
 create mode 100644 appbuilder/core/components/retriever/baidu_vdb/baiduvdb_retriever.py
 create mode 100644 appbuilder/core/components/retriever/bes/README.md
 create mode 100644 appbuilder/core/components/retriever/bes/__init__.py
 rename appbuilder/core/components/retriever/{ => bes}/bes_retriever.py (100%)

diff --git a/appbuilder/core/components/retriever/README.md b/appbuilder/core/components/retriever/README.md
index 73a4b494..3374a721 100644
--- a/appbuilder/core/components/retriever/README.md
+++ b/appbuilder/core/components/retriever/README.md
@@ -1,88 +1,10 @@
-# 向量检索（Baidu ElasticSearch Retriever）
+# 向量检索
 
 ## 简介
-向量检索组件（Baidu ElasticSearch Retriever）基于一款Baidu ElasticSearch的内容检索组件，支持根据文本的向量的相似度进行内容检索。
+Appbuilder提供多种向量数据库作为向量检索的底座，当前主要支持百度向量数据库、百度 ElasticSearch。
 
 ### 功能介绍
-向量检索组件（Baidu ElasticSearch Retriever）用于在将文本内容输入到Baidu ElasticSearch，根据文本的向量相似度进行高效的内容检索。
+向量检索组件-VDB（Baidu VDB Retriever）以百度向量数据库作为向量存储和检索的底座。百度向量数据库是一个专注于多维向量数据的存储、检索和分析的企业级分布式数据库服务。基于百度自主研发的向量数据库内核，VectorDB在保证高性能和高可用性的同时，也特别注重易用性和可扩展性。它支持多种索引类型和相似度计算方法，能够满足各类复杂和多样化的数据应用需求。特别值得一提的是，VectorDB能够管理高达数十亿的向量规模，同时保持毫秒级的查询响应时间，非常适合进行大规模的向量检索和分析任务。
 
-### 特色优势
-- 高效准确：基于Baidu ElasticSearch的强大能力，提供高效且准确的内容检索功能。
+向量检索组件-BES（Baidu ElasticSearch Retriever）以百度 ElasticSearch作为向量存储和检索的底座。百度 ElasticSearch是一款专为企业级需求设计的分布式搜索和分析服务，它在全面兼容开源ElasticSearch的基础上，提供了更多增强功能。这款服务的核心优势在于其高性能和高可靠性，它为处理结构化和非结构化数据提供了一个低成本且高效的平台。对于关注数据安全的客户来说，百度ElasticSearch提供了先进的权限管理机制，使得您可以根据业务需求自由地配置集群权限。
 
-### 应用场景
-各种内容检索场景
-
-## 准备工作
-在使用Baidu ElasticSearch Retriever进行内容检索之前，需要到Baidu ElasticSearch官网创建相应的集群，详情见[教程](https://cloud.baidu.com/doc/BES/s/gke3ocf89)。
-
-注：创建集群时请选择7.10.2版本的ES，否则可能无法使用本组件。
-
-## 基本用法
-
-以下是有关如何开始使用BESRetriever的代码示例：
-
-```python
-import os
-import appbuilder
-
-# 请前往千帆AppBuilder官网创建密钥，流程详见：https://cloud.baidu.com/doc/AppBuilder/s/Olq6grrt6#1%E3%80%81%E5%88%9B%E5%BB%BA%E5%AF%86%E9%92%A5
-os.environ["APPBUILDER_TOKEN"] = '...'
-
-embedding = appbuilder.Embedding()
-segments = appbuilder.Message(["文心一言大模型", "百度在线科技有限公司"])
-# 初始化构建索引
-vector_index = appbuilder.BESVectorStoreIndex.from_segments(segments=segments, cluster_id=es_cluster_id, user_name=es_username, 
-                                                            password=es_password, embedding=embedding)
-# 获取当前索引中的全部内容
-all_content = vector_index.get_all_segments()
-print(all_content)
-# 转化为retriever
-retriever = vector_index.as_retriever()
-# 按照query进行检索
-query = appbuilder.Message("文心一言")
-res = retriever(query=query, top_k=1)
-print(res)
-# 删除当前索引中的全部内容
-vector_index.delete_all_segments()
-```
-
-## 参数说明
-
-### 鉴权说明
-使用组件之前，请首先申请并设置鉴权参数，可参考[组件使用流程](https://cloud.baidu.com/doc/AppBuilder/s/Olq6grrt6#1%E3%80%81%E5%88%9B%E5%BB%BA%E5%AF%86%E9%92%A5)。
-```python
-# 设置环境中的TOKEN，以下示例略
-os.environ["APPBUILDER_TOKEN"] = "bce-YOURTOKEN"
-```
-
-### 初始化参数说明：
-
-- segments （Message[List[str]]，必填）：需要入库的文本段落
-- cluster_id （str，必填）：ElacticSearch集群的id，创建集群时获取
-- user_name  （str，必填）：连接ES集群所需的用户名，创建集群时获取
-- password   （str，必填）：连接ES集群所需的密码，创建集群时获取
-- embedding  （obj，非必填）：用于将文本转为向量的模型，默认为Embedding
-
-### 调用参数：
-| 参数名称    | 参数类型   |是否必须 | 描述               | 示例值           |
-|---------|--------|--------|------------------|---------------|
-| message | String |是 | 需要检索的内容          | "中国2023人均GDP" |
-| top_k   | int    |否 | 返回相似度最高的top_k个内容 | 1             |
-
-### 响应参数
-| 参数名称 | 参数类型   | 描述  | 示例值                |
-|------|--------|-----|--------------------|
-| text | string | 检索结果 | "中国2023年人均GDP8.94万元" |
-| score | float  | 相似度 | 0.95               |
-| meta | dict   | 元信息 | ""                   |
-### 响应示例
-```json
-{"text": "中国2023年人均GDP8.94万元", "score": 0.95, "meta": ""}
-```
-
-## 高级用法：
-
-本组件根据向量的相似度进行检索，支持使用不同的embedding方法和索引方式来优化检索的效果。
-
-## 更新记录和贡献
-* 向量检索能力 (2023-12)
\ No newline at end of file
diff --git a/appbuilder/core/components/retriever/baidu_vdb/README.md b/appbuilder/core/components/retriever/baidu_vdb/README.md
new file mode 100644
index 00000000..f7a1c8b9
--- /dev/null
+++ b/appbuilder/core/components/retriever/baidu_vdb/README.md
@@ -0,0 +1,85 @@
+# 向量检索（Baidu VDB Retriever）
+
+## 简介
+向量检索组件（Baidu VDB Retriever）基于一款百度向量数据库的内容检索组件，支持根据文本的向量的相似度进行内容检索。
+
+### 功能介绍
+向量检索组件（Baidu VDB Retriever）用于在将文本内容输入到百度向量数据库，根据文本的向量相似度进行高效的内容检索。
+
+### 特色优势
+高效准确：基于百度向量数据库的强大能力，提供高效且准确的内容检索功能。
+
+### 应用场景
+各种内容检索场景
+
+## 准备工作
+在使用Baidu VDB Retriever进行内容检索之前，需要到百度向量数据库官网创建相应的实例，[教程](https://cloud.baidu.com/doc/VDB/s/hlrsoazuf)。
+
+## 基本用法
+
+以下是有关如何开始使用BaiduVDBRetriever的代码示例：
+
+```python
+import os
+import appbuilder
+
+# 请前往千帆AppBuilder官网创建密钥，流程详见：https://cloud.baidu.com/doc/AppBuilder/s/Olq6grrt6#1%E3%80%81%E5%88%9B%E5%BB%BA%E5%AF%86%E9%92%A5
+os.environ["APPBUILDER_TOKEN"] = '...'
+
+embedding = appbuilder.Embedding()
+segments = appbuilder.Message(["文心一言大模型", "百度在线科技有限公司"])
+# 初始化构建索引
+vector_index = appbuilder.BaiduVDBVectorStoreIndex.from_params(
+    instance_id=your_instance_id,
+    api_key=your_api_key,
+    drop_exists=True,
+)
+vector_index.add_segments(segments)
+
+query = appbuilder.Message("文心一言")
+retriever = vector_index.as_retriever()
+res = retriever(query)
+print(res)
+```
+
+## 参数说明
+
+### 鉴权说明
+使用组件之前，请首先申请并设置鉴权参数，可参考[组件使用流程](https://cloud.baidu.com/doc/AppBuilder/s/Olq6grrt6#1%E3%80%81%E5%88%9B%E5%BB%BA%E5%AF%86%E9%92%A5)。
+```python
+# 设置环境中的TOKEN，以下示例略
+os.environ["APPBUILDER_TOKEN"] = "bce-YOURTOKEN"
+```
+
+### 初始化参数说明：
+
+- segments （Message[List[str]]，必填）：需要入库的文本段落
+- instance_id（str，必填）：百度向量数据库的实例id，创建实例时获取
+- api_key    （str，必填）：连接向量数据库所需的密码，创建实例时获取
+- account    （str，非必填）：连接向量数据库所需的用户名，默认root
+- embedding  （obj，非必填）：用于将文本转为向量的模型，默认为Embedding
+- drop_exists (bool, 非必填) ：是否清空数据库历史记录，默认为False
+
+### 调用参数：
+| 参数名称    | 参数类型   |是否必须 | 描述               | 示例值           |
+|---------|--------|--------|------------------|---------------|
+| message | String |是 | 需要检索的内容          | "中国2023人均GDP" |
+| top_k   | int    |否 | 返回相似度最高的top_k个内容 | 1             |
+
+### 响应参数
+| 参数名称 | 参数类型   | 描述  | 示例值                |
+|------|--------|-----|--------------------|
+| text | string | 检索结果 | "中国2023年人均GDP8.94万元" |
+| score | float  | 相似度 | 0.95               |
+| meta | dict   | 元信息 | ""                   |
+### 响应示例
+```json
+{"text": "中国2023年人均GDP8.94万元", "score": 0.95, "meta": ""}
+```
+
+## 高级用法：
+
+本组件根据向量的相似度进行检索，支持使用不同的embedding方法和索引方式来优化检索的效果。
+
+## 更新记录和贡献
+* 向量检索能力 (2024-03)
diff --git a/appbuilder/core/components/retriever/baidu_vdb/__init__.py b/appbuilder/core/components/retriever/baidu_vdb/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/appbuilder/core/components/retriever/baidu_vdb/baiduvdb_retriever.py b/appbuilder/core/components/retriever/baidu_vdb/baiduvdb_retriever.py
new file mode 100644
index 00000000..d716f05f
--- /dev/null
+++ b/appbuilder/core/components/retriever/baidu_vdb/baiduvdb_retriever.py
@@ -0,0 +1,403 @@
+# Copyright (c) 2023 Baidu, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# -*- coding: utf-8 -*-
+"""
+基于Baidu VDB的retriever
+"""
+import importlib
+import os
+import random
+import string
+import time
+from typing import Dict, Any
+from appbuilder.core.component import Component, Message
+from appbuilder.core.components.embeddings.component import Embedding
+from appbuilder.core.constants import GATEWAY_URL
+from appbuilder.utils.logger_util import logger
+
+DEFAULT_ACCOUNT = "root"
+DEFAULT_DATABASE_NAME = "AppBuilderDatabase"
+DEFAULT_TABLE_NAME = "AppBuilderTable"
+DEFAULT_TIMEOUT_IN_MILLS: int = 30 * 1000
+
+DEFAULT_PARTITION = 1
+DEFAULT_REPLICA = 3
+DEFAULT_INDEX_TYPE = "HNSW"
+DEFAULT_METRIC_TYPE = "L2"
+
+DEFAULT_HNSW_M = 16
+DEFAULT_HNSW_EF_CONSTRUCTION = 200
+DEFAULT_HNSW_EF = 10
+
+DEFAULT_BATCH_SIZE = 1000
+
+FIELD_ID: str = "id"
+FIELD_TEXT: str = "text"
+FIELD_VECTOR: str = "vector"
+FIELD_METADATA: str = "metadata"
+INDEX_VECTOR: str = "vector_idx"
+
+VALUE_NONE_ERROR = "Parameter `{}` can not be None."
+NOT_SUPPORT_INDEX_TYPE_ERROR = (
+    "Unsupported index type: `{}`, supported index types are {}"
+)
+NOT_SUPPORT_METRIC_TYPE_ERROR = (
+    "Unsupported metric type: `{}`, supported metric types are {}"
+)
+
+def _try_import() -> None:
+    try:
+        import pymochow
+    except ImportError:
+        raise ImportError(
+            "`pymochow` package not found, please run `pip install pymochow`"
+        )
+
+class TableParams:
+    """Baidu VectorDB table params.
+    See the following documentation for details:
+    https://cloud.baidu.com/doc/VDB/s/mlrsob0p6
+    Args:
+        dimension int: The dimension of vector.
+        replication int: The number of replicas in the table.
+        partition int: The number of partitions in the table.
+        index_type (Optional[str]): HNSW, FLAT... Default value is "HNSW"
+        metric_type (Optional[str]): L2, COSINE, IP. Default value is "L2"
+        drop_exists (Optional[bool]): Delete the existing Table. Default value is False.
+        vector_params (Optional[Dict]):
+          if HNSW set parameters: `M` and `efConstruction`, for example `{'M': 16, efConstruction: 200}`
+          default is HNSW
+    """
+
+    def __init__(
+        self,
+        dimension: int,
+        table_name: str = DEFAULT_TABLE_NAME,
+        replication: int = DEFAULT_REPLICA,
+        partition: int = DEFAULT_PARTITION,
+        index_type: str = DEFAULT_INDEX_TYPE,
+        metric_type: str = DEFAULT_METRIC_TYPE,
+        drop_exists: bool = False,
+        vector_params: Dict = None,
+    ):
+        self.dimension = dimension
+        self.table_name = table_name
+        self.replication = replication
+        self.partition = partition
+        self.index_type = index_type
+        self.metric_type = metric_type
+        self.drop_exists = drop_exists
+        self.vector_params = vector_params
+
+class BaiduVDBVectorStoreIndex:
+    """
+    Baidu VDB向量存储检索工具
+    """
+    vdb_uri_prefix = b"/api/v1/bce/vdb/instance/"
+
+    def __init__(
+        self,
+        instance_id,
+        api_key: str,
+        account: str = DEFAULT_ACCOUNT,
+        database_name: str = DEFAULT_DATABASE_NAME,
+        table_params: TableParams = TableParams(dimension=384),
+        embedding=None,
+    ):
+
+        if embedding is None:
+            embedding = Embedding()
+
+        self.embedding = embedding
+        
+        self._init_client(instance_id, account, api_key)
+        self._create_database_if_not_exists(database_name)
+        self._create_table(table_params)
+
+    def _init_client(self, instance_id, account, api_key):
+        """
+        创建一个vdb的client
+        """
+        import pymochow
+        from pymochow.configuration import Configuration
+        from pymochow.auth.bce_credentials import AppBuilderCredentials
+
+        gateway = os.getenv("GATEWAY_URL") if os.getenv("GATEWAY_URL") else GATEWAY_URL
+        
+        config = Configuration(
+            credentials=AppBuilderCredentials(account, api_key, appbuilder_token),
+            endpoint=gateway,
+            uri_perfix=self.vdb_uri_prefix,
+            connection_timeout_in_mills=DEFAULT_TIMEOUT_IN_MILLS,
+        )
+        self.vdb_client = pymochow.MochowClient(config)
+
+    def _create_database_if_not_exists(self, database_name: str) -> None:
+        db_list = self.vdb_client.list_databases()
+
+        if database_name in [db.database_name for db in db_list]:
+            self.database = self.vdb_client.database(database_name)
+        else:
+            self.database = self.vdb_client.create_database(database_name)
+
+    def _create_table(self, table_params: TableParams) -> None:
+        import pymochow
+
+        if table_params is None:
+            raise ValueError(VALUE_NONE_ERROR.format("table_params"))
+
+        try:
+            self.table = self.database.describe_table(table_params.table_name)
+            if table_params.drop_exists:
+                self.database.drop_table(table_params.table_name)
+                # wait db release resource
+                time.sleep(5)
+                self._create_table_in_db(table_params)
+        except pymochow.exception.ServerError:
+            self._create_table_in_db(table_params)
+
+    def _create_table_in_db(
+        self,
+        table_params: TableParams,
+    ) -> None:
+        from pymochow.model.enum import FieldType
+        from pymochow.model.schema import Field, Schema, SecondaryIndex, VectorIndex
+        from pymochow.model.table import Partition
+
+        index_type = self._get_index_type(table_params.index_type)
+        metric_type = self._get_metric_type(table_params.metric_type)
+        vector_params = self._get_index_params(index_type, table_params)
+        fields = []
+        fields.append(
+            Field(
+                FIELD_ID,
+                FieldType.UINT64,
+                primary_key=True,
+                partition_key=True,
+                auto_increment=True,
+                not_null=True,
+            )
+        )
+        fields.append(Field(FIELD_METADATA, FieldType.STRING))
+        fields.append(Field(FIELD_TEXT, FieldType.STRING))
+        fields.append(
+            Field(
+                FIELD_VECTOR, FieldType.FLOAT_VECTOR, dimension=table_params.dimension
+            )
+        )
+
+        indexes = []
+        indexes.append(
+            VectorIndex(
+                index_name=INDEX_VECTOR,
+                index_type=index_type,
+                field=FIELD_VECTOR,
+                metric_type=metric_type,
+                params=vector_params,
+            )
+        )
+
+        schema = Schema(fields=fields, indexes=indexes)
+        self.table = self.database.create_table(
+            table_name=table_params.table_name,
+            replication=table_params.replication,
+            partition=Partition(partition_num=table_params.partition),
+            schema=Schema(fields=fields, indexes=indexes),
+            enable_dynamic_field=True,
+        )
+        # need wait 10s to wait proxy sync meta
+        time.sleep(10)
+    
+    @staticmethod
+    def _get_index_params(index_type: Any, table_params: TableParams) -> None:
+        from pymochow.model.enum import IndexType
+        from pymochow.model.schema import HNSWParams
+
+        vector_params = (
+            {} if table_params.vector_params is None else table_params.vector_params
+        )
+
+        if index_type == IndexType.HNSW:
+            return HNSWParams(
+                m=vector_params.get("M", DEFAULT_HNSW_M),
+                efconstruction=vector_params.get(
+                    "efConstruction", DEFAULT_HNSW_EF_CONSTRUCTION
+                ),
+            )
+        return None
+
+    @staticmethod
+    def _get_index_type(index_type_value: str) -> Any:
+        from pymochow.model.enum import IndexType
+
+        index_type_value = index_type_value or IndexType.HNSW
+        try:
+            return IndexType(index_type_value)
+        except ValueError:
+            support_index_types = [d.value for d in IndexType.__members__.values()]
+            raise ValueError(
+                NOT_SUPPORT_INDEX_TYPE_ERROR.format(
+                    index_type_value, support_index_types
+                )
+            )
+
+    @staticmethod
+    def _get_metric_type(metric_type_value: str) -> Any:
+        from pymochow.model.enum import MetricType
+
+        metric_type_value = metric_type_value or MetricType.L2
+        try:
+            return MetricType(metric_type_value.upper())
+        except ValueError:
+            support_metric_types = [d.value for d in MetricType.__members__.values()]
+            raise ValueError(
+                NOT_SUPPORT_METRIC_TYPE_ERROR.format(
+                    metric_type_value, support_metric_types
+                )
+            )
+
+    @property
+    def client(self) -> Any:
+        """Get client."""
+        return self.vdb_client
+
+    def as_retriever(self):
+        """
+        转化为retriever
+        """
+        return BaiduVDBRetriever(
+            embedding=self.embedding,
+            table=self.table,
+        )
+
+    def add_segments(self, segments: Message, metadata=""):
+        """
+        向bes中插入数据
+        参数:
+            query (Message[str]): 需要插入的内容
+        返回:
+        """
+        from pymochow.model.table import Row
+
+        segment_vectors = self.embedding.batch(segments)
+        segment_vectors = segment_vectors.content
+        vector_dims = len(segment_vectors[0])
+        segments = segments.content
+        
+        rows = []
+        for segment, vector in zip(segments, segment_vectors):
+            row = Row(text=segment, vector=vector, metadata=metadata)
+            rows.append(row)
+        if len(rows) >= DEFAULT_BATCH_SIZE:
+                self.collection.upsert(rows=rows)
+                rows = []
+
+        if len(rows) > 0:
+            self.table.upsert(rows=rows)
+
+    @classmethod
+    def from_params(
+        cls,
+        instance_id: str,
+        api_key: str,
+        account: str = DEFAULT_ACCOUNT,
+        database_name: str = DEFAULT_DATABASE_NAME,
+        table_name: str = DEFAULT_TABLE_NAME,
+        drop_exists: bool = False,
+        **kwargs,
+    ):
+        _try_import()
+        dimension = kwargs.get("dimension", 384)
+        table_params = TableParams(
+            dimension=dimension, 
+            table_name=table_name,
+            drop_exists=drop_exists,
+        )
+        return cls(
+            instance_id=instance_id,
+            account=account,
+            api_key=api_key,
+            database_name=database_name,
+            table_params=table_params,
+        )
+
+
+class BaiduVDBRetriever(Component):
+    """
+    向量检索组件，用于检索和query相匹配的内容
+
+    Examples:
+
+        .. code-block:: python
+
+            import appbuilder
+            os.environ["APPBUILDER_TOKEN"] = '...'
+
+            segments = appbuilder.Message(["文心一言大模型", "百度在线科技有限公司"])
+            vector_index = appbuilder.BaiduVDBVectorStoreIndex.from_params(
+                    self.instance_id,
+                    self.api_key,
+            )
+            vector_index.add_segments(segments)
+            
+            query = appbuilder.Message("文心一言")
+            time.sleep(5)
+            retriever = vector_index.as_retriever()
+            res = retriever(query)
+
+    """
+    name: str = "BaiduVectorDBRetriever"
+    tool_desc: Dict[str, Any] = {"description": "a retriever based on Baidu VectorDB"}
+
+    def __init__(self, embedding, table):
+        super().__init__()
+
+        self.embedding = embedding
+        self.table = table
+
+    def run(self, query: Message, top_k: int = 1):
+        """
+        根据query进行查询
+        参数:
+            query (Message[str]): 需要查询的内容，
+            top_k (bool): 查询结果中匹配度最高的top_k个结果
+        返回:
+            obj (Message[Dict]): 查询到的结果，包含文本和匹配得分。
+        """
+        from pymochow.model.table import AnnSearch, HNSWSearchParams
+        from pymochow.model.enum import ReadConsistency
+
+        query_embedding = self.embedding(query)
+        anns = AnnSearch(
+            vector_field=FIELD_VECTOR,
+            vector_floats=query_embedding.content,
+            params=HNSWSearchParams(ef=10, limit=top_k),
+        )
+        res = self.table.search(anns=anns, read_consistency=ReadConsistency.STRONG)
+        rows = res.rows
+        docs = []
+        if rows is None or len(rows) == 0:
+            return Message(docs)
+
+        for row in rows:
+            row_data = row.get("row", {})
+            docs.append({
+                "text": row_data.get(FIELD_TEXT),
+                "meta": row_data.get(FIELD_METADATA),
+                "score": row.get("score")
+            })
+
+        return Message(docs)
diff --git a/appbuilder/core/components/retriever/bes/README.md b/appbuilder/core/components/retriever/bes/README.md
new file mode 100644
index 00000000..73a4b494
--- /dev/null
+++ b/appbuilder/core/components/retriever/bes/README.md
@@ -0,0 +1,88 @@
+# 向量检索（Baidu ElasticSearch Retriever）
+
+## 简介
+向量检索组件（Baidu ElasticSearch Retriever）基于一款Baidu ElasticSearch的内容检索组件，支持根据文本的向量的相似度进行内容检索。
+
+### 功能介绍
+向量检索组件（Baidu ElasticSearch Retriever）用于在将文本内容输入到Baidu ElasticSearch，根据文本的向量相似度进行高效的内容检索。
+
+### 特色优势
+- 高效准确：基于Baidu ElasticSearch的强大能力，提供高效且准确的内容检索功能。
+
+### 应用场景
+各种内容检索场景
+
+## 准备工作
+在使用Baidu ElasticSearch Retriever进行内容检索之前，需要到Baidu ElasticSearch官网创建相应的集群，详情见[教程](https://cloud.baidu.com/doc/BES/s/gke3ocf89)。
+
+注：创建集群时请选择7.10.2版本的ES，否则可能无法使用本组件。
+
+## 基本用法
+
+以下是有关如何开始使用BESRetriever的代码示例：
+
+```python
+import os
+import appbuilder
+
+# 请前往千帆AppBuilder官网创建密钥，流程详见：https://cloud.baidu.com/doc/AppBuilder/s/Olq6grrt6#1%E3%80%81%E5%88%9B%E5%BB%BA%E5%AF%86%E9%92%A5
+os.environ["APPBUILDER_TOKEN"] = '...'
+
+embedding = appbuilder.Embedding()
+segments = appbuilder.Message(["文心一言大模型", "百度在线科技有限公司"])
+# 初始化构建索引
+vector_index = appbuilder.BESVectorStoreIndex.from_segments(segments=segments, cluster_id=es_cluster_id, user_name=es_username, 
+                                                            password=es_password, embedding=embedding)
+# 获取当前索引中的全部内容
+all_content = vector_index.get_all_segments()
+print(all_content)
+# 转化为retriever
+retriever = vector_index.as_retriever()
+# 按照query进行检索
+query = appbuilder.Message("文心一言")
+res = retriever(query=query, top_k=1)
+print(res)
+# 删除当前索引中的全部内容
+vector_index.delete_all_segments()
+```
+
+## 参数说明
+
+### 鉴权说明
+使用组件之前，请首先申请并设置鉴权参数，可参考[组件使用流程](https://cloud.baidu.com/doc/AppBuilder/s/Olq6grrt6#1%E3%80%81%E5%88%9B%E5%BB%BA%E5%AF%86%E9%92%A5)。
+```python
+# 设置环境中的TOKEN，以下示例略
+os.environ["APPBUILDER_TOKEN"] = "bce-YOURTOKEN"
+```
+
+### 初始化参数说明：
+
+- segments （Message[List[str]]，必填）：需要入库的文本段落
+- cluster_id （str，必填）：ElacticSearch集群的id，创建集群时获取
+- user_name  （str，必填）：连接ES集群所需的用户名，创建集群时获取
+- password   （str，必填）：连接ES集群所需的密码，创建集群时获取
+- embedding  （obj，非必填）：用于将文本转为向量的模型，默认为Embedding
+
+### 调用参数：
+| 参数名称    | 参数类型   |是否必须 | 描述               | 示例值           |
+|---------|--------|--------|------------------|---------------|
+| message | String |是 | 需要检索的内容          | "中国2023人均GDP" |
+| top_k   | int    |否 | 返回相似度最高的top_k个内容 | 1             |
+
+### 响应参数
+| 参数名称 | 参数类型   | 描述  | 示例值                |
+|------|--------|-----|--------------------|
+| text | string | 检索结果 | "中国2023年人均GDP8.94万元" |
+| score | float  | 相似度 | 0.95               |
+| meta | dict   | 元信息 | ""                   |
+### 响应示例
+```json
+{"text": "中国2023年人均GDP8.94万元", "score": 0.95, "meta": ""}
+```
+
+## 高级用法：
+
+本组件根据向量的相似度进行检索，支持使用不同的embedding方法和索引方式来优化检索的效果。
+
+## 更新记录和贡献
+* 向量检索能力 (2023-12)
\ No newline at end of file
diff --git a/appbuilder/core/components/retriever/bes/__init__.py b/appbuilder/core/components/retriever/bes/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/appbuilder/core/components/retriever/bes_retriever.py b/appbuilder/core/components/retriever/bes/bes_retriever.py
similarity index 100%
rename from appbuilder/core/components/retriever/bes_retriever.py
rename to appbuilder/core/components/retriever/bes/bes_retriever.py
diff --git a/requirements.txt b/requirements.txt
index a2aafbfe..19f0f962 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -7,3 +7,4 @@ urllib3<2.0.0
 tenacity
 pandas
 openpyxl
+pymochow>=1.1.2