From 9551abadc020429da9bca9f6754f74f5010a09af Mon Sep 17 00:00:00 2001 From: zhanghexian1 Date: Tue, 10 Oct 2023 18:04:39 +0800 Subject: [PATCH 01/17] add new field --- .../integrations/vectorstores/vearch.ipynb | 68 +++---- .../langchain/vectorstores/vearch.py | 178 +++++++++++------- 2 files changed, 142 insertions(+), 104 deletions(-) diff --git a/docs/docs_skeleton/docs/integrations/vectorstores/vearch.ipynb b/docs/docs_skeleton/docs/integrations/vectorstores/vearch.ipynb index eca0218a4f32c..11d15411ecfea 100644 --- a/docs/docs_skeleton/docs/integrations/vectorstores/vearch.ipynb +++ b/docs/docs_skeleton/docs/integrations/vectorstores/vearch.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 2, + "execution_count": 1, "metadata": {}, "outputs": [ { @@ -11,7 +11,7 @@ "text": [ "/export/anaconda3/envs/vearch_cluster_langchain/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", " from .autonotebook import tqdm as notebook_tqdm\n", - "Loading checkpoint shards: 100%|██████████| 7/7 [00:07<00:00, 1.01s/it]\n" + "Loading checkpoint shards: 100%|██████████| 7/7 [00:06<00:00, 1.03it/s]\n" ] } ], @@ -31,7 +31,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 2, "metadata": {}, "outputs": [ { @@ -42,7 +42,7 @@ "ChatGLM:你好👋!我是人工智能助手 ChatGLM2-6B,很高兴见到你,欢迎问我任何问题。\n", "\n", "Human: 你知道凌波微步吗,你知道都有谁学会了吗?\n", - "ChatGLM:凌波微步是一种步伐,最早出自《倚天屠龙记》。在电视剧《人民的名义》中,侯亮平也学会了凌波微步。\n", + "ChatGLM:凌波微步是一种步伐,源自金庸先生的武侠小说《天龙八部》。段誉是学会凌波微步的人之一。\n", "\n" ] } @@ -58,7 +58,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 3, "metadata": {}, "outputs": [ { @@ -86,28 +86,28 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "docids ['18ce6747dca04a2c833e60e8dfd83c04', 'aafacb0e46574b378a9f433877ab06a8', '9776bccfdd8643a8b219ccee0596f370']\n", + "['3556d6e803ff4c42b0d30cd2bb452ff2', '0079e8720dd546a3a3392e73009c3697', '95534c7b67cc4a4eb8244ca8541ec2af']\n", "***************after is cluster res*****************\n", - "docids ['1841638988191686991', '-4519586577642625749', '5028230008472292907']\n" + "['-8844872069836508090', '-6664697494158621762', '149322492689170423']\n" ] } ], "source": [ "#first add your document into vearch vectorstore\n", "vearch_standalone = Vearch.from_documents(\n", - " texts,embeddings,path_or_url=\"/data/zhx/zhx/langchain-ChatGLM_new/knowledge_base/localdb_new_test\",table_name=\"localdb_new_test\",flag=0)\n", + " texts,embeddings,path_or_url=\"/data/zhx/zhx/langchain-ChatGLM_new/knowledge_base/valid_table\",table_name=\"valid_table\",flag=0)\n", "\n", "print(\"***************after is cluster res*****************\")\n", "\n", "vearch_cluster = Vearch.from_documents(\n", - " texts,embeddings,path_or_url=\"http://test-vearch-langchain-router.vectorbase.svc.ht1.n.jd.local\",db_name=\"vearch_cluster_langchian\",table_name=\"tobenumone\",flag=1)\n" + " texts,embeddings,path_or_url=\"http://test-vearch-langchain-router.vectorbase.svc.ht1.n.jd.local\",db_name=\"valid_table_new_2\",table_name=\"valid_table_new_1\",flag=1)\n" ] }, { @@ -152,7 +152,7 @@ "段誉心道:“神仙姊姊所遗的步法,必定精妙之极,遇到强敌时脱身逃走,那就很好,‘再取敌命’也就不必了。”\n", "卷好帛卷,对之作了两个揖,珍而重之地揣入怀中,转身对那玉像道:“神仙姊姊,你吩咐我朝午晚三次练功,段誉不敢有违。今后我对人加倍客气,别人不会来打我,我自然也不会去吸他内力。你这套‘凌波微步’我更要用心练熟,眼见不对,立刻溜之大吉,就吸不到他内力了。”至于“杀尽我逍遥派弟子”一节,却想也不敢去想。\n", "\n", - "********ChatGLM:凌波微步是一门极上乘的轻功,源于《易经》八八六十四卦。使用者按照特定顺序踏着卦象方位行进,从第一步到最后一步正好行走一个大圈。这门轻功精妙异常,可以使人内力大为提升,但需在练成“北冥神功”后才能真正掌握。凌波微步在金庸先生的《天龙八部》中得到了充分的描写。\n", + "********ChatGLM:凌波微步是一门极上乘的轻功,源于《易经》中的六十四卦。使用者需要按照特定顺序踏着卦象方位行进,从第一步到最后一步正好行走一个大圈。这门轻功精妙异常,可以让人内力大为提升,但需要谨慎练习,避免误用。据《天龙八部》记载,凌波微步是逍遥派独门轻功身法,段誉曾练过并深得其中的奥义。\n", "\n", "***************************after is cluster res******************************\n", "####################第1段相关文档####################\n", @@ -188,7 +188,7 @@ "段誉心道:“神仙姊姊所遗的步法,必定精妙之极,遇到强敌时脱身逃走,那就很好,‘再取敌命’也就不必了。”\n", "卷好帛卷,对之作了两个揖,珍而重之地揣入怀中,转身对那玉像道:“神仙姊姊,你吩咐我朝午晚三次练功,段誉不敢有违。今后我对人加倍客气,别人不会来打我,我自然也不会去吸他内力。你这套‘凌波微步’我更要用心练熟,眼见不对,立刻溜之大吉,就吸不到他内力了。”至于“杀尽我逍遥派弟子”一节,却想也不敢去想。\n", "\n", - "********ChatGLM:凌波微步是一门极上乘的轻功,源于《易经》中的六十四卦。使用者按照特定顺序踏着卦象方位行进,从第一步到最后一步正好行走一个大圈。这门轻功精妙异常,可以使人内力增进,但需要谨慎练习,避免伤害他人。凌波微步在逍遥派中尤为流行,但并非所有逍遥派弟子都会凌波微步。\n", + "********ChatGLM:凌波微步是一种轻功身法,源于《易经》中的六十四卦。它通过按照特定顺序踏着卦象方位行进,从第一步到最后一步正好行走一个大圈,使内息自然而然地也转了一个周天。凌波微步可以锻炼身体的协调性和灵活性,增强身体的控制力和平衡感。凌波微步是逍遥派独门轻功身法,精妙异常,使用者需按照特定顺序踏着卦象方位行进,并达到一定的内力要求才能真正掌握这门轻功。在《天龙八部》中,段誉曾练凌波微步并精通其中之道。\n", "\n" ] } @@ -229,17 +229,17 @@ "output_type": "stream", "text": [ "Human: 你知道vearch是什么吗?\n", - "ChatGLM:是的,我知道 Vearch。Vearch 是一种用于计算机械系统极化子的工具,它可以用于模拟和优化电路的性能。它是一个基于Matlab的电路仿真软件,可以用于设计和分析各种类型的电路,包括交流电路和直流电路。\n", + "ChatGLM:是的,我知道 Vearch。Vearch 是一种基于矩阵分解的方法,用于解决线性方程组。它是一种高效的算法,特别适用于大规模线性方程组的求解。Vearch 算法将矩阵分解为基向量的叉积形式,这样可以快速地求解线性方程组。\n", "\n", - "docids ['eee5e7468434427eb49829374c1e8220', '2776754da8fc4bb58d3e482006010716', '9223acd6d89d4c2c84ff42677ac0d47c']\n", + "['029aa75548ff4f85908d089888fa84f4', '9e13390d56ce4cf08880579b3f6b340f', 'f7ee0b61bfe0476e988c19559f36d16f']\n", "*****************after is cluster res********************\n", - "docids ['-4311783201092343475', '-2899734009733762895', '1342026762029067927']\n" + "['-6632005459476004618', '-79270890206033064', '7036574374942192674']\n" ] }, { "data": { "text/plain": [ - "['-4311783201092343475', '-2899734009733762895', '1342026762029067927']" + "['-6632005459476004618', '-79270890206033064', '7036574374942192674']" ] }, "execution_count": 7, @@ -286,7 +286,7 @@ "\n", "vearch 是基于C语言,go语言开发的,并提供python接口,可以直接通过pip安装\n", "\n", - "***************ChatGLM:是的,Varch是一个向量数据库,旨在存储和快速搜索模型embedding后的向量。它支持OpenAI、ChatGLM等模型,并可直接通过pip安装。\n", + "***************ChatGLM:是的,我知道 Vearch。Varch 是一款基于 C 语言和 Go 语言开发的向量数据库,旨在存储和快速搜索模型嵌入后的向量,可用于基于个人知识库的大模型应用。Varch 支持 OpenAI、Llama 和 ChatGLM 等模型,并提供了 Python 接口,用户可以通过 pip 安装。Varch 的特点是高可扩展性、高性能和易于使用。\n", "\n", "***************after is cluster res******************\n", "####################第1段相关文档####################\n", @@ -301,7 +301,7 @@ "\n", "vearch 是基于C语言,go语言开发的,并提供python接口,可以直接通过pip安装\n", "\n", - "***************ChatGLM:是的,Varch是一个向量数据库,旨在存储和快速搜索模型embedding后的向量。它支持OpenAI,ChatGLM等模型,并可用于基于个人知识库的大模型应用。Varch基于C语言和Go语言开发,并提供Python接口,可以通过pip安装。\n", + "***************ChatGLM:是的,Vearch是一款存储大语言模型数据的向量数据库,用于存储和快速搜索模型embedding后的向量。它支持OpenAI,Llama,ChatGLM等模型,并且可以与LangChain库集成。Varch是一个基于C语言和Go语言开发的项目,并提供Python接口。通过pip安装,用户可以直接使用Varch进行数据存储和搜索。\n", "\n" ] } @@ -342,22 +342,20 @@ "text": [ "delete vearch standalone docid True\n", "Human: 你知道vearch是什么吗?\n", - "ChatGLM:Vearch是一种用于处理向量的库,可以轻松地将向量转换为矩阵,并提供许多有用的函数和算法,以操作向量。 Vearch支持许多常见的向量操作,例如加法、减法、乘法、除法、矩阵乘法、求和、统计和归一化等。 Vearch还提供了一些高级功能,例如L2正则化、协方差矩阵、稀疏矩阵和奇异值分解等。\n", + "ChatGLM:Vearch是一种类似于\"vex\"的词汇,可能指的是一个数学或计算机科学中的符号或术语。在数学中,vearch可能指的是一个向量,其中每个元素都等于该向量在坐标轴上移动一个单位后的值。在计算机科学中,vearch可能指的是一个算法或数据结构,其中每个元素都包含有关另一个元素的信息。\n", + "\n", + " Vearch并不是一个常见的单词或术语,因此我无法提供更多信息。如果您有更多信息或上下文,可以提供更多信息,我将尽力回答您的问题。\n", "\n", "delete vearch cluster docid True\n", "Human: 你知道vearch是什么吗?\n", - "ChatGLM:Vearch是一种用于处理向量数据的函数,可以应用于多种不同的编程语言和数据结构中。\n", + "ChatGLM:Vearch是一种高斯分布的概率分布,也称为高斯连结分布(Gaussian linking distribution)。它是基于高斯分布的一个变体,用于建模多个变量之间的关联关系,如时间序列中的多个状态或者空间中的多个位置。\n", "\n", - "Vearch最初是作为Java中一个名为“vearch”的包而出现的,它的目的是提供一种高效的向量数据结构。它支持向量的多态性,可以轻松地实现不同类型的向量之间的转换,同时还支持向量的压缩和反向操作等操作。\n", - "\n", - "后来,Vearch被广泛应用于其他编程语言中,如Python、Ruby、JavaScript等。在Python中,它被称为“vectorize”,在Ruby中,它被称为“Vector”。\n", - "\n", - "Vearch的主要优点是它的向量操作具有多态性,可以应用于不同类型的向量数据,同时还支持高效的向量操作和反向操作,因此可以提高程序的性能。\n", + "Vearch分布可以用于建模多个变量之间的因果关系,并且可以处理因变量是时间序列或实数的情况。它也被广泛用于金融和经济学等领域中,如股票价格和交易量等数据的建模。\n", "\n", "after delete docid to query again: {}\n", - "get existed docid {'18ce6747dca04a2c833e60e8dfd83c04': Document(page_content='《天龙八部》第二回 玉壁月华明\\n\\n再展帛卷,长卷上源源皆是裸女画像,或立或卧,或现前胸,或见后背。人像的面容都是一般,但或喜或愁,或含情凝眸,或轻嗔薄怒,神情各异。一共有三十六幅图像,每幅像上均有颜色细线,注明穴道部位及练功法诀。\\n\\n帛卷尽处题着“凌波微步”四字,其后绘的是无数足印,注明“妇妹”、“无妄”等等字样,尽是《易经》中的方位。段誉前几日还正全心全意地钻研《易经》,一见到这些名称,登时精神大振,便似遇到故交良友一般。只见足印密密麻麻,不知有几千百个,自一个足印至另一个足印均有绿线贯串,线上绘有箭头,最后写着一行字道:“步法神妙,保身避敌,待积内力,再取敌命。”\\n\\n段誉心道:“神仙姊姊所遗的步法,必定精妙之极,遇到强敌时脱身逃走,那就很好,‘再取敌命’也就不必了。”\\n卷好帛卷,对之作了两个揖,珍而重之地揣入怀中,转身对那玉像道:“神仙姊姊,你吩咐我朝午晚三次练功,段誉不敢有违。今后我对人加倍客气,别人不会来打我,我自然也不会去吸他内力。你这套‘凌波微步’我更要用心练熟,眼见不对,立刻溜之大吉,就吸不到他内力了。”至于“杀尽我逍遥派弟子”一节,却想也不敢去想。', metadata={'source': '/data/zhx/zhx/langchain-ChatGLM_new/knowledge_base/天龙八部/lingboweibu.txt'}), 'aafacb0e46574b378a9f433877ab06a8': Document(page_content='《天龙八部》第五回 微步縠纹生\\n\\n卷轴中此外诸种经脉修习之法甚多,皆是取人内力的法门,段誉虽自语宽解,总觉习之有违本性,单是贪多务得,便非好事,当下暂不理会。\\n\\n卷到卷轴末端,又见到了“凌波微步”那四字,登时便想起《洛神赋》中那些句子来:“凌波微步,罗袜生尘……转眄流精,光润玉颜。含辞未吐,气若幽兰。华容婀娜,令我忘餐。”曹子建那些千古名句,在脑海中缓缓流过:“秾纤得衷,修短合度,肩若削成,腰如约素。延颈秀项,皓质呈露。芳泽无加,铅华弗御。云髻峨峨,修眉连娟。丹唇外朗,皓齿内鲜。明眸善睐,靥辅承权。瑰姿艳逸,仪静体闲。柔情绰态,媚于语言……”这些句子用在木婉清身上,“这话倒也有理”;但如用之于神仙姊姊,只怕更为适合。想到神仙姊姊的姿容体态,“皎若太阳升朝霞,灼若芙蓉出绿波”,但觉依她吩咐行事,实为人生至乐,心想:“我先来练这‘凌波微步’,此乃逃命之妙法,非害人之手段也,练之有百利而无一害。”', metadata={'source': '/data/zhx/zhx/langchain-ChatGLM_new/knowledge_base/天龙八部/lingboweibu.txt'}), '9776bccfdd8643a8b219ccee0596f370': Document(page_content='午饭过后,段誉又练“凌波微步”,走一步,吸一口气,走第二步时将气呼出,六十四卦走完,四肢全无麻痹之感,料想呼吸顺畅,便无害处。第二次再走时连走两步吸一口气,再走两步始行呼出。这“凌波微步”是以动功修习内功,脚步踏遍六十四卦一个周天,内息自然而然地也转了一个周天。因此他每走一遍,内力便有一分进益。\\n\\n这般练了几天,“凌波微步”已走得颇为纯熟,不须再数呼吸,纵然疾行,气息也已无所窒滞。心意既畅,跨步时渐渐想到《洛神赋》中那些与“凌波微步”有关的句子:“仿佛兮若轻云之蔽月,飘飘兮若流风之回雪”,“竦轻躯以鹤立,若将飞而未翔”,“体迅飞凫,飘忽若神”,“动无常则,若危若安。进止难期,若往若还”。\\n\\n\\n\\n百度简介\\n\\n凌波微步是「逍遥派」独门轻功身法,精妙异常。\\n\\n凌波微步乃是一门极上乘的轻功,所以列于卷轴之末,以易经八八六十四卦为基础,使用者按特定顺序踏着卦象方位行进,从第一步到最后一步正好行走一个大圈。此步法精妙异常,原是要待人练成「北冥神功」,吸人内力,自身内力已【颇为深厚】之后再练。', metadata={'source': '/data/zhx/zhx/langchain-ChatGLM_new/knowledge_base/天龙八部/lingboweibu.txt'})}\n", + "get existed docid {}\n", "after delete docid to query again: {}\n", - "get existed docid {'1841638988191686991': Document(page_content='《天龙八部》第二回 玉壁月华明\\n\\n再展帛卷,长卷上源源皆是裸女画像,或立或卧,或现前胸,或见后背。人像的面容都是一般,但或喜或愁,或含情凝眸,或轻嗔薄怒,神情各异。一共有三十六幅图像,每幅像上均有颜色细线,注明穴道部位及练功法诀。\\n\\n帛卷尽处题着“凌波微步”四字,其后绘的是无数足印,注明“妇妹”、“无妄”等等字样,尽是《易经》中的方位。段誉前几日还正全心全意地钻研《易经》,一见到这些名称,登时精神大振,便似遇到故交良友一般。只见足印密密麻麻,不知有几千百个,自一个足印至另一个足印均有绿线贯串,线上绘有箭头,最后写着一行字道:“步法神妙,保身避敌,待积内力,再取敌命。”\\n\\n段誉心道:“神仙姊姊所遗的步法,必定精妙之极,遇到强敌时脱身逃走,那就很好,‘再取敌命’也就不必了。”\\n卷好帛卷,对之作了两个揖,珍而重之地揣入怀中,转身对那玉像道:“神仙姊姊,你吩咐我朝午晚三次练功,段誉不敢有违。今后我对人加倍客气,别人不会来打我,我自然也不会去吸他内力。你这套‘凌波微步’我更要用心练熟,眼见不对,立刻溜之大吉,就吸不到他内力了。”至于“杀尽我逍遥派弟子”一节,却想也不敢去想。', metadata={'source': '/data/zhx/zhx/langchain-ChatGLM_new/knowledge_base/天龙八部/lingboweibu.txt'}), '-4519586577642625749': Document(page_content='《天龙八部》第五回 微步縠纹生\\n\\n卷轴中此外诸种经脉修习之法甚多,皆是取人内力的法门,段誉虽自语宽解,总觉习之有违本性,单是贪多务得,便非好事,当下暂不理会。\\n\\n卷到卷轴末端,又见到了“凌波微步”那四字,登时便想起《洛神赋》中那些句子来:“凌波微步,罗袜生尘……转眄流精,光润玉颜。含辞未吐,气若幽兰。华容婀娜,令我忘餐。”曹子建那些千古名句,在脑海中缓缓流过:“秾纤得衷,修短合度,肩若削成,腰如约素。延颈秀项,皓质呈露。芳泽无加,铅华弗御。云髻峨峨,修眉连娟。丹唇外朗,皓齿内鲜。明眸善睐,靥辅承权。瑰姿艳逸,仪静体闲。柔情绰态,媚于语言……”这些句子用在木婉清身上,“这话倒也有理”;但如用之于神仙姊姊,只怕更为适合。想到神仙姊姊的姿容体态,“皎若太阳升朝霞,灼若芙蓉出绿波”,但觉依她吩咐行事,实为人生至乐,心想:“我先来练这‘凌波微步’,此乃逃命之妙法,非害人之手段也,练之有百利而无一害。”', metadata={'source': '/data/zhx/zhx/langchain-ChatGLM_new/knowledge_base/天龙八部/lingboweibu.txt'}), '5028230008472292907': Document(page_content='午饭过后,段誉又练“凌波微步”,走一步,吸一口气,走第二步时将气呼出,六十四卦走完,四肢全无麻痹之感,料想呼吸顺畅,便无害处。第二次再走时连走两步吸一口气,再走两步始行呼出。这“凌波微步”是以动功修习内功,脚步踏遍六十四卦一个周天,内息自然而然地也转了一个周天。因此他每走一遍,内力便有一分进益。\\n\\n这般练了几天,“凌波微步”已走得颇为纯熟,不须再数呼吸,纵然疾行,气息也已无所窒滞。心意既畅,跨步时渐渐想到《洛神赋》中那些与“凌波微步”有关的句子:“仿佛兮若轻云之蔽月,飘飘兮若流风之回雪”,“竦轻躯以鹤立,若将飞而未翔”,“体迅飞凫,飘忽若神”,“动无常则,若危若安。进止难期,若往若还”。\\n\\n\\n\\n百度简介\\n\\n凌波微步是「逍遥派」独门轻功身法,精妙异常。\\n\\n凌波微步乃是一门极上乘的轻功,所以列于卷轴之末,以易经八八六十四卦为基础,使用者按特定顺序踏着卦象方位行进,从第一步到最后一步正好行走一个大圈。此步法精妙异常,原是要待人练成「北冥神功」,吸人内力,自身内力已【颇为深厚】之后再练。', metadata={'source': '/data/zhx/zhx/langchain-ChatGLM_new/knowledge_base/天龙八部/lingboweibu.txt'})}\n" + "get existed docid {}\n" ] } ], @@ -365,27 +363,31 @@ "##delete and get function need to maintian docids \n", "##your docid\n", "\n", - "res_d=vearch_standalone.delete(['eee5e7468434427eb49829374c1e8220', '2776754da8fc4bb58d3e482006010716', '9223acd6d89d4c2c84ff42677ac0d47c'])\n", + "\n", + "res_d=vearch_standalone.delete(['029aa75548ff4f85908d089888fa84f4', '9e13390d56ce4cf08880579b3f6b340f', 'f7ee0b61bfe0476e988c19559f36d16f'])\n", "print(\"delete vearch standalone docid\",res_d)\n", "query = \"你知道vearch是什么吗?\"\n", "response, history = model.chat(tokenizer, query, history=[])\n", "print(f\"Human: {query}\\nChatGLM:{response}\\n\")\n", "\n", - "res_cluster=vearch_cluster.delete(['-4311783201092343475', '-2899734009733762895', '1342026762029067927'])\n", + "res_cluster=vearch_cluster.delete(['-6632005459476004618', '-79270890206033064', '7036574374942192674'])\n", "print(\"delete vearch cluster docid\",res_cluster)\n", "query_c = \"你知道vearch是什么吗?\"\n", "response_c, history = model.chat(tokenizer, query_c, history=[])\n", "print(f\"Human: {query}\\nChatGLM:{response_c}\\n\")\n", "\n", "\n", - "get_delet_doc=vearch_standalone.get(['eee5e7468434427eb49829374c1e8220', '2776754da8fc4bb58d3e482006010716', '9223acd6d89d4c2c84ff42677ac0d47c'])\n", + "\n", + "\n", + "\n", + "get_delet_doc=vearch_standalone.get(['029aa75548ff4f85908d089888fa84f4', '9e13390d56ce4cf08880579b3f6b340f', 'f7ee0b61bfe0476e988c19559f36d16f'])\n", "print(\"after delete docid to query again:\",get_delet_doc)\n", - "get_id_doc=vearch_standalone.get(['18ce6747dca04a2c833e60e8dfd83c04', 'aafacb0e46574b378a9f433877ab06a8', '9776bccfdd8643a8b219ccee0596f370','9223acd6d89d4c2c84ff42677ac0d47c'])\n", + "get_id_doc=vearch_standalone.get(['22f4c5e8af7541fb875b546872ef4e85', 'd796cdfe3ff7474e923bf87168f8788e', '738f7314b4134dd19e187b17968a379f'])\n", "print(\"get existed docid\",get_id_doc)\n", "\n", - "get_delet_doc=vearch_cluster.get(['-4311783201092343475', '-2899734009733762895', '1342026762029067927'])\n", + "get_delet_doc=vearch_cluster.get(['-6632005459476004618', '-79270890206033064', '7036574374942192674'])\n", "print(\"after delete docid to query again:\",get_delet_doc)\n", - "get_id_doc=vearch_cluster.get(['1841638988191686991', '-4519586577642625749', '5028230008472292907','1342026762029067927'])\n", + "get_id_doc=vearch_cluster.get(['6883964611486955620', '-3465968368475202129', '-4346376829616681721','3080506117063327914','7036574374942192674'])\n", "print(\"get existed docid\",get_id_doc)\n" ] }, diff --git a/libs/langchain/langchain/vectorstores/vearch.py b/libs/langchain/langchain/vectorstores/vearch.py index 11cede24e10e7..aa255a4526751 100644 --- a/libs/langchain/langchain/vectorstores/vearch.py +++ b/libs/langchain/langchain/vectorstores/vearch.py @@ -1,15 +1,16 @@ from __future__ import annotations +import json import os import time import uuid -from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Tuple, Type +from typing import (TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Tuple, + Type) import numpy as np - from langchain.docstore.document import Document from langchain.schema.embeddings import Embeddings -from langchain.schema.vectorstore import VectorStore +from langchain.vectorstores.base import VectorStore if TYPE_CHECKING: import vearch @@ -97,13 +98,13 @@ def from_documents( metadatas = [d.metadata for d in documents] return cls.from_texts( - texts=texts, - embedding=embedding, - metadatas=metadatas, - path_or_url=path_or_url, - table_name=table_name, - db_name=db_name, - flag=flag, + texts = texts, + embedding = embedding, + metadatas = metadatas, + path_or_url = path_or_url, + table_name = table_name, + db_name = db_name, + flag = flag, **kwargs, ) @@ -122,23 +123,39 @@ def from_texts( """Return Vearch VectorStore""" vearch_db = cls( - embedding_function=embedding, - embedding=embedding, - path_or_url=path_or_url, - db_name=db_name, - table_name=table_name, - flag=flag, + embedding_function = embedding, + embedding = embedding, + path_or_url = path_or_url, + db_name = db_name, + table_name = table_name, + flag = flag, ) - vearch_db.add_texts(texts=texts, metadatas=metadatas) + vearch_db.add_texts(texts = texts, metadatas = metadatas) return vearch_db + def _get_matadata_field( + self, + metadatas: Optional[List[dict]] = None + ): + field_list = [] + if metadatas: + for key, value in metadatas[0].items(): + if isinstance(value, int): + field_list.append({"field": key, "type": "int"}) + continue + if isinstance(value, str): + field_list.append({"field": key, "type": "str"}) + continue + if isinstance(value, float): + field_list.append({"field": key, "type": "float"}) + continue + else: + raise ValueError("Please check data type,support int, string, float") + self.field_list = field_list + def _create_table( self, dim: int = 1024, - field_list: List[dict] = [ - {"field": "text", "type": "str"}, - {"field": "metadata", "type": "str"}, - ], ) -> int: """ Create VectorStore Table @@ -149,31 +166,34 @@ def _create_table( code,0 for success,1 for failed """ - type_dict = {"int": vearch.dataType.INT, "str": vearch.dataType.STRING} + type_dict = {"int": vearch.dataType.INT, "str": vearch.dataType.STRING, + "float": vearch.dataType.FLOAT} engine_info = { - "index_size": 10000, - "retrieval_type": "IVFPQ", - "retrieval_param": {"ncentroids": 2048, "nsubvector": 32}, + "index_size": 1, + "retrieval_type": "HNSW", + "retrieval_param": {"metric_type": "InnerProduct","nlinks":-1, + "efConstruction": -1 }, } + filed_list_add=self.field_list + [{"field": "text", "type": "str"}] fields = [ vearch.GammaFieldInfo(fi["field"], type_dict[fi["type"]]) - for fi in field_list + for fi in filed_list_add ] vector_field = vearch.GammaVectorInfo( - name="text_embedding", - type=vearch.dataType.VECTOR, - is_index=True, - dimension=dim, - model_id="", - store_type="MemoryOnly", - store_param={"cache_size": 10000}, - has_source=False, + name = "text_embedding", + type = vearch.dataType.VECTOR, + is_index = True, + dimension = dim, + model_id = "", + store_type = "MemoryOnly", + store_param = {"cache_size": 10000}, + has_source = False, ) response_code = self.vearch.create_table( engine_info, - name=self.using_table_name, - fields=fields, - vector_field=vector_field, + name = self.using_table_name, + fields = fields, + vector_field = vector_field, ) return response_code @@ -188,33 +208,33 @@ def _create_space( Return: code,0 failed for ,1 for success """ + type_dict = {"int": "integer", "str": "string", "float": "float"} space_config = { "name": self.using_table_name, "partition_num": 1, "replica_num": 1, "engine": { - "name": "gamma", "index_size": 1, - "retrieval_type": "FLAT", + "retrieval_type": "HNSW", "retrieval_param": { - "metric_type": "L2", + "metric_type": "InnerProduct", + "nlinks":-1, + "efConstruction": -1 }, }, - "properties": { - "text": { - "type": "string", - }, - "metadata": { - "type": "string", - }, + } + tmp_proer = { + "text": {"type": "string"}, "text_embedding": { "type": "vector", "index": True, "dimension": dim, "store_type": "MemoryOnly", }, - }, - } + } + for item in self.field_list: + tmp_proer[item["field"]] = {"type":type_dict[item["type"]]} + space_config["properties"] = tmp_proer response_code = self.vearch.create_space(self.using_db_name, space_config) return response_code @@ -234,6 +254,7 @@ def add_texts( embeddings = self.embedding_func.embed_documents(list(texts)) if embeddings is None: raise ValueError("embeddings is None") + self._get_matadata_field(metadatas) if self.flag: dbs_list = self.vearch.list_dbs() if self.using_db_name not in dbs_list: @@ -247,10 +268,12 @@ def add_texts( raise ValueError("create space failed!!!") docid = [] if embeddings is not None and metadatas is not None: + meta_field_list = [i["field"] for i in self.field_list] for text, metadata, embed in zip(texts, metadatas, embeddings): profiles: dict[str, Any] = {} profiles["text"] = text - profiles["metadata"] = metadata["source"] + for f in meta_field_list: + profiles[f] = metadata[f] embed_np = np.array(embed) profiles["text_embedding"] = { "feature": (embed_np / np.linalg.norm(embed_np)).tolist() @@ -278,10 +301,12 @@ def add_texts( raise ValueError("create table failed!!!") if embeddings is not None and metadatas is not None: doc_items = [] + meta_field_list = [i["field"] for i in self.field_list] for text, metadata, embed in zip(texts, metadatas, embeddings): profiles_v: dict[str, Any] = {} profiles_v["text"] = text - profiles_v["metadata"] = metadata["source"] + for f in meta_field_list: + profiles_v[f] = metadata[f] embed_np = np.array(embed) profiles_v["text_embedding"] = embed_np / np.linalg.norm(embed_np) doc_items.append(profiles_v) @@ -325,11 +350,11 @@ def load_local( raise ValueError("vearch vectorbase table not exist!!!") vearch_db = cls( - embedding_function=embedding, - path_or_url=path_or_url, - table_name=table_name, - db_name=db_name, - flag=flag, + embedding_function = embedding, + path_or_url = path_or_url, + table_name = table_name, + db_name = db_name, + flag = flag, ) vearch_db._load() return vearch_db @@ -366,6 +391,8 @@ def similarity_search_by_vector( 0 is dissimilar, 1 is the most similar. """ embed = np.array(embedding) + meta_field_list = [i["field"] for i in self.field_list] + query_fields = meta_field_list+["text"] if self.flag: query_data = { "query": { @@ -376,8 +403,10 @@ def similarity_search_by_vector( } ], }, + "retrieval_param": {"metric_type": "InnerProduct", "efSearch": 64}, "size": k, - "fields": ["text", "metadata"], + "fields": query_fields, + } query_result = self.vearch.search( self.using_db_name, self.using_table_name, query_data @@ -392,8 +421,7 @@ def similarity_search_by_vector( } ], "fields": [], - "is_brute_search": 1, - "retrieval_param": {"metric_type": "InnerProduct", "nprobe": 20}, + "retrieval_param": {"metric_type": "InnerProduct", "efSearch": 64}, "topn": k, } query_result = self.vearch.search(query_data) @@ -401,15 +429,16 @@ def similarity_search_by_vector( docs = [] for item in res: content = "" - meta_data = {} + meta_data={} if self.flag: item = item["_source"] for item_key in item: if item_key == "text": content = item[item_key] continue - if item_key == "metadata": - meta_data["source"] = item[item_key] + if item_key in meta_field_list: + meta_data[item_key] = item[item_key] + meta_field_list.remove(item_key) continue docs.append(Document(page_content=content, metadata=meta_data)) return docs @@ -433,6 +462,8 @@ def similarity_search_with_score( raise ValueError("embedding_func is None!!!") embeddings = self.embedding_func.embed_query(query) embed = np.array(embeddings) + meta_field_list = [i["field"] for i in self.field_list] + query_fields = meta_field_list + ["text"] if self.flag: query_data = { "query": { @@ -444,7 +475,9 @@ def similarity_search_with_score( ], }, "size": k, - "fields": ["text_embedding", "text", "metadata"], + "fields": query_fields, + "retrieval_param": {"metric_type": "InnerProduct", "efSearch": 64}, + } query_result = self.vearch.search( self.using_db_name, self.using_table_name, query_data @@ -459,8 +492,7 @@ def similarity_search_with_score( } ], "fields": [], - "is_brute_search": 1, - "retrieval_param": {"metric_type": "InnerProduct", "nprobe": 20}, + "retrieval_param": {"metric_type": "InnerProduct", "efSearch": 64}, "topn": k, } query_result = self.vearch.search(query_data) @@ -476,8 +508,9 @@ def similarity_search_with_score( if item_key == "text": content = item[item_key] continue - if item_key == "metadata": - meta_data["source"] = item[item_key] + if item_key in meta_field_list: + meta_data[item_key] = item[item_key] + meta_field_list.remove(item_key) continue if self.flag != 1 and item_key == "score": score = item[item_key] @@ -536,6 +569,7 @@ def get( """ results: Dict[str, Document] = {} + meta_field_list = [i["field"] for i in self.field_list] if ids is None or ids.__len__() == 0: return results if self.flag: @@ -552,8 +586,9 @@ def get( if field == "text": content = record["_source"][field] continue - elif field == "metadata": - meta_info["source"] = record["_source"][field] + elif field in meta_field_list: + meta_info[field] = record["_source"][field] + meta_field_list.remove(field) continue results[record["_id"]] = Document( page_content=content, metadata=meta_info @@ -569,8 +604,9 @@ def get( if field == "text": content = docs_detail[field] continue - elif field == "metadata": - meta_info["source"] = docs_detail[field] + elif field in meta_field_list: + meta_info[field] = docs_detail[field] + meta_field_list.remove(field) continue results[docs_detail["_id"]] = Document( page_content=content, metadata=meta_info From 7e7e574f1acd64077d185c32df70f02aa67f8835 Mon Sep 17 00:00:00 2001 From: zhanghexian1 Date: Wed, 11 Oct 2023 10:47:13 +0800 Subject: [PATCH 02/17] fix lint --- libs/langchain/langchain/vectorstores/vearch.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/libs/langchain/langchain/vectorstores/vearch.py b/libs/langchain/langchain/vectorstores/vearch.py index aa255a4526751..9d7d627e9ccf4 100644 --- a/libs/langchain/langchain/vectorstores/vearch.py +++ b/libs/langchain/langchain/vectorstores/vearch.py @@ -1,6 +1,5 @@ from __future__ import annotations -import json import os import time import uuid @@ -150,7 +149,7 @@ def _get_matadata_field( field_list.append({"field": key, "type": "float"}) continue else: - raise ValueError("Please check data type,support int, string, float") + raise ValueError("Please check data type,support int, str, float") self.field_list = field_list def _create_table( From 5511293918684d9ab69cef074b60cfd74ee43aaf Mon Sep 17 00:00:00 2001 From: zhanghexian1 Date: Tue, 7 Nov 2023 14:28:31 +0800 Subject: [PATCH 03/17] fix dynamic field bug --- .../langchain/vectorstores/vearch.py | 23 +++++++++++++------ 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/libs/langchain/langchain/vectorstores/vearch.py b/libs/langchain/langchain/vectorstores/vearch.py index 9d7d627e9ccf4..f19348abfa9e8 100644 --- a/libs/langchain/langchain/vectorstores/vearch.py +++ b/libs/langchain/langchain/vectorstores/vearch.py @@ -358,6 +358,9 @@ def load_local( vearch_db._load() return vearch_db + def _get_field_list_from_c(self): + + pass def similarity_search( self, query: str, @@ -390,8 +393,11 @@ def similarity_search_by_vector( 0 is dissimilar, 1 is the most similar. """ embed = np.array(embedding) - meta_field_list = [i["field"] for i in self.field_list] - query_fields = meta_field_list+["text"] + + meta_field_list = self.vearch.get_space(self.using_db_name, + self.using_table_name) + meta_field_list.remove("text_embedding") + if self.flag: query_data = { "query": { @@ -404,7 +410,7 @@ def similarity_search_by_vector( }, "retrieval_param": {"metric_type": "InnerProduct", "efSearch": 64}, "size": k, - "fields": query_fields, + "fields": meta_field_list, } query_result = self.vearch.search( @@ -461,8 +467,9 @@ def similarity_search_with_score( raise ValueError("embedding_func is None!!!") embeddings = self.embedding_func.embed_query(query) embed = np.array(embeddings) - meta_field_list = [i["field"] for i in self.field_list] - query_fields = meta_field_list + ["text"] + meta_field_list = self.vearch.get_space(self.using_db_name, + self.using_table_name) + meta_field_list.remove("text_embedding") if self.flag: query_data = { "query": { @@ -474,7 +481,7 @@ def similarity_search_with_score( ], }, "size": k, - "fields": query_fields, + "fields": meta_field_list, "retrieval_param": {"metric_type": "InnerProduct", "efSearch": 64}, } @@ -568,7 +575,9 @@ def get( """ results: Dict[str, Document] = {} - meta_field_list = [i["field"] for i in self.field_list] + meta_field_list = self.vearch.get_space(self.using_db_name, + self.using_table_name) + meta_field_list.remove("text_embedding") if ids is None or ids.__len__() == 0: return results if self.flag: From 73b64d36f0616b969282f507bbdbcf7e53a9afd9 Mon Sep 17 00:00:00 2001 From: Bagatur Date: Mon, 1 Apr 2024 17:22:57 -0700 Subject: [PATCH 04/17] fmt --- libs/community/langchain_community/vectorstores/vearch.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/libs/community/langchain_community/vectorstores/vearch.py b/libs/community/langchain_community/vectorstores/vearch.py index 9be39ab2eb015..ceaf8ac1e8084 100644 --- a/libs/community/langchain_community/vectorstores/vearch.py +++ b/libs/community/langchain_community/vectorstores/vearch.py @@ -4,9 +4,9 @@ from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Tuple, Type import numpy as np -from langchain.docstore.document import Document -from langchain.schema.embeddings import Embeddings -from langchain.vectorstores.base import VectorStore +from langchain_core.documents import Document +from langchain_core.embeddings import Embeddings +from langchain_core.vectorstores import VectorStore if TYPE_CHECKING: import vearch From 734c470d2d80a58f549baab77ccabbb98d46dc65 Mon Sep 17 00:00:00 2001 From: Bagatur Date: Mon, 1 Apr 2024 17:23:24 -0700 Subject: [PATCH 05/17] fmt --- libs/community/langchain_community/vectorstores/vearch.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/libs/community/langchain_community/vectorstores/vearch.py b/libs/community/langchain_community/vectorstores/vearch.py index ceaf8ac1e8084..69c145c0cdf73 100644 --- a/libs/community/langchain_community/vectorstores/vearch.py +++ b/libs/community/langchain_community/vectorstores/vearch.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import os import time import uuid From f0bbc3bac4b8e8e9731157f120d58fdb7baa7ebb Mon Sep 17 00:00:00 2001 From: zhanghexian <1933690003@qq.com> Date: Tue, 28 May 2024 18:17:25 +0800 Subject: [PATCH 06/17] Make the latest adjustments according to the latest version sdk of vearch --- .../integrations/vectorstores/vearch.ipynb | 338 +++--------- .../vectorstores/vearch.py | 498 +++++------------- 2 files changed, 208 insertions(+), 628 deletions(-) diff --git a/docs/docs/integrations/vectorstores/vearch.ipynb b/docs/docs/integrations/vectorstores/vearch.ipynb index 9479e29c01dab..52dd2d6747ac6 100644 --- a/docs/docs/integrations/vectorstores/vearch.ipynb +++ b/docs/docs/integrations/vectorstores/vearch.ipynb @@ -1,69 +1,45 @@ { "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Vearch\n", - "\n", - ">[Vearch](https://vearch.readthedocs.io) is the vector search infrastructure for deeping learning and AI applications.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Setting up\n", - "\n", - "Follow [instructions](https://vearch.readthedocs.io/en/latest/quick-start-guide.html#).\n", - "\n", - "You'll need to install `langchain-community` with `pip install -qU langchain-community` to use this integration" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%pip install --upgrade --quiet vearch\n", - "\n", - "# OR\n", - "\n", - "%pip install --upgrade --quiet vearch_cluster" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Example" - ] - }, { "cell_type": "code", "execution_count": 1, - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [ { - "name": "stdout", + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "5a664802522248e8b0cb83aad3885474", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Loading checkpoint shards: 0%| | 0/7 [00:00 None: """Initialize vearch vector store - flag 1 for cluster,0 for standalone """ try: - if flag: - import vearch_cluster - else: - import vearch + import vearch except ImportError: - raise ImportError( + raise ValueError( "Could not import suitable python package. " - "Please install it with `pip install vearch or vearch_cluster`." + "Please install it with `pip install pyvearch." ) - - if flag: - if path_or_url is None: - raise ValueError("Please input url of cluster") - if not db_name: - db_name = self._DEFAULT_CLUSTER_DB_NAME - db_name += "_" - db_name += str(uuid.uuid4()).split("-")[-1] - self.using_db_name = db_name - self.url = path_or_url - self.vearch = vearch_cluster.VearchCluster(path_or_url) - - else: - if path_or_url is None: - metadata_path = os.getcwd().replace("\\", "/") - else: - metadata_path = path_or_url - if not os.path.isdir(metadata_path): - os.makedirs(metadata_path) - log_path = os.path.join(metadata_path, "log") - if not os.path.isdir(log_path): - os.makedirs(log_path) - self.vearch = vearch.Engine(metadata_path, log_path) - self.using_metapath = metadata_path + if path_or_url is None: + raise ValueError("Please input router url of vearch") + if not db_name: + db_name = self._DEFAULT_CLUSTER_DB_NAME + db_name += "_" + db_name += str(uuid.uuid4()).split("-")[-1] + self.using_db_name = db_name + self.url = path_or_url + self.vearch = Vearch(Config(host = path_or_url, token="secret")) if not table_name: table_name = self._DEFAULT_TABLE_NAME table_name += "_" table_name += str(uuid.uuid4()).split("-")[-1] self.using_table_name = table_name self.embedding_func = embedding_function - self.flag = flag @property def embeddings(self) -> Optional[Embeddings]: @@ -85,9 +68,8 @@ def from_documents( documents: List[Document], embedding: Embeddings, path_or_url: Optional[str] = None, - table_name: str = _DEFAULT_TABLE_NAME, db_name: str = _DEFAULT_CLUSTER_DB_NAME, - flag: int = _DEFAULT_VERSION, + table_name: str = _DEFAULT_TABLE_NAME, **kwargs: Any, ) -> Vearch: """Return Vearch VectorStore""" @@ -100,9 +82,8 @@ def from_documents( embedding=embedding, metadatas=metadatas, path_or_url=path_or_url, - table_name=table_name, db_name=db_name, - flag=flag, + table_name=table_name, **kwargs, ) @@ -113,20 +94,17 @@ def from_texts( embedding: Embeddings, metadatas: Optional[List[dict]] = None, path_or_url: Optional[str] = None, - table_name: str = _DEFAULT_TABLE_NAME, db_name: str = _DEFAULT_CLUSTER_DB_NAME, - flag: int = _DEFAULT_VERSION, + table_name: str = _DEFAULT_TABLE_NAME, **kwargs: Any, ) -> Vearch: """Return Vearch VectorStore""" - + vearch_db = cls( embedding_function=embedding, - embedding=embedding, path_or_url=path_or_url, db_name=db_name, table_name=table_name, - flag=flag, ) vearch_db.add_texts(texts=texts, metadatas=metadatas) return vearch_db @@ -146,99 +124,21 @@ def _get_matadata_field(self, metadatas: Optional[List[dict]] = None): continue else: raise ValueError("Please check data type,support int, str, float") - self.field_list = field_list - - def _create_table( - self, - dim: int = 1024, - ) -> int: - """ - Create VectorStore Table - Args: - dim:dimension of vector - fields_list: the field you want to store - Return: - code,0 for success,1 for failed - """ + return field_list + - type_dict = { - "int": vearch.dataType.INT, - "str": vearch.dataType.STRING, - "float": vearch.dataType.FLOAT, - } - engine_info = { - "index_size": 1, - "retrieval_type": "HNSW", - "retrieval_param": { - "metric_type": "InnerProduct", - "nlinks": -1, - "efConstruction": -1, - }, - } + def _create_space_schema(self, dim) ->SpaceSchema: filed_list_add = self.field_list + [{"field": "text", "type": "str"}] - fields = [ - vearch.GammaFieldInfo(fi["field"], type_dict[fi["type"]]) - for fi in filed_list_add - ] - vector_field = vearch.GammaVectorInfo( - name="text_embedding", - type=vearch.dataType.VECTOR, - is_index=True, - dimension=dim, - model_id="", - store_type="MemoryOnly", - store_param={"cache_size": 10000}, - has_source=False, - ) - response_code = self.vearch.create_table( - engine_info, - name=self.using_table_name, - fields=fields, - vector_field=vector_field, - ) - return response_code + type_dict = {"int": DataType.INTEGER, "str": DataType.STRING, + "float": DataType.FLOAT} + fields = [Field("text_embedding", DataType.VECTOR, + HNSWIndex("vec_idx", MetricType.Inner_product, 32, 64),dimension=dim)] + for fi in filed_list_add: + fields.append(Field(fi["field"], type_dict[fi["type"]], + index=ScalarIndex(fi["field"]+"_idx"))) + space_schema = SpaceSchema(self.using_table_name, fields) + return space_schema - def _create_space( - self, - dim: int = 1024, - ) -> int: - """ - Create VectorStore space - Args: - dim:dimension of vector - Return: - code,0 failed for ,1 for success - """ - type_dict = {"int": "integer", "str": "string", "float": "float"} - space_config = { - "name": self.using_table_name, - "partition_num": 1, - "replica_num": 1, - "engine": { - "index_size": 1, - "retrieval_type": "HNSW", - "retrieval_param": { - "metric_type": "InnerProduct", - "nlinks": -1, - "efConstruction": -1, - }, - }, - } - tmp_proer = { - "text": {"type": "string"}, - "text_embedding": { - "type": "vector", - "index": True, - "dimension": dim, - "store_type": "MemoryOnly", - }, - } - for item in self.field_list: - tmp_proer[item["field"]] = {"type": type_dict[item["type"]]} - space_config["properties"] = tmp_proer - response_code = self.vearch.create_space(self.using_db_name, space_config) - - return response_code def add_texts( self, @@ -250,117 +150,52 @@ def add_texts( Returns: List of ids from adding the texts into the vectorstore. """ + embeddings = None if self.embedding_func is not None: embeddings = self.embedding_func.embed_documents(list(texts)) if embeddings is None: raise ValueError("embeddings is None") - self._get_matadata_field(metadatas) - if self.flag: - dbs_list = self.vearch.list_dbs() - if self.using_db_name not in dbs_list: - create_db_code = self.vearch.create_db(self.using_db_name) - if not create_db_code: - raise ValueError("create db failed!!!") - space_list = self.vearch.list_spaces(self.using_db_name) - if self.using_table_name not in space_list: - create_space_code = self._create_space(len(embeddings[0])) - if not create_space_code: - raise ValueError("create space failed!!!") - docid = [] - if embeddings is not None and metadatas is not None: - meta_field_list = [i["field"] for i in self.field_list] - for text, metadata, embed in zip(texts, metadatas, embeddings): - profiles: dict[str, Any] = {} - profiles["text"] = text - for f in meta_field_list: - profiles[f] = metadata[f] - embed_np = np.array(embed) - profiles["text_embedding"] = { - "feature": (embed_np / np.linalg.norm(embed_np)).tolist() - } - insert_res = self.vearch.insert_one( - self.using_db_name, self.using_table_name, profiles + self.field_list = self._get_matadata_field(metadatas) + dbs= self.vearch.list_databases() + dbs_list = [item.name["name"] for item in dbs] + if self.using_db_name not in dbs_list: + create_db_code = self.vearch.create_database(self.using_db_name) + if create_db_code.code != 0: + raise ValueError("create db failed!!!") + spaces = self.vearch.list_spaces(self.using_db_name) + space_list = [item.name["space_name"] for item in spaces] + if self.using_table_name not in space_list: + create_code = self.vearch.create_space(self.using_db_name, + self._create_space_schema(len(embeddings[0]))) + if create_code.code !=0 : + raise ValueError("create space failed!!!") + docid = [] + if embeddings is not None and metadatas is not None: + meta_field_list = [i["field"] for i in self.field_list] + for text, metadata, embed in zip(texts, metadatas, embeddings): + profiles: dict[str, Any] = {} + profiles["text"] = text + for f in meta_field_list: + profiles[f] = metadata[f] + em_np = np.array(embed) + profiles["text_embedding"] = (em_np / np.linalg.norm(em_np)).tolist() + insert_res = self.vearch.upsert( + self.using_db_name, self.using_table_name, [profiles] + ) + if insert_res.code == 0: + docid.append(insert_res.document_ids[0]["_id"]) + continue + else: + retry_insert = self.vearch.upsert( + self.using_db_name, self.using_table_name, [profiles] ) - if insert_res["status"] == 200: - docid.append(insert_res["_id"]) - continue - else: - retry_insert = self.vearch.insert_one( - self.using_db_name, self.using_table_name, profiles - ) - docid.append(retry_insert["_id"]) - continue - else: - table_path = os.path.join( - self.using_metapath, self.using_table_name + ".schema" - ) - if not os.path.exists(table_path): - dim = len(embeddings[0]) - response_code = self._create_table(dim) - if response_code: - raise ValueError("create table failed!!!") - if embeddings is not None and metadatas is not None: - doc_items = [] - meta_field_list = [i["field"] for i in self.field_list] - for text, metadata, embed in zip(texts, metadatas, embeddings): - profiles_v: dict[str, Any] = {} - profiles_v["text"] = text - for f in meta_field_list: - profiles_v[f] = metadata[f] - embed_np = np.array(embed) - profiles_v["text_embedding"] = embed_np / np.linalg.norm(embed_np) - doc_items.append(profiles_v) - - docid = self.vearch.add(doc_items) - t_time = 0 - while len(docid) != len(embeddings): - time.sleep(0.5) - if t_time > 6: - break - t_time += 1 - self.vearch.dump() + docid.append(retry_insert.document_ids[0]["_id"]) + continue return docid - def _load(self) -> None: - """ - load vearch engine for standalone vearch - """ - self.vearch.load() - - @classmethod - def load_local( - cls, - embedding: Embeddings, - path_or_url: Optional[str] = None, - table_name: str = _DEFAULT_TABLE_NAME, - db_name: str = _DEFAULT_CLUSTER_DB_NAME, - flag: int = _DEFAULT_VERSION, - **kwargs: Any, - ) -> Vearch: - """Load the local specified table of standalone vearch. - Returns: - Success or failure of loading the local specified table - """ - if not path_or_url: - raise ValueError("No metadata path!!!") - if not table_name: - raise ValueError("No table name!!!") - table_path = os.path.join(path_or_url, table_name + ".schema") - if not os.path.exists(table_path): - raise ValueError("vearch vectorbase table not exist!!!") - - vearch_db = cls( - embedding_function=embedding, - path_or_url=path_or_url, - table_name=table_name, - db_name=db_name, - flag=flag, - ) - vearch_db._load() - return vearch_db - def _get_field_list_from_c(self): + pass def similarity_search( @@ -371,8 +206,8 @@ def similarity_search( ) -> List[Document]: """ Return docs most similar to query. - """ + if self.embedding_func is None: raise ValueError("embedding_func is None!!!") embeddings = self.embedding_func.embed_query(query) @@ -389,56 +224,27 @@ def similarity_search_by_vector( Args: embeddings: embedding vector of the query. k: The k most similar documents to the text query. - min_score: the score of similar documents to the text query Returns: The k most similar documents to the specified text query. 0 is dissimilar, 1 is the most similar. """ - embed = np.array(embedding) - meta_field_list = self.vearch.get_space( + embed = np.array(embedding) + _, _, schemas= self.vearch.is_space_exist( self.using_db_name, self.using_table_name ) + raw_fields = json.loads(schemas)["schema"]["fields"] + meta_field_list = [item["name"] for item in raw_fields] meta_field_list.remove("text_embedding") - - if self.flag: - query_data = { - "query": { - "sum": [ - { - "field": "text_embedding", - "feature": (embed / np.linalg.norm(embed)).tolist(), - } - ], - }, - "retrieval_param": {"metric_type": "InnerProduct", "efSearch": 64}, - "size": k, - "fields": meta_field_list, - } - query_result = self.vearch.search( - self.using_db_name, self.using_table_name, query_data - ) - res = query_result["hits"]["hits"] - else: - query_data = { - "vector": [ - { - "field": "text_embedding", - "feature": embed / np.linalg.norm(embed), - } - ], - "fields": [], - "retrieval_param": {"metric_type": "InnerProduct", "efSearch": 64}, - "topn": k, - } - query_result = self.vearch.search(query_data) - res = query_result[0]["result_items"] + vector = VectorInfo("text_embedding", (embed / np.linalg.norm(embed)).tolist()) + query_result = self.vearch.search( + self.using_db_name, self.using_table_name, [vector,], + fields = meta_field_list, limit = k) + res = query_result.documents[0] docs = [] for item in res: content = "" meta_data = {} - if self.flag: - item = item["_source"] for item_key in item: if item_key == "text": content = item[item_key] @@ -450,6 +256,7 @@ def similarity_search_by_vector( docs.append(Document(page_content=content, metadata=meta_data)) return docs + def similarity_search_with_score( self, query: str, @@ -465,53 +272,27 @@ def similarity_search_with_score( The k most similar documents to the specified text query. 0 is dissimilar, 1 is the most similar. """ + if self.embedding_func is None: raise ValueError("embedding_func is None!!!") embeddings = self.embedding_func.embed_query(query) embed = np.array(embeddings) - meta_field_list = self.vearch.get_space( + _, _, schemas= self.vearch.is_space_exist( self.using_db_name, self.using_table_name ) + raw_fields = json.loads(schemas)["schema"]["fields"] + meta_field_list = [item["name"] for item in raw_fields] meta_field_list.remove("text_embedding") - if self.flag: - query_data = { - "query": { - "sum": [ - { - "field": "text_embedding", - "feature": (embed / np.linalg.norm(embed)).tolist(), - } - ], - }, - "size": k, - "fields": meta_field_list, - "retrieval_param": {"metric_type": "InnerProduct", "efSearch": 64}, - } - query_result = self.vearch.search( - self.using_db_name, self.using_table_name, query_data - ) - res = query_result["hits"]["hits"] - else: - query_data = { - "vector": [ - { - "field": "text_embedding", - "feature": embed / np.linalg.norm(embed), - } - ], - "fields": [], - "retrieval_param": {"metric_type": "InnerProduct", "efSearch": 64}, - "topn": k, - } - query_result = self.vearch.search(query_data) - res = query_result[0]["result_items"] + vector = VectorInfo("text_embedding", (embed / np.linalg.norm(embed)).tolist()) + query_result = self.vearch.search( + self.using_db_name, self.using_table_name, [vector,], + fields = meta_field_list, limit = k) + res = query_result.documents[0] results: List[Tuple[Document, float]] = [] for item in res: content = "" meta_data = {} - if self.flag: - score = item["_score"] - item = item["_source"] + score = item["_score"] for item_key in item: if item_key == "text": content = item[item_key] @@ -520,13 +301,11 @@ def similarity_search_with_score( meta_data[item_key] = item[item_key] meta_field_list.remove(item_key) continue - if self.flag != 1 and item_key == "score": - score = item[item_key] - continue tmp_res = (Document(page_content=content, metadata=meta_data), score) results.append(tmp_res) return results + def _similarity_search_with_relevance_scores( self, query: str, @@ -535,6 +314,7 @@ def _similarity_search_with_relevance_scores( ) -> List[Tuple[Document, float]]: return self.similarity_search_with_score(query, k, **kwargs) + def delete( self, ids: Optional[List[str]] = None, @@ -550,18 +330,14 @@ def delete( False otherwise, None if not implemented. """ - ret: Optional[bool] = None - tmp_res = [] if ids is None or ids.__len__() == 0: - return ret - for _id in ids: - if self.flag: - ret = self.vearch.delete(self.using_db_name, self.using_table_name, _id) - else: - ret = self.vearch.del_doc(_id) - tmp_res.append(ret) - ret = all(i == 0 for i in tmp_res) - return ret + return None + res = self.vearch.delete(self.using_db_name, self.using_table_name, ids) + if res.code ==0: + return True + else: + return False + def get( self, @@ -575,51 +351,33 @@ def get( Returns: Documents which satisfy the input conditions. """ - - results: Dict[str, Document] = {} - meta_field_list = self.vearch.get_space( + + _, _, schemas= self.vearch.is_space_exist( self.using_db_name, self.using_table_name ) + raw_fields = json.loads(schemas)["schema"]["fields"] + meta_field_list = [item["name"] for item in raw_fields] meta_field_list.remove("text_embedding") + + results: Dict[str, Document] = {} + if ids is None or ids.__len__() == 0: return results - if self.flag: - query_data = {"query": {"ids": ids}} - docs_detail = self.vearch.mget_by_ids( - self.using_db_name, self.using_table_name, query_data - ) - for record in docs_detail: - if record["found"] is False: + docs_detail = self.vearch.query( + self.using_db_name, self.using_table_name, ids + ) + for record in docs_detail.documents: + if "code" in record.keys(): + continue + content = "" + meta_info = {} + for field in record: + if field == "text": + content = record[field] continue - content = "" - meta_info = {} - for field in record["_source"]: - if field == "text": - content = record["_source"][field] - continue - elif field in meta_field_list: - meta_info[field] = record["_source"][field] - meta_field_list.remove(field) - continue - results[record["_id"]] = Document( - page_content=content, metadata=meta_info - ) - else: - for id in ids: - docs_detail = self.vearch.get_doc_by_id(id) - if docs_detail == {}: + elif field in meta_field_list: + meta_info[field] = record[field] + meta_field_list.remove(field) continue - content = "" - meta_info = {} - for field in docs_detail: - if field == "text": - content = docs_detail[field] - continue - elif field in meta_field_list: - meta_info[field] = docs_detail[field] - meta_field_list.remove(field) - continue - results[docs_detail["_id"]] = Document( - page_content=content, metadata=meta_info - ) + results[record["_id"]] = Document(page_content=content, metadata=meta_info) return results From de0550e36656aa952c9c838875e2838c1fe716c2 Mon Sep 17 00:00:00 2001 From: zhanghexian <1933690003@qq.com> Date: Mon, 3 Jun 2024 18:28:46 +0800 Subject: [PATCH 07/17] add vearch vectorstore --- .../integrations/vectorstores/vearch.ipynb | 25 ------------------- 1 file changed, 25 deletions(-) diff --git a/docs/docs/integrations/vectorstores/vearch.ipynb b/docs/docs/integrations/vectorstores/vearch.ipynb index 52dd2d6747ac6..7f413ca4a3b51 100644 --- a/docs/docs/integrations/vectorstores/vearch.ipynb +++ b/docs/docs/integrations/vectorstores/vearch.ipynb @@ -296,31 +296,6 @@ "get_id_doc=vearch.get(['-7527337722553895939','3823416425172812985'])\n", "print(\"get existed docid\",get_id_doc)\n" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { From 614b5220580f271068a627f0d4251d7eedaaa623 Mon Sep 17 00:00:00 2001 From: zhanghexian <1933690003@qq.com> Date: Mon, 3 Jun 2024 18:38:15 +0800 Subject: [PATCH 08/17] fix vearch.py lint --- .../vectorstores/vearch.py | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) diff --git a/libs/community/langchain_community/vectorstores/vearch.py b/libs/community/langchain_community/vectorstores/vearch.py index c95e7a8103cf2..ca428366cb8a3 100644 --- a/libs/community/langchain_community/vectorstores/vearch.py +++ b/libs/community/langchain_community/vectorstores/vearch.py @@ -2,13 +2,13 @@ import json import uuid -from typing import (TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Tuple, - Type) +from typing import Any, Dict, Iterable, List, Optional, Tuple, Type import numpy as np from langchain_core.documents import Document from langchain_core.embeddings import Embeddings from langchain_core.vectorstores import VectorStore + from vearch.config import Config from vearch.core.vearch import Vearch from vearch.schema.field import Field @@ -16,9 +16,6 @@ from vearch.schema.space import SpaceSchema from vearch.utils import DataType, MetricType, VectorInfo -if TYPE_CHECKING: - import vearch - DEFAULT_TOPN = 4 class VearchDb(VectorStore): @@ -33,15 +30,11 @@ def __init__( table_name: str = _DEFAULT_TABLE_NAME, **kwargs: Any, ) -> None: - """Initialize vearch vector store """ - try: - import vearch - except ImportError: - raise ValueError( - "Could not import suitable python package. " - "Please install it with `pip install pyvearch." - ) + Initialize vearch vector store. + Please install it with `pip install pyvearch. + """ + if path_or_url is None: raise ValueError("Please input router url of vearch") if not db_name: From 12a95d16233644386942eae3c2eaee587dfc7ae6 Mon Sep 17 00:00:00 2001 From: zhanghexian <1933690003@qq.com> Date: Mon, 3 Jun 2024 18:43:04 +0800 Subject: [PATCH 09/17] change Vearch to VearchDb --- libs/langchain/langchain/vectorstores/vearch.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/libs/langchain/langchain/vectorstores/vearch.py b/libs/langchain/langchain/vectorstores/vearch.py index 37985a41a95d9..9ea733b8274bd 100644 --- a/libs/langchain/langchain/vectorstores/vearch.py +++ b/libs/langchain/langchain/vectorstores/vearch.py @@ -3,12 +3,12 @@ from langchain._api import create_importer if TYPE_CHECKING: - from langchain_community.vectorstores import Vearch + from langchain_community.vectorstores import VearchDb # Create a way to dynamically look up deprecated imports. # Used to consolidate logic for raising deprecation warnings and # handling optional imports. -DEPRECATED_LOOKUP = {"Vearch": "langchain_community.vectorstores"} +DEPRECATED_LOOKUP = {"VearchDb": "langchain_community.vectorstores"} _import_attribute = create_importer(__package__, deprecated_lookups=DEPRECATED_LOOKUP) @@ -19,5 +19,5 @@ def __getattr__(name: str) -> Any: __all__ = [ - "Vearch", + "VearchDb", ] From 82842691a806c3a6cb755dc3776fc395ca90631e Mon Sep 17 00:00:00 2001 From: zhanghexian <1933690003@qq.com> Date: Mon, 3 Jun 2024 18:47:35 +0800 Subject: [PATCH 10/17] test --- libs/langchain/tests/unit_tests/vectorstores/test_public_api.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libs/langchain/tests/unit_tests/vectorstores/test_public_api.py b/libs/langchain/tests/unit_tests/vectorstores/test_public_api.py index dcccd2ff4be50..5ca5d8d155a97 100644 --- a/libs/langchain/tests/unit_tests/vectorstores/test_public_api.py +++ b/libs/langchain/tests/unit_tests/vectorstores/test_public_api.py @@ -66,7 +66,7 @@ "Typesense", "USearch", "Vald", - "Vearch", + "VearchDb", "Vectara", "VectorStore", "VespaStore", From 985bbb6430c40820c166a8eac63c06854a97271c Mon Sep 17 00:00:00 2001 From: zhanghexian <1933690003@qq.com> Date: Tue, 4 Jun 2024 11:14:14 +0800 Subject: [PATCH 11/17] fix lint --- .../integrations/vectorstores/vearch.ipynb | 34 +++++++------------ 1 file changed, 13 insertions(+), 21 deletions(-) diff --git a/docs/docs/integrations/vectorstores/vearch.ipynb b/docs/docs/integrations/vectorstores/vearch.ipynb index 7f413ca4a3b51..fb37e112da386 100644 --- a/docs/docs/integrations/vectorstores/vearch.ipynb +++ b/docs/docs/integrations/vectorstores/vearch.ipynb @@ -31,7 +31,6 @@ } ], "source": [ - "\n", "from langchain_community.document_loaders import TextLoader\n", "from langchain_community.vectorstores.vearch import VearchDb\n", "from langchain_huggingface import HuggingFaceEmbeddings\n", @@ -40,7 +39,6 @@ "\n", "# repalce to your local model path\n", "model_path =\"\" \n", - "\n", "tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)\n", "model = AutoModel.from_pretrained(model_path, trust_remote_code=True).half().cuda(0)" ] @@ -92,13 +90,11 @@ "file_path = \"Your local file path\"\n", "loader = TextLoader(file_path,encoding=\"utf-8\")\n", "documents = loader.load()\n", - "\n", "# split text into sentences and embedding the sentences\n", "text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)\n", "texts = text_splitter.split_documents(documents)\n", - "\n", "embedding_path = 'your model path'\n", - "embeddings = HuggingFaceEmbeddings(model_name=embedding_path)\n" + "embeddings = HuggingFaceEmbeddings(model_name=embedding_path)" ] }, { @@ -121,7 +117,12 @@ "#first add your document into vearch vectorstore\n", "!pip3 install pyvearch\n", "vearch = VearchDb.from_documents(\n", - " texts,embeddings,path_or_url=\"your_vearch_router_url\",db_name=\"your_db\",table_name=\"your_table\")\n" + " texts,\n", + " embeddings,\n", + " path_or_url=\"your_vearch_router_url\",\n", + " db_name=\"your_db\",\n", + " table_name=\"your_table\"\n", + ")" ] }, { @@ -182,11 +183,9 @@ ], "source": [ "query = \"你知道凌波微步吗,你知道都有谁会凌波微步?\"\n", - "\n", "cluster_res=vearch.similarity_search(query, 3)\n", "for idx,tmp in enumerate(cluster_res): \n", " print(f\"{'#'*20}第{idx+1}段相关文档{'#'*20}\\n\\n{tmp.page_content}\\n\")\n", - "\n", "# combine your local knowleadge and query \n", "context = \"\".join([tmp.page_content for tmp in cluster_res])\n", "new_query = f\"基于以下信息,尽可能准确的来回答用户的问题。背景信息:\\n {context} \\n 回答用户这个问题:{query}\\n\\n\"\n", @@ -224,13 +223,11 @@ "query = \"你知道vearch是什么吗?\"\n", "response, history = model.chat(tokenizer, query, history=history)\n", "print(f\"Human: {query}\\nChatGLM:{response}\\n\")\n", - "\n", "vearch_info = [\"Vearch 是一款存储大语言模型数据的向量数据库,用于存储和快速搜索模型embedding后的向量,可用于基于个人知识库的大模型应用\",\n", " \"Vearch 支持OpenAI, Llama, ChatGLM等模型,以及LangChain库\",\n", " \"vearch 是基于C语言,go语言开发的,并提供python接口,可以直接通过pip安装\"]\n", "vearch_source=[{'source': '/data/langchain-ChatGLM_new/knowledge_base/tlbb/three_body.txt'},{'source': '/data/langchain-ChatGLM_new/knowledge_base/tlbb/three_body.txt'},{'source': '/data/zhx/zhx/langchain-ChatGLM_new/knowledge_base/tlbb/three_body.txt'}]\n", - "\n", - "vearch.add_texts(vearch_info,vearch_source)\n" + "vearch.add_texts(vearch_info,vearch_source)" ] }, { @@ -260,16 +257,13 @@ } ], "source": [ - "\n", "query3_c = \"你知道vearch是什么吗?\"\n", "res1_c = vearch.similarity_search(query3_c, 3)\n", "for idx,tmp in enumerate(res1_c): \n", " print(f\"{'#'*20}第{idx+1}段相关文档{'#'*20}\\n\\n{tmp.page_content}\\n\")\n", - "\n", "context1_C = \"\".join([tmp.page_content for tmp in res1_c])\n", "new_query1_c = f\"基于以下信息,尽可能准确的来回答用户的问题。背景信息:\\n {context1_C} \\n 回答用户这个问题:{query3_c}\\n\\n\"\n", "response_c, history_c = model.chat(tokenizer, new_query1_c, history=[])\n", - "\n", "print(f\"***************ChatGLM:{response_c}\\n\")" ] }, @@ -288,19 +282,17 @@ } ], "source": [ - "##delete and get function need to maintian docids \n", - "##your docid\n", + "#delete and get function need to maintian docids \n", "res_cluster=vearch.delete(['5613292411219285975', '120290588477549397','-4146045014700199589'])\n", "print(\"delete vearch cluster docid\",res_cluster)\n", - "\n", "get_id_doc=vearch.get(['-7527337722553895939','3823416425172812985'])\n", - "print(\"get existed docid\",get_id_doc)\n" + "print(\"get existed docid\",get_id_doc)" ] } ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "Python 3.12.3 64-bit", "language": "python", "name": "python3" }, @@ -314,11 +306,11 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.13" + "version": "3.12.3" }, "vscode": { "interpreter": { - "hash": "f1da10a89896267ed34b497c9568817f36cc7ea79826b5cfca4d96376f5b4835" + "hash": "aee8b7b246df8f9039afb4144a1f6fd8d2ca17a180786b69acc140d282b71a49" } } }, From 6353ac81c79752ea3f356e40804a8b7404559eb8 Mon Sep 17 00:00:00 2001 From: zhanghexian <1933690003@qq.com> Date: Tue, 4 Jun 2024 11:24:02 +0800 Subject: [PATCH 12/17] fix lint --- .../integrations/vectorstores/vearch.ipynb | 55 +++++++++++-------- 1 file changed, 33 insertions(+), 22 deletions(-) diff --git a/docs/docs/integrations/vectorstores/vearch.ipynb b/docs/docs/integrations/vectorstores/vearch.ipynb index fb37e112da386..4e27f5a46bf6d 100644 --- a/docs/docs/integrations/vectorstores/vearch.ipynb +++ b/docs/docs/integrations/vectorstores/vearch.ipynb @@ -38,7 +38,7 @@ "from transformers import AutoModel, AutoTokenizer\n", "\n", "# repalce to your local model path\n", - "model_path =\"\" \n", + "model_path = \"\" \n", "tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)\n", "model = AutoModel.from_pretrained(model_path, trust_remote_code=True).half().cuda(0)" ] @@ -65,10 +65,10 @@ ], "source": [ "query = \"你好!\"\n", - "response, history = model.chat(tokenizer, query, history=[])\n", + "response, history = model.chat(tokenizer, query, history = [])\n", "print(f\"Human: {query}\\nChatGLM:{response}\\n\")\n", "query = \"你知道凌波微步吗,你知道都有谁学会了吗?\"\n", - "response, history = model.chat(tokenizer, query, history=history)\n", + "response, history = model.chat(tokenizer, query, history = history)\n", "print(f\"Human: {query}\\nChatGLM:{response}\\n\")" ] }, @@ -88,13 +88,13 @@ "source": [ "# Add your local knowledge files\n", "file_path = \"Your local file path\"\n", - "loader = TextLoader(file_path,encoding=\"utf-8\")\n", + "loader = TextLoader(file_path, encoding=\"utf-8\")\n", "documents = loader.load()\n", "# split text into sentences and embedding the sentences\n", - "text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)\n", + "text_splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap = 100)\n", "texts = text_splitter.split_documents(documents)\n", - "embedding_path = 'your model path'\n", - "embeddings = HuggingFaceEmbeddings(model_name=embedding_path)" + "embedding_path = \"your model path\"\n", + "embeddings = HuggingFaceEmbeddings(model_name = embedding_path)" ] }, { @@ -121,7 +121,7 @@ " embeddings,\n", " path_or_url=\"your_vearch_router_url\",\n", " db_name=\"your_db\",\n", - " table_name=\"your_table\"\n", + " table_name=\"your_table\",\n", ")" ] }, @@ -183,13 +183,14 @@ ], "source": [ "query = \"你知道凌波微步吗,你知道都有谁会凌波微步?\"\n", - "cluster_res=vearch.similarity_search(query, 3)\n", - "for idx,tmp in enumerate(cluster_res): \n", + "cluster_res = vearch.similarity_search(query, 3)\n", + "for idx, tmp in enumerate(cluster_res): \n", " print(f\"{'#'*20}第{idx+1}段相关文档{'#'*20}\\n\\n{tmp.page_content}\\n\")\n", + "\n", "# combine your local knowleadge and query \n", "context = \"\".join([tmp.page_content for tmp in cluster_res])\n", "new_query = f\"基于以下信息,尽可能准确的来回答用户的问题。背景信息:\\n {context} \\n 回答用户这个问题:{query}\\n\\n\"\n", - "response, history = model.chat(tokenizer, new_query, history=[])\n", + "response, history = model.chat(tokenizer, new_query, history = [])\n", "print(f\"********ChatGLM:{response}\\n\")" ] }, @@ -221,12 +222,18 @@ ], "source": [ "query = \"你知道vearch是什么吗?\"\n", - "response, history = model.chat(tokenizer, query, history=history)\n", + "response, history = model.chat(tokenizer, query, history = history)\n", "print(f\"Human: {query}\\nChatGLM:{response}\\n\")\n", - "vearch_info = [\"Vearch 是一款存储大语言模型数据的向量数据库,用于存储和快速搜索模型embedding后的向量,可用于基于个人知识库的大模型应用\",\n", - " \"Vearch 支持OpenAI, Llama, ChatGLM等模型,以及LangChain库\",\n", - " \"vearch 是基于C语言,go语言开发的,并提供python接口,可以直接通过pip安装\"]\n", - "vearch_source=[{'source': '/data/langchain-ChatGLM_new/knowledge_base/tlbb/three_body.txt'},{'source': '/data/langchain-ChatGLM_new/knowledge_base/tlbb/three_body.txt'},{'source': '/data/zhx/zhx/langchain-ChatGLM_new/knowledge_base/tlbb/three_body.txt'}]\n", + "vearch_info = [\n", + " \"Vearch 是一款存储大语言模型数据的向量数据库,用于存储和快速搜索模型embedding后的向量,可用于基于个人知识库的大模型应用\",\n", + " \"Vearch 支持OpenAI, Llama, ChatGLM等模型,以及LangChain库\",\n", + " \"vearch 是基于C语言,go语言开发的,并提供python接口,可以直接通过pip安装\"\n", + "]\n", + "vearch_source=[\n", + " {'source': '/data/langchain-ChatGLM_new/knowledge_base/tlbb/three_body.txt'},\n", + " {'source': '/data/langchain-ChatGLM_new/knowledge_base/tlbb/three_body.txt'},\n", + " {'source': '/data/zhx/zhx/langchain-ChatGLM_new/knowledge_base/tlbb/three_body.txt'}\n", + "]\n", "vearch.add_texts(vearch_info,vearch_source)" ] }, @@ -259,7 +266,7 @@ "source": [ "query3_c = \"你知道vearch是什么吗?\"\n", "res1_c = vearch.similarity_search(query3_c, 3)\n", - "for idx,tmp in enumerate(res1_c): \n", + "for idx, tmp in enumerate(res1_c): \n", " print(f\"{'#'*20}第{idx+1}段相关文档{'#'*20}\\n\\n{tmp.page_content}\\n\")\n", "context1_C = \"\".join([tmp.page_content for tmp in res1_c])\n", "new_query1_c = f\"基于以下信息,尽可能准确的来回答用户的问题。背景信息:\\n {context1_C} \\n 回答用户这个问题:{query3_c}\\n\\n\"\n", @@ -282,11 +289,15 @@ } ], "source": [ - "#delete and get function need to maintian docids \n", - "res_cluster=vearch.delete(['5613292411219285975', '120290588477549397','-4146045014700199589'])\n", - "print(\"delete vearch cluster docid\",res_cluster)\n", - "get_id_doc=vearch.get(['-7527337722553895939','3823416425172812985'])\n", - "print(\"get existed docid\",get_id_doc)" + "# delete and get function need to maintian docids \n", + "res_cluster=vearch.delete(\n", + " ['5613292411219285975', '120290588477549397','-4146045014700199589']\n", + ")\n", + "print(\"delete vearch cluster docid\", res_cluster)\n", + "get_id_doc=vearch.get(\n", + " ['-7527337722553895939','3823416425172812985']\n", + ")\n", + "print(\"get existed docid\", get_id_doc)" ] } ], From b50e413f081b8d12459866918e3472209155ef99 Mon Sep 17 00:00:00 2001 From: zhanghexian <1933690003@qq.com> Date: Tue, 4 Jun 2024 11:32:56 +0800 Subject: [PATCH 13/17] init --- libs/community/langchain_community/vectorstores/__init__.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/libs/community/langchain_community/vectorstores/__init__.py b/libs/community/langchain_community/vectorstores/__init__.py index 3856d67881fac..cba7fec909507 100644 --- a/libs/community/langchain_community/vectorstores/__init__.py +++ b/libs/community/langchain_community/vectorstores/__init__.py @@ -274,7 +274,7 @@ VDMS, ) from langchain_community.vectorstores.vearch import ( - Vearch, + VearchDb, ) from langchain_community.vectorstores.vectara import ( Vectara, @@ -390,7 +390,7 @@ "USearch", "VDMS", "Vald", - "Vearch", + "VearchDb", "Vectara", "VectorStore", "VespaStore", @@ -491,7 +491,7 @@ "USearch": "langchain_community.vectorstores.usearch", "Vald": "langchain_community.vectorstores.vald", "VDMS": "langchain_community.vectorstores.vdms", - "Vearch": "langchain_community.vectorstores.vearch", + "VearchDb": "langchain_community.vectorstores.vearch", "Vectara": "langchain_community.vectorstores.vectara", "VectorStore": "langchain_core.vectorstores", "VespaStore": "langchain_community.vectorstores.vespa", From a89acb0fa96300774ece3c72e6e7670cfff13cae Mon Sep 17 00:00:00 2001 From: zhanghexian <1933690003@qq.com> Date: Tue, 4 Jun 2024 13:44:42 +0800 Subject: [PATCH 14/17] import fix --- docs/docs/integrations/vectorstores/vearch.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/docs/integrations/vectorstores/vearch.ipynb b/docs/docs/integrations/vectorstores/vearch.ipynb index 4e27f5a46bf6d..18d77ee4e5b97 100644 --- a/docs/docs/integrations/vectorstores/vearch.ipynb +++ b/docs/docs/integrations/vectorstores/vearch.ipynb @@ -32,7 +32,7 @@ ], "source": [ "from langchain_community.document_loaders import TextLoader\n", - "from langchain_community.vectorstores.vearch import VearchDb\n", + "from langchain_community.vectorstores import VearchDb\n", "from langchain_huggingface import HuggingFaceEmbeddings\n", "from langchain_text_splitters import RecursiveCharacterTextSplitter\n", "from transformers import AutoModel, AutoTokenizer\n", From 3ba905e53d6e6ea2074a923e74e93c789a34c905 Mon Sep 17 00:00:00 2001 From: zhanghexian <1933690003@qq.com> Date: Tue, 4 Jun 2024 13:51:37 +0800 Subject: [PATCH 15/17] import fix --- .../integrations/vectorstores/vearch.ipynb | 22 ++++++++----------- 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/docs/docs/integrations/vectorstores/vearch.ipynb b/docs/docs/integrations/vectorstores/vearch.ipynb index 18d77ee4e5b97..a20fe7f033615 100644 --- a/docs/docs/integrations/vectorstores/vearch.ipynb +++ b/docs/docs/integrations/vectorstores/vearch.ipynb @@ -65,10 +65,10 @@ ], "source": [ "query = \"你好!\"\n", - "response, history = model.chat(tokenizer, query, history = [])\n", + "response, history = model.chat(tokenizer, query, history=[])\n", "print(f\"Human: {query}\\nChatGLM:{response}\\n\")\n", "query = \"你知道凌波微步吗,你知道都有谁学会了吗?\"\n", - "response, history = model.chat(tokenizer, query, history = history)\n", + "response, history = model.chat(tokenizer, query, history=history)\n", "print(f\"Human: {query}\\nChatGLM:{response}\\n\")" ] }, @@ -91,10 +91,10 @@ "loader = TextLoader(file_path, encoding=\"utf-8\")\n", "documents = loader.load()\n", "# split text into sentences and embedding the sentences\n", - "text_splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap = 100)\n", + "text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)\n", "texts = text_splitter.split_documents(documents)\n", "embedding_path = \"your model path\"\n", - "embeddings = HuggingFaceEmbeddings(model_name = embedding_path)" + "embeddings = HuggingFaceEmbeddings(model_name=embedding_path)" ] }, { @@ -190,7 +190,7 @@ "# combine your local knowleadge and query \n", "context = \"\".join([tmp.page_content for tmp in cluster_res])\n", "new_query = f\"基于以下信息,尽可能准确的来回答用户的问题。背景信息:\\n {context} \\n 回答用户这个问题:{query}\\n\\n\"\n", - "response, history = model.chat(tokenizer, new_query, history = [])\n", + "response, history = model.chat(tokenizer, new_query, history=[])\n", "print(f\"********ChatGLM:{response}\\n\")" ] }, @@ -222,7 +222,7 @@ ], "source": [ "query = \"你知道vearch是什么吗?\"\n", - "response, history = model.chat(tokenizer, query, history = history)\n", + "response, history = model.chat(tokenizer, query, history=history)\n", "print(f\"Human: {query}\\nChatGLM:{response}\\n\")\n", "vearch_info = [\n", " \"Vearch 是一款存储大语言模型数据的向量数据库,用于存储和快速搜索模型embedding后的向量,可用于基于个人知识库的大模型应用\",\n", @@ -232,7 +232,7 @@ "vearch_source=[\n", " {'source': '/data/langchain-ChatGLM_new/knowledge_base/tlbb/three_body.txt'},\n", " {'source': '/data/langchain-ChatGLM_new/knowledge_base/tlbb/three_body.txt'},\n", - " {'source': '/data/zhx/zhx/langchain-ChatGLM_new/knowledge_base/tlbb/three_body.txt'}\n", + " {'source': '/data/zhx/zhx/langchain-ChatGLM_new/knowledge_base/tlbb/three_body.txt'},\n", "]\n", "vearch.add_texts(vearch_info,vearch_source)" ] @@ -290,13 +290,9 @@ ], "source": [ "# delete and get function need to maintian docids \n", - "res_cluster=vearch.delete(\n", - " ['5613292411219285975', '120290588477549397','-4146045014700199589']\n", - ")\n", + "res_cluster=vearch.delete(['5613292411219285975', '120290588477549397','-4146045014700199589'])\n", "print(\"delete vearch cluster docid\", res_cluster)\n", - "get_id_doc=vearch.get(\n", - " ['-7527337722553895939','3823416425172812985']\n", - ")\n", + "get_id_doc=vearch.get(['-7527337722553895939','3823416425172812985'])\n", "print(\"get existed docid\", get_id_doc)" ] } From 3bd2743a49316b5f39e3d5a9d7dec2b7e8b26747 Mon Sep 17 00:00:00 2001 From: zhanghexian <1933690003@qq.com> Date: Tue, 4 Jun 2024 14:04:50 +0800 Subject: [PATCH 16/17] fix test_vearch.py --- .../tests/integration_tests/vectorstores/test_vearch.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/libs/community/tests/integration_tests/vectorstores/test_vearch.py b/libs/community/tests/integration_tests/vectorstores/test_vearch.py index d509bd90f91a4..0b9890426e22b 100644 --- a/libs/community/tests/integration_tests/vectorstores/test_vearch.py +++ b/libs/community/tests/integration_tests/vectorstores/test_vearch.py @@ -1,7 +1,7 @@ # flake8: noqa +from langchain_community.vectorstores import VearchDb from langchain_core.documents import Document -from langchain_community.vectorstores.vearch import Vearch from tests.integration_tests.vectorstores.fake_embeddings import FakeEmbeddings @@ -31,7 +31,7 @@ def test_vearch() -> None: ) }, ] - vearch_db = Vearch.from_texts( + vearch_db = VearchDb.from_texts( texts=texts, embedding=FakeEmbeddings(), metadatas=metadatas, @@ -79,7 +79,7 @@ def test_vearch_add_texts() -> None: "three_body.txt" }, ] - vearch_db = Vearch.from_texts( + vearch_db = VearchDb.from_texts( texts=texts, embedding=FakeEmbeddings(), metadatas=metadatas, From 21f21c197082fbcadb7072d26257e7b772bf53e7 Mon Sep 17 00:00:00 2001 From: zhanghexian <1933690003@qq.com> Date: Tue, 4 Jun 2024 14:26:20 +0800 Subject: [PATCH 17/17] change vearch to VearchDb --- libs/community/tests/unit_tests/vectorstores/test_imports.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/libs/community/tests/unit_tests/vectorstores/test_imports.py b/libs/community/tests/unit_tests/vectorstores/test_imports.py index 397dc581a2cfe..677ce92f86ec3 100644 --- a/libs/community/tests/unit_tests/vectorstores/test_imports.py +++ b/libs/community/tests/unit_tests/vectorstores/test_imports.py @@ -1,7 +1,6 @@ -from langchain_core.vectorstores import VectorStore - from langchain_community import vectorstores from langchain_community.vectorstores import __all__, _module_lookup +from langchain_core.vectorstores import VectorStore EXPECTED_ALL = [ "Aerospike", @@ -92,7 +91,7 @@ "USearch", "VDMS", "Vald", - "Vearch", + "VearchDb", "Vectara", "VectorStore", "VespaStore",