-
Notifications
You must be signed in to change notification settings - Fork 10
/
Copy pathknowledge_search_whoosh.py
127 lines (111 loc) · 4.65 KB
/
knowledge_search_whoosh.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
import os
import json
import jsonloader
from flask import Flask, request, jsonify, Response
from flask_cors import CORS
from whoosh.fields import Schema, TEXT, ID, STORED
from whoosh.index import Index
from whoosh.qparser import QueryParser
from whoosh.filedb.filestore import RamStorage
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
# Initialize Flask app
app = Flask(__name__)
CORS(app)
# Define the schema for the Whoosh index
schema = Schema(
title=TEXT(phrase=False, stored=True), # Document title
content=TEXT(phrase=False, stored=True), # Document content
url=STORED, # URL of the document
pubDate=STORED # Publication date (if any)
)
# Create an in-memory index using RamStorage
storage = RamStorage()
ix = storage.create_index(schema) # Create an index with RamStorage
# Load JSON documents from the "knowledge" folder into the Whoosh index
def load_documents_into_index(knowledge_folder):
#writer = ix.writer(limitmb=1024, procs=16, multisegment=True)
writer = ix.writer(limitmb=8192)
# Set the cachesize to -1 to indicate unbounded caching
#stem_ana = writer.schema["content"].format.analyzer
#stem_ana.cachesize = -1
#stem_ana.clear()
for filename in os.listdir(knowledge_folder):
if filename.endswith(".jsonl") or filename.endswith(".flatjson") or filename.endswith(".jsonl.gz") or filename.endswith(".flatjson.gz"):
filepath = os.path.join(knowledge_folder, filename)
print("reading index dump from " + filepath)
json_data = jsonloader.load(filepath)
n = 0
for doc in json_data:
if "index" in doc:
continue # skip this line
writer.add_document(
title=doc.get("title", ""),
content=doc.get("text_t", ""),
url=doc.get("url", doc.get("url_s", doc.get("sku", ""))),
pubDate=doc.get("pubDate", "")
)
n += 1
#if n > 1000: break
writer.commit(optimize=True)
# Load documents into the index
knowledge_folder = "knowledge" # Folder containing JSON documents
load_documents_into_index(knowledge_folder)
# Function to compute TF-IDF vectors and cosine similarity
def compute_similarity(query, documents):
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform([query] + documents)
similarity_scores = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:]).flatten()
return similarity_scores
# Search endpoint
@app.route('/yacysearch.json', methods=['GET', 'POST'])
def yacysearch():
# Parse query and count from the request
if request.method == 'GET':
query = request.args.get('query', '')
count = int(request.args.get('count', '3'))
elif request.method == 'POST':
data = request.get_json()
query = data.get('query', '')
count = int(data.get('count', '3'))
# Perform the search using Whoosh
results = []
with ix.searcher() as searcher:
query_parser = QueryParser("content", ix.schema)
parsed_query = query_parser.parse(query)
# Perform the search using or logic
whoosh_results = searcher.search(parsed_query, limit=count, terms=True)
# Extract document content for similarity computation
documents = [hit["content"] for hit in whoosh_results]
if documents:
# Compute similarity scores
similarity_scores = compute_similarity(query, documents)
# Combine results with similarity scores
for i, hit in enumerate(whoosh_results):
result = {
"title": hit.get("title", ""),
"link": hit.get("url", ""),
"description": hit.get("content", ""),
"similarity": float(similarity_scores[i])
}
results.append(result)
# Sort results by similarity (descending order)
results.sort(key=lambda x: x["similarity"], reverse=True)
# Format the response in YaCy API format
yacy_results = {
"channels": [
{
"title": "YaCy Expert Search",
"description": "Items from YaCy Search Engine Dumps as Search Results",
"startIndex": "0",
"itemsPerPage": str(count),
"searchTerms": query,
"items": results[:count] # Limit results to the requested count
}
]
}
# Return the response as JSON
return jsonify(yacy_results)
# Run the Flask app
if __name__ == '__main__':
app.run(debug=False, port=8094, host='0.0.0.0')