-
-
Notifications
You must be signed in to change notification settings - Fork 4
/
npuchat.py
313 lines (253 loc) · 8.93 KB
/
npuchat.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
import configparser
import os
import re
import requests
import threading
import time
from flask import Flask, render_template, request, jsonify
from requests.exceptions import Timeout
APPNAME = 'NPU Chat'
VERSION = '0.27'
def contains_chinese(text: str) -> bool:
"""
Detect if a string contains Chinese characters.
Args:
text (str): The input string to check.
Returns:
bool: True if the string contains Chinese characters, False otherwise.
"""
pattern = r'[\u4e00-\u9fff]+'
if re.search(pattern, text):
return True
else:
return False
def feed_the_llama(query: str) -> str:
"""
Send the user's query to the NPU server and modify it if context is being
used.
If you're using chat context, the Qwen model appears to differentiate
between codeblocks (3 backticks in markdown) and plain text outside of
these blocks. Let's take this exchange for example:
user: how many crayons?
llm: there are three crayons
On the next query, the user's query to the LLM
('which colors are they?') would become:
```
there are three crayons
```
which colors are they?
The model appears to 'know' that the text outside the codeblock is
referencing what's inside of it.
To see this in action, uncomment the print statement below and perform
a query. In my tests this method seems be sufficient to maintain context.
If you have a better idea please open an issue or make a PR.
Args:
query (str): The user's input string.
Returns:
str: Answer from the LLM.
"""
if USE_CHAT_CONTEXT:
if CONTEXT: # if context history isn't empty
# compile the history into individual codeblocks
llm_output_history = ""
for llm_reply in CONTEXT: # format into markdown codeblocks
llm_output_history = (
f"{llm_output_history}"
f"```\n"
f"{llm_reply}"
f"\n```\n"
)
query = llm_output_history + query # append user's query
#print(f"\ncontext:\n\n{query}\n\n")
# prompt template:
prefix = (
"<|im_start|>system You are a helpful assistant. <|im_end|> "
"<|im_start|>user "
)
postfix = (
"<|im_end|><|im_start|>assistant "
)
# create the request object
json_data = {
'PROMPT_TEXT_PREFIX': prefix,
'input_str': str(query) + ' ',
'PROMPT_TEXT_POSTFIX': postfix,
}
headers = {
'Content-Type': 'application/json',
}
try:
response = requests.post(
f"http://{NPU_ADDRESS}:{NPU_PORT}",
headers=headers,
json=json_data,
timeout=CONNECTION_TIMEOUT
)
response.raise_for_status() # throw exception on non 2xx HTTP statuses
response = response.json() # get JSON from response
answer = response['content'] # text of the LLM's response
return answer # return text to client
# handle errors for better user-friendliness
except Timeout:
return "Request timed out. Please try again later."
except requests.exceptions.RequestException as e:
error_msg = (
f"An error occurred: {str(e)}"
f" ---- is the server online?"
)
return error_msg
def web_server() -> None:
"""
Sets up the Flask server and handles web requests.
"""
# print the server details to the console
print(
f"Starting server at: "
f"http://{BINDING_ADDRESS}:{BINDING_PORT}"
)
app = Flask(__name__)
app.name = f"{APPNAME} v{VERSION}"
# serve the UI to the user
@app.route('/', methods=['GET', 'POST'])
def index():
response = app.make_response(
render_template('index.html', selected_theme=UI_THEME)
)
response.headers['Pragma'] = 'no-cache'
response.headers['Cache-Control'] = (
'no-cache, no-store, must-revalidate'
)
return response
# endpoint (http://your-ip/search)
@app.route('/search', methods=['POST'])
def web_request():
response = app.make_response(jsonify(web_request_logic()))
response.headers['Pragma'] = 'no-cache'
response.headers['Cache-Control'] = (
'no-cache, no-store, must-revalidate'
)
return response
def web_request_logic():
global USE_CHAT_CONTEXT
global CONTEXT
# start recording response time
start_time = time.time()
# get web input
question = request.form['input_text']
# commands for manipulating context state
if question.lower() == 'context': # show current context
context_history = ""
for llm_reply in CONTEXT: # format into markdown codeblocks
context_history = (
f"```\n"
f"{llm_reply}"
f"\n```\n"
)
answer = (
f"<md class='markdown-style'>"
f"{context_history}"
f"</md>"
)
return {'content': answer}
if question.lower() == 'clear': # erase context
CONTEXT = [] # initialize the context list
return {'content': "context cleared."}
if question.lower() == 'off': # turn it off
CONTEXT = [] # initialize the context list
USE_CHAT_CONTEXT = False
return {'content': "context off."}
if question.lower() == 'on': # turn it on
USE_CHAT_CONTEXT = True
return {'content': "context on."}
print(f"━━━━━━━━┫ Received request: {question}")
# if lock is acquired then this app is currently in use.
# we can currently only handle one request at a time.
if lock.locked():
error_msg = "Sorry, I can only handle one request "\
"at a time and I'm currently busy."
return {
'result': error_msg
}
# if this app is free to process a request
with lock:
answer = feed_the_llama(question) # send to npu server
# update chat context
if USE_CHAT_CONTEXT:
ignore_chinese_chars = False
# check if user wants to ignore Chinese characters
if IGNORE_CHINESE:
ignore_chinese_chars = contains_chinese(answer)
# update context history
if not ignore_chinese_chars:
CONTEXT.append(answer)
# trim context to desired depth by removing oldest element
if len(CONTEXT) > CONTEXT_DEPTH:
CONTEXT.pop(0)
# modify the answer for markdown HTML tag to make it pretty
answer = (
f"<md class='markdown-style'>"
f"{answer}"
f"</md>"
)
# print completed time
end_time = time.time()
print(f"Completed in {end_time - start_time:.2f} seconds.")
# return answer to client
answer_markdown = {'content': answer}
return answer_markdown
# setup Flask
app.run(
host=BINDING_ADDRESS,
port=BINDING_PORT
)
def load_config(script_dir: str) -> None:
"""
Loads user settings from settings.ini into globals
Args:
script_dir (str): The working directory of this script.
Returns:
None
"""
# if script path doesn't have trailing slash
if script_dir[-1] != '/':
script_dir = script_dir + "/" # add it
# get current working directory of this script to find settings.ini
config_path = f"{script_dir}settings.ini"
# parse settings.ini
parser = configparser.ConfigParser()
parser.read(config_path)
# globals for settings
global BINDING_ADDRESS
global BINDING_PORT
global NPU_ADDRESS
global NPU_PORT
global CONNECTION_TIMEOUT
global USE_CHAT_CONTEXT
global CONTEXT_DEPTH
global IGNORE_CHINESE
global UI_THEME
# assign settings. str unless noted otherwise
BINDING_ADDRESS = parser.get('chat_ui', 'BINDING_ADDRESS')
BINDING_PORT = parser.get('chat_ui','BINDING_PORT')
NPU_ADDRESS = parser.get('npu','NPU_ADDRESS')
NPU_PORT = parser.get('npu','NPU_PORT')
CONNECTION_TIMEOUT = int(
parser.get('timeout','TIMEOUT')
) # int
USE_CHAT_CONTEXT = parser.get('context','USE_CONTEXT') # bool
CONTEXT_DEPTH = int(
parser.get('context','DEPTH')
) # int
IGNORE_CHINESE = parser.get('context','IGNORE_CHINESE') # bool
UI_THEME = parser.get('theme','THEME')
if __name__ == "__main__":
# get cwd of this script
script_dir = os.path.dirname(os.path.abspath(__file__))
# load user settings
load_config(script_dir)
# control access since currently only one instance can run at a time
lock = threading.Lock()
# context / chat history
CONTEXT = []
# start Flask
web_server()