forked from Doriandarko/claude-engineer
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathunfinite_context.py
423 lines (350 loc) · 14.1 KB
/
unfinite_context.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
#!/usr/bin/env python3
import sys
import os
import re
import json
import random
import sqlite3
from collections import deque
###############################################################################
# 1) Common Interface: ChunkStore
###############################################################################
class ChunkStore:
"""
A minimal interface for storing and retrieving 'chunks'.
Each chunk is identified by a unique label (the 3-word code).
The chunk data might include: summary, messages, references, etc.
"""
def save_chunk(self, label, data: dict):
"""
Save (insert or replace) the chunk data under 'label'.
"""
raise NotImplementedError
def load_chunk(self, label):
"""
Return the chunk data (a dict) for 'label', or None if not found.
"""
raise NotImplementedError
def list_labels(self):
"""
Return a list of all chunk labels in the store.
"""
raise NotImplementedError
def load_all_chunks(self):
"""
Return a list of all chunk data dicts in the store.
"""
raise NotImplementedError
###############################################################################
# 2) SQLite-based DocStore: SQLiteDocStore
###############################################################################
class SQLiteDocStore(ChunkStore):
"""
Stores chunk data in a local SQLite database (clumps table).
Each row: label (PRIMARY KEY), chunk_data (JSON).
"""
def __init__(self, db_path="clumps.db"):
"""
:param db_path: path to the SQLite database file
"""
self.db_path = db_path
self._init_db()
def _init_db(self):
"""
Create the table if it doesn't exist.
"""
with sqlite3.connect(self.db_path) as conn:
conn.execute(
"""
CREATE TABLE IF NOT EXISTS clumps (
label TEXT PRIMARY KEY,
chunk_data TEXT
)
"""
)
conn.commit()
def save_chunk(self, label, data):
serialized = json.dumps(data, ensure_ascii=False)
with sqlite3.connect(self.db_path) as conn:
conn.execute(
"""
INSERT OR REPLACE INTO clumps(label, chunk_data)
VALUES(?, ?)
""",
(label, serialized),
)
conn.commit()
def load_chunk(self, label):
with sqlite3.connect(self.db_path) as conn:
cur = conn.execute(
"SELECT chunk_data FROM clumps WHERE label = ?", (label,)
)
row = cur.fetchone()
if row:
return json.loads(row[0])
return None
def list_labels(self):
with sqlite3.connect(self.db_path) as conn:
cur = conn.execute("SELECT label FROM clumps")
rows = cur.fetchall()
return [r[0] for r in rows]
def load_all_chunks(self):
with sqlite3.connect(self.db_path) as conn:
cur = conn.execute("SELECT chunk_data FROM clumps")
rows = cur.fetchall()
return [json.loads(r[0]) for r in rows]
###############################################################################
# 3) JSON-based DocStore: JsonFileDocStore
###############################################################################
class JsonFileDocStore(ChunkStore):
"""
Stores each chunk in a separate JSON file named {label}.json in a directory.
"""
def __init__(self, storage_dir="clump_store"):
"""
:param storage_dir: directory for JSON chunk files
"""
self.storage_dir = storage_dir
if not os.path.exists(self.storage_dir):
os.makedirs(self.storage_dir)
def save_chunk(self, label, data):
filename = os.path.join(self.storage_dir, f"{label}.json")
with open(filename, "w", encoding="utf-8") as f:
json.dump(data, f, indent=2)
def load_chunk(self, label):
filename = os.path.join(self.storage_dir, f"{label}.json")
if not os.path.exists(filename):
return None
with open(filename, "r", encoding="utf-8") as f:
return json.load(f)
def list_labels(self):
labels = []
for fname in os.listdir(self.storage_dir):
if fname.endswith(".json"):
labels.append(os.path.splitext(fname)[0])
return labels
def load_all_chunks(self):
chunks = []
for lbl in self.list_labels():
data = self.load_chunk(lbl)
if data:
chunks.append(data)
return chunks
###############################################################################
# 4) SimpleContextManager
###############################################################################
class SimpleContextManager:
"""
Manages conversation memory with older messages chunked and stored in a doc store.
- Keeps at most `max_full_messages` in memory
- If we exceed that, we summarize & compress older ones into a chunk
- We can recall relevant older chunks by naive text matching in summary/messages
- We BFS into references if a chunk references other chunk labels
"""
# Regex to detect references to 3-word codes (e.g. "alpha-beta-gamma")
THREE_WORD_CODE_PATTERN = re.compile(r"\b[a-zA-Z]+\-[a-zA-Z]+\-[a-zA-Z]+\b")
def __init__(self, doc_store: ChunkStore, max_full_messages=5):
"""
:param doc_store: either SQLiteDocStore or JsonFileDocStore
:param max_full_messages: how many recent messages to keep uncompressed
"""
self.doc_store = doc_store
self.max_full_messages = max_full_messages
self.messages = [] # the "live" messages
self.used_labels = set()
self.goal_summary = None
def set_goal_summary(self, text):
"""
Sets an overarching 'goal' that we can include as a system message.
"""
self.goal_summary = text
def add_message(self, role, content):
"""
Add a new message (role='user'/'assistant', etc.).
Compress older if we exceed max_full_messages.
"""
self.messages.append({"role": role, "content": content})
self._maybe_compress_old_messages()
def _maybe_compress_old_messages(self):
# If messages exceed the limit, chunk/summarize older ones
if len(self.messages) > self.max_full_messages:
older = self.messages[: -self.max_full_messages]
latest = self.messages[-self.max_full_messages :]
summary_text = self._summarize_messages(older)
references = self._extract_3word_codes(older, summary_text)
label = self._generate_3word_label(summary_text[:50])
chunk_data = {
"label": label,
"summary": summary_text,
"messages": older,
"references": references,
}
self.doc_store.save_chunk(label, chunk_data)
# keep only the latest
self.messages = latest
def _summarize_messages(self, messages):
# Very naive summary approach
all_content = " ".join(m["content"] for m in messages)
return f"Summary: {all_content[:100]}..."
def _extract_3word_codes(self, messages, summary_text):
combined = summary_text
for m in messages:
combined += " " + m["content"]
found = set(re.findall(self.THREE_WORD_CODE_PATTERN, combined))
return list(found)
def _generate_3word_label(self, seed_text):
words = re.findall(r"\w+", seed_text.lower())
unique_words = list(set(words))
while len(unique_words) < 3:
unique_words.append(f"word{random.randint(1000,9999)}")
chosen = random.sample(unique_words, 3)
label = "-".join(chosen)
while label in self.used_labels:
label += str(random.randint(0, 9999))
self.used_labels.add(label)
return label
def recall_relevant_chunks(self, query):
"""
Naive recall: if any of the query's words appear in chunk's summary or messages.
Then BFS to expand references.
"""
query_words = set(query.lower().split())
all_chunks = self.doc_store.load_all_chunks()
candidate_labels = []
for chunk in all_chunks:
text_for_search = chunk["summary"].lower()
for msg in chunk["messages"]:
text_for_search += " " + msg["content"].lower()
matches = sum(1 for w in query_words if w in text_for_search)
if matches > 0:
candidate_labels.append(chunk["label"])
results = {}
queue = deque(candidate_labels)
visited = set()
while queue:
lbl = queue.popleft()
if lbl in visited:
continue
visited.add(lbl)
chunk_data = self.doc_store.load_chunk(lbl)
if chunk_data:
results[lbl] = chunk_data
for ref_lbl in chunk_data.get("references", []):
if ref_lbl not in visited:
queue.append(ref_lbl)
return list(results.values())
def get_context_for_model(self, user_query):
"""
Build the "context" to feed an LLM:
1) system (goal) if set
2) live messages
3) short recall for each relevant chunk
"""
context = []
if self.goal_summary:
context.append({"role": "system", "content": f"Goal: {self.goal_summary}"})
context.extend(self.messages)
# find relevant older chunks
relevant = self.recall_relevant_chunks(user_query)
for chunk_data in relevant:
lbl = chunk_data["label"]
summ = chunk_data["summary"]
context.append(
{"role": "assistant", "content": f"(Recall from {lbl}): {summ}"}
)
return context
###############################################################################
# 5) Helper to create doc store with fallback
###############################################################################
def create_store_with_fallback(db_path="clumps.db", fallback_dir="clump_store"):
"""
Tries to create a SQLiteDocStore at db_path.
If that fails, falls back to JsonFileDocStore at fallback_dir.
"""
# Attempt to init SQLite
try:
store = SQLiteDocStore(db_path)
# Test write/read cycle
label_test = "test-label"
data_test = {
"label": label_test,
"summary": "test summary",
"messages": [],
"references": [],
}
store.save_chunk(label_test, data_test)
loaded = store.load_chunk(label_test)
if not loaded or loaded["label"] != label_test:
raise RuntimeError("Could not verify test record in SQLite store.")
# remove the test record
with sqlite3.connect(db_path) as conn:
conn.execute("DELETE FROM clumps WHERE label = ?", (label_test,))
conn.commit()
print(f"Using SQLiteDocStore at: {db_path}")
return store
except Exception as e:
print("SQLiteDocStore failed:", e)
print(f"Falling back to JsonFileDocStore in: {fallback_dir}")
return JsonFileDocStore(storage_dir=fallback_dir)
###############################################################################
# 6) Self Test
###############################################################################
def run_self_test(db_path="test_clumps.db", fallback_dir="test_clump_store"):
"""
A quick demonstration and test:
- We create a store with fallback
- We create a context manager
- We add some messages, forcing some older ones to compress
- We do a recall
- Print out the final context
"""
print("=== Running Self-Test ===")
# 1) Create doc store (db or fallback)
store = create_store_with_fallback(db_path=db_path, fallback_dir=fallback_dir)
# 2) Create manager
mgr = SimpleContextManager(store, max_full_messages=3)
mgr.set_goal_summary("Help user troubleshoot Docker container crashes.")
# 3) Add messages
mgr.add_message("user", "I'm having issues with Docker.")
mgr.add_message("assistant", "Can you share the Dockerfile?")
mgr.add_message("user", "Sure, here's a snippet: FROM python:3.8 ...")
mgr.add_message(
"assistant",
"Try adding an ENTRYPOINT. Also consider net-debug-xyz if you have connectivity issues.",
)
# (At this point, older messages might have been compressed, depending on the limit)
mgr.add_message(
"user", "Still no luck. I'm seeing 'Cannot connect to Docker daemon'."
)
# 4) Do a recall
user_query = "What about net-debug-xyz you mentioned?"
context = mgr.get_context_for_model(user_query)
print("\n=== Final Context ===")
for idx, c in enumerate(context, start=1):
print(f"{idx}. {c['role'].upper()}: {c['content']}")
# 5) Check if at least one recall chunk is in the context
found_recall = any(
"(Recall from " in msg["content"]
for msg in context
if msg["role"] == "assistant"
)
if found_recall:
print("\nSelf-test succeeded: found older chunk recall in context.")
else:
print(
"\nSelf-test warning: no older chunk was recalled. Possibly not enough messages or matching words."
)
###############################################################################
# 7) Main entry point
###############################################################################
if __name__ == "__main__":
# If user types "python thisscript.py selftest", run self-test
if len(sys.argv) > 1 and sys.argv[1].lower() == "selftest":
run_self_test()
else:
# Normal usage demonstration
print("Usage: python thisscript.py selftest\n")
print(
"This script demonstrates a SimpleContextManager with fallback from SQLite to JSON."
)
print("Run with 'selftest' to see a quick example in action.")