-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbox_ai_extract_metadata.py
92 lines (75 loc) · 2.81 KB
/
box_ai_extract_metadata.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
from box_sdk_gen import (
AiExtractResponse,
AiItemBase,
BoxClient,
CreateAiExtractStructuredMetadataTemplate,
)
from tqdm import tqdm
from utils.box_client_ccg import AppConfig, get_ccg_user_client
from utils.box_metadata import apply_metadata_to_document
from utils.box_samples import files_start_with
def print_ai_response(prompt: str, ai_response: AiExtractResponse):
print()
print("=" * 80)
print(f"Description: {prompt}")
print("-" * 80)
print(f"Answer:\n{ai_response}")
print("=" * 80)
print()
def main():
conf = AppConfig()
client: BoxClient = get_ccg_user_client(conf, conf.ccg_user_id)
# who am i
me = client.users.get_user_me()
# cleat screen
print("\033[H\033[J")
print(f"Hello, I'm logged in as {me.name} ({me.id})")
# find files starting with 'HAB-1' in Habitat Leases folder
hab_1_files = files_start_with("HAB-1", client, conf)
hab_2_files = files_start_with("HAB-2", client, conf)
hab_3_files = files_start_with("HAB-3", client, conf)
hab_files = hab_1_files + hab_2_files + hab_3_files
print(f"Using {len(hab_files)} documents for Box AI context")
# Metadata template config
template_key = "leases_workshop"
template_type = "metadata_template"
template_scope = f"enterprise_{conf.enterprise_id}"
# Documents
items = [AiItemBase(id=file.id, type="file") for file in hab_files]
# Extract metadata from single document
item = items[0]
metadata_template = CreateAiExtractStructuredMetadataTemplate(
template_key=template_key,
type=template_type,
scope=template_scope,
)
ai_response = client.ai.create_ai_extract_structured(
items=[item],
metadata_template=metadata_template,
).to_dict()
print_ai_response("Extract metadata from single document", ai_response)
# Apply metadata to document
metadata = apply_metadata_to_document(client, item.id, template_key, ai_response)
# filter out internal metadata items
metadata = {
k: v
for k, v in metadata.to_dict().items()
if (not k.startswith("$")) and (not k == "extra_data")
}
print_ai_response("Metadata applied to document:", metadata)
# Extract metadata from multiple documents
print(f"\nExtracting data from {len(items)} documents, and applying metadata...")
progress_bar = tqdm(total=len(items))
for item in items:
# Extract document data using metadata template
ai_response = client.ai.create_ai_extract_structured(
items=[item],
metadata_template=metadata_template,
).to_dict()
# Apply metadata to document
apply_metadata_to_document(client, item.id, template_key, ai_response)
progress_bar.update()
progress_bar.close()
print()
if __name__ == "__main__":
main()