-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy patharticle_loader.py
95 lines (76 loc) · 3.19 KB
/
article_loader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import requests
import json
from bs4 import BeautifulSoup
from astrapy.db import AstraDBCollection, AstraDB
from langchain_openai import OpenAI
import os
from dotenv import load_dotenv
load_dotenv()
# AstraDB connection information
token = os.getenv("token")
api_endpoint = os.getenv("endpoint")
collection_name = "c_link_articles"
# API endpoint URL
url = "http://167.172.142.105:5000/api/elasticsearch/leaves_articles/_search"
headers = {'Content-Type': 'application/json'}
# Initialize AstraDB instance
astra_db = AstraDB(token=token, api_endpoint=api_endpoint)
# Check if collection exists and create if not
if collection_name in astra_db.get_collections()['status']['collections']:
print(f"Collection '{collection_name}' already exists. New collection not created")
else:
astra_db.create_collection(collection_name=collection_name)
# Initialize AstraDBCollection instance
collection = AstraDBCollection(collection_name=collection_name, astra_db=astra_db)
def gen_request_body(batch_size=20, start_index=0):
"""
Generates the request body for the API request.
Args:
batch_size (int, optional): The batch size of articles. Defaults to 20.
start_index (int, optional): The starting index of articles. Defaults to 0.
Returns:
str: JSON string representing the request body.
"""
return json.dumps({"size": int(batch_size), "from": int(start_index)})
def get_total_articles():
"""
Retrieves the total number of articles from the API.
Returns:
int: Total number of articles.
"""
response = requests.request("POST", url, headers=headers, data=gen_request_body())
return int(response.json()['hits']['total'])
def shred_article(article, char_limit=6500):
"""
Splits the given article into smaller chunks.
Args:
article (str): The article text.
char_limit (int, optional): The character limit for each chunk. Defaults to 6500.
Returns:
list: A list of strings representing the split article chunks.
"""
len_article = len(article)
result = []
for i in range(0, len_article, char_limit):
result.append(article[i:(i+char_limit)])
return result
total_articles = get_total_articles()
batch_size = 20
empty_articles = 0
# Iterate over articles in batches
for i in range(0, total_articles, batch_size):
# Make API request to fetch articles
response = requests.request("POST", url, headers=headers, data=gen_request_body(batch_size, i))
articles = []
# Process each article in the response
for article in response.json()['hits']['hits']:
if 'content' in article['_source']:
# Shred the article into smaller chunks
articles.append({"content": shred_article(BeautifulSoup(article['_source']['content'], 'html.parser').get_text(separator=" \n "))})
# Insert shredded articles into AstraDB collection
results = collection.insert_many(documents=articles)
inserted_articles = len(results['status']['insertedIds'])
empty_articles = empty_articles + (batch_size - inserted_articles)
print(results)
print(f"Articles {str(i)} to {str(i+len(articles))} uploaded.")
print(f"{str(empty_articles)} empty articles found of {str(total_articles)}")