-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathapp.py
188 lines (162 loc) · 7.42 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
import requests
from bs4 import BeautifulSoup
from sqlalchemy import create_engine, Column, String, Text
from sqlalchemy.orm import declarative_base, sessionmaker
from urllib.parse import urlparse, urlunparse, parse_qs, urlencode
import ollama
import uuid
import os
from pathlib import Path
# Database setup
DATABASE_URL = "sqlite:///urls.db" # Database URL for SQLite
Base = declarative_base() # Base class for SQLAlchemy models
# Define a model for storing URL records
class URLRecord(Base):
__tablename__ = 'urls' # Name of the table in the database
id = Column(String, primary_key=True, default=lambda: str(uuid.uuid4())) # Unique identifier for each record
url = Column(String, unique=True, nullable=False) # URL (must be unique and not null)
heading = Column(String) # Page heading
summary = Column(Text) # Summary of the page content
tags = Column(Text) # Tags extracted from the content
# Create the database and the URLRecord table
engine = create_engine(DATABASE_URL)
Base.metadata.create_all(engine)
Session = sessionmaker(bind=engine)
session = Session()
# Class to interact with the Ollama LLM
class LlmClient:
def __init__(self, ollama_instance_url: str, model: str):
self.ollama_instance_url = ollama_instance_url # URL of the Ollama instance
self.model = model # Model to use for generation
self.client = ollama.Client(host=ollama_instance_url) # Initialize the Ollama client
# API call options
self.client_options = {
# Uncomment and set values as needed
# "num_ctx": 8000,
# "temperature": 0.7,
# "top_k": 40,
# "top_p": 0.9,
}
# Load system prompt from file
self.prompt_templates_dir = Path(__file__).parent / "prompts"
self.system_prompt = ""
system_prompt_path = self.prompt_templates_dir / "system.j2"
if system_prompt_path.exists():
with open(system_prompt_path, "r") as file:
self.system_prompt = file.read()
# Get a response from the LLM for a given input
def get_llm_response(self, input: str = ""):
response = self.client.generate(
model=self.model,
options=self.client_options,
system=self.system_prompt,
prompt=input,
)
output = response["response"] # Extract the response content
output = self.clean_output(output) # Clean the output (remove any unwanted tokens)
return output
# Get a streamed response from the LLM for a given input
def get_llm_response_stream(self, input: str = ""):
response = self.client.generate(
model=self.model,
options=self.client_options,
system=self.system_prompt,
prompt=input,
stream=True,
)
for chunk in response:
output = chunk["response"] # Extract the chunk content
output = self.clean_output(output) # Clean the output
yield output
# Clean the output by removing unwanted tokens
def clean_output(self, output: str) -> str:
ending_token = ""
output = output.replace(ending_token, "")
return output
# Remove tracking parameters from the URL
def strip_tracking(url):
parsed_url = urlparse(url) # Parse the URL into components
query = parse_qs(parsed_url.query) # Parse the query parameters
query = {k: v for k, v in query.items() if not k.startswith(('amp_', 'precache_', 'utm_'))} # Remove tracking parameters
stripped_query = urlencode(query, doseq=True) # Re-encode the query parameters
stripped_url = parsed_url._replace(query=stripped_query) # Replace the query part of the URL
return urlunparse(stripped_url) # Reconstruct the URL
# Fetch the content of the URL
def fetch_url_data(url):
try:
response = requests.get(url) # Make a GET request to the URL
if response.status_code == 200:
soup = BeautifulSoup(response.content, 'html.parser') # Parse the HTML content with BeautifulSoup
text = soup.get_text() # Extract the text content
return text, soup # Return the text content and the BeautifulSoup object
else:
print(f"Failed to retrieve the URL. Status code: {response.status_code}")
return None, None
except requests.RequestException as e:
print(f"Request failed: {e}")
return None, None
# Summarise the text using Ollama
def summarise_with_ollama(llm_client, text):
try:
# Structured prompt for summarisation
prompt = f"Concisely summarise web page information and descriptive SEO keywords and tags: \n\n{text[:2000]}"
response = llm_client.get_llm_response(prompt) # Get the response from Ollama
# Simulating a structured response with summary and tags
lines = response.split('\n')
summary = lines[0]
tags = ', '.join(set(word for word in response.split() if word.isalpha() and len(word) > 3)) # Extract tags
short_summary = summary[:200] + '...' if len(summary) > 200 else summary # Truncate the summary
return short_summary, tags
except Exception as e:
print(f"Error calling Ollama: {e}")
return None, None
# Process a single URL
def process_url(llm_client, url):
stripped_url = strip_tracking(url) # Remove tracking parameters
existing_record = session.query(URLRecord).filter_by(url=stripped_url).first() # Check if the URL is already processed
if existing_record:
print(f"URL already processed: {stripped_url}")
return False
text, soup = fetch_url_data(stripped_url) # Fetch the content of the URL
if text and soup:
heading = soup.title.string if soup.title else "No title found" # Extract the page heading
summary, tags = summarise_with_ollama(llm_client, text) # Summarise the content
if summary:
# Create a new record and save it to the database
new_record = URLRecord(url=stripped_url, heading=heading, summary=summary, tags=tags)
session.add(new_record)
session.commit()
print(f"URL: {stripped_url}")
print(f"Heading: {heading}")
print(f"Summary: {summary}")
print(f"Tags: {tags}")
return True
else:
print("Failed to summarise text with Ollama.")
return False
else:
print("Failed to fetch URL data.")
return False
# Main function to process a list of URLs from a file
def main():
llm_client = LlmClient(ollama_instance_url="http://localhost:11434", model="mistral:latest") # Initialize the LLM client
# Read URLs from the file
with open("bmurllist.txt", "r") as file:
urls = file.readlines()
# Get already processed URLs from the database
processed_urls = set(record.url for record in session.query(URLRecord.url).all())
broken_urls = []
# Process each URL
for url in urls:
url = url.strip()
if url and url not in processed_urls:
success = process_url(llm_client, url)
if not success:
broken_urls.append(url) # Add to broken URLs list if processing fails
processed_urls.add(url) # Add to processed URLs set
# Write broken URLs to a file
with open("broken_urls.txt", "w") as file:
for url in broken_urls:
file.write(url + "\n")
if __name__ == "__main__":
main()