-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
49 lines (38 loc) · 1.52 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
import sys
from clustering import cluster_embeddings, hierarchical_clustering
from config import load_config
from extraction import get_embeddings, log_extraction_stats
from get_links import get_links
from report import generate_cluster_report
def main():
config = load_config() # Loads config.json
links_file = config.get("LINKS_FILE", "links.txt")
rate_limiting_domains = config.get("RATE_LIMITING_DOMAINS", [])
ignore_domains = config.get("IGNORE_DOMAINS", [])
urls = get_links(links_file)
if not urls:
print("No URLs found. Exiting.")
return
print("Fetching content and computing embeddings in parallel...")
embeddings, texts = get_embeddings(
urls,
max_workers=10,
rate_limiting_domains=rate_limiting_domains,
ignore_domains=ignore_domains,
)
log_extraction_stats()
print("Clustering with HDBSCAN...")
hdbscan_labels = cluster_embeddings(embeddings, min_cluster_size=2, min_samples=1)
print("HDBSCAN Cluster labels:", hdbscan_labels)
print("Clustering with Hierarchical Clustering...")
hier_labels = hierarchical_clustering(embeddings, distance_threshold=1.5)
print("Hierarchical Cluster labels:", hier_labels)
print("Generating cluster reports...")
# generate_cluster_report(
# urls, texts, hdbscan_labels, output_file="hdbscan_cluster_report.txt"
# )
generate_cluster_report(
urls, texts, hier_labels, output_file="hierarchical_cluster_report.txt"
)
if __name__ == "__main__":
main()