-
Notifications
You must be signed in to change notification settings - Fork 1
/
tfidf.py
186 lines (162 loc) · 5.35 KB
/
tfidf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
import os
from collections import defaultdict
import numpy as np
# Define the list of terms we're interested in
terms = [
"socialSignup",
"facebook",
"twitter",
"google",
"jscrypto",
".connected",
"storage.set",
"storage.get",
"storage.sync.get",
"storage.sync.set",
"cookies.set",
"cookies.get",
"browser.cookies",
"window.location",
"window.height",
"window.width",
"navigator.userAgent",
"isChrome",
"isFirefox",
"__REACT_DEVTOOLS_GLOBAL_HOOK__",
"chrome.devtools.network",
"chrome.storage.sync.get('visitorId')",
"window.location.href",
"/recaptcha/api2/",
"CaptchaMessage",
"getUserMedia",
"AudioContext",
"addEventListener('keyup')",
"addEventListener('click')",
"notificationMsg",
"notification",
"options",
"click",
"settings",
".filter",
"url.search",
"window.addEventListener(DOMContentLoaded",
"fetch",
"math.random",
]
# Function to traverse directories and count term frequencies
def get_term_frequencies(directory):
term_counts = defaultdict(int)
for root, dirs, files in os.walk(directory):
for file in files:
if file.endswith(".js"):
file_path = os.path.join(root, file)
try:
with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
content = (
f.read().lower()
) # Read and convert content to lowercase
for term in terms:
term_counts[term] += content.count(term)
except Exception as e:
print(f"Error reading {file_path}: {e}")
return term_counts
# Function to compare frequencies
def compare_frequencies(freq_a, freq_b):
more_in_a = sum(freq_a[term] > freq_b[term] for term in terms)
more_in_b = sum(freq_b[term] > freq_a[term] for term in terms)
return more_in_a, more_in_b
# Paths to the directories
dir_a = "/home/npantel/vv8-crawler-slim-v5/celery_workers/vv8_worker/vv8_crawler/merged_folder_6_parts"
dir_b = "/home/npantel/vv8-crawler-slim-v5/celery_workers/vv8_worker/vv8_crawler/ALL_EXTENSIONS1k"
# Calculate term frequencies for both directories
frequencies_a = get_term_frequencies(dir_a)
frequencies_b = get_term_frequencies(dir_b)
# Compare the frequencies and print the result
more_in_a, more_in_b = compare_frequencies(frequencies_a, frequencies_b)
print(f"Terms more frequent in Directory A: {more_in_a}")
print(f"Terms more frequent in Directory B: {more_in_b}")
# import os
# import glob
# from collections import defaultdict
# from sklearn.feature_extraction.text import TfidfVectorizer
# # List of JavaScript APIs to track
# javascript_apis = [
# "socialSignup",
# "facebook",
# "twitter",
# "google",
# "jscrypto",
# ".connected",
# "storage.set",
# "storage.get",
# "storage.sync.get",
# "storage.sync.set",
# "cookies.set",
# "cookies.get",
# "browser.cookies",
# "window.location",
# "window.height",
# "window.width",
# "navigator.userAgent",
# "isChrome",
# "isFirefox",
# "__REACT_DEVTOOLS_GLOBAL_HOOK__",
# "chrome.devtools.network",
# "chrome.storage.sync.get('visitorId')",
# "window.location.href",
# "/recaptcha/api2/",
# "CaptchaMessage",
# "getUserMedia",
# "AudioContext",
# "addEventListener('keyup')",
# "addEventListener('click')",
# "notificationMsg",
# "notification",
# "options",
# "click",
# "settings",
# ".filter",
# "url.search",
# "window.addEventListener(DOMContentLoaded",
# "fetch",
# "math.random",
# ]
# # Function to read all .js files in a directory (recursively)
# def read_js_files(directory):
# all_texts = []
# for root, dirs, files in os.walk(directory):
# for file in files:
# if file.endswith(".js"):
# file_path = os.path.join(root, file)
# try:
# with open(file_path, "r", encoding="utf-8") as file:
# all_texts.append(file.read())
# except:
# continue
# return all_texts
# # Function to compute TF-IDF
# def compute_tfidf(texts):
# vectorizer = TfidfVectorizer(vocabulary=javascript_apis, stop_words="english")
# tfidf_matrix = vectorizer.fit_transform(texts)
# feature_names = vectorizer.get_feature_names_out()
# return tfidf_matrix, feature_names
# # Main function to process directories
# def process_directories(dir_a, dir_b):
# texts_a = read_js_files(dir_a)
# texts_b = read_js_files(dir_b)
# # Compute TF-IDF for directory A and B
# tfidf_a, feature_names = compute_tfidf(texts_a)
# tfidf_b, _ = compute_tfidf(texts_b)
# # Get dense representation for comparison
# dense_a = tfidf_a.todense()
# dense_b = tfidf_b.todense()
# # Display TF-IDF results
# print("TF-IDF for Directory A:")
# print(dense_a)
# print("\nTF-IDF for Directory B:")
# print(dense_b)
# # Specify the paths to your directories
# dir_a = "/home/npantel/vv8-crawler-slim-v5/celery_workers/vv8_worker/vv8_crawler/merged_folder_6_parts"
# dir_b = "/home/npantel/vv8-crawler-slim-v5/celery_workers/vv8_worker/vv8_crawler/ALL_EXTENSIONS1k"
# # Process both directories
# process_directories(dir_a, dir_b)