forked from DSCTOCE/Algorithms
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Word Frequency Functions
40 lines (28 loc) · 1.26 KB
/
Word Frequency Functions
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
import string
from math import log10
def term_frequency(term: str, document: str) -> int:
# strip all punctuation and newlines and replace it with ''
document_without_punctuation = document.translate(
str.maketrans("", "", string.punctuation)
).replace("\n", "")
tokenize_document = document_without_punctuation.split(" ") # word tokenization
return len([word for word in tokenize_document if word.lower() == term.lower()])
def document_frequency(term: str, corpus: str) -> int:
corpus_without_punctuation = corpus.lower().translate(
str.maketrans("", "", string.punctuation)
) # strip all punctuation and replace it with ''
docs = corpus_without_punctuation.split("\n")
term = term.lower()
return (len([doc for doc in docs if term in doc]), len(docs))
def inverse_document_frequency(df: int, N: int, smoothing=False) -> float:
if smoothing:
if N == 0:
raise ValueError("log10(0) is undefined.")
return round(1 + log10(N / (1 + df)), 3)
if df == 0:
raise ZeroDivisionError("df must be > 0")
elif N == 0:
raise ValueError("log10(0) is undefined.")
return round(log10(N / df), 3)
def tf_idf(tf: int, idf: int) -> float:
return round(tf * idf, 3)