-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfeature.py
32 lines (27 loc) · 1015 Bytes
/
feature.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
import numpy as np # linear algebra
import pandas as pd # data processing
import os
import re
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from translator import lang_translate
# nltk.data.path.append('C:/fake_news_detection/venv/lib/site-packages/nltk/data.py')
# nltk.download()
def get_all_query(title):
title = [title]
return title
def remove_punctuation_stopwords_lemma(sentence):
# google translate if language is not english
lang_translate(sentence)
filter_sentence = ''
stop_words = stopwords.words('english')
lemmatizer = WordNetLemmatizer()
#sentence = re.sub(r'[^\w\s]', '', sentence)
sentence = re.sub(r'https?://\S+', '', sentence, flags=re.MULTILINE)
words = nltk.word_tokenize(sentence) # tokenization
words = [w for w in words if not w in stop_words]
for word in words:
filter_sentence = filter_sentence + ' ' + \
str(lemmatizer.lemmatize(word)).lower()
return filter_sentence