-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathGetData.py
64 lines (53 loc) · 2.15 KB
/
GetData.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import re
import pandas as pd
import numpy as np
import nltk
from nltk.stem.snowball import SnowballStemmer
from sklearn.cluster import KMeans
class GetData:
# extract keywords from episodes
def word_extract(f):
# tokenize
tokens = nltk.word_tokenize(f)
stopwords = nltk.corpus.stopwords.words('english')
stopwords.extend(["'s", "n't", "'m", "'d", "us","would","know","one","go","want","come","like","get","veri","well","thing","king","re","ve","ever","still"])
stemmer = SnowballStemmer("english")
# extract stemmers
word = []
for token in tokens:
if re.search('[a-zA-Z]', token):
word.append(token)
word = [s.lower() for s in word if s.lower() not in stopwords]
stems = [stemmer.stem(t) for t in word]
stems = [s.lower() for s in stems if s.lower() not in stopwords]
return stems
# extract keywords from paragraphs
def para_data(f):
# separate paragraphs
para = f.split("\n\n")
para[:] = (value for value in para if value != '\t')
# Separate and tokenize
Records = []
Words = []
Time = []
for line in range (len(para)):
lines = word_tokenize(para[line])
all_word = []
for token in lines:
if re.search('[a-zA-Z]', token):
all_word.append (token)
Words.append(all_word)
# Stem and remove stopwords
stemmer = SnowballStemmer("english")
stopwords = nltk.corpus.stopwords.words('english')
stopwords.extend(["'s", "n't", "'m", "'d", "us","would","know","one","go","want","come","like","king","north","army","father","need","think","armi","dead","back","lord"])
for i in range (len(Words)):
Words[i] = [stemmer.stem(t) for t in Words[i]]
Words[i] = [s.lower() for s in Words[i] if s.lower() not in stopwords]
return Words
def most_100(f):
return nltk.FreqDist(f).most_common(100)
def unique(f):
return list(nltk.FreqDist(f).keys())