-
Notifications
You must be signed in to change notification settings - Fork 19
/
Copy pathrecitation_1.R
116 lines (58 loc) · 2.31 KB
/
recitation_1.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
####Recitation 1: Introduction to the course
## Examples taken wholesale from Ken Benoit's NYU Dept. of Politics short course Fall 2014
## Avaliable on his website: www.kenbenoit.net
## be sure to install the latest version from GitHub, using dev branch:
devtools::install_github("kbenoit/quanteda")
## and quantedaData
devtools::install_github("kbenoit/quantedaData")
library(quanteda)
##start with a short document
sampletxt <- "The police with their policing strategy instituted a policy of general
iterations at the Data Science Institute."
#Let's tokenize
tokens <- tokenize(sampletxt)
?tokenize
tokens <- tokenize(sampletxt, removePunct=TRUE)
## stemming examples
stems <- wordstem(tokens)
?wordstem
# vector representations
data("SOTUCorpus", package = "quantedaData")
last_speech_text <- SOTUCorpus[ndoc(SOTUCorpus)]
# same as
last_speech_text <- texts(SOTUCorpus)[ndoc(SOTUCorpus)]
obama_dfm <- dfm(last_speech_text)
?dfm
##stopwords example
stopwords("english")
obama_dfm <- dfm(last_speech_text, ignoredFeatures = stopwords("english"))
###full matrix of all speeches
full_dfm <- dfm(SOTUCorpus, ignoredFeatures = stopwords("english"))
topfeatures(full_dfm)
plot(full_dfm, min.freq = 30)
##weighting--tfidf
weighted_dfm <- tfidf(full_dfm)
topfeatures(weighted_dfm)
##Think about what this means!
normalized <- tfidf(full_dfm, normalize=TRUE)
topfeatures(normalized)
## constructing bigrams
collocations(last_speech_text)
collocations(last_speech_text, size=3)
## detect collocations in overall
colloc <- collocations(last_speech_text)
## remove any collocations containing a word in the stoplist
# isStopwordList <- lapply(colloc$word1, `%in%`, stopwords("english"))
# stopwordindex <- which(colloc$word1 %in% stopwords("english")| colloc$word2 %in% stopwords("english"))
# colloc[-stopwordindex] # collocations not containing stopwords
removeFeatures(colloc, stopwords("english"))
##now let's take a little look at regular expressions
# regular expressions are a very powerful tool in wrangling text
# not a focus of this class, but something to be aware of
?regex
s_index <- grep(" s ", texts(SOTUCorpus))
?grep
# this returns every speech that contains " s "
grep(" s ", texts(SOTUCorpus), value=TRUE)
s_index <- grep(" s ", texts(SOTUCorpus))
no_s <- gsub(" s ", "", SOTUCorpus[s_index])