-
Notifications
You must be signed in to change notification settings - Fork 19
/
recitation_4.R
137 lines (83 loc) · 3.9 KB
/
recitation_4.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
####Recitation 4
## Some Examples taken wholesale from Ken Benoit's NYU Dept. of Politics short course Fall 2014
## Avaliable on his website: www.kenbenoit.net
##load data
library(quanteda)
library(quantedaData)
##read in conservative manifestos
setwd("C:/Users/kevin/Documents/GitHub/Text_as_Data/cons")
##read in the files
files <- list.files( full.names=TRUE)
text <- lapply(files, readLines)
text<-unlist(lapply(text, function(x) paste(x, collapse = " ")))
#name data
files<-unlist(files)
files<-gsub("./Con", "", files )
files<-gsub(".txt", "", files )
#don't use this exact line--think about how it's different
man_df<-data.frame(year = as.numeric(files), text = text , stringsAsFactors = FALSE)
## basic regular expression function demos
grep("^vot", c("voting", "votes", "devoted", "vote"))
grepl("^vot", c("voting", "votes", "devoted", "vote"))
## replace patterns
gsub("(\\w)-(\\d{2,2})", "\\1-19\\2", c("Roosevelt-33", "Roosevelt-37", "Obama-2003"))
# observe the difference the $ makes
gsub("(\\w)-(\\d{2,2})$", "\\1-19\\2", c("Roosevelt-33", "Roosevelt-37", "Obama-2003"))
##Ã using dfm to keep only certain terms
## keep only certain words
testText <- "The quick brown fox named Seamus jumps over the lazy dog also named Seamus, with
the newspaper from a a boy named Seamus, in his mouth."
print(dfm(testCorpus, keptFeatures="s$"), TRUE) # keep only words ending in "s"
testTweets <- c("My homie @justinbieber #justinbieber getting his shopping on in #LA yesterday #beliebers",
"To all the haters including my brother #justinbieber #justinbiebermeetcrystaltalley #emabiggestfansjustinbieber",
"Justin Bieber #justinbieber #belieber #kidrauhl #fetusjustin #EMABiggestFansJustinBieber")
print(dfm(testTweets, keptFeatures="^#"), TRUE) # keep only hashtags
## similar keep for dfms
library(quantedaData)
data(ie2010Corpus)
ieDfm <- dfm(ie2010Corpus, keptFeatures=c("tax|budg|^auster"))
View(ieDfm)
## compound words
mytexts <- c("The new law included a capital gains tax, and an inheritance tax.",
"New York City has raised a taxes: an income tax and a sales tax.")
mydict <- list(tax=c("tax", "income tax", "capital gains tax", "inheritance tax"))
print(dfm(mytexts, dictionary=mydict), show.values=TRUE)
## Laver Garry dictionary
setwd("C:/Users/kevin/Documents/GitHub/Text_as_Data")
lgdict <- dictionary(file = "LaverGarry.cat", format = "wordstat")
head(dfm(man_df$text, dictionary=lgdict))
dic_dfm<-dfm(df$text, dictionary=lgdict)
features(dic_dfm)
##plot it
plot(man_df$year,
dic_dfm[,"CULTURE.SPORT"],
xlab="Year", ylab="SPORTS", type="b", pch=19)
plot(man_df$year,
dic_dfm[,"VALUES.CONSERVATIVE"],
xlab="Year", ylab="Conservative values", type="b", pch=19)
plot(man_df$year,
dic_dfm[,"INSTITUTIONS.CONSERVATIVE"] - dic_dfm[,"INSTITUTIONS.RADICAL"],
xlab="Year", ylab="Net Conservative Institutions", type="b", pch=19)
##RID Dictionary--Regressive Imagery Dictionary
setwd("C:/Users/kevin/Documents/GitHub/Text_as_Data")
rid_dict <- dictionary(file = "RID.cat", format = "wordstat")
data("SOTUCorpus")
sotus <- texts(SOTUCorpus)
year<-(SOTUCorpus$documents$Date)
pres<-(SOTUCorpus$documents$President)
head(dfm(sotus, dictionary=rid_dict))
dic_dfm<-dfm(sotus, dictionary=rid_dict)
features(dic_dfm)
plot(year,
dic_dfm[,"PRIMARY.REGR_KNOL.NARCISSISM"],
xlab="Year", ylab="Narcissism", type="b", pch=19)
plot(year,
dic_dfm[,"EMOTIONS.ANXIETY._"],
xlab="Year", ylab="Anxiety", type="b", pch=19)
plot(year,
dic_dfm[,"EMOTIONS.AGGRESSION._"],
xlab="Year", ylab="Aggression", type="b", pch=19)
plot(year,
dic_dfm[,"PRIMARY.ICARIAN_IM.FIRE"] + dic_dfm[,"PRIMARY.ICARIAN_IM.ASCEND"] +dic_dfm[,"PRIMARY.ICARIAN_IM.DESCENT"] +
dic_dfm[,"PRIMARY.ICARIAN_IM.DEPTH"] + dic_dfm[,"PRIMARY.ICARIAN_IM.HEIGHT"] + dic_dfm[,"PRIMARY.ICARIAN_IM.WATER"],
xlab="Year", ylab="Icarian-ness", type="b", pch=19)