-
Notifications
You must be signed in to change notification settings - Fork 0
/
textasdata.R
148 lines (109 loc) · 5.15 KB
/
textasdata.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
# load libraries
library(devtools)
devtools::install_github("bradleyboehmke/harrypotter")
library(harrypotter)
library(tidytext)
library(plyr)
library(tidyverse)
library(quanteda)
philosophers_stone_corpus <- corpus(philosophers_stone)
philosophers_stone_summary <- summary(philosophers_stone_corpus)
philosophers_stone_summary
# check for metadata; shouldn't see any
docvars(philosophers_stone_corpus)
# add an indicator for the book; this will be useful later when we add
# all the books together into a single corpus
philosophers_stone_summary$book <- "Philosopher's Stone"
# create a chapter indicator
philosophers_stone_summary$chapter <- as.numeric(str_extract(philosophers_stone_summary$Text, "[0-9]+"))
#Now we can assign these to the corpus as document-level metadata as follows
docvars(philosophers_stone_corpus) <- philosophers_stone_summary
docvars(philosophers_stone_corpus)
#look at only chapters with fewer than 5,000 tokens AKA WORDS.
small_corpus <- corpus_subset(philosophers_stone_corpus, Tokens < 5000)
summary(small_corpus)
# the number of documents (chapters) in our small corpus
ndoc(small_corpus)
# the command to reshape our corpus to the sentence level
small_corpus_sentences <- corpus_reshape(small_corpus, to = "sentences")
# the number of documents (sentences) in our reshaped corpus
ndoc(small_corpus_sentences)
# a summary of the first 5 texts in the sentence-level corpus
summary(small_corpus_sentences, n=5)
# the default breaks on white space
philosophers_stone_tokens <- tokens(philosophers_stone_corpus)
print(philosophers_stone_tokens)
# you can also drop punctuation
philosophers_stone_tokens <- tokens(philosophers_stone_corpus,
remove_punct = T)
print(philosophers_stone_tokens)
# as well as numbers
philosophers_stone_tokens <- tokens(philosophers_stone_corpus,
remove_punct = T,
remove_numbers = T)
print(philosophers_stone_tokens)
# check the use of "dumbledore"
kwic_dumbledore <- kwic(philosophers_stone_tokens,
pattern = c("dumbledore"))
# look at the first few uses
head(kwic_dumbledore)
# now look at a broader window of terms around "dumbledore"
kwic_dumbledore <- kwic(philosophers_stone_tokens,
pattern = c("dumbledore"),
window = 10)
# look at the first few uses
head(kwic_dumbledore)
# if you are more interested in phrases, then you can do that too using phrase()
kwic_phrase <- kwic(philosophers_stone_tokens,
pattern = phrase("daily prophet"))
head(kwic_phrase)
# list out the object (book) names that we need
myBooks <- c("philosophers_stone", "chamber_of_secrets", "prisoner_of_azkaban", "goblet_of_fire", "order_of_the_phoenix",
"half_blood_prince", "deathly_hallows")
# create loop.
for (i in 1:length(myBooks)){
# create corpora
corpusCall <- paste(myBooks[i],"_corpus <- corpus(",myBooks[i],")", sep = "")
eval(parse(text=corpusCall))
# change document names for each chapter to include the book title. If you don't do this, the document names will be duplicated and you'll get an error.
namesCall <- paste("tmpNames <- docnames(",myBooks[i],"_corpus)", sep = "")
eval(parse(text=namesCall))
bindCall <- paste("docnames(",myBooks[i],"_corpus) <- paste(\"",myBooks[i],"\", tmpNames, sep = \"-\")", sep = "")
eval(parse(text=bindCall))
# create summary data
summaryCall <- paste(myBooks[i],"_summary <- summary(",myBooks[i],"_corpus)", sep = "")
eval(parse(text=summaryCall))
# add indicator
bookCall <- paste(myBooks[i],"_summary$book <- \"",myBooks[i],"\"", sep = "")
eval(parse(text=bookCall))
# add chapter indicator
chapterCall <- paste(myBooks[i],"_summary$chapter <- as.numeric(str_extract(",myBooks[i],"_summary$Text, \"[0-9]+\"))", sep = "")
eval(parse(text=chapterCall))
# add meta data to each corpus
metaCall <- paste("docvars(",myBooks[i],"_corpus) <- ",myBooks[i],"_summary", sep = "")
eval(parse(text=metaCall))
}
# once the loop finishes up, check to make sure you've created what you want
docvars(deathly_hallows_corpus)
# create combined corpora of the first 7 Harry Potter books.
harry_potter_corpus <- c(philosophers_stone_corpus, chamber_of_secrets_corpus, prisoner_of_azkaban_corpus, goblet_of_fire_corpus, order_of_the_phoenix_corpus,
half_blood_prince_corpus, deathly_hallows_corpus)
summary(harry_potter_corpus)
# check the number of documents (here, total chapters in the 7 books)
ndoc(harry_potter_corpus)
# check the total length of the text (i.e., the total word count)
sum(ntoken(harry_potter_corpus))
# check the total length of the text (i.e., the total word count)
max(ntoken(harry_potter_corpus))
# check the use of "hagrid"
kwic_hagrid <- kwic(philosophers_stone_tokens,
pattern = c("hagrid"))
# look at the first few uses
head(kwic_hagrid)
# now look at a broader window of terms around "hagrid"
kwic_hagrid <- kwic(philosophers_stone_tokens,
pattern = c("hagrid"),
window = 10)
# look at the first few uses
head(kwic_hagrid)
hist(ntoken(harry_potter_corpus))