-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtext.R
305 lines (217 loc) · 8.77 KB
/
text.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
## 4/17: Collaboration mini project II: data manipulation
### PART I ###
rm(list = ls()) # Clear the workspace
### Installation ###
# Many packages! Plus some explanation:
library(dplyr) # Data wrangling, glimpse() and tbl_df()
library(fs) # File management functions
library(lubridate) # Dates and times
library(magrittr) # Pipes %>%, %T>% and equals(), extract()'
library(maps) # Maps for US cities
library(mosaic) # favstats and other summary functions
library(ncdf4) # NetCDF manipulation
library(stringi) # More strings
library(stringr) # String operations
library(tibble) # Convert row names into a column
library(tidyr) # Prepare a tidy dataset, use gather()
library(tidytext) # Tidying text data for analysis
library(tidyverse) # All tidyverse packages
# Hint: use install.packages(c('pkg1', 'pkg2')) etc. to install missing ones.
### PART II ###
## Load the data
Abc7 <- read.csv("abc7ny.csv")
Kcra <- read.csv("kcra.csv")
## Inspect the data
?glimpse() # In the tibble library
Abc7 %>% glimpse()
# You can also just use glimpse(Abc7), but getting comfy with pipes is good!
# Now inspect your Kcra data.
# Doublecheck that the column names are identical between the files
base::identical(names(Abc7), names(Kcra))
?dim
?class() # You may want to use these functions sometimes
## Combine the datasets
?dplyr::bind_rows() # Top-to-bottom merge of two data frames
?dplyr::bind_cols() # Side-by-side merge of two data frames
# Note that the :: format above means package::function.
# You can equivalently write:
?bind_rows()
?bind_cols()
# What makes sense to merge these files?
Merge <- dplyr::bind_cols(Abc7, Kcra, .id = "data_id")
Merge <- dplyr::bind_rows(Abc7, Kcra, .id = "data_id")
# Warnings are ok, don't worry
# Check the dimensions
Merge %>%
dplyr::count(data_id) # A tibble
head(Merge) # head(tibble) is a bit different from head(df)
Merge %>%
glimpse()
# The somewhat dramatic definition of a tibble is on your worksheet
## Manipulating strings
# I'm taking the first 5 lines of column 'headline'
# I'll put that data into an object (headlines_var):
headlines_var <- Merge %>%
dplyr::select(headline) %>%
Matrix::head(5)
# {base} applies arguments to characters
# We need to ensure we are working with characters:
is.character(headlines_var) # Not a character...
typeof(headlines_var)
# We can convert a list to a character vector via unlist():
headlines_var <- unlist(headlines_var)
is.character(headlines_var) # Worked!
# Examine the vector:
headlines_var %>% utils::str()
## Remove characters
sub(pattern = "-", replacement = " ", x = headlines_var)
# Here, base::sub() only finds the first instance of - and replaces it with a blank space " ".
# If I want to remove ALL instances of -, I use gsub().
gsub(pattern = "-", replacement = " ", x = headlines_var)
## Combine characters
# To collapse strings or "stick" them together, use paste():
?paste()
paste(headlines_var, collapse = "; ")
# Or cat(), short for "concatenate":
?cat()
cat(headlines_var, sep = ", ")
# Or noquote():
noquote(headlines_var)
## {stringr}
# Hadley Wickham, a chief programmer for R Studio, developed this handy package.
# {stringr} is a package I have relied on a lot lately, so this is only a preview.
# Change cases
Merge %>%
mutate(headline_low = str_to_lower(headline)) %>%
select(headline, headline_low) %>%
head()
# You can also use {stringr} to search for word occurrences:
?dplyr::mutate()
Merge %>%
dplyr::mutate(teaser_3_words = stringr::word(Merge$teaser, 1, 3)) %>%
count(teaser_3_words, sort = TRUE) %>% # Again, note the package::library format
head(10)
# Joining new data
# We've explored cbind(), rbind(), and merge(). Here's some advanced joining.
UsCity <- maps::us.cities # Using built-in data from package {maps}
UsCity %>% glimpse()
UsCity %>% dplyr::distinct(name) %>% nrow() # What is this telling us?
# Next up, we are touching on "regular expressions", defined as:
# "pattern[s] which specifies a set of strings of characters"
# They are commonly also defined as "a nightmare". Ha.
UsCity <- UsCity %>% # Reassign our mutated UsCity df to an identically named df
mutate(
city_id = stringr::str_replace_all( # This is my favorite {stringr} function
string = UsCity$name,
pattern = UsCity$country.etc,
replacement = ""
),
city_id = stringr::str_trim(city_id), # Trim whitespace
city_id = stringr::str_to_lower(city_id) # Change to lowercase
)
UsCity %>% glimpse()
# Make a vector of IDs parsed by |
city_id_vec <- unlist(UsCity$city_id)
city_id_vec <- paste(city_id_vec, sep = "", collapse = " | ")
# Turn everything to lowercase:
Merge <- Merge %>%
mutate(headline = stringr::str_to_lower(headline))
Merge %$% head(headline, 1) # This pipe differs- it makes use of a regular expression argument
?"%$%"
# Now we want to see which headlines in Merge contain cities from "city_id_vec"
# We ask stringr's str_detect() to operate inside dplyr's filter()
# These functions check for matches according to the pattern = argument.
MapMerge <- Merge %>%
filter(stringr::str_detect(string = headline,
pattern = city_id_vec)) %>%
dplyr::select(headline,
dplyr::everything())
MapMerge %>% dplyr::glimpse()
# We now have 2322 matching, filtered observations.
# To join to UsCity, we create a city_id variable in MapMerge using str_extract()
MapMerge <- MapMerge %>%
mutate(
city_id = str_extract(
string = MapMerge$headline,
pattern = paste(city_id_vec, collapse = "|")
),
city_id = stringr::str_trim(city_id)
)
MapMerge %>%
count(city_id, sort = TRUE) %>%
head() # Hello, tibble
# Using inner_join() from dplyr, I join the two dfs:
?inner_join()
MapMerge <- inner_join(MapMerge, UsCity, by = "city_id")
# This is the end of this tutorial. Save your edits and return to the worksheet.
### PART III ###
## Dates and times
# Getting R to recognize date/time entry as I intended has taken MONTHS of my life.
# Save yourself those months with this tutorial.
# First up, package {lubridate}, another magical contribution by Hadley Wickham:
ymd('20180417')
mdy('04-18-2018')
dmy('18/04/2018')
# {lubridate} nicely handles varius formats and separators.
arrive <- ymd_hms("2018-04-18 12:00:00", tz = "Pacific/Honolulu")
arrive
leave <- ymd_hms("2018-04-18 14:00:00", tz = "Pacific/Honolulu")
leave
## Setting and extracting
second(arrive)
second(arrive) <- 25 # Change the arrival time
arrive
second(arrive) <- 0 # Reset it
wday(arrive) # returns the day of the week as a number or ordered factor if label is TRUE
wday(arrive, label = TRUE)
## Time Zones
defense <- ymd_hms("2018-05-04 15:30:00", tz = "Pacific/Honolulu")
with_tz(defense, "America/New_York") # What time is the defense in NYC?
## Time Intervals
hnl <- interval(arrive, leave) # How long will we be in Honolulu?
hnl
hnl <- arrive %--% leave # Another pipe type!
?"%--%"
hnl
# How long was the visit to Honolulu?
hnl / ddays(1)
hnl / dhours(1)
hnl / dminutes(1)
hnl / dminutes(2) # How many minutes in the second half of the visit?
## Handling NetCDF libraries
# Note that I use "libraries" here to refer to the *software* type, not an R library.
# NetCDF = Network Common Data Form
# Do you ever use NOAA or UCAR data? You need to know how to examine a NetCDF file.
nc = nc_open('yourpathway/file.nc') # Hint: Style tip #8
str(nc) # Examine nc file structure
# Okay, we see there are 4 dimensions (labeled: $ dimid: int [1:4(1d)] 0 1 2 3)
# There are other variables here too- coordinate info, units, variable of interest, units...
v1 = nc$var[[1]] # Identifies which of the variables that you'll extract time from
v1
# Now you'll need to index into the dimensions of v1. Try this:
v1$dim[[1]] # Doesn't look like time. What about...
v1$dim[[2]] # Keep going...
v1$dim[[3]] # Here is our depth variable, identified with $name up top...
v1$dim[[4]] # Here's time.
?as.POSIXlt()
dates<-as.POSIXlt(v1$dim[[4]]$vals, origin = "1970-01-01", tz = "UTC") # Hint: Style tip #5
dates # Verify the date
dat <- ncvar_get(nc; v1) # Hint: Style tip #9
dim(dat)
# 139 rows (lon) and 38 col (lat)
lon <- v1$dim[[1]]$vals # dim[[1]] is lon, verified above with v1$dim[[1]]
lat = v1$dim[[2]]$vals # dim[[2]] is lat; also, edit this line, delete this comment
# Repeat each lat value 139 times (to match with lon)
latrep <- rep(lat, each = 139)
latrepdf <- as.data.frame(latrep)
# Repeat lon values 38 times
lonrep <- rep(lon, 38)
lonrepdf <- as.data.frame(lonrep)
coords <- cbind(lonrepdf, latrepdf)
# Pair values to appropriate coords
valsApr18 <- c(dat) # Combine Values into a vector
datvals <- as.data.frame(valsApr18)
tempdat <- cbind(datvals,coords) # Hint: Style tip #5
summary(tempdat)
# Now these data are helpfully organized in a way that allows for further manipulation
# End of this Part III exercise- return to the worksheet.