-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcreate_tutorial_dataset.R
71 lines (60 loc) · 3.27 KB
/
create_tutorial_dataset.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
library(tidyverse)
# We'll be using the Raleigh corpus from the SPADE project. Download the Raleigh
# dataset at from OSF https://osf.io/79jgu and put the file in the data
# folder.
raleigh_data_full <- read.csv("data/spade-Raleigh_formants_whitelisted.csv")
# To make this tutorial run-able on a personal laptop, we'll just use the ten
# speakers who have the most data.
# make a vector containing the names of the 10 speakers with the most data
top_10_speakers <- raleigh_data_full %>%
count(speaker) %>% # the number of tokens per speaker
mutate(rank=dense_rank(desc(n))) %>% # rank number of tokens with most=1
filter(rank <= 10) %>% # filter out everyone except the top 10 speakers
pull(speaker) # convert speaker column to a vector
# filter the original dataset to only include top 10 speakers
raleigh_data_top10 <- raleigh_data_full %>%
filter(speaker %in% top_10_speakers)
# we won't be using the full dataset anymore, so let's get rid of it to
# save a little memory
rm(raleigh_data_full)
# Before narrowing down to the two vowels of interest (/ɪ/ and /ɛ/), we want to
# Lobanov normalize F1 and F2 for each speaker using all of their vowel tokens
# i.e., take a z-score of F1 and F2 across all vowel tokens for each speaker.
raleigh_data_norm <- raleigh_data_top10 %>%
group_by(speaker) %>% # vowels are normalized separately for each speaker
mutate(across(c(F1, F2), ~ (.x - mean(.x, na.rm = TRUE)) / sd(.x)))
# Now, let's narrow down the dataset to the tokens and quantities we're interested in. We want stressed tokens of /ɪ/ and /ɛ/.
raleigh_data <- raleigh_data_norm %>%
# pick out just syllables with primary stress
filter(syllable_stress==1) %>%
# grab the /ɪ/ and /ɛ/ tokens.
# match the unisyn label to the phone label for a quality check
filter((phone_label=="EH1" & UnisynPrimStressedVowel1=="e") |
(phone_label=="IH1" & UnisynPrimStressedVowel1=="i")) %>%
# make sure that UR in this dialect corresponds to lexical set
# (more important when there is more than one dialect)
filter(unisynPrimStressedVowel2_sca==UnisynPrimStressedVowel1) %>%
# label the context according to following consonant
mutate(context = case_when(
following_phone %in% c("B", "CH", "D", "DH", "F", "G", "HH", "JH", "K",
"L", "P", "R", "S", "SH", "T", "TH", "V", "W",
"Y", "Z", "ZH" ) ~ "oral", # oral consonants
following_phone %in% c("N", "M", "NG") ~ "nasal", # nasal consonants
TRUE ~ "other")) %>%
# get rid of everything that isn't an oral or nasal consonant
filter(context != "other") %>%
# get rid of voiced prevalar contexts (can include, but would want
# following consonant included in model as a predictor or random effect)
filter(following_phone!="NG", following_phone!="G") %>%
# make vowel and context factors
mutate(vowel = factor(UnisynPrimStressedVowel1,
levels=c('i', 'e')),
context = factor(context,
levels=c('oral', 'nasal'))) %>%
# make sure that there aren't extraneous levels
droplevels() %>%
# keep just the columns we're interested in
dplyr::select(speaker, vowel, context, word, F1, F2)
# Save the dataset
raleigh_data %>% saveRDS("data/Raleigh_subset.rds")
raleigh_data %>% write_csv("data/Raleigh_subset.csv")