Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

assorted examples of quality checks #37

Open
mdingemanse opened this issue Oct 14, 2023 · 1 comment
Open

assorted examples of quality checks #37

mdingemanse opened this issue Oct 14, 2023 · 1 comment
Assignees
Labels
examples sample visualisations, user journeys etc. to demonstrate functionalities

Comments

@mdingemanse
Copy link
Contributor

I have a large set of source level and corpus level notes in the original ElPaCo code — share this so that it can inspire functions and examples for quality control

@mdingemanse mdingemanse self-assigned this Oct 14, 2023
@mdingemanse
Copy link
Contributor Author

mdingemanse commented Oct 14, 2023

@bvreede Here's some code (won't run without access to the data) that we can walk though together for some ideas of useful quality checks. These are notes I have used in the selection and curation of corpora in the ACL and LREC papers.

# turns with identical times ----------------------------------------------

# Turns with identical times should be exceedingly rare.

# take a long source as an example

duplicate_annotations <- d %>% group_by(source,begin,end) %>%
  filter(n()>1) %>% ungroup() %>% 
  select(begin,end,duration,participant,utterance,source,language,uid,priorby)

duplications <- d %>% group_by(source,begin,end) %>%
  mutate(is_dup = ifelse(n() >1,1,0)) %>% ungroup() %>%
  select(begin,end,duration,participant,utterance,source,language,uid,is_dup)
duplications %>% 
  group_by(language) %>%
  summarise(turns = n_distinct(uid),
            dup = sum(is_dup),
            prop = dup/turns) %>%
  arrange(desc(prop))

convplot(sample(duplicate_annotations[duplicate_annotations$language != "nahuatl",]$uid,10),highlight=T,content=T)
ggsave('qc-duplicate-annotations.png',width=20,height=22,bg="white")

# total time of all included sources --------------------------------------

d %>% group_by(language,source) %>%
  summarize(start=min.na(begin),finish=max.na(end),
            totaltime = finish - start,
            hours = (totaltime/1000) / 3600)

#sort by length and inspect: are the shortest ones worth keeping?
d %>% group_by(language,source) %>%
  summarize(start=min.na(begin),finish=max.na(end),
            turns=n_distinct(uid),
            totaltime = finish - start,
            minutes = (totaltime/1000 / 60), 
            hours = (totaltime/1000) / 3600) %>%
  arrange(hours)

# sorting from longest shows no extreme times
d %>% group_by(language,source) %>%
  summarize(start=min.na(begin),finish=max.na(end),
            turns=n_distinct(uid),
            totaltime = finish - start,
            minutes = (totaltime/1000 / 60), 
            hours = (totaltime/1000) / 3600) %>%
  arrange(desc(hours))



# Akhoe -------------------------------------------------------------------

# The Akhoe corpus is so small that a long annotation occurring 4 times is
# pretty striking. Is this a copy paste error?

term <- "ga gestures that a mad (blind?) person has come"
d[d$utterance %in% term,]$source
uids <- d[d$utterance %in% term,]$uid
convplot(uids,focus=T,window=6000)


# Arabic ------------------------------------------------------------------

# Compare three callhome corpora

arabic1 <- read_corpus("arabic1") %>% mutate(language="arabic1")
arabic2 <- read_corpus("arabic2") %>% mutate(language="arabic2")
arabic3 <- read_corpus("arabic3") %>% mutate(language="arabic3")

loadedcorpora <- c("arabic1","arabic2","arabic3")
data = do.call("list", mget(loadedcorpora))
d <- rbindlist(data,fill=T)
rm(data)
# run FTO coding lines from processing.R

d %>%
  ggplot(aes(FTO)) +
  theme_tufte() +
  ggtitle('Floor transfer onset') +
  geom_density(alpha=0.1,na.rm=T) +
  xlim(-20000,20000) +
  geom_vline(xintercept = 0,colour="#cccccc") +
  facet_wrap(~ language,strip.position="bottom")
ggsave('qc-arabic-3corpora_FTO.png',bg="white")

d %>%
  ggplot(aes(FTO,duration)) +
  theme_tufte() +
  ggtitle('Floor transfer onset by turn duration') +
  geom_point(alpha=0.1,na.rm=T) +
  ylim(0,20000) +
  xlim(-20000,20000) +
  geom_vline(xintercept = 0,colour="#cccccc") +
  facet_wrap(~ language,strip.position="bottom")
ggsave('qc-arabic-3corpora_FTOxdur.png',bg="white")


# Chatino -----------------------------------------------------------------


# Chatino is mostly massively monologic: only 463 speaker transitions out of
# 2154 turns in total, and not necessarily because the annotations are
# splitter-type

d %>% filter(language == "Chatino") %>% drop_na(FTO) 

# very few speaker transitions overall — I'm beginning to think this corpus may not be usable
d %>% filter(language == "Chatino") %>%
  group_by(source,priorby) %>%
  summarize(n=n())

d %>% filter(language == "Chatino") %>% 
  group_by(FTO) %>% summarise(n=n()) %>% arrange(desc(n))

convplot(lang="Chatino",focus=T,window=120000)

# One source especially is mostly monologue: "/zacetepec_chatino1/zac-2012_07_11-trans_mgh_mbh_amp.ea"


# Datooga -----------------------------------------------------------------

# Datooga unfortunately has only single tier transcriptions

d %>% filter(language == "Datooga") %>% View()

convplot(lang="Datooga",n=2,window=8000000)


# Dutch -------------------------------------------------------------------

tokenize_corpus <- function(langs=NULL) {
  
  d.tokens <- list()
  
  for (lang in langs) {
    
    print(paste('Tokenization: ',lang))
    d.temp <- d %>% filter(language == lang) %>%
      ungroup() %>%
      drop_na(utterance) %>% drop_na(utterance_stripped) %>%
      unnest_tokens(word, utterance_stripped) %>%
      count(word, sort=T, name="n") %>%
      mutate(language = lang,
             rank = row_number(desc(n)),
             total = sum(n),
             freq = n/total,
             freq_log = log(freq))
    
    d.tokens[[lang]] <- d.temp
    
    
  }
  
  d.tokens <- reduce(d.tokens, rbind)
  return(d.tokens)
}  


# Two corpora for Dutch
dutch1 <- read_corpus("dutch1") %>% mutate(language="dutch1")
dutch2 <- read_corpus("dutch2") %>% mutate(language="dutch2")

loadedcorpora <- c("dutch1","dutch2")
data = do.call("list", mget(loadedcorpora))
d <- rbindlist(data,fill=T)
rm(data)
d$translation <- NA

test <- d %>%
  mutate(duration_abs = abs(duration)) %>%
  filter(duration_abs > 40000)
d <- d %>% filter(uid %notin% test$uid)

d.tokens <- tokenize_corpus(c("dutch1","dutch2"))
inspect_language("dutch1",saveplot=T)
inspect_language("dutch2",saveplot=T)

convplot(lang="dutch1",window=10000,dyads=T,content=T)
ggsave('qc-panel-dutch1-convplot.png',bg="white",width=14,height=10)

# what to do with speaker = UNKNOWN ?

uids <- d[d$participant == "UNKNOWN",]$uid
test <- convplot(sample(uids,6),content=T,datamode=T)

convplot(sample(uids,6),content=T)
ggsave('qc-panel-dutch1-convplot.png',bg="white",width=14,height=10)

examples <- c("dutch-1486-387-568685", "dutch-2133-32-71095", "dutch-218-309-489890", 
              "dutch-2561-70-147440", "dutch-2739-305-496438", "dutch-909-74-95161")

test <- convplot(examples,datamode=T)
ggsave('qc-dutch1-convplot-unknown.png',bg="white", width=14,height=10)
test %>% select(uid,source,participant,participant_int) %>% View()

# English -----------------------------------------------------------------

# For English we have the following corpora:

english1 <- read_corpus("english1") %>% mutate(language="english1")
english2 <- read_corpus("english2") %>% mutate(language="english2")
english3 <- read_corpus("english3") %>% mutate(language="english3")
english4 <- read_corpus("english4") %>% mutate(language="english4")
english5 <- read_corpus("english5") %>% mutate(language="english5")
english6 <- read_corpus("english6") %>% mutate(language="english6")
english7 <- read_corpus("english7") %>% mutate(language="english7")

# combine all 
loadedcorpora <- c("english1","english2","english3","english4","english5","english6","english7")
data = do.call("list", mget(loadedcorpora))
d <- rbindlist(data,fill=T)
rm(data)
d <- english7

# verdict: English 1 looks very weird (no <0 timings); English6 too (all times
# 0). English 2, 3, and 7 have a (slight) problem of a sizable set of turns that
# begin at the exact same instant (very unlikely and also seen for Spanish and
# Japanese corpora that have been excluded for same reason).

# this leaves English 2 and English 4 as the most reasonable-looking ones. We
# might want to combine them.


d %>%
  ggplot(aes(FTO,duration)) +
  theme_tufte() +
  ggtitle('Floor transfer onset by turn duration') +
  geom_point(alpha=0.1,na.rm=T) +
  ylim(0,20000) +
  xlim(-20000,20000) +
  geom_vline(xintercept = 0,colour="#cccccc") +
  facet_wrap(~ language,strip.position="bottom")
#ggsave('qc-english-7corpora.png',bg="white")

# looking into english1
test <- d %>% filter(source %in% sources[1:100])
test %>%
  ggplot(aes(FTO,duration)) +
  theme_tufte() +
  ggtitle('Floor transfer onset by turn duration',
          subtitle="CABNC, first 100 sources (18370 transitions, 7 with negative FTO)") +
  geom_point(alpha=0.1,na.rm=T) +
  ylim(0,20000) +
  xlim(-10000,10000) +
  geom_vline(xintercept = 0,colour="#cccccc") +
  facet_wrap(~ language,strip.position="bottom")
ggsave('out/qc-english1-cabnc-FTO-by-duration.png',bg="white",width=6,height=4)


test %>%
  ungroup() %>%
  mutate(FTO_status = ifelse(FTO <0,"in overlap","not in overlap")) %>%
  drop_na(FTO) %>% group_by(FTO_status) %>%
  summarise(n=n())

test <- test %>% mutate(naiveFTO = begin-lag(end))
summary(test$naiveFTO)

# looking into english7

d %>%
  ggplot(aes(FTO,duration)) +
  theme_tufte() +
  ggtitle('Floor transfer onset by turn duration',
          subtitle="Santa Barbara Corpus of Spoken English (english7)") +
  geom_point(alpha=0.1,na.rm=T) +
  ylim(0,20000) +
  xlim(-10000,10000) +
  geom_vline(xintercept = 0,colour="#cccccc") +
  facet_wrap(~ language,strip.position="bottom")
ggsave('out/qc-english7-FTO-by-duration.png',bg="white",width=6,height=4)

d %>%
  ggplot(aes(FTO)) +
  theme_tufte() +
  ggtitle('Floor transfer onset by turn duration',
          subtitle="Santa Barbara Corpus of Spoken English (english7)") +
  geom_density() +
  #ylim(0,20000) +
  xlim(-5000,5000) +
  geom_vline(xintercept = 0,colour="#cccccc") +
  facet_wrap(~ language,strip.position="bottom")
ggsave('out/qc-english7-FTO-density.png',bg="white",width=6,height=4)

FTO_0 <- d[which(d$FTO==0),]$uid
FTO_length <- d[which(d$FTO==d$duration),]$uid
these_uids <- sample(FTO_0,10)
these_uids <- sample(FTO_length,10)
convplot(these_uids,content=T,highlight=T)

# French ------------------------------------------------------------------

french1 <- read_corpus("french1") %>% mutate(language="french1")
french2 <- read_corpus("french2") %>% mutate(language="french2")
french3 <- read_corpus("french3") %>% mutate(language="french3")

# combine all 
loadedcorpora <- c("french1","french2","french3")
data = do.call("list", mget(loadedcorpora))
d <- rbindlist(data,fill=T) 
rm(data)

d <- d %>% mutate(translation = NA)
d %>%
  ggplot(aes(FTO,duration)) +
  theme_tufte() +
  ggtitle('Floor transfer onset by turn duration') +
  geom_point(alpha=0.1,na.rm=T) +
  ylim(0,20000) +
  xlim(-20000,20000) +
  geom_vline(xintercept = 0,colour="#cccccc") +
  facet_wrap(~ language,strip.position="bottom")
ggsave('qc-french-3corpora.png',bg="white")

inspect_language("french1",saveplot = T)
inspect_language("french2",saveplot = T)
inspect_language("french3",saveplot = T)



# German ------------------------------------------------------------------

german1 <- read_corpus("german1") %>% mutate(language="german1")
german2 <- read_corpus("german2") %>% mutate(language="german2")

# combine all 
loadedcorpora <- c("german1","german2")
data = do.call("list", mget(loadedcorpora))
d <- rbindlist(data,fill=T) 
rm(data)

d <- d %>% mutate(translation = NA)

inspect_language("german1",saveplot = T)
inspect_language("german2",saveplot = T)

convplot(lang="german1")
ggsave('qc-conv-german1.png',bg="white")
convplot(lang="german2")
ggsave('qc-conv-german2.png',bg="white")

# Hausa -------------------------------------------------------------------

# Hausa prior to post-processing has 'splitter-type' segmentation, and also has
# an unlikely proportion of FTOs at exactly 0 (586 out of 1585)

d %>% filter(language == "hausa") %>% 
  group_by(FTO) %>% summarise(n=n()) %>% arrange(desc(n))
  
convplot(lang="hausa",window=20000,focus=T)


# Mixtec ------------------------------------------------------------------

mixtec <- d %>% filter(langshort=="Mixtec")

inspect_corpus("yoloxochitl_mixtec")

mixtec %>%
  filter(priorby=="other")

# oof, only 212 out of 50k turns have a prior turn by other instead of self




# Polish ------------------------------------------------------------------


# polish has some outliers in duration

test <- d %>% filter(language=="polish") %>%
  mutate(duration_abs = abs(duration)) %>%
  filter(duration_abs > 40000)
convplot(test$uid)
test$source
# three of these are in MW_006, let's have a closer look

uids <- sample(d[d$source == "/polish/MW_006.eaf",]$uid,10)
convplot(uids,window=20000)

# Siputhi -----------------------------------------------------------------

# Siputhi is a very small corpus (430 turns, of which only 328 are speaker
# transitions). Timing appears to be weirdly spiky mainly because of this

d %>% filter(language == "Siputhi") %>% 
  group_by(FTO) %>% summarise(n=n()) %>% arrange(desc(n)) %>% View()

convplot(lang="Siputhi",window=20000,focus=T)


# Totoli ------------------------------------------------------------------

# Totoli has a mean annotation length of only 798, and many self-transitions,
# indicating that turns are segmented with a splitter's temperament

convplot(lang="Totoli")

theseuids <- c("totoli-1-139-150117", "totoli-1-209-259023", "totoli-1-66-76901")

convplot(theseuids)



# Zaar --------------------------------------------------------------------


# Zaar is relatively small and segmented in splitter fashion, with relatively
# few speaker transitions (520 other vs 1234 self). This is especially striking
# in BC_CONV_03.eaf. There are also relatively many FTOs of 0 (201 out of 520)
# indicating sth about transcription convention or software.


d %>% filter(language == "Zaar") %>% 
  group_by(source,priorby) %>% summarise(n=n())

d %>% filter(language == "Zaar") %>% 
  group_by(FTO) %>% summarise(n=n()) %>% arrange(desc(n)) %>% View()

convplot(lang="Zaar",window=20000,focus=T)



# Japanese and German selection -------------------------------------------

japanese1 <- read_corpus("japanese1") %>% mutate(language = "japanese1")
japanese2 <- read_corpus("japanese2") %>% mutate(language = "japanese2")
japanese3 <- read_corpus("japanese3") %>% mutate(language = "japanese3")
spanish1 <- read_corpus("spanish1") %>% mutate(language = "spanish1")
spanish2 <- read_corpus("spanish2") %>% mutate(language = "spanish2")
spanish3 <- read_corpus("spanish3") %>% mutate(language = "spanish3")

# combine all 
loadedcorpora <- c("japanese1","japanese2","japanese3","spanish1","spanish2","spanish3")
data = do.call("list", mget(loadedcorpora))

d <- rbindlist(data,fill=T)
rm(data)



# Mandarin ----------------------------------------------------------------


# mandarin comparing 3 corpora
mandarin1 <- read_corpus("mandarin1") %>% mutate(language = "mandarin1")
mandarin2 <- read_corpus("mandarin2") %>% mutate(language = "mandarin2")

# mandarin check
mandarindata <- list(mandarin1,mandarin2)
d <- rbindlist(mandarindata,fill=T)

inspect_language("mandarin1",saveplot=T)
inspect_language("mandarin2",saveplot=T)

# Tseltal timing investigation --------------------------------------------

# The two largest Tseltal sources unfortunately have a disproportionate amount
# of transitions set at exactly zero, an artefact of cutting up annotations
# during segmentation. This means we cannot trust these to represent true
# time-aligned annotations, and we cannot use them to precision-target segments
# of audio.

d %>% filter(language == "Tseltal") %>%
  ggplot(aes(FTO)) +
  theme_tufte() +
  ggtitle(paste0('Floor transfer onset for all turns by source')) +
  geom_density(na.rm=T,size=1) +
  xlim(-2000,2000) +
  geom_vline(xintercept = 0,colour="#cccccc") +
  facet_wrap(~ source,strip.position ="bottom")


d %>%
  filter(language %in% c("Tseltal")) %>% group_by(source) %>% 
  summarise(n=n(),
            medianFTO=median.na(FTO),
            meanFTO=mean.na(FTO))
d %>%
  filter(language %in% c("Tseltal")) %>% group_by(FTO) %>% 
  summarise(n=n()) %>% arrange(desc(n))


# Zauzou ------------------------------------------------------------------


# some turns are exactly 4000 with an FTO of 0?

test <- zauzou %>%
  filter(duration==4000)
test$uid
convplot(test$uid,focus=T)
View(test)


# japanese and spanish timing investigation -------------------------------


whatsupwith <- c("japanese","spanish")

d %>%
  filter(language %in% whatsupwith) %>%
  ggplot(aes(FTO,duration)) +
  theme_tufte() +
  ggtitle('Floor transfer onset by turn duration') +
  geom_point(alpha=0.1,na.rm=T) +
  ylim(0,20000) +
  xlim(-20000,20000) +
  geom_vline(xintercept = 0,colour="#cccccc") +
  facet_wrap(~ language,strip.position="bottom")
ggsave('ex-whatsupwith-japanese-spanish.png',bg="white")

d %>%
  filter(language %in% whatsupwith) %>%
  ggplot(aes(offsetCRUDE,duration)) +
  theme_tufte() + theme(legend.position = "none") +
  ggtitle('What process could have generated this pattern?',
          subtitle='~5000 turns for which -offset = duration. Are these duplicate rows?') +
  geom_point(alpha=0.1,na.rm=T) +
  geom_point(data=. %>% filter(duration == -offsetCRUDE),
             aes(offsetCRUDE,duration),color="red",alpha=0.3) +
  ylim(0,20000) +
  xlim(-20000,20000) +
  geom_vline(xintercept = 0,colour="#cccccc") +
  facet_wrap(~ language,strip.position="bottom")
ggsave('ex-whatsupwith-japanese-spanish-highlighted.png',bg="white")

d %>%
  filter(language %in% whatsupwith) %>%
  filter(duration == -offsetCRUDE) %>%
  ggplot(aes(offsetCRUDE,duration)) +
  theme_tufte() +
  ggtitle('OffsetCRUDE by turn duration') +
  geom_point(alpha=0.1,na.rm=T) +
  ylim(0,20000) +
  xlim(-20000,20000) +
  geom_vline(xintercept = 0,colour="#cccccc") +
  facet_wrap(~ language,strip.position="bottom")

weird <- d %>%
  filter(language %in% whatsupwith) %>%
  filter(duration == -offsetCRUDE)
View(weird)

uids <- sample(weird[weird$language %in% "spanish",]$uid,20)
uids <- c("spanish-11-500-697626","spanish-14-569-935684", "spanish-17-1047-1317073", "spanish-17-814-993060", "spanish-18-1433-1924591", "spanish-19-410-726290", 
          "spanish-19-506-924906", "spanish-7-113-199645")
convplot(uids,focus=T)
ggsave('ex-whatsupwith-spanish-convplot.png',bg="white")
test <- convplot(uids,datamode=T)
test %>% select(begin,end,duration,offsetCRUDE,utterance,focus)

uids <- sample(weird[weird$language %in% "japanese",]$uid,8)
uids <- c("japanese-40-228-525640", "japanese-61-99-292040", "japanese-7-117-423940", "japanese-7-12-130240", "japanese-7-128-460210", "japanese-7-219-722520", "japanese-7-90-332520", "japanese-87-120-302060")
convplot(uids,focus=T)
ggsave('ex-whatsupwith-japanese-convplot.png',bg="white")

unique(weird$source)


# Farsi subcorpora --------------------------------------------------------

# summary: below some data on weird subcorpora that might be worth excluding
# from FTO / timing-related analyses
weird_subcorpora <- c("/farsi/callfriend/fa_4699_asc.txt",
                      "/farsi/callfriend/fa_6936_asc.txt",
                      "/farsi/callfriend/fa_7003_asc.txt")

d %>% filter(language == "farsi" & source %in% weird_subcorpora) %>%
  group_by(source) %>%
  ggplot(aes(FTO)) +
  theme_tufte() +
  ggtitle(paste0('Floor transfer onset for Farsi by source')) +
  geom_density(na.rm=T,size=1) +
  geom_vline(xintercept = 0,colour="#cccccc") +
  facet_wrap(~ source,strip.position ="bottom")

d %>% filter(language == "farsi" & source %notin% weird_subcorpora) %>%
  group_by(source) %>%
  ggplot(aes(FTO)) +
  theme_tufte() +
  ggtitle(paste0('Floor transfer onset for Farsi by source')) +
  geom_density(na.rm=T,size=1) +
  geom_vline(xintercept = 0,colour="#cccccc") +
  facet_wrap(~ source,strip.position ="bottom")
ggsave('qc-farsi-by-source.png',bg="white",width=20,height=20)

# hey farsi source 20 looks weird, with lots of fully overlapping turns
#fa_4699
finduid("farsi-20-180-779680")
d %>% filter(source=="/farsi/callfriend/fa_4699_asc.txt") %>% View()
uids <- sample(d[d$source == "/farsi/callfriend/fa_4699_asc.txt",]$uid,20)
convplot(uids,window=200000)
ggsave("qc-farsi-fa_4699_asc.png",bg="white",height=12,width=5)


d %>% filter(source=="/farsi/callfriend/fa_4699_asc.txt") %>% 
  ggplot(aes(FTO)) +
  theme_tufte() +
  ggtitle(paste0('Floor transfer onset for fa_4699_asc')) +
  geom_density(na.rm=T,size=1) +
  geom_vline(xintercept = 0,colour="#cccccc") +
  facet_wrap(~ language,strip.position ="bottom")

d %>% filter(language =="farsi" & source != "/farsi/callfriend/fa_4699_asc.txt") %>% 
  ggplot(aes(FTO)) +
  theme_tufte() +
  ggtitle(paste0('Floor transfer onset for all other Farsi data')) +
  geom_density(na.rm=T,size=1) +
  geom_vline(xintercept = 0,colour="#cccccc") +
  facet_wrap(~ language,strip.position ="bottom")

# this makes me realize we can facet by source by language to find weirdly transcribed subcorpora
d %>% filter(language == "farsi") %>%
  group_by(source) %>%
  ggplot(aes(FTO)) +
  theme_tufte() +
  ggtitle(paste0('Floor transfer onset for Farsi by source')) +
  geom_density(na.rm=T,size=1) +
  geom_vline(xintercept = 0,colour="#cccccc") +
  facet_wrap(~ source,strip.position ="bottom")
ggsave('qc-farsi-by-source.png',bg="white",width=20,height=20)

# fa_6936.asc
d %>% filter(source=="/farsi/callfriend/fa_6936_asc.txt") %>% 
  ggplot(aes(FTO)) +
  theme_tufte() +
  ggtitle(paste0('Floor transfer onset for fa_4699_asc')) +
  geom_density(na.rm=T,size=1) +
  geom_vline(xintercept = 0,colour="#cccccc") +
  facet_wrap(~ language,strip.position ="bottom")
d %>% filter(source=="/farsi/callfriend/fa_6936_asc.txt") %>% View()
uids <- sample(d[d$source == "/farsi/callfriend/fa_6936_asc.txt",]$uid,20)
convplot(uids,window=200000)
ggsave("qc-farsi-fa_6936_asc.png",bg="white",height=12,width=5)

# fa_6419
# nothing much going on judging from convplots; just lots of competitive overlap
uids <- sample(d[d$source == "/farsi/callfriend/fa_6419_asc.txt",]$uid,20)
convplot(uids,window=200000)

# fa_4771
# nothing much going on judging from convplots; just lots of competitive overlap
uids <- sample(d[d$source == "/farsi/callfriend/fa_4771_asc.txt",]$uid,20)
convplot(uids,window=200000)

# fa_7003
# this one's mostly monologue, won't affect FTO or timing analyses
uids <- sample(d[d$source == "/farsi/callfriend/fa_7003_asc.txt",]$uid,20)
convplot(uids,window=200000)
d %>% filter(source=="/farsi/callfriend/fa_6936_asc.txt") %>% View()


# weird timing distributions ----------------------------------------------

# From the highly peaked distributions of a bunch of these it is clear that
# something is up. In most of these languages, the most frequent FTO is exactly
# 0, which is highly implausible and likely a transcription software quirk
# (breaking up annotations at one point). This means we cannot trust `end` value
# (and therefore also not `duration` or `FTO`), so we are not using them. One of
# them even has no FTOs <0 (Akie), showing that overlap is not marked and the
# content of annotations is not time-aligned with the actual speech.
#
# Two larger corpora, Brazilian Portugues and Czech, have lots of directly
# adjacent annotations by same speaker. Unless these can be concatenated in some
# empirically sensible way, timing information is not to be trusted, so we
# exclude those too for now.


weird_distributions <- c("akie","croatian","mambila","nganasan","khinalug","besemah","czech","brazilian_portuguese")
d.w <- d %>% filter(language %in% weird_distributions)

nturns <- d.w %>% drop_na(FTO) %>% ungroup() %>% summarize(n=n()) %>% as.integer()

pA <- d.w %>%
  filter(language %in% c("akie","mambila")) %>%
  ggplot(aes(FTO)) +
  theme_tufte() +
  ggtitle(paste0('Floor transfer onset')) +
  xlim(-2000,2000) +
  geom_density(na.rm=T,size=1,bw=0.5,trim=T) +
  geom_vline(xintercept = 0,colour="#cccccc") +
  facet_wrap(~ language,strip.position ="bottom")


d.w %>%
  filter(language %in% c("akie","mambila")) %>% group_by(language) %>% summarise(n=n())

sort(table(d.w$FTO),decreasing=T)[1:10]

convplot(lang="akie",n=5,window=20000)
convplot(lang="mambila",n=5,window=20000)

uids <- c("akie-1-171-371969", "mambila-1-756-1260971")
pB <- convplot(uids)
pB <- pB +
  ggtitle("Sample annotations in Akie and Mambila")

plot_grid(pB,pA,labels=c("A","B"),rel_widths=c(2,1))

ggsave('sup-akia-mambila-panel.png',bg="white",height=2,width=10)



# how weird are Brazilian Portuguese and Czech?
test <- d %>% filter(language %in% c("brazilian_portuguese","czech") & FTO < -250)

uids <- sample(test[test$language == "czech",]$uid,20)
convplot(uids,focus=T)
ggsave('ex-segmentation-weirdness_czech.png',bg="white")
View(convplot(uids,focus=T,datamode=T))

uids <- sample(test[test$language == "brazilian_portuguese",]$uid,6)
convplot(uids,focus=T)
ggsave('ex-segmentation-weirdness-brazilian_portugues.png',bg="white")
View(convplot(uids,focus=T,datamode=T))


# diagnosis: too weird to include in first run. They have lots of directly
# adjacent annotations by same speaker which I guess should be concatenated.

@mdingemanse mdingemanse changed the title share qualitative outcomes of quality checks assorted examples of quality checks Oct 14, 2023
@mdingemanse mdingemanse added documentation Improvements or additions to documentation examples sample visualisations, user journeys etc. to demonstrate functionalities and removed documentation Improvements or additions to documentation labels Aug 19, 2024
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
examples sample visualisations, user journeys etc. to demonstrate functionalities
Projects
None yet
Development

No branches or pull requests

1 participant