From 42bc97f9b3b9efcdf74d2ac8cab9e9e46e0c5e51 Mon Sep 17 00:00:00 2001 From: "Dr. Erin M. Buchanan" Date: Sat, 1 Aug 2020 12:38:11 -0400 Subject: [PATCH] building package --- .Rbuildignore | 2 + DESCRIPTION | 11 ++++ NAMESPACE | 1 + R/get_data.R | 117 +++++++++++++++++++++++++++++++++++++++++++ R/load_metadata.R | 28 +++++++++++ man/hello.Rd | 12 +++++ semanticprimeR.Rproj | 20 ++++++++ 7 files changed, 191 insertions(+) create mode 100644 .Rbuildignore create mode 100644 DESCRIPTION create mode 100644 NAMESPACE create mode 100644 R/get_data.R create mode 100644 R/load_metadata.R create mode 100644 man/hello.Rd create mode 100644 semanticprimeR.Rproj diff --git a/.Rbuildignore b/.Rbuildignore new file mode 100644 index 0000000..91114bf --- /dev/null +++ b/.Rbuildignore @@ -0,0 +1,2 @@ +^.*\.Rproj$ +^\.Rproj\.user$ diff --git a/DESCRIPTION b/DESCRIPTION new file mode 100644 index 0000000..f0b9859 --- /dev/null +++ b/DESCRIPTION @@ -0,0 +1,11 @@ +Package: semanticprimeR +Type: Package +Title: What the Package Does (Title Case) +Version: 0.1.0 +Author: Who wrote it +Maintainer: The package maintainer +Description: More about what it does (maybe more than one line) + Use four spaces when indenting paragraphs within the Description. +License: What license is it under? +Encoding: UTF-8 +LazyData: true diff --git a/NAMESPACE b/NAMESPACE new file mode 100644 index 0000000..d75f824 --- /dev/null +++ b/NAMESPACE @@ -0,0 +1 @@ +exportPattern("^[[:alpha:]]+") diff --git a/R/get_data.R b/R/get_data.R new file mode 100644 index 0000000..cb606cd --- /dev/null +++ b/R/get_data.R @@ -0,0 +1,117 @@ +#' Get Dataset +#' +#' This function allows you to import the current datasets avaliable from +#' \href{https://github.com/orgs/SemanticPriming/}{Semantic Priming GitHub Group}. +#' +#' @param corpus Include a two letter code to download the Open Subtitles corpus for +#' text models. You can view the corpora on +#' \href{http://opus.nlpl.eu/OpenSubtitles.php}{their website}. Note: these files +#' can be very large, so they may take up a lot of memory to download. They are +#' text based files that are read using `readLines`. +#' @param bibtexID The bibtex ID of the dataset you are trying to load. +#' You can leave all parameters blank to load just the metadata. +#' @param citation Include the citation for the dataset you loaded - will only +#' load if you include a bibtex ID. +#' @param language If you include a bibtex ID, you will get back the language of +#' the dataset, if you do not include a bibtex ID, it will return a list of +#' datasets in that language. +#' @param variables If you include a bibtex ID, you will get back the variables +#' included the dataset, if you do not include a bibtex ID, it will return a list of +#' datasets that include that variable (can also be paired with language). +#' Use the column names from the metadata as your filter. +#' @return +#' \item{metadata}{The metadata list of avaliable datasets} +#' \item{loaded_data}{The dataset you requested to load} +#' \item{language}{The language of the dataset you requested to load} +#' \item{variables}{The variables of the dataset you requested to load} +#' \item{datasets}{Possible datasets based on your language and variable names} +#' +#' @keywords metadata, datasets, linguistic norms +#' @export +#' @examples +#' +#' get_dataset() +#' get_dataset(bibtexID = "Birchenough2017", citation = TRUE) +#' get_dataset(language = "English", variables = c("aoa", "freq")) + + +get_dataset <- function(corpus = NULL, + bibtexID = NULL, + citation = NULL, + language = NULL, + variables = NULL + ) { + + metadata <- load_metadata() + variable_return <- list(metadata = metadata) + + if (!is.null(corpus)){ + + con <- gzcon(url(paste("http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/mono/OpenSubtitles.", + corpus, ".gz", sep=""))) + variable_return$subtitle <- readLines(con, encoding = "utf8") + } + + if (!is.null(bibtexID)) { + data_link <- metadata$link[metadata$bibtex == bibtexID] + variable_return$loaded_data <- read.csv(url(data_link), stringsAsFactors = F) + + if (!is.null(citation)){ + + variable_return$citation <- paste0(metadata$author[metadata$bibtex == bibtexID], ". (", + metadata$year[metadata$bibtex == bibtexID], "). ", + metadata$ref_title[metadata$bibtex == bibtexID], ". ", + metadata$ref_journal[metadata$bibtex == bibtexID], ", ", + metadata$ref_volume[metadata$bibtex == bibtexID], ", ", + metadata$ref_page[metadata$bibtex == bibtexID], ". doi: ", + metadata$ref_doi[metadata$bibtex == bibtexID] + ) + } + + if (!is.null(language)){ + variable_return$language <- metadata$language[metadata$bibtex == bibtexID] + } + + if (!is.null(variables)){ + temp <- metadata[metadata$bibtex == bibtexID, 26:ncol(metadata)] + variable_return$variables <- colnames(temp)[temp == 1] + } + + } else { + + if (!is.null(language) & !is.null(variables)) { #both + + temp <- metadata[ tolower(metadata$language) == tolower(language) , ] + + for (var in variables){ + if (var %in% colnames(metadata)){ + temp <- temp[ temp[ , var] == 1 , ] + } + } + + variable_return$datasets <- temp + + } else if (!is.null(language)){ #just language + + variable_return$datasets <- metadata[ tolower(metadata$language) == tolower(language) , ] + + } else if (!is.null(variables)){ #just variables + + temp <- metadata + + for (var in variables){ + if (var %in% colnames(metadata)){ + temp <- temp[ temp[ , var] == 1 , ] + } + } + + variable_return$datasets <- temp + + } + + } + + return(variable_return) +} + +#' @rdname get_data diff --git a/R/load_metadata.R b/R/load_metadata.R new file mode 100644 index 0000000..b4ba566 --- /dev/null +++ b/R/load_metadata.R @@ -0,0 +1,28 @@ +#' Load Metadata +#' +#' This function loads the current metadata avaliable from the +#' \href{https://github.com/orgs/SemanticPriming/}{Semantic Priming GitHub Group}. +#' +#' @param webaddress The default value for webaddress is the current location +#' of the metadata list. +#' @return +#' \item{metadata}{The metadata list of avaliable datasets} +#' +#' @keywords metadata, datasets, linguistic norms +#' @export +#' @examples +#' +#' #Use the following to load the metadata: +#' metadata <- load_metadata() +#' View(metadata) + +load_metadata <- function(webaddress = "https://raw.githubusercontent.com/SemanticPriming/LAB-data/master/included_data.csv") { + + metadata <- read.csv(url(webaddress), stringsAsFactors = F) + + metadata <- subset(metadata, included == "yes") + + return(metadata) +} + +#' @rdname load_metadata diff --git a/man/hello.Rd b/man/hello.Rd new file mode 100644 index 0000000..0fa7c4b --- /dev/null +++ b/man/hello.Rd @@ -0,0 +1,12 @@ +\name{hello} +\alias{hello} +\title{Hello, World!} +\usage{ +hello() +} +\description{ +Prints 'Hello, world!'. +} +\examples{ +hello() +} diff --git a/semanticprimeR.Rproj b/semanticprimeR.Rproj new file mode 100644 index 0000000..497f8bf --- /dev/null +++ b/semanticprimeR.Rproj @@ -0,0 +1,20 @@ +Version: 1.0 + +RestoreWorkspace: Default +SaveWorkspace: Default +AlwaysSaveHistory: Default + +EnableCodeIndexing: Yes +UseSpacesForTab: Yes +NumSpacesForTab: 2 +Encoding: UTF-8 + +RnwWeave: Sweave +LaTeX: pdfLaTeX + +AutoAppendNewline: Yes +StripTrailingWhitespace: Yes + +BuildType: Package +PackageUseDevtools: Yes +PackageInstallArgs: --no-multiarch --with-keep.source