diff --git a/.Rbuildignore b/.Rbuildignore index 8c89bd5..0358159 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -12,3 +12,11 @@ ^pkgdown$ ^cran-comments\.md$ ^CRAN-SUBMISSION$ +^CITATION\.cff$ +^install\.R$ +^postBuild$ +^apt\.txt$ +^runtime\.txt$ +^_quarto\.yml$ +^\.quarto$ +^methodshub diff --git a/.gitignore b/.gitignore index 67d62c0..89fa8e7 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,5 @@ README.html inst/doc docs + +/.quarto/ diff --git a/CITATION.cff b/CITATION.cff new file mode 100644 index 0000000..ea9913e --- /dev/null +++ b/CITATION.cff @@ -0,0 +1,155 @@ +# -------------------------------------------- +# CITATION file created with {cffr} R package +# See also: https://docs.ropensci.org/cffr/ +# -------------------------------------------- + +cff-version: 1.2.0 +message: 'To cite package "adaR" in publications use:' +type: software +license: MIT +title: 'adaR: A Fast ''WHATWG'' Compliant URL Parser' +version: 0.3.2 +abstract: A wrapper for 'ada-url', a 'WHATWG' compliant and fast URL parser written + in modern 'C++'. Also contains auxiliary functions such as a public suffix extractor. +authors: +- family-names: Schoch + given-names: David + email: david@schochastics.net + orcid: https://orcid.org/0000-0003-2952-4812 +- family-names: Chan + given-names: Chung-hong + email: chainsawtiney@gmail.com + orcid: https://orcid.org/0000-0002-6232-7530 +repository: https://CRAN.R-project.org/package=adaR +repository-code: https://github.com/gesistsa/adaR +url: https://gesistsa.github.io/adaR/ +contact: +- family-names: Schoch + given-names: David + email: david@schochastics.net + orcid: https://orcid.org/0000-0003-2952-4812 +keywords: +- r +- rstats +- rstats-package +- url-parser +references: +- type: software + title: Rcpp + abstract: 'Rcpp: Seamless R and C++ Integration' + notes: LinkingTo + url: https://www.rcpp.org + repository: https://CRAN.R-project.org/package=Rcpp + authors: + - family-names: Eddelbuettel + given-names: Dirk + - family-names: Francois + given-names: Romain + - family-names: Allaire + given-names: JJ + - family-names: Ushey + given-names: Kevin + - family-names: Kou + given-names: Qiang + - family-names: Russell + given-names: Nathan + - family-names: Ucar + given-names: Inaki + - family-names: Bates + given-names: Douglas + - family-names: Chambers + given-names: John + year: '2024' +- type: software + title: triebeard + abstract: 'triebeard: ''Radix'' Trees in ''Rcpp''' + notes: Imports + url: https://github.com/Ironholds/triebeard/ + repository: https://CRAN.R-project.org/package=triebeard + authors: + - family-names: Keyes + given-names: Os + - family-names: Schmidt + given-names: Drew + - family-names: Takano + given-names: Yuuki + year: '2024' +- type: software + title: knitr + abstract: 'knitr: A General-Purpose Package for Dynamic Report Generation in R' + notes: Suggests + url: https://yihui.org/knitr/ + repository: https://CRAN.R-project.org/package=knitr + authors: + - family-names: Xie + given-names: Yihui + email: xie@yihui.name + orcid: https://orcid.org/0000-0003-0645-5666 + year: '2024' +- type: software + title: rmarkdown + abstract: 'rmarkdown: Dynamic Documents for R' + notes: Suggests + url: https://pkgs.rstudio.com/rmarkdown/ + repository: https://CRAN.R-project.org/package=rmarkdown + authors: + - family-names: Allaire + given-names: JJ + email: jj@posit.co + - family-names: Xie + given-names: Yihui + email: xie@yihui.name + orcid: https://orcid.org/0000-0003-0645-5666 + - family-names: Dervieux + given-names: Christophe + email: cderv@posit.co + orcid: https://orcid.org/0000-0003-4474-2498 + - family-names: McPherson + given-names: Jonathan + email: jonathan@posit.co + - family-names: Luraschi + given-names: Javier + - family-names: Ushey + given-names: Kevin + email: kevin@posit.co + - family-names: Atkins + given-names: Aron + email: aron@posit.co + - family-names: Wickham + given-names: Hadley + email: hadley@posit.co + - family-names: Cheng + given-names: Joe + email: joe@posit.co + - family-names: Chang + given-names: Winston + email: winston@posit.co + - family-names: Iannone + given-names: Richard + email: rich@posit.co + orcid: https://orcid.org/0000-0003-3925-190X + year: '2024' +- type: software + title: testthat + abstract: 'testthat: Unit Testing for R' + notes: Suggests + url: https://testthat.r-lib.org + repository: https://CRAN.R-project.org/package=testthat + authors: + - family-names: Wickham + given-names: Hadley + email: hadley@posit.co + year: '2024' + version: '>= 3.0.0' +- type: software + title: 'R: A Language and Environment for Statistical Computing' + notes: Depends + url: https://www.R-project.org/ + authors: + - name: R Core Team + institution: + name: R Foundation for Statistical Computing + address: Vienna, Austria + year: '2024' + version: '>= 4.2' + diff --git a/_quarto.yml b/_quarto.yml new file mode 100644 index 0000000..7454539 --- /dev/null +++ b/_quarto.yml @@ -0,0 +1,5 @@ +project: + title: adaR + type: default + render: + - methodshub.qmd diff --git a/apt.txt b/apt.txt new file mode 100644 index 0000000..a18b53c --- /dev/null +++ b/apt.txt @@ -0,0 +1 @@ +zip \ No newline at end of file diff --git a/install.R b/install.R new file mode 100644 index 0000000..4426e40 --- /dev/null +++ b/install.R @@ -0,0 +1 @@ +install.packages("adaR") diff --git a/methodshub.md b/methodshub.md new file mode 100644 index 0000000..25f02cd --- /dev/null +++ b/methodshub.md @@ -0,0 +1,126 @@ +# adaR - A Fast ‘WHATWG’ Compliant URL Parser + + +## Description + + + +A wrapper for ‘ada-url’, a ‘WHATWG’ compliant and fast URL parser +written in modern ‘C++’. Also contains auxiliary functions such as a +public suffix extractor. + +## Keywords + + + +- URL Parsing +- Webtracking Data +- Webscraping + +## Science Usecase(s) + + + + +URL parsing is an important process in the analysis of webtracking data, +e.g. [GESIS Web +Tracking](https://www.gesis.org/en/services/planning-studies-and-collecting-data/tools-for-the-collection-of-digital-behavioral-data/gesis-web-tracking). +Although not using this package, the technique has been used in various +social science publications, e.g. [de León et +al. (2023)](https://doi.org/10.5117/CCR2023.2.4.DELE). + +The package was used in various webscraping projects for communication +research, e.g. [paperboy](https://github.com/JBGruber/paperboy). + +## Repository structure + +This repository follows [the standard structure of an R +package](https://cran.r-project.org/doc/FAQ/R-exts.html#Package-structure). + +## Environment Setup + +With R installed: + +``` r +install.packages("adaR") +``` + + + + + +## Input Data + + + + +The input data has to be a vector of URLs. + +## Sample Input and Output Data + + + + +The input data looks like this: + +``` r +urls <- c("https://www.google.de/search?q=GESIS&client=ubuntu&hs=ixb&sca_esv=dccc38f8e2930152&sca_upv=1") + +urls +``` + + [1] "https://www.google.de/search?q=GESIS&client=ubuntu&hs=ixb&sca_esv=dccc38f8e2930152&sca_upv=1" + +The output data is a data frame of parsed URLs. + +## How to Use + + + + +Please refer to the [“Introduction to +adaR”](https://gesistsa.github.io/adaR/articles/adaR.html) for a +comprehensive introduction of the package. + +The main function of this package is `ada_url_parse()` and it decomposes +a url into its components. + +``` r +library(adaR) + +urls <- c("https://www.google.de/search?q=GESIS&client=ubuntu&hs=ixb&sca_esv=dccc38f8e2930152&sca_upv=1", + "https://www.nytimes.com/2024/06/19/world/africa/sudan-darfur-takeaways.html", + "https://www.sueddeutsche.de/thema/Fu%C3%9Fball-EM") + +ada_url_parse(urls) +``` + + href + 1 https://www.google.de/search?q=GESIS&client=ubuntu&hs=ixb&sca_esv=dccc38f8e2930152&sca_upv=1 + 2 https://www.nytimes.com/2024/06/19/world/africa/sudan-darfur-takeaways.html + 3 https://www.sueddeutsche.de/thema/Fußball-EM + protocol username password host hostname port + 1 https: www.google.de www.google.de + 2 https: www.nytimes.com www.nytimes.com + 3 https: www.sueddeutsche.de www.sueddeutsche.de + pathname + 1 /search + 2 /2024/06/19/world/africa/sudan-darfur-takeaways.html + 3 /thema/Fußball-EM + search hash + 1 ?q=GESIS&client=ubuntu&hs=ixb&sca_esv=dccc38f8e2930152&sca_upv=1 + 2 + 3 + +## Contact Details + +Maintainer: David Schoch + +Issue Tracker: + + + + + + + diff --git a/methodshub.qmd b/methodshub.qmd new file mode 100644 index 0000000..5321c18 --- /dev/null +++ b/methodshub.qmd @@ -0,0 +1,104 @@ +--- +title: adaR - A Fast 'WHATWG' Compliant URL Parser +format: + html: + embed-resources: true + gfm: default +--- + +## Description + + + +A wrapper for 'ada-url', a 'WHATWG' compliant and fast URL parser written in modern 'C++'. Also contains auxiliary functions such as a public suffix extractor. + +## Keywords + + + +* URL Parsing +* Webtracking Data +* Webscraping + +## Science Usecase(s) + + + + +URL parsing is an important process in the analysis of webtracking data, e.g. [GESIS Web Tracking](https://www.gesis.org/en/services/planning-studies-and-collecting-data/tools-for-the-collection-of-digital-behavioral-data/gesis-web-tracking). Although not using this package, the technique has been used in various social science publications, e.g. [de León et al. (2023)](https://doi.org/10.5117/CCR2023.2.4.DELE). + +The package was used in various webscraping projects for communication research, e.g. [paperboy](https://github.com/JBGruber/paperboy). + +## Repository structure + +This repository follows [the standard structure of an R package](https://cran.r-project.org/doc/FAQ/R-exts.html#Package-structure). + +## Environment Setup + +With R installed: + +```r +install.packages("adaR") +``` + + + + + +## Input Data + + + + +The input data has to be a vector of URLs. + + +## Sample Input and Output Data + + + + +The input data looks like this: + +```{r} +urls <- c("https://www.google.de/search?q=GESIS&client=ubuntu&hs=ixb&sca_esv=dccc38f8e2930152&sca_upv=1") + +urls +``` + +The output data is a data frame of parsed URLs. + +## How to Use + + + + + +Please refer to the ["Introduction to adaR"](https://gesistsa.github.io/adaR/articles/adaR.html) for a comprehensive introduction of the package. + +The main function of this package is `ada_url_parse()` and it decomposes a url into its components. + +```{r} +library(adaR) + +urls <- c("https://www.google.de/search?q=GESIS&client=ubuntu&hs=ixb&sca_esv=dccc38f8e2930152&sca_upv=1", + "https://www.nytimes.com/2024/06/19/world/africa/sudan-darfur-takeaways.html", + "https://www.sueddeutsche.de/thema/Fu%C3%9Fball-EM") + +ada_url_parse(urls) +``` + +## Contact Details + +Maintainer: David Schoch + +Issue Tracker: [https://github.com/gesistsa/adaR/issues](https://github.com/gesistsa/adaR/issues) + + + + + + + + + diff --git a/postBuild b/postBuild new file mode 100644 index 0000000..942913e --- /dev/null +++ b/postBuild @@ -0,0 +1,62 @@ +#!/bin/bash -v + +# determine which version of Quarto to install +QUARTO_VERSION=1.4.551 + +# See whether we need to lookup a Quarto version +if [ $QUARTO_VERSION = "prerelease" ]; then + QUARTO_JSON="_prerelease.json" +elif [ $QUARTO_VERSION = "release" ]; then + QUARTO_JSON="_download.json" +fi + +if [ $QUARTO_JSON != "" ]; then + +# create a python script and run it +PYTHON_SCRIPT=_quarto_version.py +if [ -e $PYTHON_SCRIPT ]; then + rm -rf $PYTHON_SCRIPT +fi + +cat > $PYTHON_SCRIPT <