From c034d728cc96d4d8dc79791e7e523bb3993ea7e1 Mon Sep 17 00:00:00 2001 From: Saransh Chopra Date: Fri, 16 Aug 2024 13:56:56 +0100 Subject: [PATCH] first draft for JOSS paper --- .github/workflows/paper.yml | 20 +++++ paper/paper.bib | 163 ++++++++++++++++++++++++++++++++++++ paper/paper.md | 85 +++++++++++++++++++ 3 files changed, 268 insertions(+) create mode 100644 .github/workflows/paper.yml create mode 100644 paper/paper.bib create mode 100644 paper/paper.md diff --git a/.github/workflows/paper.yml b/.github/workflows/paper.yml new file mode 100644 index 00000000..d04aeefe --- /dev/null +++ b/.github/workflows/paper.yml @@ -0,0 +1,20 @@ +name: Draft PDF +on: [push, workflow_dispatch] + +jobs: + paper: + runs-on: ubuntu-latest + name: Paper Draft + steps: + - name: Checkout + uses: actions/checkout@v4 + - name: Build draft PDF + uses: openjournals/openjournals-draft-action@master + with: + journal: joss + paper-path: paper/paper.md + - name: Upload + uses: actions/upload-artifact@v4 + with: + name: paper + path: paper/paper.pdf diff --git a/paper/paper.bib b/paper/paper.bib new file mode 100644 index 00000000..fcad1d93 --- /dev/null +++ b/paper/paper.bib @@ -0,0 +1,163 @@ +@Article{ harris:2020, + title = {Array programming with {NumPy}}, + author = {Charles R. Harris and K. Jarrod Millman and St{\'{e}}fan J. + van der Walt and Ralf Gommers and Pauli Virtanen and David + Cournapeau and Eric Wieser and Julian Taylor and Sebastian + Berg and Nathaniel J. Smith and Robert Kern and Matti Picus + and Stephan Hoyer and Marten H. van Kerkwijk and Matthew + Brett and Allan Haldane and Jaime Fern{\'{a}}ndez del + R{\'{i}}o and Mark Wiebe and Pearu Peterson and Pierre + G{\'{e}}rard-Marchant and Kevin Sheppard and Tyler Reddy and + Warren Weckesser and Hameer Abbasi and Christoph Gohlke and + Travis E. Oliphant}, + year = {2020}, + month = sep, + journal = {Nature}, + volume = {585}, + number = {7825}, + pages = {357--362}, + doi = {10.1038/s41586-020-2649-2}, + publisher = {Springer Science and Business Media {LLC}}, + url = {https://doi.org/10.1038/s41586-020-2649-2} +} + +@software{Gray:2023, +author = {Gray, Lindsey and Smith, Nicholas and Novak, Andrzej and Fackeldey, Peter and Tovar, Benjamin and Chen, Yi-Mu and Watts, Gordon and Krommydas, Iason}, +doi = {10.5281/zenodo.7733568}, +month = mar, +title = {{coffea}}, +url = {https://github.com/CoffeaTeam/coffea}, +version = {0.7.21}, +year = {2023} +} + +@software{Pivarski:2018, +author = {Pivarski, Jim and Osborne, Ianna and Ifrim, Ioana and Schreiner, Henry and Hollands, Angus and Biswas, Anish and Das, Pratyush and Roy Choudhury, Santam and Smith, Nicholas and Goyal, Manasvi}, +doi = {10.5281/zenodo.4341376}, +month = oct, +title = {{Awkward Array}}, +year = {2018} +} + +@inproceedings{rocklin:2015, + title={Dask: Parallel computation with blocked algorithms and task scheduling}, + author={Rocklin, Matthew}, + booktitle={Proceedings of the 14th python in science conference}, + number={130-136}, + year={2015}, + organization={Citeseer} +} + +@inproceedings{lam:2015, + title={Numba: A llvm-based python jit compiler}, + author={Lam, Siu Kwan and Pitrou, Antoine and Seibert, Stanley}, + booktitle={Proceedings of the Second Workshop on the LLVM Compiler Infrastructure in HPC}, + pages={1--6}, + year={2015} +} + +@article{Meurer:2017, + title = {SymPy: symbolic computing in Python}, + author = {Meurer, Aaron and Smith, Christopher P. and Paprocki, Mateusz and \v{C}ert\'{i}k, Ond\v{r}ej and Kirpichev, Sergey B. and Rocklin, Matthew and Kumar, AMiT and Ivanov, Sergiu and Moore, Jason K. and Singh, Sartaj and Rathnayake, Thilina and Vig, Sean and Granger, Brian E. and Muller, Richard P. and Bonazzi, Francesco and Gupta, Harsh and Vats, Shivam and Johansson, Fredrik and Pedregosa, Fabian and Curry, Matthew J. and Terrel, Andy R. and Rou\v{c}ka, \v{S}t\v{e}p\'{a}n and Saboo, Ashutosh and Fernando, Isuru and Kulal, Sumith and Cimrman, Robert and Scopatz, Anthony}, + year = 2017, + month = jan, + keywords = {Python, Computer algebra system, Symbolics}, + abstract = { + SymPy is an open source computer algebra system written in pure Python. It is built with a focus on extensibility and ease of use, through both interactive and programmatic applications. These characteristics have led SymPy to become a popular symbolic library for the scientific Python ecosystem. This paper presents the architecture of SymPy, a description of its features, and a discussion of select submodules. The supplementary material provide additional examples and further outline details of the architecture and features of SymPy. + }, + volume = 3, + pages = {e103}, + journal = {PeerJ Computer Science}, + issn = {2376-5992}, + url = {https://doi.org/10.7717/peerj-cs.103}, + doi = {10.7717/peerj-cs.103} +} + +@article{Kling:2023, + title={FLArE up dark sectors with EM form factors at the LHC forward physics facility}, + volume={987}, + ISSN={0550-3213}, + url={http://dx.doi.org/10.1016/j.nuclphysb.2023.116103}, + DOI={10.1016/j.nuclphysb.2023.116103}, + journal={Nuclear Physics B}, + publisher={Elsevier BV}, + author={Kling, Felix and Kuo, Jui-Lin and Trojanowski, Sebastian and Tsai, Yu-Dai}, + year={2023}, + month=feb, pages={116103} } + +@article{Held:2024, + author = "Held, Alexander and Kauffman, Elliott and Shadura, Oksana and Wightman, Andrew", + title = "{Physics analysis for the HL-LHC: Concepts and pipelines in practice with the Analysis Grand Challenge}", + eprint = "2401.02766", + archivePrefix = "arXiv", + primaryClass = "hep-ex", + doi = "10.1051/epjconf/202429506016", + journal = "EPJ Web Conf.", + volume = "295", + pages = "06016", + year = "2024" +} + +@InProceedings{Qu:2022, + author = "Qu, Huilin and Li, Congqiao and Qian, Sitian", + title = "{Particle Transformer} for Jet Tagging", + booktitle = "{Proceedings of the 39th International Conference on Machine Learning}", + pages = "18281--18292", + year = "2022", + eprint = "2202.03772", + archivePrefix = "arXiv", + primaryClass = "hep-ph" +} + +@article{Brehmer:2020, + author = "Brehmer, Johann and Kling, Felix and Espejo, Irina and Cranmer, Kyle", + title = "{MadMiner: Machine learning-based inference for particle physics}", + journal = "Comput. Softw. Big Sci.", + volume = "4", + year = "2020", + number = "1", + pages = "3", + doi = "10.1007/s41781-020-0035-2", + eprint = "1907.10621", + archivePrefix = "arXiv", + primaryClass = "hep-ph", + SLACcitation = "%%CITATION = ARXIV:1907.10621;%%" +} + +@software{aryan:2023, + author = {Aryan Roy and + Jim Pivarski and + Chris Papageorgakis and + Javier Duarte and + Lindsey Gray and + Henry Schreiner and + Raghav Kansal and + Matthew Feickert and + Kilian Lieret and + ssrothman}, + title = {scikit-hep/fastjet}, + month = jan, + year = 2023, + publisher = {Zenodo}, + doi = {10.5281/zenodo.7504167}, + url = {https://doi.org/10.5281/zenodo.7504167} +} + +@software{spyral-utils:2024, + author = {Gordon McCann}, + title = "{spyral-utils}", + url = {https://github.com/ATTPC/spyral-utils}, +} + +@software{weaver-core:2024, + author = {Huilin Qu and Javier Duarte and Stephen Chao and sunwayihep}, + title = "{weaver-core}", + url = {https://github.com/hqucms/weaver-core}, +} + +@software{pylhe, + author = {Lukas Heinrich and Matthew Feickert and Eduardo Rodrigues}, + title = "{pylhe}", + doi = {10.5281/zenodo.1217031}, + url = {https://github.com/scikit-hep/pylhe}, +} diff --git a/paper/paper.md b/paper/paper.md new file mode 100644 index 00000000..fa17346b --- /dev/null +++ b/paper/paper.md @@ -0,0 +1,85 @@ +--- +title: "Vector: creating and manipulating jagged arrays of Lorentz vectors" +tags: + - Python + - vector algebra + - high energy physics +authors: + - name: Henry Schreiner + orcid: 0000-0002-7833-783X + equal-contrib: true + affiliation: 1 + - name: Jim Pivarski + orcid: 0000-0002-6649-343X + equal-contrib: true + corresponding: true + affiliation: 1 + - name: Saransh Chopra + orcid: 0000-0003-3046-7675 + equal-contrib: true + affiliation: 1 + +affiliations: + - name: Princeton University + index: 1 +date: 16 August 2024 +bibliography: paper.bib +--- + +# Summary + +Vector algebra is a crucial component of data analysis pipelines in high energy +physics, enabling physicists to transform raw data into meaningful results that +can be visualized. Given that high energy physics data is not uniform, the +vector algebra frameworks or libraries are expected to work readily on +non-uniform or jagged data, allowing users to perform operations on an entire +jagged array in minimum passes. Furthermore, optimizing memory usage and +processing time has become essential with the increasing computational demands +at the LHC. Vector is a Python library for creating and manipulating 2D, 3D, +and Lorentz vectors, especially arrays of vectors, to solve common physics +problems in a NumPy-like [@harris:2020] way. The library enables physicists to +operate on high energy physics data in a high level language without +compromising speed. The library is already in use at LHC and is a part of +frameworks, like Coffea [@Gray:2023], employed by physicists across multiple +high energy physics experiments. + +# Statement of need + +Vector is currently the only Lorentz vector library providing a Pythonic +interface but a C++ (through Awkward Array [@Pivarski:2018]) computational +backend. Vector integrates seamlessly with the existing high energy physics +ecosystem and the broader scientific Python ecosystem, including libraries like +Dask [@rocklin:2015] and Numba [@lam:2015]. The library implements a variety of +backends for several purposes. Although vector was written with high energy +physics in mind, it is a general-purpose library that can be used for any +scientific or engineering application. The library houses 3+2 numerical +backends for experimental physicists and 1 symbolic backend for theoretical +physicists. These backends include a pure Python object backend for simple +computations, a SymPy [@Meurer:2017] backend for symbolic computations, a +NumPy backend for computations on regular data, an Awkward backend for +computations on jagged data, and implementations of the Object and the Awkward +backend in Numba for just-in-time compilable operations. Support for JAX and +Dask is also provided through the Awkward backend, which enable vector +functionalities to support automatic differentiation and parallel computing. + +## Impact + +Vector has become the de facto library for vector algebra in Python based high +energy physics data analysis pipelines. The library has been installed over +2 million times and 314 GitHub repositories use it as a dependency at the time +of writing this paper. Along with being utilized directly in analysis pipelines +at LHC and other experiments [@Kling:2023; @Held:2024; @Qu:2022], the library +is also used as a dependency in user-facing frameworks, such as, Coffea, +MadMiner [@Brehmer:2020], FastJet [@aryan:2023], Spyral [@spyral-utils:2024], +Weaver [@weaver-core:2024], and pylhe [@pylhe]. The library is also used in +multiple teaching materials for graduate courses and workshops. Finally, given +the generic nature of the library, it is also often used in non high energy +physics use cases. + +# Acknowledgements + +The work on vector was supported by NSF cooperative agreements OAC-1836650 +(IRIS-HEP) and PHY-2323298 (IRIS-HEP). We would also like to thank the +contributors of vector and the Scikit-HEP community for their support. + +# Reference