From 668e6589bf64e76a07958414bc94943ee34c60d4 Mon Sep 17 00:00:00 2001 From: Becky Sweger Date: Tue, 8 Oct 2024 17:32:42 -0400 Subject: [PATCH] Add another page --- docs/conf.py | 2 +- docs/index.md | 2 +- docs/user-guide.md | 64 ++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 66 insertions(+), 2 deletions(-) create mode 100644 docs/user-guide.md diff --git a/docs/conf.py b/docs/conf.py index cfde5fb..42d780a 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -3,7 +3,7 @@ # -- Project information project = "Cladetime" -copyright = "2024, Reich Lab @ The University of Massachusetts, Amherst" +copyright = "2024, Reich Lab @ The University of Massachusetts Amherst" author = "Reich Lab" release = "0.1" diff --git a/docs/index.md b/docs/index.md index 4c943b1..581a697 100644 --- a/docs/index.md +++ b/docs/index.md @@ -1,3 +1,3 @@ # Cladetime -## Hi! \ No newline at end of file +Cladetime is a lightweight Python library for manipulating SARS-CoV-2 sequence and clade data provided by [nextstrain.org](https://nextstrain.org/). \ No newline at end of file diff --git a/docs/user-guide.md b/docs/user-guide.md new file mode 100644 index 0000000..5bf2b0b --- /dev/null +++ b/docs/user-guide.md @@ -0,0 +1,64 @@ +# User Guide + +## Installing + +cladetime can be installed with [pip])(https://pip.pypa.io/): + +```bash +pip install git+https://github.com/reichlab/cladetime.git +``` + +## Finding Nextstrain SARS-CoV-2 sequences and sequence metadata + +Cladetime provides a CladeTime class that provides a lightweight interface to nextstrain.org files. + +```python +from cladetime import CladeTime + +# Instantiating a CladeTime object with no parameters will use the +# latest available data from nextstrain.org. +ct = CladeTime() + +# URL to the most recent SARS-CoV-2 sequence file (.fasta) +ct.url_sequence +# 'https://nextstrain-data.s3.amazonaws.com/files/ncov/open/sequences.fasta.zst?versionId=d66Hn1T0eFMAg8osEh8Yrod.QEUBRxvu' + +# URL to the metadata that describes the sequences in the above file +ct.url.sequence_metadata +# 'https://nextstrain-data.s3.amazonaws.com/files/ncov/open/metadata.tsv.zst?versionId=JTXXFlKyyvt9AerxKMwoZflhFYQFrDek' + +# Metadata about the nextstrain data pipeline that created generated the sequence file and its metadata +ct.ncov_metadata +# {'schema_version': 'v1', +# 'nextclade_version': 'nextclade 3.8.2', +# 'nextclade_dataset_name': 'SARS-CoV-2', +# 'nextclade_dataset_version': '2024-09-25--21-50-30Z', +# 'nextclade_tsv_sha256sum': '5b0f2b64bfe694a3c96bd5a116de8fae23b144bfd3d22da774d4bfe9a84399c3', +# 'metadata_tsv_sha256sum': '1dc6a4204039e5c69eed84583faf75bbec1629e531dc99aab5bd566d3fb28295'} +``` + +## Working with SARS-CoV-2 sequence metadata + +The CladeTime class also provides a Polars LazyFrame object that points to the Nextstrain's sequence metadata file. This file is in .tsv format and contains information about the sequences, such as their collection date, host, and location. + +The metadata also includes a clade assignment for each sequence. Nextstrain assigns clades based on a reference tree, and the reference tree varies over time. + +TODO: better wording, add links for reference + +```python +import polars as pl +from cladetime import CladeTime + +ct = CladeTime() + +# ct contains a Polars LazyFrame that references the sequence metadata .tsv file on AWS S3 +lz = ct.sequence_metadata +lz + + +# TODO: some polars examples +``` + +## Time Traveling + +omg! \ No newline at end of file