From ff36c9311ab72db871d3fec30450865c7cafbca5 Mon Sep 17 00:00:00 2001
From: Cornelius Roemer <cornelius.roemer@gmail.com>
Date: Sat, 23 Nov 2024 01:14:58 +0100
Subject: [PATCH 1/2] fix: add codespell config and fix typos with it

Do dry-run using `codespell`, write using `codespell -w`

When there are multiple options, select interactively  with `codespell -w -i2`
---
 .codespellrc                                  |  4 ++++
 .config/codespell/exclude-file                |  1 +
 .config/codespell/ignore-words                | 21 +++++++++++++++++++
 architecture_docs/03_context_and_scope.md     |  2 +-
 backend/README.md                             |  2 +-
 .../service/crossref/CrossRefService.kt       |  6 +++---
 .../docs/introduction/system-overview.md      |  2 +-
 ena-submission/ENA_submission.md              |  6 +++---
 .../ena_deposition/ena_submission_helper.py   |  4 ++--
 ingest/README.md                              |  2 +-
 keycloak/keycloakify/vite.config.ts           |  2 +-
 kubernetes/loculus/CONTRIBUTING.md            |  2 +-
 kubernetes/loculus/values.yaml                |  4 ++--
 .../components/SeqSetCitations/SeqSetForm.tsx |  2 +-
 website/src/services/lapisClient.ts           |  2 +-
 15 files changed, 44 insertions(+), 18 deletions(-)
 create mode 100644 .codespellrc
 create mode 100644 .config/codespell/exclude-file
 create mode 100644 .config/codespell/ignore-words

diff --git a/.codespellrc b/.codespellrc
new file mode 100644
index 000000000..a14db0f95
--- /dev/null
+++ b/.codespellrc
@@ -0,0 +1,4 @@
+[codespell]
+skip = */node_modules,*.jsonl,*.tsv,*.json,*/dist_keycloak,*/playwright-report,*.properties,*/dist,./backend/build,yarn.lock,i18n.ts
+exclude-file = .config/codespell/exclude-file
+ignore-words = .config/codespell/ignore-words
diff --git a/.config/codespell/exclude-file b/.config/codespell/exclude-file
new file mode 100644
index 000000000..ba65a876b
--- /dev/null
+++ b/.config/codespell/exclude-file
@@ -0,0 +1 @@
+import com.fasterxml.jackson.databind.ser.std.StdSerializer
diff --git a/.config/codespell/ignore-words b/.config/codespell/ignore-words
new file mode 100644
index 000000000..131f7793f
--- /dev/null
+++ b/.config/codespell/ignore-words
@@ -0,0 +1,21 @@
+# not clear mistakes
+unsecure
+re-use
+
+# common false positives
+Nam
+
+## abbreviations
+ser
+
+## Camel case variable
+afterAll
+assertIn
+
+## Slang
+AAs
+nd
+
+## Library name
+redundent
+
diff --git a/architecture_docs/03_context_and_scope.md b/architecture_docs/03_context_and_scope.md
index a61a50da2..225c13aa5 100644
--- a/architecture_docs/03_context_and_scope.md
+++ b/architecture_docs/03_context_and_scope.md
@@ -8,4 +8,4 @@ All external participants are listed in the diagram below:
 We provide instructions how to install Loculus, and we host instances ourselves,
 but other maintainers can host their own instances as well.
 We offer guidance and documentation and are open for feature requests, but we do not provide direct support for custom instances.
-We cannot forsee all possible configurations and environments that Loculus might be deployed in.
+We cannot foresee all possible configurations and environments that Loculus might be deployed in.
diff --git a/backend/README.md b/backend/README.md
index d9038aab8..051ca3256 100644
--- a/backend/README.md
+++ b/backend/README.md
@@ -69,7 +69,7 @@ You need to set:
 
 We use Flyway, so that the service can provision an empty/existing DB without any manual steps in between. On startup scripts in `src/main/resources/db/migration` are executed in order, i.e. `V1__*.sql` before `V2__*.sql` if they didn't run before, so that the DB is always up-to-date. (For more info on the naming convention, see [this](https://www.red-gate.com/blog/database-devops/flyway-naming-patterns-matter) blog post.)
 
-Note: When using a postgresSQL development platform (e.g. pgAdmin) the hostname is 127.0.0.1 and not localhost - this is defined in the `deploy.py` file.
+Note: When using a postgresql development platform (e.g. pgAdmin) the hostname is 127.0.0.1 and not localhost - this is defined in the `deploy.py` file.
 
 Note that we also use flyway in the ena-submission pod to create an additional schema in the database, ena-submission. This schema is not added here.
 
diff --git a/backend/src/main/kotlin/org/loculus/backend/service/crossref/CrossRefService.kt b/backend/src/main/kotlin/org/loculus/backend/service/crossref/CrossRefService.kt
index db64d494e..bd8c78c94 100644
--- a/backend/src/main/kotlin/org/loculus/backend/service/crossref/CrossRefService.kt
+++ b/backend/src/main/kotlin/org/loculus/backend/service/crossref/CrossRefService.kt
@@ -77,9 +77,9 @@ class CrossRefService(final val properties: CrossRefServiceProperties) {
             )
 
             "head" {
-                // The doi_batch_id gets ignored and the actual one is assigned after the equest is processed through
+                // The doi_batch_id gets ignored and the actual one is assigned after the request is processed through
                 // CrossRef's queue. Because of this, presumably, the doi_batch_id is not sent back when a request to
-                // the service is successful. For this, one would have to query the equest queue and retrieve it from there
+                // the service is successful. For this, one would have to query the request queue and retrieve it from there
                 "doi_batch_id" { -doiBatchID }
                 "timestamp" { -date.atStartOfDay(ZoneId.of("UTC")).toInstant().toEpochMilli().toString() }
                 "depositor" {
@@ -167,7 +167,7 @@ class CrossRefService(final val properties: CrossRefServiceProperties) {
         formData.forEach { (key, value) ->
             if (value is String) {
                 // Both carriage return and new line characters have to be sent ("\r\n"),
-                // otherwise the request witll cause a 500 error on CrossRef's end
+                // otherwise the request will cause a 500 error on CrossRef's end
                 printWriter.append("--$boundary").append("\r\n")
                 printWriter.append(
                     "Content-Disposition: form-data; name=\"${URLEncoder.encode(key, "UTF-8")}\"",
diff --git a/docs/src/content/docs/introduction/system-overview.md b/docs/src/content/docs/introduction/system-overview.md
index 9e03c90eb..67352cdd5 100644
--- a/docs/src/content/docs/introduction/system-overview.md
+++ b/docs/src/content/docs/introduction/system-overview.md
@@ -14,7 +14,7 @@ Loculus has a modular architecture and consists of several sub-services:
 -   **SILO(s):** [SILO](https://github.com/GenSpectrum/LAPIS-SILO) is an open-source query engine for genetic sequences optimized for high performance and supporting alignment-specific queries such as mutation searches. It regularly pulls data from the backend server and indexes them. By default, SILO is not exposed to the users but accessed via LAPIS. For each [organism](../glossary#organism) of a Loculus [instance](../glossary#instance), there is a separate instance of SILO.
 -   **LAPIS(es):** [LAPIS](https://github.com/GenSpectrum/LAPIS) provides a convenient interface to SILO, offering a lightweight web API and additional data and compression formats. For each SILO instance, there is a corresponding LAPIS instance.
 -   **Website:** The frontend application of Loculus accesses the APIs of the backend server and LAPIS. It uses the backend server for everything related to data submission and LAPIS for searching and downloading released data. For logins and registrations, users are redirected to Keycloak.
--   **Preprocessing pipeline(s):** A preprocessing pipeline fetches [unprocessed/user-submitted data](../glossary#unprocessed-data) from the backend server, processes them (which usually includes cleaning, alignment and adding annotations), and sends [processed data](../glossary#processed-data) back to the backend server. The pipeline contains [organism](../glossary#organism)-specific logic, thus, there is a separate pipeline for each organism. We maintain a customizeable preprocessing pipeline that uses [Nextclade](https://github.com/nextstrain/nextclade) for alignment, quality checks and annotations but it is easy to write a new one by following the [preprocessing pipeline specifications](https://github.com/loculus-project/loculus/blob/main/preprocessing/specification.md).
+-   **Preprocessing pipeline(s):** A preprocessing pipeline fetches [unprocessed/user-submitted data](../glossary#unprocessed-data) from the backend server, processes them (which usually includes cleaning, alignment and adding annotations), and sends [processed data](../glossary#processed-data) back to the backend server. The pipeline contains [organism](../glossary#organism)-specific logic, thus, there is a separate pipeline for each organism. We maintain a customizable preprocessing pipeline that uses [Nextclade](https://github.com/nextstrain/nextclade) for alignment, quality checks and annotations but it is easy to write a new one by following the [preprocessing pipeline specifications](https://github.com/loculus-project/loculus/blob/main/preprocessing/specification.md).
 
 ![Architecture overview](./architectureOverview.svg)
 
diff --git a/ena-submission/ENA_submission.md b/ena-submission/ENA_submission.md
index 1c1606093..0eb0fee7f 100644
--- a/ena-submission/ENA_submission.md
+++ b/ena-submission/ENA_submission.md
@@ -51,7 +51,7 @@ The following could be implement as post-MVP features:
 
 ## Submission process
 
-## 1. Registering a study programatically
+## 1. Registering a study programmatically
 
 1. Create the study XML ([schema](https://ftp.ebi.ac.uk/pub/databases/ena/doc/xsd/sra_1_5/ENA.project.xsd)):
 
@@ -123,7 +123,7 @@ The following could be implement as post-MVP features:
 
    This is where one gets the accession from.
 
-## 2. Registering a sample programatically
+## 2. Registering a sample programmatically
 
 [Docs](https://ena-docs.readthedocs.io/en/latest/submit/samples.html)
 
@@ -347,7 +347,7 @@ Instead you should use the GCA accession. These are distributed by NCBI (this is
 
 ### What would the end-to-end flow of submitting sequences for pathoplexus look like?
 
-1. [Register study programatically](https://ena-docs.readthedocs.io/en/latest/submit/study/programmatic.html)
+1. [Register study programmatically](https://ena-docs.readthedocs.io/en/latest/submit/study/programmatic.html)
 2. Upload sequences using what route? Which files are needed?
 
 ### What information do we give to the original submitter?
diff --git a/ena-submission/src/ena_deposition/ena_submission_helper.py b/ena-submission/src/ena_deposition/ena_submission_helper.py
index ea1001cf9..71745e634 100644
--- a/ena-submission/src/ena_deposition/ena_submission_helper.py
+++ b/ena-submission/src/ena_deposition/ena_submission_helper.py
@@ -267,7 +267,7 @@ def post_webin(config: ENAConfig, xml: dict[str, Any]) -> requests.Response:
         config.ena_submission_url,
         auth=HTTPBasicAuth(config.ena_submission_username, config.ena_submission_password),
         files=xml,
-        timeout=10,  # wait a full 10 seconds for a response incase slow
+        timeout=10,  # wait a full 10 seconds for a response in case slow
     )
 
 
@@ -551,7 +551,7 @@ def get_ena_analysis_process(
         response = requests.get(
             url,
             auth=HTTPBasicAuth(config.ena_submission_username, config.ena_submission_password),
-            timeout=10,  # wait a full 10 seconds for a response incase slow
+            timeout=10,  # wait a full 10 seconds for a response in case slow
         )
     except requests.exceptions.RequestException as e:
         error_message = f"Request failed with exception: {e}."
diff --git a/ingest/README.md b/ingest/README.md
index 94f6380e7..fd0a0ff55 100644
--- a/ingest/README.md
+++ b/ingest/README.md
@@ -16,7 +16,7 @@ snakemake --dag | dot -Tpng > static/dag.png
 
 ### Download data from NCBI virus
 
-Using NCBI `datasets` CLI, download all sequences and corresponding NCBI curated metadata for a configurable taxon. The taxon is specified using the NCBI Taxonomy ID, and includes all child taxa, i.e. dowloading sequences for the Ebola virus taxon ID includes all sequences for more specific Ebola virus (sub)species taxon ids.
+Using NCBI `datasets` CLI, download all sequences and corresponding NCBI curated metadata for a configurable taxon. The taxon is specified using the NCBI Taxonomy ID, and includes all child taxa, i.e. downloading sequences for the Ebola virus taxon ID includes all sequences for more specific Ebola virus (sub)species taxon ids.
 
 Sequences and metadata are transformed into (nd)json files to simplify (de)serialization and further processing.
 
diff --git a/keycloak/keycloakify/vite.config.ts b/keycloak/keycloakify/vite.config.ts
index b2eab59aa..b762b32cd 100644
--- a/keycloak/keycloakify/vite.config.ts
+++ b/keycloak/keycloakify/vite.config.ts
@@ -42,7 +42,7 @@ export default defineConfig({
   /*
    * Uncomment this if you want to use the default domain provided by GitHub Pages
    * replace "keycloakify-starter" with your repository name.
-   * This is only relevent if you are building an Wep App + A Keycloak theme.
+   * This is only relevant if you are building an Wep App + A Keycloak theme.
    * If you are only building a Keycloak theme, you can ignore this.
    */
   //base: "/keycloakify-starter/"
diff --git a/kubernetes/loculus/CONTRIBUTING.md b/kubernetes/loculus/CONTRIBUTING.md
index 411543e98..d5702651e 100644
--- a/kubernetes/loculus/CONTRIBUTING.md
+++ b/kubernetes/loculus/CONTRIBUTING.md
@@ -13,7 +13,7 @@ helm template loculus kubernetes/loculus \
 
 ## Diffing produced Kubernetes manifests
 
-To diff produced manifests, you can use the `diff` command and to specifically compare metadata fields you can use the `kuberentes/loculus/utils/yamldiff_script.py` script.
+To diff produced manifests, you can use the `diff` command and to specifically compare metadata fields you can use the `kubernetes/loculus/utils/yamldiff_script.py` script.
 
 1. Install yamldiff: `go install github.com/sahilm/yamldiff@latest`
 2. Create the manifests to diff: `helm template loculus kubernetes/loculus > /tmp/new.yaml`
diff --git a/kubernetes/loculus/values.yaml b/kubernetes/loculus/values.yaml
index e0bba1cc1..97339fa1d 100644
--- a/kubernetes/loculus/values.yaml
+++ b/kubernetes/loculus/values.yaml
@@ -821,14 +821,14 @@ defaultOrganismConfig: &defaultOrganismConfig
       - name: sequencedByContactName
         ontology_id: GENEPIO:0100471
         definition: The name or title of the contact responsible for follow-up regarding the sequence.
-        guidance: Provide the name of an individual or their job title. As personnel turnover may render the contact's name obsolete, it is more prefereable to provide a job title for ensuring accuracy of information and institutional memory. If the information is unknown or cannot be provided, leave blank or provide a null value.
+        guidance: Provide the name of an individual or their job title. As personnel turnover may render the contact's name obsolete, it is more preferable to provide a job title for ensuring accuracy of information and institutional memory. If the information is unknown or cannot be provided, leave blank or provide a null value.
         example: Enterics Lab Manager
         displayName: Sequenced by - contact name
         header: Sequencing
       - name: sequencedByContactEmail
         ontology_id: GENEPIO:0100422
         definition: The email address of the contact responsible for follow-up regarding the sequence.
-        guidance: Provide the email associated with the listed contact. As personnel turnover may render an individual's email obsolete, it is more prefereable to provide an address for a position or lab, to ensure accuracy of information and institutional memory. If the information is unknown or cannot be provided, leave blank or provide a null value.
+        guidance: Provide the email associated with the listed contact. As personnel turnover may render an individual's email obsolete, it is more preferable to provide an address for a position or lab, to ensure accuracy of information and institutional memory. If the information is unknown or cannot be provided, leave blank or provide a null value.
         example: enterics@lab.ca
         displayName: Sequenced by - contact email
         header: Sequencing
diff --git a/website/src/components/SeqSetCitations/SeqSetForm.tsx b/website/src/components/SeqSetCitations/SeqSetForm.tsx
index 00b2384a8..3c8b899da 100644
--- a/website/src/components/SeqSetCitations/SeqSetForm.tsx
+++ b/website/src/components/SeqSetCitations/SeqSetForm.tsx
@@ -117,7 +117,7 @@ export const SeqSetForm: FC<SeqSetFormProps> = ({ clientConfig, accessToken, edi
                         htmlFor={`loculus-${isFocalStr}-accession-input`}
                         className='block mb-2 text-sm font-medium text-gray-900 dark:text-white'
                     >
-                        {`${isFocal === true ? '* ' : ''}${capitalCase(isFocalStr)} accessions (seperated by comma or whitespace)`}
+                        {`${isFocal === true ? '* ' : ''}${capitalCase(isFocalStr)} accessions (separated by comma or whitespace)`}
                     </label>
                     <textarea
                         id={`loculus-${isFocalStr}-accession-input`}
diff --git a/website/src/services/lapisClient.ts b/website/src/services/lapisClient.ts
index a5f160c33..51528973e 100644
--- a/website/src/services/lapisClient.ts
+++ b/website/src/services/lapisClient.ts
@@ -60,7 +60,7 @@ export class LapisClient extends ZodiosWrapperClient<typeof lapisApi> {
             dataFormat: 'TSV',
         });
         // This type cast isn't pretty, but if the API would be typed correctly, the union type
-        // of the actual details resonse and the potential 'string' would pollute the whole API,
+        // of the actual details response and the potential 'string' would pollute the whole API,
         // so I (@fhennig) decided to just do this cast here. We know that the return value is a TSV string.
         return result.map((data) => data as unknown as string);
     }

From cf2d3f4f8b32e87ff7a1c54c86381752f575ec43 Mon Sep 17 00:00:00 2001
From: Cornelius Roemer <cornelius.roemer@gmail.com>
Date: Sat, 23 Nov 2024 09:38:22 +0100
Subject: [PATCH 2/2] Update backend/README.md

Co-authored-by: Chaoran Chen <mail@chaoran-chen.de>
---
 backend/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/backend/README.md b/backend/README.md
index 051ca3256..4efeaad31 100644
--- a/backend/README.md
+++ b/backend/README.md
@@ -69,7 +69,7 @@ You need to set:
 
 We use Flyway, so that the service can provision an empty/existing DB without any manual steps in between. On startup scripts in `src/main/resources/db/migration` are executed in order, i.e. `V1__*.sql` before `V2__*.sql` if they didn't run before, so that the DB is always up-to-date. (For more info on the naming convention, see [this](https://www.red-gate.com/blog/database-devops/flyway-naming-patterns-matter) blog post.)
 
-Note: When using a postgresql development platform (e.g. pgAdmin) the hostname is 127.0.0.1 and not localhost - this is defined in the `deploy.py` file.
+Note: When using a PostgreSQL development platform (e.g. pgAdmin) the hostname is 127.0.0.1 and not localhost - this is defined in the `deploy.py` file.
 
 Note that we also use flyway in the ena-submission pod to create an additional schema in the database, ena-submission. This schema is not added here.