diff --git a/.gitignore b/.gitignore index ad4a1f1..ed8b358 100644 --- a/.gitignore +++ b/.gitignore @@ -170,7 +170,9 @@ poetry.toml # ruff .ruff_cache/ +.history + # LSP config files pyrightconfig.json -# End of https://www.toptal.com/developers/gitignore/api/python +# End of https://www.toptal.com/developers/gitignore/api/python \ No newline at end of file diff --git a/README.md b/README.md index 078a26a..9d616cf 100644 --- a/README.md +++ b/README.md @@ -1,10 +1,10 @@ -# leb 💬 +# SALT 💬 Language experimentation tools to accompany the SALT dataset ## Docs After editing the documentation .md file -1. You can view the documentation locally by running `mkdocs serve +1. You can view the documentation locally by running `mkdocs serve` 2. If all looks good, run `./build_and_deploy_docs.sh` to build deploy the documentation diff --git a/docs/API/index.md b/docs/API/index.md new file mode 100644 index 0000000..40aff41 --- /dev/null +++ b/docs/API/index.md @@ -0,0 +1 @@ +# SUNBIRDAI API \ No newline at end of file diff --git a/docs/assets/favicon.ico b/docs/assets/favicon.ico new file mode 100644 index 0000000..31f12cf Binary files /dev/null and b/docs/assets/favicon.ico differ diff --git a/docs/assets/logo.png b/docs/assets/logo.png new file mode 100644 index 0000000..650560a Binary files /dev/null and b/docs/assets/logo.png differ diff --git a/docs/blog/index.md b/docs/blog/index.md new file mode 100644 index 0000000..c58f16c --- /dev/null +++ b/docs/blog/index.md @@ -0,0 +1,2 @@ +# Blog + diff --git a/docs/index.md b/docs/index.md index 90ce5b9..2c312e2 100644 --- a/docs/index.md +++ b/docs/index.md @@ -1,15 +1,12 @@ -This site contains the project documentation for the `leb` project used for the [Sunbird AI Language Projects](https://sunbird.ai/portfolio/african-languages/). +# SALT Documentation +## Welcome to the SALT project documentation! -# Leb Documentation +This documentation serves as the official guide for the [**SALT**](https://github.com/SunbirdAI/salt) project, which is part of the [Sunbird AI Language Projects](https://sunbird.ai/portfolio/african-languages/). The goal of this documentation is to provide you with comprehensive information on how to use the Leb project effectively. -Welcome to the Leb project documentation! + diff --git a/docs/reference.md b/docs/reference.md index a8b70a7..eb23b23 100644 --- a/docs/reference.md +++ b/docs/reference.md @@ -1,4 +1,4 @@ This part of the project documentation focuses on an **information-oriented** approach. Use it as a reference for the technical implementation of the -`leb` project code. +`SALT` project code. diff --git a/docs/stylesheets/custom.css b/docs/stylesheets/custom.css new file mode 100644 index 0000000..ef6f842 --- /dev/null +++ b/docs/stylesheets/custom.css @@ -0,0 +1,3 @@ +.md-footer-meta__inner { + display: none; +} diff --git a/docs/tutorials/04-basics.md b/docs/tutorials/04-basics.md index 29684bb..fc75261 100644 --- a/docs/tutorials/04-basics.md +++ b/docs/tutorials/04-basics.md @@ -15,7 +15,7 @@ set up the configs ```python yaml_config = ''' -huggingface_load: +huggingface_load: path: Sunbird/salt split: train name: text-all @@ -34,20 +34,32 @@ ds = leb.dataset.create(config) list(ds.take(5)) ``` -output -``` -[{'source': '>>lug<< Eggplants always grow best under warm conditions.', - 'target': 'Bbiringanya lubeerera asinga kukulira mu mbeera ya bugumu'}, - {'source': '>>ach<< Eggplants always grow best under warm conditions.', - 'target': 'Bilinyanya pol kare dongo maber ka lyeto tye'}, - {'source': '>>lug<< Farmland is sometimes a challenge to farmers.', - 'target': "Ettaka ly'okulimirako n'okulundirako ebiseera ebimu kisoomooza abalimi"}, - {'source': '>>ach<< Farmland is sometimes a challenge to farmers.', - 'target': 'Ngom me pur i kare mukene obedo peko madit bot lupur'}, - {'source': '>>lug<< Farmers should be encouraged to grow more coffee.', - 'target': 'Abalimi balina okukubirizibwa okwongera okulima emmwanyi'}] +output +```json +[ + { + "source": ">>lug<< Eggplants always grow best under warm conditions.", + "target": "Bbiringanya lubeerera asinga kukulira mu mbeera ya bugumu" + }, + { + "source": ">>ach<< Eggplants always grow best under warm conditions.", + "target": "Bilinyanya pol kare dongo maber ka lyeto tye" + }, + { + "source": ">>lug<< Farmland is sometimes a challenge to farmers.", + "target": "Ettaka ly'okulimirako n'okulundirako ebiseera ebimu kisoomooza abalimi" + }, + { + "source": ">>ach<< Farmland is sometimes a challenge to farmers.", + "target": "Ngom me pur i kare mukene obedo peko madit bot lupur" + }, + { + "source": ">>lug<< Farmers should be encouraged to grow more coffee.", + "target": "Abalimi balina okukubirizibwa okwongera okulima emmwanyi" + } +] ``` -This is how a basic data loader works \ No newline at end of file +This is how a basic data loader works diff --git a/docs/tutorials/06-text-datasets.md b/docs/tutorials/06-text-datasets.md deleted file mode 100644 index e69de29..0000000 diff --git a/docs/tutorials/07-speech-datasets.md b/docs/tutorials/07-speech-datasets.md index e69de29..404698f 100644 --- a/docs/tutorials/07-speech-datasets.md +++ b/docs/tutorials/07-speech-datasets.md @@ -0,0 +1,42 @@ +# Sunbird African Language Technology (SALT) dataset + +SALT is a multi-way parallel text and speech corpus of Engish and six languages widely spoken in Uganda and East Africa: `Luganda`, `Lugbara`, `Acholi`, `Runyankole`, `Ateso` and `Swahili`. +The core of the dataset is a set of `25,000` sentences covering a range of topics of local relevance, such as agriculture, health and society. +Each sentence is translated into all languages, to support machine translation, and speech recordings are made for approximately `5,000` of the sentences both by a variety of speakers in natural settings (suitable for ASR) and by professionals in a studio setting (suitable for text-to-speech). + +## Subsets + +| Subset name | Contents | +| --------------------- | --------------------------------------------------------------------------------- | +| text-all | Text translations of each sentence. | +| multispeaker-`{lang}` | Speech recordings of each sentence, by a variety of speakers in natural settings. | +| studio-`{lang}` | Speech recordings in a studio setting, suitable for text-to-speech. | + +The sentence IDs map across subsets, so that for example the text of a sentence in Acholi can be mapped to the studio recording of that concept being expressed in Swahili. +The subsets can therefore be combined to support the training and evaluation of several further tasks, such as speech-to-text translation and speech-to-speech translation. + +## Language support + +| ISO 639-3 | Language | Translated text | Multispeaker speech | Studio speech | +| --------- | ------------------------ | --------------- | ------------------- | ------------- | +| eng | English (Ugandan accent) | Yes | Yes | Yes | +| lug | Luganda | Yes | Yes | Yes | +| ach | Acholi | Yes | Yes | Yes | +| lgg | Lugbara | Yes | Yes | Yes | +| teo | Ateso | Yes | Yes | Yes | +| nyn | Runyankole | Yes | Yes | Yes | +| swa | Swahili | Yes | No | Yes | +| ibo | Igbo | Yes | No | No | + +## Helper utilities + +Code for convenient experimentation with multilingual models can be found at [https://github.com/SunbirdAI/salt](https://github.com/SunbirdAI/salt). +See example notebooks [here](https://github.com/SunbirdAI/salt/tree/main/notebooks). + +## Collaborators + +This dataset was collected in practical collaboration between Sunbird AI and the Makerere University AI Lab (Ugandan languages) and KenCorpus, Maseno University (Swahili). + +## Reference + +[Machine Translation For African Languages: Community Creation Of Datasets And Models In Uganda](https://openreview.net/pdf?id=BK-z5qzEU-9). Benjamin Akera, Jonathan Mukiibi, Lydia Sanyu Naggayi, Claire Babirye, Isaac Owomugisha, Solomon Nsumba, Joyce Nakatumba-Nabende, Engineer Bainomugisha, Ernest Mwebaze, John Quinn. 3rd Workshop on African Natural Language Processing, 2022. diff --git a/docs/tutorials/09-asr-models.md b/docs/tutorials/09-asr-models.md index 2c99578..f2e943b 100644 --- a/docs/tutorials/09-asr-models.md +++ b/docs/tutorials/09-asr-models.md @@ -13,7 +13,8 @@ Before getting started, ensure that you have the following prerequisites: ## Installation To begin, install the necessary dependencies by running the following commands: -```{bash} +```bash + !pip install -q jiwer evaluate !pip install -qU accelerate !pip install -q transformers[torch] @@ -29,7 +30,8 @@ These commands will install the required libraries, including Jiwer, Evaluate, A Create a YAML configuration file named asr_config.yml with the necessary settings for your training. Here's an example configuration: -```{yaml} +```yaml + train: source: language: [luganda, english] @@ -80,7 +82,8 @@ To use the trained model for inference, follow these steps: 1. Load the trained model and processor: -```{python} +```python + model = Wav2Vec2ForCTC.from_pretrained("path/to/trained/model") processor = Wav2Vec2Processor.from_pretrained("path/to/processor") ``` diff --git a/docs/tutorials/13-diarization.md b/docs/tutorials/13-diarization.md index e16a103..309c934 100644 --- a/docs/tutorials/13-diarization.md +++ b/docs/tutorials/13-diarization.md @@ -10,7 +10,7 @@ Speaker Diarization at Sunbird is performed using pyannote's speaker-diarization The necessary libraries to perform speaker diarization required for efficient execution of the pipeline and determine various metrics are installed and imported. -```python +```bash !pip install pyctcdecode !pip install kenlm !pip install jiwer @@ -19,8 +19,9 @@ The necessary libraries to perform speaker diarization required for efficient ex !pip install pandas !pip install pyannote.audio !pip install onnxruntime +``` - +```python import torch from huggingface_hub import hf_hub_download from transformers import ( @@ -65,7 +66,7 @@ tokenizer = Wav2Vec2CTCTokenizer.from_pretrained(model_id) #### Tokenizer setup -```python +```python tokenizer.set_target_lang("eng") model.load_adapter("eng_meta") ``` @@ -82,6 +83,7 @@ sorted_vocab_dict = {k.lower(): v for k, v in sorted(vocab_dict.items(), key=lam ``` #### Language model file setup + Within the `Sunbird/sunbird-mms` huggingface repository is a subfolder named `language_model` containing various language models capable of efficient transcription. ```python @@ -136,7 +138,7 @@ pipe = AutomaticSpeechRecognitionPipeline( transcription = pipe("/content/Kibuuka_eng.mp3") ``` - The resulting dictionary `transcription` will contain a `text` key containing all the transcribed text as well as a `chunks` containing individual texts along with their time stamps of the format below: +The resulting dictionary `transcription` will contain a `text` key containing all the transcribed text as well as a `chunks` containing individual texts along with their time stamps of the format below: ```python { @@ -165,7 +167,7 @@ import librosa SAMPLE_RATE = 16000 def load_audio(file: str, sr: int = SAMPLE_RATE) -> np.ndarray: - + try: # librosa automatically resamples to the given sample rate (if necessary) # and converts the signal to mono (by averaging channels) @@ -175,6 +177,7 @@ def load_audio(file: str, sr: int = SAMPLE_RATE) -> np.ndarray: return audio ``` + The `load_audio` functions takes an audio file and sampling rate as one of its parameters. The sampling rate used for this Speaker Diarization is 16000. This sampling rate should be the same sampling rate used to transcribe the audio from using the Sunbird mms to ensure consistency with the output. **Diarization Pipeline** @@ -183,7 +186,6 @@ The class `Diarization Pipeline` is a custom class created to facilitate the dia It returns a pandas DataFrame with with columns for the segment, label, speaker, start time, and end time of each speaker segment. - ```python class DiarizationPipeline: def __init__( @@ -242,7 +244,7 @@ The function iterates through segments of a transcript and assigns the speaker l In case of no overlap, a the fill_nearest parameter can be set to `True`, then the function will assign the speakers to segments by finding the closest speaker in time. The function takes parameters: - + `diarize_df`: a pandas DataFrame returned by the DiarizationPipeline containing the diarization information with columns like `start`, `end` and `speaker` `transcript_result`: A dictionary with a key `chunks` that contains a list of trancript `Segments` obtained from the ASR pipeline. @@ -264,7 +266,7 @@ The function takes parameters: ```python def assign_word_speakers(diarize_df, transcript_result, fill_nearest=False): - + transcript_segments = transcript_result["chunks"] for seg in transcript_segments: @@ -288,6 +290,7 @@ def assign_word_speakers(diarize_df, transcript_result, fill_nearest=False): ``` **Running the diarization model** + ```python diarize_model = DiarizationPipeline(use_auth_token=hf_token, device=device) diarize_segments = diarize_model("/content/Kibuuka_eng.mp3", min_speakers=1, max_speakers=2) @@ -445,4 +448,4 @@ output {'text': 'you', 'timestamp': (45.48, 45.54), 'speaker': 'SPEAKER_01'}, {'text': 'are', 'timestamp': (45.56, 45.62), 'speaker': 'SPEAKER_01'}, {'text': 'married', 'timestamp': (45.68, 45.92), 'speaker': 'SPEAKER_01'}]} -``` \ No newline at end of file +``` diff --git a/docs/tutorials/14-diarization-training.md b/docs/tutorials/14-diarization-training.md index 08d5819..c736118 100644 --- a/docs/tutorials/14-diarization-training.md +++ b/docs/tutorials/14-diarization-training.md @@ -2,7 +2,7 @@ This process highlights the steps taken for Model Training on the [CallHome Dataset](https://huggingface.co/datasets/talkbank/callhome). For this particular dataset we used the English version of the CallHome Dataset. The Model Training Architecture, Loss Functions, Optimisation Techniques, Data Augmentation and Metrics Used. -# Segmentation Model Configuration Explained +## Segmentation Model Configuration Explained ## Overview @@ -38,7 +38,7 @@ Parameters: `target`: Ground truth labels. Type: `torch.Tensor` -`weight`: Type: `Optional[torch.Tensor]` +`weight`: Type: `Optional[torch.Tensor]` Returns: Permutation-invariant segmentation loss. `torch.Tensor` @@ -47,6 +47,7 @@ Returns: Permutation-invariant segmentation loss. `torch.Tensor` `to_pyannote_model`: Converts the current model to a pyannote segmentation model for use in pyannote pipelines ```python + class SegmentationModel(PreTrainedModel): config_class = SegmentationModelConfig @@ -121,7 +122,7 @@ class SegmentationModel(PreTrainedModel): target: torch.Tensor, weight: Optional[torch.Tensor] = None, ) -> torch.Tensor: - + if self.specifications.powerset: # `clamp_min` is needed to set non-speech weight to 1. @@ -139,7 +140,7 @@ class SegmentationModel(PreTrainedModel): @classmethod def from_pyannote_model(cls, pretrained): - + # Initialize model: specifications = copy.deepcopy(pretrained.specifications) @@ -178,7 +179,7 @@ class SegmentationModel(PreTrainedModel): return model def to_pyannote_model(self): - + seg_model = PyanNet(sincnet={"stride": 10}) seg_model.hparams.update(self.model.hparams) @@ -208,8 +209,9 @@ class SegmentationModel(PreTrainedModel): - Configuration parameters like chunk duration, number of speakers per chunk/frame, minimum duration, warm-up period, etc. ```python + class SegmentationModelConfig(PretrainedConfig): - + model_type = "pyannet" def __init__( @@ -222,7 +224,7 @@ class SegmentationModelConfig(PretrainedConfig): weigh_by_cardinality=False, **kwargs, ): - + super().__init__(**kwargs) self.chunk_duration = chunk_duration self.max_speakers_per_frame = max_speakers_per_frame @@ -249,12 +251,15 @@ class SegmentationModelConfig(PretrainedConfig): ### Optimization Techniques ### Batch Size - - This refers to the number of samples that you feed into your model at each iteration of the training process. This can be adjusted accordingly to optimise the performance of your model + +- This refers to the number of samples that you feed into your model at each iteration of the training process. This can be adjusted accordingly to optimise the performance of your model ### Learning Rate - - This is an optimization tunning parameter that determines the step-size at each iteration while moving towards a minimum loss function + +- This is an optimization tunning parameter that determines the step-size at each iteration while moving towards a minimum loss function ### Training Epochs + - An epoch refers to a complete pass through the entire training dataset. A model is exposed to all the training examples and updates its parametrs basd on the patterns it learns. In our case, we try and iterate and test with 5, 10 and 20 epochs and find that the Diarisation Error Rate remains constant at "'der': 0.23994926057695026" #### Warm-up @@ -269,23 +274,22 @@ class SegmentationModelConfig(PretrainedConfig): ### Data Augmentation Methods - For our case this is done using the the DataCollator class. This class is responsible for collecting data and ensuring that the target labels are dynamically padded. -- Pads the target labels to ensure they have the same shape. +- Pads the target labels to ensure they have the same shape. - Pads with zeros if the number of speakers in a chunk is less than the maximum number of speakers per chunk - - #### Preprocessing Steps - Preprocessing steps like random overlap and fixed overlap during chunking can be considered a form of augmentation as they provide varied inputs to the model. - `Preprocess` class used to handle these preprocessing steps is not detailed here, but it's responsible for preparing the input data. ```python + class Preprocess: def __init__( self, config, ): - + self.chunk_duration = config.chunk_duration self.max_speakers_per_frame = config.max_speakers_per_frame self.max_speakers_per_chunk = config.max_speakers_per_chunk @@ -301,7 +305,7 @@ class Preprocess: ).shape def get_labels_in_file(self, file): - + file_labels = [] for i in range(len(file["speakers"][0])): @@ -311,7 +315,7 @@ class Preprocess: return file_labels def get_segments_in_file(self, file, labels): - + file_annotations = [] @@ -328,7 +332,7 @@ class Preprocess: return annotations def get_chunk(self, file, start_time): - + sample_rate = file["audio"][0]["sampling_rate"] @@ -420,6 +424,7 @@ class Preprocess: - For the metrics we have the Diarisation Error Rate(DER), FalseAlarm Rate, MissedDetectionRate and the SpeakerConfusionRate with the implementation in the metrics class below. ```python + import numpy as np import torch from pyannote.audio.torchmetrics import (DiarizationErrorRate, FalseAlarmRate, @@ -544,9 +549,10 @@ class DataCollator: ### Training Script - The script [train_segmentation.py](https://github.com/huggingface/diarizers/) - can be used to pre-process a diarization dataset and subsequently fine-tune the pyannote segmentation model. In the following example, we fine-tuned the segmentation model on the English subset of the CallHome dataset, a conversational dataset between native speakers: + can be used to pre-process a diarization dataset and subsequently fine-tune the pyannote segmentation model. In the following example, we fine-tuned the segmentation model on the English subset of the CallHome dataset, a conversational dataset between native speakers: ```bash + !python3 train_segmentation.py \ --dataset_name=diarizers-community/callhome \ --dataset_config_name=eng \ @@ -574,6 +580,7 @@ class DataCollator: The script [test_segmentation.py](https://github.com/huggingface/diarizers/)can be used to evaluate a fine-tuned model on a diarization dataset. In the following example, we evaluate the fine-tuned model from the previous step on the test split of the CallHome English dataset: ```bash + !python3 test_segmentation.py \ --dataset_name=diarizers-community/callhome \ --dataset_config_name=eng \ @@ -588,12 +595,12 @@ The script [test_segmentation.py](https://github.com/huggingface/diarizers/)can ![alt text](EVAL.PNG) - ### Inference with Pyannote -- The fine-tuned segmentation model can easily be loaded into the pyannote speaker diarization pipeline for inference. To do so, we load the pre-trained speaker diarization pipeline, and subsequently override the segmentation model with our fine-tuned checkpoint: +- The fine-tuned segmentation model can easily be loaded into the pyannote speaker diarization pipeline for inference. To do so, we load the pre-trained speaker diarization pipeline, and subsequently override the segmentation model with our fine-tuned checkpoint: ```python + from diarizers import SegmentationModel from pyannote.audio import Pipeline from datasets import load_dataset @@ -625,5 +632,3 @@ diarization = pipeline(sample) with open("audio.rttm", "w") as rttm: diarization.write_rttm(rttm) ``` - - diff --git a/docs/tutorials/15-diarization-evaluation.md b/docs/tutorials/15-diarization-evaluation.md index 251c9db..d783189 100644 --- a/docs/tutorials/15-diarization-evaluation.md +++ b/docs/tutorials/15-diarization-evaluation.md @@ -355,7 +355,6 @@ For the purpose of this demonstration, the latter two were not obtained. Details ```python - def compute_metrics_on_file(self, file): pred = self.predict(file) @@ -390,6 +389,7 @@ This function iteratively calls the `compute_metrics_on_file` function to perfor `Returns`: The average values of the `der`(diarization error rate) and `f1`(F1 Score). ```python + def compute_metrics(self): der = 0 diff --git a/mkdocs.yml b/mkdocs.yml index e1a6a42..ded8bef 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -1,33 +1,175 @@ -site_name: Leb Documentation +site_name: SALT Documentation +site_url: https://salt.sunbird.ai +repo_url: https://github.com/SunbirdAI/salt +repo_name: SunbirdAI/salt + +# Configuration theme: - name: readthedocs + name: material + # custom_dir: material/overrides + features: + - announce.dismiss + - content.action.edit + - content.action.view + - content.code.annotate + - content.code.copy + - content.code.select + # - content.footnote.tooltips + # - content.tabs.link + - content.tooltips + # - header.autohide + # - navigation.expand + - navigation.footer + - navigation.indexes + # - navigation.instant + # - navigation.instant.prefetch + # - navigation.instant.progress + # - navigation.prune + - navigation.sections + - navigation.path + - navigation.tabs + # - navigation.tabs.sticky + - navigation.top + - navigation.tracking + - search.highlight + - search.share + - search.suggest + - toc.follow + # - toc.integrate + palette: + # - media: "(prefers-color-scheme)" + # toggle: + # icon: material/weather-night + # name: Switch to light mode + - media: "(prefers-color-scheme: light)" + scheme: default + primary: white + accent: indigo + toggle: + icon: material/weather-night + name: Switch to dark mode + - media: "(prefers-color-scheme: dark)" + scheme: slate + primary: black + accent: indigo + toggle: + icon: material/weather-sunny + name: Switch to system preference + font: + text: Roboto + code: Roboto Mono + favicon: assets/favicon.ico + logo: assets/logo.png + +extra_css: + - stylesheets/custom.css +# Plugins plugins: - - mkdocstrings + # - blog + - search: + separator: '[\s\u200b\-_,:!=\[\]()"`/]+|\.(?!\d)|&[lg]t;|(?!\b)(?=[A-Z][a-z])' + - minify: + minify_html: true + +# Additional configuration +extra: + generator: false + status: + new: Recently added + deprecated: Deprecated + # analytics: + # provider: google + # property: !ENV GOOGLE_ANALYTICS_KEY +# social: +# - icon: fontawesome/solid/globe +# link: https://sunbird.ai +# - icon: fontawesome/brands/github +# link: https://github.com/SunbirdAI +# - icon: fontawesome/brands/x-twitter +# link: https://twitter.com/sunbirdai +# - icon: fontawesome/brands/linkedin +# link: https://ug.linkedin.com/company/sunbird-ai + +# Extensions +markdown_extensions: + - abbr + - admonition + - attr_list + - def_list + - footnotes + - md_in_html + - tables + - toc: + permalink: true + - pymdownx.arithmatex: + generic: true + - pymdownx.betterem: + smart_enable: all + - pymdownx.caret + - pymdownx.details + - pymdownx.emoji: + emoji_generator: !!python/name:material.extensions.emoji.to_svg + emoji_index: !!python/name:material.extensions.emoji.twemoji + - pymdownx.highlight: + anchor_linenums: true + line_spans: __span + pygments_lang_class: true + # auto_title: true + # linenums: true + linenums_style: pymdownx-inline + - pymdownx.inlinehilite + - pymdownx.keys + # - pymdownx.magiclink: + # normalize_issue_symbols: true + # repo_url_shorthand: true + # user: squidfunk + # repo: mkdocs-material + - pymdownx.mark + - pymdownx.smartsymbols + - pymdownx.snippets: + auto_append: + - includes/mkdocs.md + - pymdownx.superfences: + custom_fences: + - name: mermaid + class: mermaid + format: !!python/name:pymdownx.superfences.fence_code_format + - pymdownx.tabbed: + alternate_style: true + combine_header_slug: true + slugify: !!python/object/apply:pymdownx.slugs.slugify + kwds: + case: lower + - pymdownx.tasklist: + custom_checkbox: true + - pymdownx.tilde nav: - - '💬 LEB': 'index.md' - - 'Getting Started': - - 'Introduction': tutorials/01-introduction.md - - 'Installation': tutorials/02-installation.md - - 'Quick Tour': tutorials/03-quick-tour.md - - 'Tutorials': - - 'Beginner': - - 'Basics': tutorials/04-basics.md - - 'Data Exploration': tutorials/05-data-exploration.md - - 'Leb Datasets': - - 'Text Datasets': tutorials/06-text-datasets.md - - 'Speech Datasets': tutorials/07-speech-datasets.md - - 'Leb Models': - - 'Translation Models': tutorials/08-translation-models.md - - 'ASR Models': tutorials/09-asr-models.md - - 'TTS Models': tutorials/10-tts-models.md - - 'Leb Pipelines': - - 'Data Loading': tutorials/11-data-loading.md - - 'Training': tutorials/12-training.md - - 'Speaker Diarization': + - SALT: index.md + - Getting Started: + - "Introduction": tutorials/01-introduction.md + - "Installation": tutorials/02-installation.md + - "Quick Tour": tutorials/03-quick-tour.md + - Tutorials: + - Beginner: + - "Basics": tutorials/04-basics.md + - "Data Exploration": tutorials/05-data-exploration.md + - SALT Datasets: tutorials/07-speech-datasets.md + - SALT Models: + - "Translation Models": tutorials/08-translation-models.md + - "ASR Models": tutorials/09-asr-models.md + - "TTS Models": tutorials/10-tts-models.md + - SALT Pipelines: + - "Data Loading": tutorials/11-data-loading.md + - "Training": tutorials/12-training.md + - Speaker Diarization: - "Diarization": tutorials/13-diarization.md - "Fine-Tuning Diarization": tutorials/14-diarization-training.md - - 'Reference': - - 'Reference': reference.md - + - "Diarization evaluation": "tutorials/15-diarization-evaluation.md" + - Reference: + - "Reference": reference.md + # - Blog: + # - Intro: blog/index.md + # - API: + # - Sunbird AI API: API/index.md diff --git a/requirements.txt b/requirements.txt index dff6d86..532915c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,9 +1,89 @@ -datasets -nlpaug -pandas -numpy -librosa -unidecode -clean-text -evaluate -sacrebleu \ No newline at end of file +aiohappyeyeballs==2.4.3 +aiohttp==3.11.7 +aiosignal==1.3.1 +async-timeout==5.0.1 +attrs==24.2.0 +audioread==3.0.1 +babel==2.16.0 +beautifulsoup4==4.12.3 +certifi==2024.8.30 +cffi==1.17.1 +charset-normalizer==3.4.0 +clean-text==0.6.0 +click==8.1.7 +colorama==0.4.6 +csscompressor==0.9.5 +datasets==3.1.0 +decorator==5.1.1 +dill==0.3.8 +emoji==1.7.0 +evaluate==0.4.3 +filelock==3.16.1 +frozenlist==1.5.0 +fsspec==2024.9.0 +ftfy==6.3.1 +gdown==5.2.0 +ghp-import==2.1.0 +htmlmin2==0.1.13 +huggingface-hub==0.26.2 +idna==3.10 +Jinja2==3.1.4 +joblib==1.4.2 +jsmin==3.0.1 +lazy_loader==0.4 +librosa==0.10.2.post1 +llvmlite==0.43.0 +lxml==5.3.0 +Markdown==3.7 +MarkupSafe==3.0.2 +mergedeep==1.3.4 +mkdocs==1.6.1 +mkdocs-autorefs==1.2.0 +mkdocs-get-deps==0.2.0 +mkdocs-material==9.5.46 +mkdocs-material-extensions==1.3.1 +mkdocs-minify-plugin==0.8.0 +mkdocstrings==0.27.0 +msgpack==1.1.0 +multidict==6.1.0 +multiprocess==0.70.16 +nlpaug==1.1.11 +numba==0.60.0 +numpy==2.0.2 +packaging==24.2 +paginate==0.5.7 +pandas==2.2.3 +pathspec==0.12.1 +platformdirs==4.3.6 +pooch==1.8.2 +portalocker==3.0.0 +propcache==0.2.0 +pyarrow==18.1.0 +pycparser==2.22 +Pygments==2.18.0 +pymdown-extensions==10.12 +PySocks==1.7.1 +python-dateutil==2.9.0.post0 +pytz==2024.2 +PyYAML==6.0.2 +pyyaml_env_tag==0.1 +regex==2024.11.6 +requests==2.32.3 +sacrebleu==2.4.3 +scikit-learn==1.5.2 +scipy==1.14.1 +six==1.16.0 +soundfile==0.12.1 +soupsieve==2.6 +soxr==0.5.0.post1 +tabulate==0.9.0 +threadpoolctl==3.5.0 +tqdm==4.67.1 +typing_extensions==4.12.2 +tzdata==2024.2 +Unidecode==1.3.8 +urllib3==2.2.3 +watchdog==6.0.0 +wcwidth==0.2.13 +xxhash==3.5.0 +yarl==1.18.0 diff --git a/site/img/favicon.ico b/site/img/favicon.ico index e85006a..31f12cf 100644 Binary files a/site/img/favicon.ico and b/site/img/favicon.ico differ