Skip to content

Commit

Permalink
feat: add librispeech dataset, add text info option to commonvoice
Browse files Browse the repository at this point in the history
  • Loading branch information
flavioschneider committed Sep 14, 2022
1 parent c258266 commit b2f3eaf
Show file tree
Hide file tree
Showing 6 changed files with 94 additions and 10 deletions.
34 changes: 32 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,38 @@ dataset[0] # (1, 158621)
dataset[1] # (1, 153757)
```

#### Full API:
```py
LJSpeechDataset(
root: str = "./data", # The root where the dataset will be downloaded
transforms: Optional[Callable] = None, # Transforms to apply to audio files
)
```

### LibriSpeech Dataset
Wrapper for the [LibriSpeech](https://www.openslr.org/12) dataset (EN only). Requires `pip install datasets`. Note that this dataset requires several GBs of storage.

```py
from audio_data_pytorch import LibriSpeechDataset

dataset = LibriSpeechDataset(
root="./data",
)

dataset[0] # (1, 222336)
```

#### Full API:
```py
LibriSpeechDataset(
root: str = "./data", # The root where the dataset will be downloaded
with_info: bool = False, # Whether to return info (i.e. text, sampling rate, speaker_id)
transforms: Optional[Callable] = None, # Transforms to apply to audio files
)
```

### Common Voice Dataset
Multilanguage wrapper for the [Common Voice](https://commonvoice.mozilla.org/) dataset with voice-only data. Requires `pip install datasets`. Note that each language requires several GBs of storage, and that you have to confirm access for each distinct version you use e.g. [here](https://huggingface.co/datasets/mozilla-foundation/common_voice_10_0), to validate your Huggingface access token. You can provide a list of `languages` and to avoid an unbalanced dataset the values will be interleaved by downsampling the majority language to have the same number of samples as the minority language.
Multilanguage wrapper for the [Common Voice](https://commonvoice.mozilla.org/). Requires `pip install datasets`. Note that each language requires several GBs of storage, and that you have to confirm access for each distinct version you use e.g. [here](https://huggingface.co/datasets/mozilla-foundation/common_voice_10_0), to validate your Huggingface access token. You can provide a list of `languages` and to avoid an unbalanced dataset the values will be interleaved by downsampling the majority language to have the same number of samples as the minority language.

```py
from audio_data_pytorch import CommonVoiceDataset
Expand All @@ -66,7 +96,7 @@ CommonVoiceDataset(
sub_version: int = 0, # Subversion: common_voice_{version}_{sub_version}
root: str = "./data", # The root where the dataset will be downloaded
languages: Sequence[str] = ['en'], # List of languages to include in the dataset
with_sample_rate: bool = False, # Returns sample rate as second argument
with_info: bool = False, # Whether to return info (i.e. text, sampling rate, age, gender, accent, locale)
transforms: Optional[Callable] = None, # Transforms to apply to audio files
)
```
Expand Down
3 changes: 2 additions & 1 deletion audio_data_pytorch/datasets/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from .common_voice_dataset import CommonVoiceDataset
from .ljspeech_dataset import LJSpeechDataset
from .libri_speech_dataset import LibriSpeechDataset
from .lj_speech_dataset import LJSpeechDataset
from .wav_dataset import WAVDataset
from .youtube_dataset import YoutubeDataset
21 changes: 15 additions & 6 deletions audio_data_pytorch/datasets/common_voice_dataset.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import os
from typing import Callable, Optional, Sequence, Tuple, Union
from typing import Callable, Dict, Optional, Sequence, Tuple, Union

import torch
from torch import Tensor
Expand All @@ -14,10 +14,10 @@ def __init__(
sub_version: int = 0,
root: str = "./data",
languages: Sequence[str] = ["en"],
with_sample_rate: bool = False,
with_info: bool = False,
transforms: Optional[Callable] = None,
):
self.with_sample_rate = with_sample_rate
self.with_info = with_info
self.transforms = transforms

from datasets import interleave_datasets, load_dataset
Expand All @@ -37,15 +37,24 @@ def __init__(

def __getitem__(
self, idx: Union[Tensor, int]
) -> Union[Tensor, Tuple[Tensor, Tensor]]:
) -> Union[Tensor, Tuple[Tensor, Dict]]:
idx = idx.tolist() if torch.is_tensor(idx) else idx # type: ignore
data = self.dataset[idx]

waveform = torch.tensor(data["audio"]["array"]).view(1, -1)
sample_rate = data["audio"]["sampling_rate"]

info = dict(
sample_rate=data["audio"]["sampling_rate"],
text=data["sentence"],
age=data["age"],
accent=data["accent"],
gender=data["gender"],
locale=data["locale"],
)

if self.transforms:
waveform = self.transforms(waveform)
return (waveform, sample_rate) if self.with_sample_rate else waveform
return (waveform, info) if self.with_info else waveform

def __len__(self) -> int:
return len(self.dataset)
44 changes: 44 additions & 0 deletions audio_data_pytorch/datasets/libri_speech_dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
import os
from typing import Callable, Dict, Optional, Tuple, Union

import torch
from torch import Tensor
from torch.utils.data import Dataset


class LibriSpeechDataset(Dataset):
def __init__(
self,
root: str = "./data",
with_info: bool = False,
transforms: Optional[Callable] = None,
):
self.with_info = with_info
self.transforms = transforms

from datasets import load_dataset

self.dataset = load_dataset(
"librispeech_asr",
"clean",
split="train.100",
cache_dir=os.path.join(root, "librispeech_dataset"),
)

def __getitem__(
self, idx: Union[Tensor, int]
) -> Union[Tensor, Tuple[Tensor, Dict]]:
idx = idx.tolist() if torch.is_tensor(idx) else idx # type: ignore
data = self.dataset[idx]
waveform = torch.tensor(data["audio"]["array"]).view(1, -1)
info = dict(
sample_rate=data["audio"]["sampling_rate"],
text=data["text"],
speaker_id=data["speaker_id"],
)
if self.transforms:
waveform = self.transforms(waveform)
return (waveform, info) if self.with_info else waveform

def __len__(self) -> int:
return len(self.dataset)
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
setup(
name="audio-data-pytorch",
packages=find_packages(exclude=[]),
version="0.0.10",
version="0.0.11",
license="MIT",
description="Audio Data - PyTorch",
long_description_content_type="text/markdown",
Expand Down

0 comments on commit b2f3eaf

Please sign in to comment.