Skip to content

Commit

Permalink
Merge pull request #13 from sooftware/dev
Browse files Browse the repository at this point in the history
Release v0.2 (resolved #11 resolved #12)
  • Loading branch information
sooftware authored Jun 7, 2021
2 parents 39b0328 + 697c04b commit b6e2682
Show file tree
Hide file tree
Showing 64 changed files with 504 additions and 81 deletions.
26 changes: 26 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,32 @@ $ python ./openspeech_cli/hydra_train.py \
criterion=ctc
```

### Evaluation examples

- Example1: Evaluation the `listen_attend_spell` model:

```
$ python ./openspeech_cli/hydra_eval.py \
audio=melspectrogram \
eval.model_name=listen_attend_spell \
eval.dataset_path=$DATASET_PATH \
eval.checkpoint_path=$CHECKPOINT_PATH \
eval.manifest_file_path=$MANIFEST_FILE_PATH
```

- Example2: Evaluation the `listen_attend_spell`, `conformer_lstm` models with ensemble:

```
$ python ./openspeech_cli/hydra_eval.py \
audio=melspectrogram \
eval.model_names=(listen_attend_spell, conformer_lstm) \
eval.dataset_path=$DATASET_PATH \
eval.checkpoint_paths=($CHECKPOINT_PATH1, $CHECKPOINT_PATH2) \
eval.ensemble_weights=(0.3, 0.7) \
eval.ensemble_method=weighted \
eval.manifest_file_path=$MANIFEST_FILE_PATH
```

## Installation

This project recommends Python 3.7 or higher.
Expand Down
5 changes: 5 additions & 0 deletions openspeech/configs/eval.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# @package _group_

defaults:
- audio: null
- eval: default
File renamed without changes.
2 changes: 1 addition & 1 deletion openspeech/data/audio/filter_bank/configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ class FilterBankConfigs(OpenspeechDataclass):
Configuration objects inherit from :class: `~openspeech.dataclass.configs.OpenspeechDataclass`.
Configurations:
Args:
name (str): name of feature transform. (default: fbank)
sample_rate (int): sampling rate of audio (default: 16000)
frame_length (float): frame length for spectrogram (default: 20.0)
Expand Down
2 changes: 1 addition & 1 deletion openspeech/data/audio/melspectrogram/configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ class MelSpectrogramConfigs(OpenspeechDataclass):
Configuration objects inherit from :class: `~openspeech.dataclass.OpenspeechDataclass`.
Configurations:
Args:
name (str): name of feature transform. (default: melspectrogram)
sample_rate (int): sampling rate of audio (default: 16000)
frame_length (float): frame length for spectrogram (default: 20.0)
Expand Down
2 changes: 1 addition & 1 deletion openspeech/data/audio/mfcc/configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ class MFCCConfigs(OpenspeechDataclass):
Configuration objects inherit from :class: `~openspeech.dataclass.OpenspeechDataclass`.
Configurations:
Args:
name (str): name of feature transform. (default: mfcc)
sample_rate (int): sampling rate of audio (default: 16000)
frame_length (float): frame length for spectrogram (default: 20.0)
Expand Down
2 changes: 1 addition & 1 deletion openspeech/data/audio/spectrogram/configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ class SpectrogramConfigs(OpenspeechDataclass):
Configuration objects inherit from :class: `~openspeech.dataclass.OpenspeechDataclass`.
Configurations:
Args:
name (str): name of feature transform. (default: spectrogram)
sample_rate (int): sampling rate of audio (default: 16000)
frame_length (float): frame length for spectrogram (default: 20.0)
Expand Down
25 changes: 25 additions & 0 deletions openspeech/data/data_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
from typing import Tuple

import torch
import numpy as np
Expand Down Expand Up @@ -132,3 +133,27 @@ def __len__(self):

def shuffle(self, epoch):
np.random.shuffle(self.bins)


def load_dataset(manifest_file_path: str) -> Tuple[list, list]:
"""
Provides dictionary of filename and labels.
Args:
manifest_file_path (str): evaluation manifest file path.
Returns: target_dict
* target_dict (dict): dictionary of filename and labels
"""
audio_paths = list()
transcripts = list()

with open(manifest_file_path) as f:
for idx, line in enumerate(f.readlines()):
audio_path, korean_transcript, transcript = line.split('\t')
transcript = transcript.replace('\n', '')

audio_paths.append(audio_path)
transcripts.append(transcript)

return audio_paths, transcripts
2 changes: 1 addition & 1 deletion openspeech/data/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ def __init__(
self.apply_noise_augment = apply_noise_augment
self.apply_time_stretch_augment = apply_time_stretch_augment
self.apply_joining_augment = apply_joining_augment
self.transforms = AUDIO_FEATURE_TRANSFORM_DATACLASS_REGISTRY[configs.name](configs)
self.transforms = AUDIO_FEATURE_TRANSFORM_DATACLASS_REGISTRY[configs.audio.name](configs)
self._load_audio = load_audio

if self.apply_spec_augment:
Expand Down
6 changes: 6 additions & 0 deletions openspeech/dataclass/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@
Fp16GPUTrainerConfigs,
Fp16TPUTrainerConfigs,
Fp64CPUTrainerConfigs,
EvaluationConfigs,
EnsembleEvaluationConfigs,
)

OPENSPEECH_CONFIGS = [
Expand Down Expand Up @@ -62,3 +64,7 @@
AUGMENT_DATACLASS_REGISTRY = {
"default": AugmentConfigs,
}
EVAL_DATACLASS_REGISTRY = {
"default": EvaluationConfigs,
"ensemble": EnsembleEvaluationConfigs,
}
56 changes: 56 additions & 0 deletions openspeech/dataclass/configurations.py
Original file line number Diff line number Diff line change
Expand Up @@ -311,6 +311,62 @@ class VocabularyConfigs(OpenspeechDataclass):
)


@dataclass
class EvaluationConfigs(OpenspeechDataclass):
model_name: str = field(
default=MISSING, metadata={"help": "Model name."}
)
dataset_path: str = field(
default=MISSING, metadata={"help": "Path of dataset."}
)
checkpoint_path: str = field(
default=MISSING, metadata={"help": "Path of model checkpoint."}
)
manifest_file_path: str = field(
default=MISSING, metadata={"help": "Path of evaluation manifest file."}
)
num_workers: int = field(
default=4, metadata={"help": "Number of worker."}
)
batch_size: int = field(
default=32, metadata={"help": "Batch size."}
)
beam_size: int = field(
default=1, metadata={"help": "Beam size of beam search."}
)


@dataclass
class EnsembleEvaluationConfigs(OpenspeechDataclass):
model_names: str = field(
default=MISSING, metadata={"help": "List of model name."}
)
dataset_paths: str = field(
default=MISSING, metadata={"help": "Path of dataset."}
)
checkpoint_paths: str = field(
default=MISSING, metadata={"help": "List of model checkpoint path."}
)
manifest_file_path: str = field(
default=MISSING, metadata={"help": "Path of evaluation manifest file."}
)
ensemble_method: str = field(
default="vanilla", metadata={"help": "Method of ensemble (vanilla, weighted)"}
)
ensemble_weights: str = field(
default="(1.0, 1.0, 1.0 ..)", metadata={"help": "Weights of ensemble models."}
)
num_workers: int = field(
default=4, metadata={"help": "Number of worker."}
)
batch_size: int = field(
default=32, metadata={"help": "Batch size."}
)
beam_size: int = field(
default=1, metadata={"help": "Beam size of beam search."}
)


def generate_openspeech_configs_with_help():
from openspeech.dataclass import OPENSPEECH_CONFIGS, TRAINER_DATACLASS_REGISTRY
from openspeech.models import MODEL_DATACLASS_REGISTRY
Expand Down
18 changes: 18 additions & 0 deletions openspeech/dataclass/initialize.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,3 +51,21 @@ def hydra_init() -> None:

for k, v in dataclass_registry.items():
cs.store(group=group, name=k, node=v, provider="openspeech")


def hydra_eval_init() -> None:
from openspeech.data import AUDIO_FEATURE_TRANSFORM_DATACLASS_REGISTRY
from openspeech.dataclass import EVAL_DATACLASS_REGISTRY

registries = {
"audio": AUDIO_FEATURE_TRANSFORM_DATACLASS_REGISTRY,
"eval": EVAL_DATACLASS_REGISTRY,
}

cs = ConfigStore.instance()

for group in registries.keys():
dataclass_registry = registries[group]

for k, v in dataclass_registry.items():
cs.store(group=group, name=k, node=v, provider="openspeech")
2 changes: 1 addition & 1 deletion openspeech/decoders/transformer_decoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -255,7 +255,7 @@ def forward(
input_var = input_var.fill_(self.pad_id)
input_var[:, 0] = self.sos_id

for di in range(1, self.max_length):
for di in range(self.max_length):
input_lengths = torch.IntTensor(batch_size).fill_(di)

outputs = self.forward_step(
Expand Down
2 changes: 1 addition & 1 deletion openspeech/models/conformer/configurations.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ class ConformerConfigs(OpenspeechDataclass):
Configuration objects inherit from :class: `~openspeech.dataclass.configs.OpenspeechDataclass`.
Configurations:
Args:
model_name (str): Model name (default: conformer)
encoder_dim (int): Dimension of encoder. (default: 512)
num_encoder_layers (int): The number of encoder layers. (default: 17)
Expand Down
2 changes: 1 addition & 1 deletion openspeech/models/conformer_lstm/configurations.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ class ConformerLSTMConfigs(OpenspeechDataclass):
Configuration objects inherit from :class: `~openspeech.dataclass.configs.OpenspeechDataclass`.
Configurations:
Args:
model_name (str): Model name (default: conformer_lstm)
encoder_dim (int): Dimension of encoder. (default: 512)
num_encoder_layers (int): The number of encoder layers. (default: 17)
Expand Down
3 changes: 1 addition & 2 deletions openspeech/models/conformer_lstm/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,13 +85,12 @@ def build_model(self):
rnn_type=self.configs.model.rnn_type,
)

def set_beam_decoder(self, batch_size: int, beam_size: int = 3):
def set_beam_decoder(self, beam_size: int = 3):
""" Setting beam search decoder """
from openspeech.search import BeamSearchLSTM
self.decoder = BeamSearchLSTM(
decoder=self.decoder,
beam_size=beam_size,
batch_size=batch_size,
)

def forward(self, inputs: Tensor, input_lengths: Tensor) -> Dict[str, Tensor]:
Expand Down
2 changes: 1 addition & 1 deletion openspeech/models/conformer_transducer/configurations.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ class ConformerTransducerConfigs(OpenspeechDataclass):
Configuration objects inherit from :class: `~openspeech.dataclass.configs.OpenspeechDataclass`.
Configurations:
Args:
model_name (str): Model name (default: conformer_transducer)
encoder_dim (int): Dimension of encoder. (default: 512)
num_encoder_layers (int): The number of encoder layers. (default: 17)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ class DeepCNNWithJointCTCListenAttendSpellConfigs(OpenspeechDataclass):
Configuration objects inherit from :class: `~openspeech.dataclass.configs.OpenspeechDataclass`.
Configurations:
Args:
model_name (str): Model name (default: deep_cnn_with_joint_ctc_listen_attend_spell)
num_encoder_layers (int): The number of encoder layers. (default: 3)
num_decoder_layers (int): The number of decoder layers. (default: 2)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -83,13 +83,12 @@ def build_model(self):
rnn_type=self.configs.model.rnn_type,
)

def set_beam_decoder(self, batch_size: int, beam_size: int = 3):
def set_beam_decoder(self, beam_size: int = 3):
""" Setting beam search decoder """
from openspeech.search import BeamSearchLSTM
self.decoder = BeamSearchLSTM(
decoder=self.decoder,
beam_size=beam_size,
batch_size=batch_size,
)

def forward(self, inputs: Tensor, input_lengths: Tensor) -> Dict[str, Tensor]:
Expand Down
2 changes: 1 addition & 1 deletion openspeech/models/deepspeech2/configurations.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ class DeepSpeech2Configs(OpenspeechDataclass):
Configuration objects inherit from :class: `~openspeech.dataclass.configs.OpenspeechDataclass`.
Configurations:
Args:
model_name (str): Model name (default: deepspeech2)
num_rnn_layers (int): The number of rnn layers. (default: 5)
rnn_hidden_dim (int): The hidden state dimension of rnn. (default: 1024)
Expand Down
2 changes: 1 addition & 1 deletion openspeech/models/jasper10x5/configurations.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ class Jasper10x5Config(OpenspeechDataclass):
Configuration objects inherit from :class: `~openspeech.dataclass.configs.OpenspeechDataclass`.
Configurations:
Args:
model_name (str): Model name (default: jasper10x5)
num_blocks (int): Number of jasper blocks (default: 10)
num_sub_blocks (int): Number of jasper sub blocks (default: 5)
Expand Down
2 changes: 1 addition & 1 deletion openspeech/models/jasper5x3/configurations.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ class Jasper5x3Config(OpenspeechDataclass):
Configuration objects inherit from :class: `~openspeech.dataclass.configs.OpenspeechDataclass`.
Configurations:
Args:
model_name (str): Model name (default: jasper5x3)
num_blocks (int): Number of jasper blocks (default: 5)
num_sub_blocks (int): Number of jasper sub blocks (default: 3)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ class JointCTCConformerLSTMConfigs(OpenspeechDataclass):
Configuration objects inherit from :class: `~openspeech.dataclass.configs.OpenspeechDataclass`.
Configurations:
Args:
model_name (str): Model name (default: joint_ctc_conformer_lstm)
encoder_dim (int): Dimension of encoder. (default: 512)
num_encoder_layers (int): The number of encoder layers. (default: 17)
Expand Down
3 changes: 1 addition & 2 deletions openspeech/models/joint_ctc_conformer_lstm/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,13 +85,12 @@ def build_model(self):
rnn_type=self.configs.model.rnn_type,
)

def set_beam_decoder(self, batch_size: int, beam_size: int = 3):
def set_beam_decoder(self, beam_size: int = 3):
""" Setting beam search decoder """
from openspeech.search import BeamSearchLSTM
self.decoder = BeamSearchLSTM(
decoder=self.decoder,
beam_size=beam_size,
batch_size=batch_size,
)

def forward(self, inputs: Tensor, input_lengths: Tensor) -> Dict[str, Tensor]:
Expand Down
30 changes: 15 additions & 15 deletions openspeech/models/joint_ctc_listen_attend_spell/configurations.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,21 +35,21 @@ class JointCTCListenAttendSpellConfigs(OpenspeechDataclass):
Configuration objects inherit from :class: `~openspeech.dataclass.configs.OpenspeechDataclass`.
Configurations:
model_name (str): Model name (default: joint_ctc_listen_attend_spell)
num_encoder_layers (int): The number of encoder layers. (default: 3)
num_decoder_layers (int): The number of decoder layers. (default: 2)
hidden_state_dim (int): The hidden state dimension of encoder. (default: 768)
encoder_dropout_p (float): The dropout probability of encoder. (default: 0.3)
encoder_bidirectional (bool): If True, becomes a bidirectional encoders (default: True)
rnn_type (str): Type of rnn cell (rnn, lstm, gru) (default: lstm)
joint_ctc_attention (bool): Flag indication joint ctc attention or not (default: True)
max_length (int): Max decoding length. (default: 128)
num_attention_heads (int): The number of attention heads. (default: 1)
decoder_dropout_p (float): The dropout probability of decoder. (default: 0.2)
decoder_attn_mechanism (str): The attention mechanism for decoder. (default: loc)
teacher_forcing_ratio (float): The ratio of teacher forcing. (default: 1.0)
optimizer (str): Optimizer for training. (default: adam)
Args:
model_name (str): Model name (default: joint_ctc_listen_attend_spell)
num_encoder_layers (int): The number of encoder layers. (default: 3)
num_decoder_layers (int): The number of decoder layers. (default: 2)
hidden_state_dim (int): The hidden state dimension of encoder. (default: 768)
encoder_dropout_p (float): The dropout probability of encoder. (default: 0.3)
encoder_bidirectional (bool): If True, becomes a bidirectional encoders (default: True)
rnn_type (str): Type of rnn cell (rnn, lstm, gru) (default: lstm)
joint_ctc_attention (bool): Flag indication joint ctc attention or not (default: True)
max_length (int): Max decoding length. (default: 128)
num_attention_heads (int): The number of attention heads. (default: 1)
decoder_dropout_p (float): The dropout probability of decoder. (default: 0.2)
decoder_attn_mechanism (str): The attention mechanism for decoder. (default: loc)
teacher_forcing_ratio (float): The ratio of teacher forcing. (default: 1.0)
optimizer (str): Optimizer for training. (default: adam)
"""
model_name: str = field(
default="joint_ctc_listen_attend_spell", metadata={"help": "Model name"}
Expand Down
Loading

0 comments on commit b6e2682

Please sign in to comment.