From 6eaf892017a88e5938033dc4eb494dc68f688f1a Mon Sep 17 00:00:00 2001 From: Patchethium Date: Sun, 13 Aug 2023 22:24:37 +0900 Subject: [PATCH] use l1 norm instead of softmax --- README.md | 3 +++ src/snfa/aligner.py | 15 +++++++++------ src/snfa/viterbi.py | 10 +++++----- 3 files changed, 17 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index d320639..0126d30 100644 --- a/README.md +++ b/README.md @@ -39,6 +39,9 @@ I'll cover this part if it's needed by anyone. Please let me know by creating an - Rust crate - multi-language +- Storing `pau` index in binary model +- Option to convert frame number into milisecond +- Record and warn the user when score is too low ## Licence diff --git a/src/snfa/aligner.py b/src/snfa/aligner.py index cfcc327..a55bbf0 100644 --- a/src/snfa/aligner.py +++ b/src/snfa/aligner.py @@ -22,6 +22,11 @@ def softmax(x, axis=-1): e_x = np.exp(x - np.max(x, axis, keepdims=True)) return e_x / np.sum(e_x, axis, keepdims=True) +def l1_normalize(arr, axis=None): + arr = arr - np.min(arr) + norm = np.sum(np.abs(arr), axis=axis, keepdims=True) + normalized_arr = arr / norm + return normalized_arr def log_softmax(x, axis=-1): return np.log(softmax(x, axis)) @@ -147,9 +152,6 @@ def mel(self, x: np.ndarray): mel = np.fliplr(mel) return mel - def _norm_labels(self, labels) -> np.ndarray: - return softmax(labels[:, 1:], axis=1) - def get_indices(self, ph): try: tokens = np.array([int(self.phone_set.index(p)) for p in ph]) @@ -162,13 +164,14 @@ def align(self, x, ph): indices = self.get_indices(ph) labels = self.model_forward(mel) - labels = self._norm_labels(labels) - trellis = viterbi.get_trellis(labels, indices) + emission = l1_normalize(labels[:, 1:], axis=1)[:, indices] + + trellis = viterbi.get_trellis(emission) path = viterbi.backtrack(trellis) segments = viterbi.merge_repeats(path, indices) - return segments, path, trellis, labels + return segments, path, trellis, emission, labels def __call__(self, x: np.ndarray, ph: List[str]): return self.align(x, ph) diff --git a/src/snfa/viterbi.py b/src/snfa/viterbi.py index fd62908..489f68a 100644 --- a/src/snfa/viterbi.py +++ b/src/snfa/viterbi.py @@ -42,21 +42,21 @@ class Point: time_index: int -def get_trellis(emission: np.ndarray, tokens: np.ndarray) -> np.ndarray: +def get_trellis(emission: np.ndarray) -> np.ndarray: """ Get a cost matrix `trellis` from emission using Viterbi algorithm. """ - num_frames, num_tokens = emission.shape[0], tokens.shape[0] + num_frames, num_tokens = emission.shape trellis = np.zeros((num_frames, num_tokens)) - trellis[1:, 0] = np.cumsum(emission[1:, tokens[0]], 0) + trellis[1:, 0] = np.cumsum(emission[1:, 0], 0) trellis[0, 1:] = -np.inf trellis[-num_tokens + 1 :, 0] = np.inf for t in range(num_frames - 1): candidate = np.maximum( - trellis[t, 1:] + emission[t + 1, tokens[1:]], - trellis[t, :-1] + emission[t + 1, tokens[1:]], + trellis[t, 1:] + emission[t + 1, 1:], + trellis[t, :-1] + emission[t + 1, 1:], ) trellis[t + 1, 1:] = candidate