add use_sec option in alignment

Patchethium · Aug 13, 2023 · 68e31b1 · 68e31b1
1 parent 6eaf892
commit 68e31b1
Show file tree

Hide file tree

Showing 3 changed files with 14 additions and 10 deletions.
diff --git a/README.md b/README.md
@@ -40,7 +40,6 @@ I'll cover this part if it's needed by anyone. Please let me know by creating an
 - Rust crate
 - multi-language
 - Storing `pau` index in binary model
-- Option to convert frame number into milisecond
 - Record and warn the user when score is too low
 
 ## Licence
@@ -49,7 +48,7 @@ I'll cover this part if it's needed by anyone. Please let me know by creating an
 
 The file `snfa/stft.py` contains code adapted from `librosa` which obeys `ISC Licence` with different copyright claim. A copy of `librosa`'s licence can be found in [librosa's repo](https://github.com/librosa/librosa/blob/main/LICENSE.md).
 
-The file `snfa/backtrack.py` contains code adapted from `torchaudio` which obeys `BSD 2-Clause "Simplified" License`. A copy of `torchaudio`'s licence can be found in [torchaudio's repo](https://github.com/pytorch/audio/blob/main/LICENSE).
+The file `snfa/viterbi.py` contains code adapted from `torchaudio` which obeys `BSD 2-Clause "Simplified" License`. A copy of `torchaudio`'s licence can be found in [torchaudio's repo](https://github.com/pytorch/audio/blob/main/LICENSE).
 
 ## Credit
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "snfa"
-version = "0.0.5"
+version = "0.0.1"
 authors = [{ name = "Patchethium" }]
 description = "a simple neural forced aligner for phoneme to audio alignment"
 readme = "README.md"

diff --git a/src/snfa/aligner.py b/src/snfa/aligner.py
@@ -22,12 +22,14 @@ def softmax(x, axis=-1):
     e_x = np.exp(x - np.max(x, axis, keepdims=True))
     return e_x / np.sum(e_x, axis, keepdims=True)
 
+
 def l1_normalize(arr, axis=None):
     arr = arr - np.min(arr)
     norm = np.sum(np.abs(arr), axis=axis, keepdims=True)
     normalized_arr = arr / norm
     return normalized_arr
 
+
 def log_softmax(x, axis=-1):
     return np.log(softmax(x, axis))
 
@@ -79,12 +81,10 @@ def __call__(self, x: np.ndarray) -> np.ndarray:
 class Aligner:
     def __init__(self, filename: str = "model.bin"):
         f = open(filename, "rb")
-        
+
         # Read metadata first, 8 is the amount of metadata entries
         # each entry is one int32 (4 bytes)
-        meta_data: np.ndarray = np.frombuffer(
-            f.read(8 * 4), np.int32, count=8
-        )
+        meta_data: np.ndarray = np.frombuffer(f.read(8 * 4), np.int32, count=8)
         # the entry list
         [
             self.n_fft,
@@ -159,7 +159,7 @@ def get_indices(self, ph):
             raise Exception("phoneme not in model's phoneme set")
         return tokens
 
-    def align(self, x, ph):
+    def align(self, x, ph, use_sec=False):
         mel = self.mel(x)
         indices = self.get_indices(ph)
 
@@ -171,10 +171,15 @@ def align(self, x, ph):
         path = viterbi.backtrack(trellis)
 
         segments = viterbi.merge_repeats(path, indices)
+        if use_sec:
+            for seg in segments:
+                seg.start = seg.start * self.hop_size / self.sr
+                seg.end = seg.end * self.hop_size / self.sr
         return segments, path, trellis, emission, labels
 
-    def __call__(self, x: np.ndarray, ph: List[str]):
-        return self.align(x, ph)
+    def __call__(self, x: np.ndarray, ph: List[str], use_sec=False):
+        return self.align(x, ph, use_sec)
+
 
 if __name__ == "__main__":
     alinger = Aligner("cv_jp.bin")