From 8b2337ece52b3f47b45ccd2e2b3364984032a8ea Mon Sep 17 00:00:00 2001 From: Michael McAuliffe Date: Thu, 21 Mar 2024 20:14:08 -0700 Subject: [PATCH] 0.6.4 (#17) --- CMakeLists.txt | 4 +- extensions/ivector/ivector.cpp | 138 ++++++++++++++++++++++++--------- kalpy/feat/pitch.py | 22 +++++- kalpy/ivector/data.py | 18 ++--- kalpy/ivector/plda.py | 59 ++++++++++---- 5 files changed, 177 insertions(+), 64 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 22bf605..d29ade5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -86,7 +86,9 @@ target_link_libraries(_kalpy PUBLIC kaldi-base kaldi-chain fstscript ) -if(CUDAToolkit_FOUND) +find_library(KALDI_CUDADECODER kaldi-cudadecoder) + +if(CUDAToolkit_FOUND AND KALDI_CUDADECODER) target_link_libraries(_kalpy PUBLIC kaldi-cudadecoder kaldi-cudafeat ) diff --git a/extensions/ivector/ivector.cpp b/extensions/ivector/ivector.cpp index e0d5785..004ca05 100644 --- a/extensions/ivector/ivector.cpp +++ b/extensions/ivector/ivector.cpp @@ -869,6 +869,29 @@ void pybind_plda(py::module &m) { py::arg("utterance_ivector"), py::arg("transformed_enrolled_ivectors"), py::arg("num_enroll_utts")) + .def("score", + []( + PyClass &plda, + const VectorBase & utterance_ivector, + const std::vector> &transformed_enrolled_ivectors + ){ + py::gil_scoped_release gil_release; + PldaConfig plda_config; + Vector ivector_one_dbl(utterance_ivector); + + std::vector scores; + + for (int32 j = 0; j < transformed_enrolled_ivectors.size(); j++) { + Vector ivector_two_dbl(transformed_enrolled_ivectors[j]); + scores.push_back(plda.LogLikelihoodRatio(ivector_one_dbl, + 1, + ivector_two_dbl)); + } + return scores; + + }, + py::arg("utterance_ivector"), + py::arg("transformed_enrolled_ivectors")) .def("log_likelihood_distance", []( PyClass &plda, @@ -912,47 +935,24 @@ void pybind_plda(py::module &m) { py::buffer_info buf3 = result.request(); double *ptr3 = static_cast(buf3.ptr); for (py::size_t i = 0; i < r_one.shape(0); i++){ - Vector ivector_one_dbl; - ivector_one_dbl.Resize(r_one.shape(1)); - Vector ivector_two_dbl; - ivector_two_dbl.Resize(r_two.shape(1)); - for (py::size_t j = 0; j < r_one.shape(1); j++){ - ivector_one_dbl(j) = r_one(i, j); - ivector_two_dbl(j) = r_two(i, j); - - } - ptr3[i] = 1.0 / Exp(plda.LogLikelihoodRatio(ivector_one_dbl, - 1, - ivector_two_dbl));; + Vector ivector_one_dbl; + ivector_one_dbl.Resize(r_one.shape(1)); + Vector ivector_two_dbl; + ivector_two_dbl.Resize(r_two.shape(1)); + for (py::size_t j = 0; j < r_one.shape(1); j++){ + ivector_one_dbl(j) = r_one(i, j); + ivector_two_dbl(j) = r_two(i, j); + + } + ptr3[i] = 1.0 / Exp(plda.LogLikelihoodRatio(ivector_one_dbl, + 1, + ivector_two_dbl));; } return result; }, py::arg("utterance_one_ivector"), py::arg("utterance_two_ivector")) - .def("score", - []( - PyClass &plda, - const VectorBase & utterance_ivector, - const std::vector> &transformed_enrolled_ivectors - ){ - py::gil_scoped_release gil_release; - PldaConfig plda_config; - Vector ivector_one_dbl(utterance_ivector); - - std::vector scores; - - for (int32 j = 0; j < transformed_enrolled_ivectors.size(); j++) { - Vector ivector_two_dbl(transformed_enrolled_ivectors[j]); - scores.push_back(plda.LogLikelihoodRatio(ivector_one_dbl, - 1, - ivector_two_dbl)); - } - return scores; - - }, - py::arg("utterance_ivector"), - py::arg("transformed_enrolled_ivectors")) .def(py::pickle( [](const PyClass &p) { // __getstate__ /* Return a tuple that fully encodes the state of the object */ @@ -1074,6 +1074,7 @@ void pybind_plda(py::module &m) { py::array_t & transformed_test_ivector ){ py::gil_scoped_release gil_release; + Vector ivector_one_dbl; auto r1 = transformed_enroll_ivector.unchecked<1>(); ivector_one_dbl.Resize(r1.shape(0)); @@ -1099,6 +1100,35 @@ void pybind_plda(py::module &m) { py::arg("transformed_enroll_ivector"), py::arg("num_enroll_utts"), py::arg("transformed_test_ivector")) + + .def("log_likelihood_ratio", + py::vectorize([]( + + PyClass &plda, + py::array_t & transformed_enroll_ivector, + int32 num_enroll_utts, + py::array_t & transformed_test_ivector + ){ + py::gil_scoped_release gil_release; + Vector ivector_one_dbl; + auto r1 = transformed_enroll_ivector.unchecked<1>(); + ivector_one_dbl.Resize(r1.shape(0)); + for (py::size_t i = 0; i < r1.shape(0); i++) + ivector_one_dbl(i) = r1(i); + + Vector ivector_two_dbl; + auto r2 = transformed_test_ivector.unchecked<1>(); + ivector_two_dbl.Resize(r2.shape(0)); + for (py::size_t i = 0; i < r2.shape(0); i++) + ivector_two_dbl(i) = r2(i); + + return plda.LogLikelihoodRatio(ivector_one_dbl, num_enroll_utts, ivector_two_dbl); + + }), + "Numpy vectorized function for log-likelihood ratio.", + py::arg("transformed_enroll_ivector"), + py::arg("num_enroll_utts"), + py::arg("transformed_test_ivector")) .def("SmoothWithinClassCovariance", &PyClass::SmoothWithinClassCovariance, "This function smooths the within-class covariance by adding to it, " @@ -1427,7 +1457,8 @@ void init_ivector(py::module &_m) { m.def("ivector_subtract_mean", []( - std::vector*> &ivectors + std::vector*> &ivectors, + bool normalize = true ) { py::gil_scoped_release gil_release; Vector sum; @@ -1439,7 +1470,40 @@ void init_ivector(py::module &_m) { for (size_t i = 0; i < ivectors.size(); i++) { Vector *ivector = ivectors[i]; ivector->AddVec(-1.0 / ivectors.size(), sum); + if (normalize){ + double norm = ivector->Norm(2.0); + double ratio = norm / sqrt(ivector->Dim()); + ivector->Scale(1.0 / ratio); + + } } }, - py::arg("ivectors")); + py::arg("ivectors"), + py::arg("normalize") = true); + + m.def("ivector_subtract_mean", + []( + std::vector*> &ivectors, + bool normalize = true + ) { + py::gil_scoped_release gil_release; + Vector sum; + + for (size_t i = 0; i < ivectors.size(); i++) { + if (sum.Dim() == 0) sum.Resize(ivectors[i]->Dim()); + sum.AddVec(1.0, *ivectors[i]); + } + for (size_t i = 0; i < ivectors.size(); i++) { + Vector *ivector = ivectors[i]; + ivector->AddVec(-1.0 / ivectors.size(), sum); + if (normalize){ + double norm = ivector->Norm(2.0); + double ratio = norm / sqrt(ivector->Dim()); + ivector->Scale(1.0 / ratio); + + } + } + }, + py::arg("ivectors"), + py::arg("normalize") = true); } diff --git a/kalpy/feat/pitch.py b/kalpy/feat/pitch.py index 3d1046c..740624b 100644 --- a/kalpy/feat/pitch.py +++ b/kalpy/feat/pitch.py @@ -271,11 +271,31 @@ def compute_pitch_for_export( if len(wave.shape) == 2: channel = 0 if segment.channel is None else segment.channel wave = wave[channel, :] - pitch = feat.compute_pitch(wave, self.extraction_opts, self.process_opts) + pitch = self.compute_pitch_for_wave(wave) if compress: pitch = CompressedMatrix(pitch) return pitch + def compute_pitch_for_wave( + self, + wave: np.ndarray, + ) -> FloatMatrixBase: + """ + Generate pitch features for exporting to a kaldi archive + + Parameters + ---------- + wave: :class:`~numpy.ndarray` + Waveform + + Returns + ------- + :class:`_kalpy.matrix.FloatMatrixBase` + Feature matrix for the segment + """ + pitch = feat.compute_pitch(wave, self.extraction_opts, self.process_opts) + return pitch + def export_feats( self, file_name: typing.Union[pathlib.Path, str], diff --git a/kalpy/ivector/data.py b/kalpy/ivector/data.py index e675ac9..6cd586d 100644 --- a/kalpy/ivector/data.py +++ b/kalpy/ivector/data.py @@ -4,11 +4,11 @@ import os import typing -from _kalpy.matrix import FloatVector +from _kalpy.matrix import DoubleVector from _kalpy.util import ( - RandomAccessBaseFloatVectorReader, + RandomAccessBaseDoubleVectorReader, RandomAccessInt32VectorVectorReader, - SequentialBaseFloatVectorReader, + SequentialBaseDoubleVectorReader, SequentialInt32VectorVectorReader, ) from kalpy.data import PathLike @@ -31,7 +31,7 @@ def __init__(self, file_name: PathLike, num_utterances_file_name: PathLike = Non self.file_name = str(file_name) self.num_utterances_file_name = num_utterances_file_name self.read_specifier = generate_read_specifier(file_name) - self.random_reader = RandomAccessBaseFloatVectorReader(self.read_specifier) + self.random_reader = RandomAccessBaseDoubleVectorReader(self.read_specifier) self.num_utterances_mapping = {} if self.num_utterances_file_name is not None: with open(self.num_utterances_file_name) as f: @@ -45,18 +45,18 @@ def close(self): self.random_reader.Close() @property - def sequential_reader(self) -> SequentialBaseFloatVectorReader: + def sequential_reader(self) -> SequentialBaseDoubleVectorReader: """Sequential reader for lattices""" - return SequentialBaseFloatVectorReader(self.read_specifier) + return SequentialBaseDoubleVectorReader(self.read_specifier) - def __iter__(self) -> typing.Generator[typing.Tuple[str, FloatVector]]: + def __iter__(self) -> typing.Generator[typing.Tuple[str, DoubleVector]]: """Iterate over the utterance lattices in the archive""" if self.read_specifier.startswith("scp"): with open(self.file_name, encoding="utf8") as f: for line in f: line = line.strip() key, ark_path = line.split(maxsplit=1) - ivector = read_kaldi_object(FloatVector, ark_path) + ivector = read_kaldi_object(DoubleVector, ark_path) num_utterances = self.num_utterances_mapping.get(key, 1) yield key, ivector, num_utterances else: @@ -74,7 +74,7 @@ def __iter__(self) -> typing.Generator[typing.Tuple[str, FloatVector]]: def __del__(self): self.close() - def __getitem__(self, item: str) -> FloatVector: + def __getitem__(self, item: str) -> DoubleVector: """Get lattice for a particular key from the archive file""" item = str(item) if not self.random_reader.HasKey(item): diff --git a/kalpy/ivector/plda.py b/kalpy/ivector/plda.py index c150811..e2c4530 100644 --- a/kalpy/ivector/plda.py +++ b/kalpy/ivector/plda.py @@ -3,9 +3,10 @@ import numpy as np -from _kalpy.ivector import Plda +from _kalpy.ivector import Plda, ivector_normalize_length, ivector_subtract_mean from _kalpy.matrix import DoubleVector, FloatVector from kalpy.utils import read_kaldi_object +from kalpy.ivector.data import IvectorArchive class PldaScorer: @@ -16,9 +17,36 @@ def __init__( simple_length_norm: bool = True, ): self.plda_path = str(plda_path) - self.plda = read_kaldi_object(Plda, self.plda_path) + self.plda: Plda = read_kaldi_object(Plda, self.plda_path) self.normalize_length = normalize_length self.simple_length_norm = simple_length_norm + self.speaker_ids = None + self.speaker_ivectors = None + self.num_speaker_examples = None + + def load_speaker_ivectors(self, speaker_archive_path, num_utts_path=None): + ivector_archive = IvectorArchive( + speaker_archive_path, num_utterances_file_name=num_utts_path + ) + speaker_ivectors = [] + self.speaker_ids = [] + self.num_speaker_examples = [] + for speaker_id, ivector, utts in ivector_archive: + self.speaker_ids.append(speaker_id) + self.num_speaker_examples.append(utts) + if self.normalize_length: + ivector_normalize_length(ivector) + speaker_ivectors.append(DoubleVector(ivector)) + ivector_subtract_mean(speaker_ivectors,normalize=self.normalize_length) + self.speaker_ivectors = self.plda.transform_ivectors(speaker_ivectors, self.num_speaker_examples) + + def transform_ivector(self, ivector: np.ndarray, num_examples: int = 1): + return self.plda.transform_ivector(ivector, num_examples) + + def transform_ivectors(self, ivectors: np.ndarray, num_examples: np.ndarray = None): + if num_examples is None: + num_examples = np.ones((ivectors.shape[0])) + return self.plda.transform_ivectors(ivectors, num_examples) def score_ivectors( self, @@ -26,21 +54,20 @@ def score_ivectors( utterance_ivector: typing.Union[np.ndarray, FloatVector, DoubleVector], num_speaker_examples: int = 1, ): - if isinstance(speaker_ivector, np.ndarray): - v = DoubleVector() - v.from_numpy(speaker_ivector) - speaker_ivector = v - elif isinstance(speaker_ivector, FloatVector): - speaker_ivector = DoubleVector(speaker_ivector) - - if isinstance(utterance_ivector, np.ndarray): - v = DoubleVector() - v.from_numpy(utterance_ivector) - utterance_ivector = v - elif isinstance(utterance_ivector, FloatVector): - utterance_ivector = DoubleVector(utterance_ivector) - score = self.plda.LogLikelihoodRatio( speaker_ivector, num_speaker_examples, utterance_ivector ) return score + + def classify_speaker( + self, + utterance_ivector: typing.Union[np.ndarray, FloatVector, DoubleVector], + ): + if self.num_speaker_examples is None: + self.num_speaker_examples = [1 for _ in (self.speaker_ivectors.shape[0])] + if isinstance(utterance_ivector, np.ndarray): + utterance_ivector = DoubleVector() + utterance_ivector.from_numpy(utterance_ivector) + ind, score = self.plda.classify_utterance(utterance_ivector, self.speaker_ivectors, self.num_speaker_examples) + speaker = self.speaker_ids[ind] + return speaker, score