6th Sem Report.txt

\documentclass[12pt]{report}
\usepackage[utf8]{inputenc}
\usepackage[T1]{fontenc}
\usepackage{graphicx}
\usepackage[export]{adjustbox}
\usepackage[a4paper, portrait, margin=1in]{geometry}
\usepackage[english]{babel}
\usepackage[utf8]{inputenc}
\usepackage[T1]{fontenc}
\usepackage{helvet}
\usepackage{etoolbox}
\usepackage{graphicx}
\usepackage{titlesec}
\usepackage{caption}
\usepackage{booktabs}
\usepackage{xcolor} 
\usepackage{titlesec}
\usepackage{setspace}
\usepackage{hyperref}
\usepackage{changepage}
\usepackage{pgfgantt}
\graphicspath{ {./images/} }


% Set chapter and section headings formatting
% \titleformat{\chapter}[display]{\normalfont\fontsize{16}{19}\bfseries\centering}{\chaptertitlename\ \thechapter}{0.5em}{\MakeUppercase}
% \titleformat{\section}[block]{\normalfont\fontsize{12}{14.4}\bfseries}{\thesection}{1em}{\MakeUppercase}
% \titleformat{\subsection}[block]{\normalfont\fontsize{12}{14.4}\bfseries}{\thesubsection}{1em}{\MakeUppercase}

% Section title formatting
\titleformat{\chapter}[display]
    {\normalfont\huge\bfseries\centering}{\chaptertitlename\ \thechapter}{20pt}{\Huge}
\titlespacing*{\chapter}{0pt}{20pt}{20pt}

\titleformat{\section}
  {\normalfont\Large\bfseries}
  {\thesection}
  {1em}
  {\sentencecase}


% Set figure and table captions formatting
%\captionsetup[figure]{font={small,normal}}
%\captionsetup[table]{font={small,normal}}

% Set footnote text size
\renewcommand{\footnotesize}{\fontsize{9}{11}\selectfont}


\usepackage[square,sort,comma,numbers,super]{natbib}

 
\usepackage{caption}
\usepackage{float}
\graphicspath{ {./Images/} }
\makeatletter
\patchcmd{\@maketitle}{\LARGE \@title}{\fontsize{16}{19.2}\selectfont\@title}{}{}
\makeatother
\date{}    

\usepackage{amsmath} % for math environments and operations
\usepackage{bm}      % for bold symbols in math mode


% Section title formatting
% \titleformat{\chapter}[display]
%     {\normalfont\huge\bfseries}{\chaptertitlename\ \thechapter}{20pt}{\Huge}
% \titlespacing*{\chapter}{0pt}{20pt}{20pt}

% \titleformat{\section}
%   {\normalfont\Large\bfseries}
%   {\thesection}
%   {1em}
%   {\sentencecase}
  
\begin{document}


\doublespacing
\thispagestyle{empty}
\pagenumbering{roman}
\vspace*{0.2cm}
\begin{center}
\textbf{ \Large
A Study on Identification of Song from  Humming and Separation of Instruments using Advanced DSP and Machine Learning \\
}
\vspace*{3mm}
{A report submitted for the course named Project II(CS322)}
\vspace*{3cm}

\emph{\textbf{Submitted By}} \\
\vspace*{2mm}
\textbf{Sushant Kumar Tiwari}

\textbf{21010127}
\vspace*{2cm} \\
\emph{Under the guidance of}

\textbf{Dr. Navanath Saharia}
\vspace*{1.5cm}
\begin{center}
\includegraphics[max width=0.25\textwidth]{iiitm-logo.png}
\end{center}

\textbf{Department of  Computer Science and Engineering \\ Indian Institute of Information Technology Senapati, Manipur} \\ April 2024
\end{center}


\newpage


\chapter*{Declaration}
\vspace{1cm}
The work embodied in the present report entitled \textbf{A Study on Identification of Song from  Humming and Separation of Instruments using Advanced DSP and Machine Learning} has been carried
out in the Computer Science Department. The work reported herein is original and does not form part of any other report or dissertation on the basis of which a degree or award was conferred on an earlier occasion or to any other student.\\
I understand the Institute’s policy on plagiarism and declare that the report and publications are my own work, except where specifically acknowledged and has not been copied from other sources or been previously submitted for award or assessment.


\vspace*{3.5cm}

\noindent Date: 18/04/2024 \hspace{7cm} (Signature)\vspace{1cm}\\
\vspace{1cm}
\hspace{10.5cm}Sushant Kumar Tiwari\\
\vspace{0.1mm}
\hspace{10.5cm}21010127\\
\vspace{2mm}
\hspace{7cm}Department of Computer Science \& Engineering\\
\vspace{2mm}
\hspace{10cm}IIIT Senapati, Manipur

\newpage
\begin{minipage}{0.2\textwidth}
\includegraphics[width=\textwidth]{iiitm-logo.png}
\end{minipage}
\begin{minipage}{0.9\textwidth}
\fontsize{15}{15}\textbf{Department of Computer Science \& Engineering}\\
\fontsize{15}{15}\textbf{Indian Institute of Information Technology Manipur}
\end{minipage}
\hspace{12cm}
\rule{18cm}{1pt}\\
\hspace{4cm}
Dr. Navanath Saharia
\hspace{7cm}Email: nsaharia@iiitmanipur.ac.in\\
Assistant Professor\\
\vspace{3cm}\\
\begin{center}
\fontsize{18}{10}\textbf{Certificate}
\end{center}
\vspace{0.8cm}
This is to certify that the project report entitled \textbf{A Study on Identification of Song from  Humming and Separation of Instruments using Advanced DSP and Machine Learning} submitted to Department of Computer Science \& Engineering, Indian Institute of Information Technology Senapati, Manipur in partial fulfillment for the award of the degree of Bachelor of Technology in Computer Science \& Engineering is a record of bonafide work carried out by \textbf{Sushant Kumar Tiwari} bearing roll number \textbf{21010127}.

\\
\vspace{2cm}

\hspace{11cm}Signature of Supervisor
\\
\vspace{1mm}

\hspace{11cm}(Dr. Navanath Saharia)
\newpage
\begin{minipage}{0.2\textwidth}
\includegraphics[width=\textwidth]{iiitm-logo.png}
\end{minipage}
\begin{minipage}{0.9\textwidth}
\fontsize{15}{15}\textbf{Department of Computer Science \& Engineering}\\
\fontsize{15}{15}\textbf{Indian Institute of Information Technology Manipur}
\end{minipage}
\hspace{12cm}
\rule{18cm}{1pt}\\
\hspace{4cm}
Dr. Navanath Saharia
\hspace{7cm}Email: nsaharia@iiitmanipur.ac.in\\
Assistant Professor\\
\vspace{3cm}\\
\begin{center}
\fontsize{18}{10}\textbf{Certificate}
\end{center}
\vspace{0.8cm}
This is to certify that the project report entitled \textbf{A Study on Identification of Song from  Humming and Separation of Instruments using Advanced DSP and Machine Learning} submitted to Department of Computer Science \& Engineering, Indian Institute of Information Technology Senapati, Manipur in partial fulfillment for the award of the degree of Bachelor of Technology in Computer Science \& Engineering is a record of bonafide work carried out by \textbf{Sushant Kumar Tiwari} bearing roll number \textbf{21010127}.

\\\\\\\\
\vspace{1cm}
\noindent
\textbf{Signature of Examiners}\\\\
\vspace{2mm}
Examiner 1: ............................................\\
\vspace{2mm}
Examiner 2: ............................................\\
\vspace{2mm}
Examiner 3: ............................................\\
\vspace{2mm}
Examiner 4: ............................................\\
\vspace{2mm}
Examiner 5: ............................................


\newpage
\chapter*{Abstract}
This project aims to develop a comprehensive solution that addresses the challenges of accurate song identification and instrument separation in music by leveraging digital signal processing (DSP) techniques and machine learning algorithms. The core of the song identification module is the application of cross-correlation, enabling the matching of a user's partial audio input against a database of known songs. For the instrument separation module, the project employs a range of DSP techniques, including Mel-Frequency Cepstral Coefficients, Linear Predictive Coding, Discrete Wavelet Transform, spectral subtraction, Wiener filtering, and Non-negative Matrix Factorization, working in tandem to effectively isolate the individual instrument sounds. The combination of cross-correlation-based song identification and the instrument separation techniques creates a versatile system that can cater to the diverse needs of music enthusiasts, researchers, and professionals, addressing the limitations of existing solutions and providing a robust, user-friendly system for effective music discovery, analysis, and creative applications.

\newpage

\chapter*{Acknowledgement}
\vspace{1cm}
I am highly indebted to our faculty, Dr. Navnath Saharia, for their guidance and constant supervision as well as for providing necessary information regarding the project and also for their support in completing the project. I have taken efforts in this project. However, it would not have been possible without the help of all the faculties of Computer Science and Engineering Department, IIIT Manipur, who took keen interest on my project work and guided me along till the completion of my project work. I would like to extend my sincere thanks to all of them for encouraging me to do this project and make it a success.
\newpage

\tableofcontents
\listoffigures
\listoftables

\newpage
\pagenumbering{arabic}

\newpage
\chapter{Introduction}
\vspace{2cm}
In the ever-evolving world of music, the ability to accurately identify and separate musical components has become an increasingly valuable asset. Music enthusiasts, researchers, and industry professionals often find themselves in situations where they can recall a familiar tune but struggle to pinpoint its exact identity. This challenge can arise due to various reasons, such as partial recollection of the melody, lyrics, or rhythmic patterns. Being able to identify a song based on limited information can open up new avenues for music discovery, recommendation, and analysis.\\
Alongside the challenge of song identification, the process of isolating individual instruments from a complex musical arrangement can provide invaluable insights for music analysis, production, and education. Separating the vocal and instrumental components of a song can enable a deeper understanding of the musical structure, facilitate targeted sound manipulation, and enable more focused study and appreciation of the various elements that contribute to the overall musical experience. This project aims to address these challenges by leveraging digital signal processing (DSP) techniques and machine learning to develop a comprehensive solution for song identification and instrument separation.\\

% Problem Statement %
\section{Problem Statement}
The primary objective of this project is to create a system that can effectively identify a song based on a user's partial recollection, such as a hummed or lyrical fragment. Additionally, the system will be designed to separate the vocal and instrumental components of a given song, providing users with the ability to isolate specific musical elements for further analysis or creative purposes.

% Objective of the Project %
\section{Objective of the Project}
The primary objective of this project is to create a robust and efficient system that can accurately identify a song based on a user's partial recollection, such as a hummed or lyrical fragment. By leveraging advanced DSP techniques, the system will be able to extract and analyze the unique characteristics of the input audio, comparing it against a comprehensive database of musical recordings to determine the most likely match.\\
Furthermore, the project aims to develop a sophisticated instrument separation module that can effectively isolate the vocal and instrumental components of a given song. This capability will enable users to explore the individual elements of a musical composition, opening up new possibilities for music analysis, production, and educational purposes. Through the application of techniques like spectrogram analysis, MFCC (Mel-Frequency Cepstral Coefficients), and NMF (Non-negative Matrix Factorization), the system will be able to distinguish and label the various instrument sounds within the audio signal, providing users with a detailed breakdown of the musical arrangement.\\
By combining the song identification and instrument separation functionalities, this project seeks to create a comprehensive solution that caters to the diverse needs of music enthusiasts, researchers, and professionals. The ability to quickly identify a song from a partial input and isolate its individual components can significantly enhance music discovery, facilitate deeper musical analysis, and enable more innovative applications in the field of music technology.\\
\section{Roadmap} 
\begin{center}
\begin{ganttchart}[
    hgrid,
    vgrid,
    x unit=0.1cm,
    y unit chart=1cm,
    time slot format=isodate,
    compress calendar,
    title/.append style={draw=none, fill=none},
    title label font=\bfseries\footnotesize,
    title label node/.append style={below=7pt}
  ]{2024-01-01}{2024-04-12}
  
  \gantttitlecalendar{month=name} \\\\
  
  \ganttbar{Planning \& Research}{2024-01-01}{2024-01-31} \\
  \ganttbar{Different Approaches}{2024-02-01}{2024-02-29} \\
  \ganttbar{Best Approach \& Development}{2024-03-01}{2024-03-31} \\
  \ganttbar{Finishing}{2024-04-01}{2024-04-12} 
  \caption{Gantt Chart}
\end{ganttchart}   
\end{center}

\subsection{January: Project Planning and Research}
The initial phase of the project focuses on comprehensive planning and extensive research. During this period, various approaches were explored, and assessed the feasibility of implementing different techniques for song identification and instrument separation. This phase involved evaluating the strengths and limitations of existing methods, identifying key challenges, and formulating a robust project plan to guide the subsequent stages of development.

\subsection{February: Different Approaches}
In the second phase, explored and evaluated multiple approaches to address the challenges of song identification and instrument separation. This included investigating the potential of various digital signal processing techniques, such as cross-correlation, spectrogram analysis, MFCC, LPC, DWT, LPCC, and NMF along with spectral subtraction and use of filters, as well as exploring the application of machine learning algorithms, including artificial neural networks (ANNs) and other relevant models. The advantages and disadvantages of each approach was carefully analyzed , considering factors like accuracy, computational efficiency, and practical feasibility.

\subsection{March: Best Approach Selection and Development}
Based on the insights gained, the most promising approach was selected and commenced the development of the comprehensive solution. It involved the implementation of the chosen DSP techniques i.e. spectrogram analysis, cross-correlation, MFCC, NMF and the integration of machine learning algorithms to create a robust and reliable system.  

\subsection{April : Final Touches}
The final phase of the project focused on the completion and polishing of the developed system. The project was tested and evaluated to ensure the accuracy, stability, and user-friendliness. A simple user-friendly web interface was developed to connnect the python scripts to process the sample for testing.

\section{Existing System}
Numerous approaches have been explored in the domain of song identification and instrument separation. One common method involves the use of audio fingerprinting, where a unique signature is generated from the audio signal and compared against a database of known songs. While effective in certain scenarios, this approach can be limited by the availability and quality of the reference database. Another approach focuses on machine learning techniques, such as convolutional neural networks (CNNs) and recurrent neural networks (RNNs), which have shown promising results in tasks like music genre classification and instrument recognition. However, these methods often require extensive training data and computational resources, and may not be suitable for real-time or resource-constrained applications.\\
In the realm of instrument separation, techniques like independent component analysis (ICA), non-negative matrix factorization (NMF), and spectral modeling synthesis (SMS) have been explored. These methods aim to decompose the audio signal into its constituent components, allowing for the isolation of individual instruments. Nevertheless, these techniques can be sensitive to the complexity of the input audio and may struggle with overlapping or closely related instrument sounds.\\
The proposed project seeks to address the limitations of existing approaches by employing a novel combination of DSP techniques and machine learning algorithms to provide a robust and versatile solution for song identification and instrument separation. By leveraging cross-correlation, spectrogram analysis, MFCC, and NMF, along with the power of artificial neural networks (ANNs), the project aims to deliver a comprehensive and user-friendly system that can efficiently meet the needs of music enthusiasts, researchers, and professionals alike.\\

\newpage
\chapter{Related Work}
The challenges of song identification and instrument separation have been the subject of extensive research in the field of music technology. Several approaches have been explored to address these issues, each with its own strengths and limitations.\\
One of the primary methods for song identification is audio fingerprinting, which involves extracting a unique signature or \emph{"fingerprint"} from the audio signal and comparing it against a database of known songs.
\section{Literature Review}
The spectral subtraction algorithm is one of the earliest methods proposed for single-channel speech enhancement, where the noise spectrum estimated during speech pauses is subtracted from the noisy speech spectrum to estimate the clean speech. While effective, this approach suffers from the presence of processing distortions, known as remnant noise. Over the years, several variations of the spectral subtraction method have been developed to address this drawback, such as spectral over-subtraction, multi-band spectral subtraction, Wiener filtering, iterative spectral subtraction, and spectral subtraction based on perceptual properties. These modified forms of spectral subtraction have been shown to significantly reduce remnant noise and minimize speech distortion, as evidenced by objective measures, spectrograms, and informal listening tests conducted on both stationary and non-stationary noise types at different SNR levels\cite{1}. Speech is a complex human ability characterized by the coordinated actions of roughly 100 muscles producing about 14 different sounds per second. Speaker recognition software can identify and recognize speakers from speech signals by extracting relevant features through techniques like Mel Frequency Cepstral Coefficients (MFCC), Linear Prediction Coefficients (LPC), Linear Prediction Cepstral Coefficients (LPCC), Line Spectral Frequencies (LSF), Discrete Wavelet Transform (DWT), and Perceptual Linear Prediction (PLP). While each method has been widely tested and proven reliable, researchers have made modifications to improve their noise-resistance, robustness, and computational efficiency, although the choice of technique ultimately depends on the specific application requirements\cite{2}. This invention presents a music identification and purchasing system that allows users to mark the time and radio station of a song they hear on a portable device, such as a key fob or phone. The system then matches this information with broadcast archives to identify the song, artist, and music company, enabling the user to purchase the full-length track or related music. Alternatively, the user can record a segment of the music and the system can identify the song based on the playback, providing relevant information and purchase options. This system offers a convenient way for users to discover and acquire music they hear, streamlining the process of music identification and purchasing\cite{3}. This survey paper provides a comprehensive overview of the theoretical research on Nonnegative Matrix Factorization (NMF) over the past 5 years. NMF is a dimensionality reduction technique that incorporates nonnegativity constraints, enabling a parts-based representation and enhanced interpretability. The paper systematically summarizes the principles, basic models, properties, and algorithms of NMF, including its various modifications, extensions, and generalizations. It categorizes existing NMF algorithms into four groups - Basic NMF, Constrained NMF, Structured NMF, and Generalized NMF - and presents a detailed analysis of their design principles, characteristics, problems, relationships, and evolution. The survey also discusses related work and open issues, as well as briefly describing relevant application areas of NMF, aiming to construct an integrated, state-of-the-art framework to guide future research in this field\cite{4}. Digital speech signal processing and voice recognition algorithms are crucial for enabling accurate and efficient automatic speech recognition. Due to the complex and information-rich nature of the voice signal, direct analysis and synthesis is challenging, necessitating the use of feature extraction and matching techniques. Methods such as Linear Predictive Coding (LPC), Hidden Markov Models (HMM), and Artificial Neural Networks have been evaluated, with the non-parametric Mel Frequency Cepstral Coefficients (MFCCs) emerging as an effective feature extraction technique to model the human auditory perception system. For feature matching, the non-linear Dynamic Time Warping (DTW) algorithm has been employed to account for variations in the temporal rate of the voice signal. This paper demonstrates the viability of using MFCCs for feature extraction and DTW for pattern comparison, providing a compelling approach to enable robust and efficient automatic speech recognition\cite{5}. This paper presents a new approach to Mel-Frequency Cepstral Coefficient (MFCC) feature extraction for speech-based speaker verification, building upon the existing Delta-Delta MFCC method. The proposed technique utilizes a distributed Discrete Cosine Transform (DCT-II) to enhance the MFCC features. The authors conduct speaker verification tests comparing the performance of conventional MFCC, Delta-Delta MFCC, and the new distributed DCT-II based Delta-Delta MFCC, all using a Gaussian Mixture Model (GMM) classifier. This research aims to identify performance improvements over the existing MFCC implementations, offering a novel approach to feature extraction that can enhance the accuracy and reliability of speech-based speaker verification systems\cite{6}. Automatic speech recognition (ASR) has been a longstanding challenge for researchers, with the need to accurately interpret spoken language and perform appropriate actions. While significant progress has been made, improving the accuracy of speech recognition systems remains a key focus area. This paper examines the theoretical underpinnings of feature extraction techniques, particularly Linear Predictive Coding (LPC) and Linear Predictive Cepstral Coefficients (LPCC), which are critical components of ASR systems. The goal of this work is to conduct a comparative analysis of these feature extraction methods, providing insights that can guide the selection of appropriate techniques based on the specific domain and requirements of the speech recognition application.\cite{7}

\newpage

\chapter{Approach}
This section outlines the diverse methodologies explored in the project to address the problem statement effectively. These methodologies encompass cross-correlation, MFCC, LPC, LPCC, DWT, NMF, spectral subtraction, Wiener filtering, and artificial neural networks (ANN).
\section{Song Indentification}
The core of the song identification module in this project is the application of cross-correlation to match the user's partial audio input, such as a hummed or lyrical fragment, against the database of known songs.
\subsection{Cross-Correlation for Segment Matching}
The first step in the song identification process is to segment the user's input audio file. This is achieved through the use of cross-correlation, a powerful signal processing technique that allows for the comparison of two signals and the identification of the degree of similarity and time offset between them.\\
The user's input audio is first preprocessed to extract the relevant features, such as the melody or rhythm. This preprocessed input is then compared against pre-segmented audio chunks from the song database, which have also been preprocessed in a similar manner.\\
The cross-correlation operation is performed between the user's input and each segment in the database. The segment that exhibits the highest correlation score is identified as the best match to the user's input.

\subsection{Song Identification from Segment Matching}
Once the best-matching segment has been identified through the cross-correlation process, the system can then retrieve the corresponding song information, such as the title, artist, and other relevant metadata.\\
By performing this segment-level matching, the system is able to accurately recognize the song based on the partial information provided by the user, even when the exact title or artist is not known. The cross-correlation-based approach ensures that the system can identify the song by comparing the user's input against the most similar segments in the database, providing a robust and reliable song identification solution.\\
This approach to song identification, combining segmentation through cross-correlation and segment-level matching, forms the core of the system's ability to accurately recognize songs from partial user input, such as humming or lyrical fragments.

\section{Instrument Sound Extraction}
The instrument sound extraction module of the project leverages the powerful technique of Non-negative Matrix Factorization (NMF) in conjunction with the Kullback-Leibler (KL) divergence cost function.
\subsection{Non-negative Matrix Factorization (NMF)}
NMF is a dimensionality reduction and matrix decomposition method that is particularly well-suited for separating the individual components within a complex audio signal. Unlike other factorization techniques, NMF imposes the constraint that all the elements in the decomposed matrices must be non-negative, leading to a parts-based representation of the input data.\\
For the purpose of instrument sound extraction, the input to the NMF algorithm will be the spectrogram of the audio signal, which represents the frequency content of the audio over time. The NMF process will decompose this spectrogram into two matrices:\\
\begin{enumerate}
    \item \textbf{The basis matrix (W): }This matrix represents the spectral patterns of the individual instrument sounds present in the input audio.
    \item \textbf{The activation matrix (H): }This matrix encodes the temporal activations of the corresponding instrument sounds.
\end{enumerate}
\subsection{Kullback-Leibler (KL) Divergence Cost Function}
To guide the NMF optimization process and ensure the optimal separation of instrument sounds, the Kullback-Leibler (KL) divergence will be employed as the cost function. The KL divergence is a measure of the difference between two probability distributions and is well-suited for non-negative data, such as audio spectrograms.\\
By minimizing the KL divergence between the input spectrogram and the product of the basis and activation matrices (W and H), the NMF algorithm will iteratively update the factorized matrices to best represent the original audio signal. This will result in the basis matrix (W) containing the isolated spectral patterns of the individual instruments, while the activation matrix (H) will encode their temporal behavior.

\section{Extracted Instrument Sound Recognition using ANN} 
The approach used in this code for Extracted Instrument Sound Recognition using an Artificial Neural Network (ANN) can be summarized as follows:\\
The process begins with the loading of audio files for different musical instruments: violin, mohanveena, sitar, flute, drums, etc. A series of preprocessing steps are then applied to the audio data, including resampling to a standard sampling rate of 44,100 Hz, rechanneling to have two channels (stereo), and padding or truncating the audio to a fixed duration of 6 seconds. Spectrograms are then generated from the preprocessed audio using the Mel-Frequency Cepstral Coefficients (MFCC) technique.\\
The ANN model consists of an input layer that accepts the flattened spectrogram data, followed by two dense layers with leaky ReLU and ReLU activation functions, respectively. A dropout layer is included to prevent overfitting, and a final dense layer with a softmax activation function is used to output the predicted instrument class.\\ \\
This chapter outlined the approach adopted in the project, which involves utilizing cross-correlation for song matching, NMF with KL divergence for instrument sound extraction, and ANN for labeling the extracted sounds. These techniques form the backbone of the system, allowing to effectively identify songs based on partial humming input and extract instrument sounds from audio recordings.\\
The subsequent chapters will delve into the implementation details, experimental results, and evaluation of the developed system.
\newpage
\chapter{System Design}
\vspace{2mm} 
The proposed system for song identification and instrument separation follows a modular design, allowing for the seamless integration of various digital signal processing (DSP) and machine learning techniques. The overall architecture consists of two main components: the Song Identification Module and the Instrument Separation Module.
\section{System Architecture}
\begin{figure}[h]
    \centering
    \includegraphics[width=0.75\linewidth]{architecture.png}
    \caption{Architecture}
    \label{Sytem Architecture}
\end{figure}
This architecture outlines the overall system design for a project focused on song identification and instrument sound extraction. The key components and their functionalities are as follows:

\subsection{Song Identification}
The song identification module within the proposed system leverages the power of cross-correlation to match a user's partial audio input against a comprehensive database of known songs. This approach allows the system to accurately identify the song even when the user provides limited information, such as a hummed or lyrical fragment.

\begin{itemize}
    \item \subsubsection{What is Cross-Correlation ?}
    In digital signal processing (DSP), cross-correlation refers to a fundamental operation used to measure the similarity between two signals or sequences in the time domain. It's a technique employed in various applications such as pattern recognition, system identification, and signal analysis.\\
    In DSP, cross-correlation is often used to determine how similar two signals are as a function of the relative time delay between them. The output of the cross-correlation operation gives information about the degree of similarity between the two signals at different time offsets.\\
    Mathematically, the cross-correlation of two discrete signals $x[n]$ and $y[n]$, denoted as $(x \star y)[m]$, is computed as:
    \begin{equation}
    (x \star y)[m] = \sum_{n=-\infty}^{\infty} x[n]y[n-m]
\end{equation}
Where:
\begin{itemize}
    \item $x[n]$ and $y[n]$ are the input signals
    \item $(x \star y)[m]$ is the cross-correlation result at lag $m$
\end{itemize}

\item \subsubsection{Cross-Correlation for Segment Matching}\\
Cross-Correlation for Segment Matching
The first step in the song identification process is to segment the user's input audio file into manageable chunks. This is achieved through the use of a sliding window approach, where the audio is divided into segments of a fixed duration (e.g., 3 seconds). For each segment, the code computes the frequency spectrum using the Fast Fourier Transform (FFT).\\
The preprocessed segments of the user's input audio are then compared against the corresponding segments from the song database using cross-correlation. Cross-correlation is a powerful signal processing technique that measures the similarity between two signals as a function of the time offset between them. By performing this cross-correlation operation, the system can identify the segment within the song database that best matches the user's input.

\item \subsubsection{Identification of Best Matching Segment}\\
The cross-correlation operation produces a correlation score for each time offset between the user's input segment and the database segments. The segment with the highest correlation score is identified as the best match. To ensure a reliable match, the code applies a \emph{threshold} to the correlation scores, discarding any matches that do not exceed a certain percentage (95\% in the case of the current system) of the maximum score.\\
If a suitable match is found, the system retrieves the relevant metadata associated with the identified song segment, such as the title, artist, and other relevant information. In the case where no satisfactory match is found, the system informs the user that no match could be determined based on the provided input.

\item \subsubsection{Visualization and Interpretation}\\
To provide users with visual feedback and aid in the interpretation of the song identification process, the code includes functionality to plot the frequency spectrum of the user's input segments and the best matching segments from the database. This allows users to visually compare the spectral characteristics of the input and the identified song, enhancing their understanding of the matching process and can be used for further studies.
\item \subsubsection{Why Cross-Correlation ?}\\
The benefits of using cross-correlation for song identification include: 
\begin{enumerate}
    \item \textbf{Ability to handle partial input:} Cross-correlation can effectively match the user's partial input against the database, even when the entire song is not available.
    \item \textbf{Robustness to temporal variations:} The cross-correlation operation can account for differences in the temporal rate between the user's input and the database segments, ensuring accurate matching.
    \item \textbf{Computational efficiency:} The cross-correlation can be efficiently computed using fast Fourier transform (FFT) techniques, making it a computationally viable approach for real-time applications.
\end{enumerate}
This approach enables the system to accurately recognize songs from partial user input, catering to the needs of music enthusiasts, researchers, and professionals alike.
\item \subsubsection{Graphical Analysis}
\begin{figure}[h]
    \centering
    \includegraphics[width=0.7\linewidth]{noMatchSegment.png}
    \caption{Frequency Spectrum Comparison - Unmatched Segment}
    \label{Frequency Spectrum Comparison - 1}
\end{figure}
The graph shows a significant difference in the frequency spectra between the test sound and the input audio for this segment. The test sound has a much more pronounced peak around 1,500 Hz, while the input audio does not exhibit a similar spike in that frequency range. This suggests that the frequency content of the test sound and input audio are quite dissimilar in Segment 6, indicating a potential mismatch or discrepancy between the two signals.\\
\begin{figure}[h]
    \centering
    \includegraphics[width=0.75\linewidth]{matchedSegment.png}
    \caption{Frequency Spectrum Comparison - Matched Segment}
    \label{Frequency Spectrum Comparison - 2}
\end{figure}\\
The frequency spectrum comparison graph provides insight into the distribution of energy across different frequencies for the test sound and input audio in Segment 8 of the analysis.\\
Both the test sound and input audio exhibit a prominent peak in the frequency spectrum around 4,000 Hz. This indicates that a significant portion of the energy or emphasis in this audio segment is concentrated in the 4 kHz frequency range.\\
The overall shape and pattern of the frequency spectra for the test sound and input audio are quite similar. This suggests that the two audio signals share comparable frequency characteristics in Segment 8, with the peaks and valleys aligning closely.
\end{itemize}

\subsection{Instrument Sound Extraction}
In this section, we delve into the techniques employed for extracting and separating sound sources from audio signals. The process involves utilizing methods from digital signal processing (DSP), particularly Short Term Fourier Transform (STFT), and Non-negative Matrix Factorization (NMF) coupled with signal reconstruction techniques.
\begin{itemize}
    \item \textbf{STFT}\\
    In Digital Signal Processing (DSP), the Short-Time Fourier Transform (STFT) is a fundamental technique used for analyzing the frequency content of a signal over short, overlapping time intervals. It is a discrete-time counterpart to the continuous-time Fourier Transform and provides a time-varying representation of the frequency components present in a signal.\\
    Mathematically, the STFT of a discrete-time signal $x[n]$ at time index $n$ and frequency index $k$ is computed as follows:
    \begin{equation}
    X[n,k] = \sum_{m=-\infty}^{\infty} x[m] \cdot w[m-n] \cdot e^{-j2\pi \frac{mk}{N}}
    \end{equation}

    Where:
\begin{itemize}
    \item $w[m-n]$ is a window function centered around time index $n$. The window function is typically tapered to reduce spectral leakage and control the trade-off between frequency and time resolution.
    \item $e^{-j2\pi \frac{mk}{N}}$ represents the complex exponential term corresponding to the frequency index $k$.
    \item $N$ is the length of the FFT used for each windowed segment.
\end{itemize}

For continuous time-signal, mathematically it is defined as follows:\\
Given a continuous-time signal $x(t)$, the STFT $X(t, \omega)$ at time $t$ and frequency $\omega$ is calculated by applying the Fourier Transform (FT) to short segments of $x(t)$, which are typically windowed to ensure stationarity, and then shifting the window along the time axis. This can be expressed as:
\begin{equation}
X(t, \omega) = \int_{-\infty}^{\infty} x(\tau) \cdot w(t - \tau) \cdot e^{-j\omega \tau} \, d\tau
\end{equation}

Where:
\begin{itemize}
    \item $w(t)$ is a window function that is typically tapered to minimize spectral leakage and control the trade-off between frequency and time resolution.
    \item $\omega$ represents the angular frequency.
    \item $e^{-j\omega \tau}$ is the complex exponential term corresponding to the frequency $\omega$.
    \item The integral is computed over a short time window around $t$, which effectively isolates a segment of the signal.
\end{itemize}\\
The STFT is widely used in various DSP applications such as audio processing, speech recognition, and vibration analysis, where the time-varying spectral characteristics of signals are of interest. It enables the analysis of non-stationary signals and provides insights into the dynamic behavior of signals over time.\\
\item \textbf{STFT Application} 
In the project, the initial step involves applying the STFT to the input audio signal. This is crucial for transforming the audio signal from the time domain to the frequency domain. By doing so, we enable the analysis of the audio signal's spectral characteristics over short time intervals. This is particularly useful for tasks such as identifying specific frequency components or patterns in the audio signal.\\
The output of the STFT is a complex-valued matrix, commonly referred to as the spectrogram. This matrix consists of magnitude and phase components, where the magnitude represents the amplitude of each frequency component at different time intervals, and the phase represents the phase shift of each frequency component relative to a reference point.\\
\begin{figure}[h]
    \centering
    \includegraphics[width=1\linewidth]{spectrogram.png}
    \caption{Spectrogram}
    \label{Spectrogram}
\end{figure}\\

By obtaining the magnitude and phase spectrograms, we gain insight into the frequency content and temporal dynamics of the audio signal. These spectrograms provide valuable information that are further analyzed in subsequent steps of the project, such as separating sound sources or identifying specific audio features.

\item \textbf{Non-negative Matrix Factorization(NMF)}\\
It is a technique used in machine learning and signal processing for decomposing a non-negative matrix into two lower-rank matrices with non-negative elements. The goal of NMF is to find a low-rank approximation of the original matrix such that the product of the two lower-rank matrices closely approximates the original matrix.\\
Mathematically, given a non-negative matrix $V$ of size $K \times N$, NMF seeks to find two non-negative matrices $W$ and $H$ such that:
\begin{equation}
V \approx WH
\end{equation}

Where:
\begin{itemize}
    \item $W$ is a matrix of size $K \times S$ containing the basis vectors or components.
    \item $H$ is a matrix of size $S \times N$ containing the coefficients or activations.
    \item $S$ represents the desired number of components or basis vectors.
\end{itemize}
\begin{figure}[h]
    \centering
    \includegraphics[width=1\linewidth]{NMF.png}
    \caption{NMF Illustration}
    \label{nmf}
\end{figure}
NMF has applications in various fields such as image processing, audio processing, text mining, and bioinformatics. In audio processing, for example, NMF can be used for source separation, where it decomposes a mixed audio signal into its constituent sources such as vocals, drums, and guitar. In image processing, it can be used for feature extraction and image segmentation.

\item \textbf{Updating the Matrices}\\
The update rules for the matrices $W$ and $H$ in the NMF algorithm are implemented iteratively until convergence criterion is met or maximum number of iterations is reached.

\begin{enumerate}
    \item \textbf{Upadte $H$}\\
    The $H$ matrix is updated using the following update rule:\\
    \begin{equation}
H \leftarrow H \cdot \frac{\left( W^T \cdot ((WH)^{\beta - 1}) \right)}{\left( W^T \cdot (((WH)^{\beta - 2}) \cdot V) \right)}
\end{equation}
This update is performed element-wise for each entry of matrix $H$.

\item \textbf{Update $W$}\\
 The $W$ matrix is updated using the following update rule:\\
 \begin{equation}
W \leftarrow W \cdot\frac{\left( ((WH)^{\beta - 1}) \cdot H^T \right)}{ \left( ((WH)^{\beta - 2}) \cdot V \right) \cdot H^T} 
\end{equation}
Similar to $H$, this update is performed element-wise for each entry of matrix $W$
\end{enumerate}
$\beta$ is the beta parameter used to specify the type of divergence measure in the NMF algorithm. The update rules are based on this parameter, with different values of $\beta$ corresponding to different divergence measures (e.g., Euclidean, Kullback-Leibler, Itakura-Saito).
\end{itemize}

\subsection{Instrument Labelling}\\
In this section, an Artificial Neural Network (ANN) is employed to label the extracted instrument sounds. The process involves training a neural network to classify the spectrograms of various musical instruments.
\begin{itemize}
    \item \textbf{What is ANN ?}\\
    Artificial Neural Networks (ANNs) are computational models inspired by the biological neural networks of the human brain. They consist of interconnected nodes, or neurons, organized into layers. Each neuron receives input signals, processes them using weighted connections and activation functions, and produces an output signal. ANNs are characterized by their ability to learn complex patterns and relationships from data, making them powerful tools for tasks such as classification, regression, and pattern recognition. The input layer of an ANN receives data, which is then passed through one or more hidden layers where computations occur. The final output layer produces the network's prediction or classification.\\
    ANNs have demonstrated remarkable success in various domains, including image and speech recognition, natural language processing, autonomous vehicles, and financial forecasting. Their versatility and ability to learn from data make them valuable tools for solving complex problems and driving advancements in artificial intelligence.

    \item \textbf{Data Preprocesssing}\\
    Data preprocessing is an essential step in preparing audio data for further analysis or modeling. Initially, the audio files of various musical instruments are loaded into memory by opening and reading them. This ensures that the audio data is readily available for processing. Subsequently, to ensure consistency across different audio samples, the channels are adjusted through rechanneling. This step ensures that all audio samples have a uniform number of channels, whether stereo or mono, facilitating standardized processing.\\
    Following rechanneling, the audio data is resampled to a standardized sampling rate. This process maintains uniformity in the temporal resolution of the audio signals, enabling consistent processing across different samples. Additionally, to standardize the input sizes for subsequent analysis, the audio duration is padded or truncated to a fixed length, typically around 6 seconds. This step ensures that all audio samples have consistent durations, facilitating streamlined processing and analysis. Finally, the audio data is transformed into spectrograms using Mel-frequency spectrogram transformation. Spectrograms provide a time-frequency representation of the audio signals, capturing both temporal and frequency information essential for subsequent analysis, such as feature extraction and classification. Together, these preprocessing steps lay the groundwork for effective analysis and modeling of audio data, enabling robust insights and applications in various domains.
    
    \item \textbf{Data Collection}\\
    After preprocessing the audio data, spectrograms are generated for each instrument category. This involves obtaining spectrogram representations for individual musical instruments such as violin, mohanveena, sitar, flute, drums, etc. each extracted from their corresponding datasets. Spectrograms provide a detailed visual representation of the frequency content of audio signals over time, offering insights into the unique characteristics of each instrument's sound. These spectrograms capture essential features that distinguish one instrument from another, such as harmonic structure, timbre, and temporal dynamics.\\
    Additionally, each spectrogram is paired with a label indicating its instrument category. This association between spectrograms and labels forms the basis of supervised learning, enabling the development of machine learning models capable of distinguishing between different instrument categories. By leveraging labeled spectrogram data, these models can learn to identify patterns and features specific to each instrument, thereby facilitating accurate classification. Overall, the combination of spectrogram data and associated labels allows for the creation of robust classification models capable of accurately categorizing audio signals based on their instrument sources.

    \item \textbf{Data Labelling}
    In the data labeling phase, each spectrogram obtained from the preprocessing stage is assigned a label corresponding to its instrument category. For example, spectrograms representing violin sounds are labeled as "violin," those depicting mohanveena sounds are labeled as "mohanveena," and those representing sitar sounds are labeled as "sitar." These labels serve as ground truth information, providing the neural network with the correct classification for each spectrogram during the training process.\\
    The assigned labels play a crucial role in training the neural network effectively. By associating each spectrogram with its correct instrument category, the network learns to recognize distinctive features and patterns inherent to each instrument type. During training, the network adjusts its parameters to minimize the discrepancy between its predictions and the true labels provided in the training data. This process enables the network to learn the underlying relationships between the spectrogram features and their corresponding instrument categories, ultimately improving its ability to accurately classify unseen spectrograms in real-world scenarios.

    \item \textbf{Model Training}\\
    In the model training phase, an Artificial Neural Network (ANN) is developed specifically for instrument classification tasks. This ANN model is structured with multiple dense layers, allowing for complex transformations of the input data. Activation functions such as Leaky ReLU and ReLU are commonly employed within these layers to introduce non-linearity, enabling the network to capture intricate relationships between input features and output classes. Additionally, dropout layers are integrated into the model architecture to mitigate overfitting issues by randomly dropping a fraction of connections during training, thereby promoting generalization to unseen data.\\
    The output layer of the ANN utilizes softmax activation, facilitating the generation of probability distributions across the various instrument classes. This final layer transforms the network's raw output into a set of probabilities, with each probability indicating the likelihood of the input spectrogram belonging to a specific instrument category. During the training process, the model iteratively adjusts its internal parameters using optimization techniques such as backpropagation and gradient descent to minimize the discrepancy between its predicted probabilities and the true labels provided in the training data. This iterative learning process enables the ANN to effectively learn discriminative features from the spectrogram data and accurately classify unseen instrument samples.\\
    \begin{enumerate}
        \item \textbf{ReLU: }Rectified Linear Unit, is a popular activation function used in neural networks. It is defined as: 
        \begin{equation}
        f(x) = \max(0, x)
        \end{equation}
        which essentially means that it outputs the input value $x$ if it is positive, and zero otherwise. In simpler terms, ReLU acts as an on-off switch: if the input is positive, it lets the input through unchanged, but if the input is negative, it outputs zero. This simple mathematical operation introduces non-linearity to the network, allowing it to learn complex patterns and relationships in the data.
        \begin{figure}[h]
                    \centering
                    \includegraphics[width=0.75\linewidth]{ReLU.png}
                    \caption{ReLU activation function}
                    \label{ReLU}
                \end{figure}
                \newpage
    \item \textbf{Leaky ReLU: }Leaky Rectified Linear Unit, is a variation of the Rectified Linear Unit (ReLU) activation function commonly used in artificial neural networks (ANNs). It addresses a limitation of traditional ReLU functions known as the "dying ReLU" problem. In standard ReLU, neurons can sometimes become inactive for certain inputs, resulting in dead neurons that never activate again, causing issues with training and convergence.\\
    The Leaky ReLU function is defined as:
    \begin{equation}
        f(x) = \max(\alpha x, x)
    \end{equation}
    where $\alpha$ is a small positive slope coefficient applied to negative inputs. Typically, $\alpha$ is set to a small constant, such as 0.01. Unlike the standard ReLU function, which sets negative inputs to zero, Leaky ReLU allows a small, non-zero gradient for negative inputs. This small gradient prevents neurons from becoming completely inactive, effectively addressing the "dying ReLU" problem.\\
    By introducing a slight slope for negative inputs, Leaky ReLU ensures that even neurons with negative inputs contribute to the gradient during backpropagation. This encourages better gradient flow through the network, leading to improved learning dynamics and faster convergence during training.\\
    \begin{figure}[h]
        \centering
        \includegraphics[width=0.75\linewidth]{Leaky_ReLU.png}
        \caption{Leaky ReLU activation function}
        \label{Leaky_ReLU}
    \end{figure}
    \item \textbf{Softmax: }Softmax activation function is a commonly used activation function in neural networks, particularly in the output layer for multi-class classification tasks. It's a type of exponential function that normalizes the output of a network to a probability distribution over multiple classes.\\
    Mathematically, given an input vector $\mathbf{z} = (z_1, z_2, ..., z_k)$, the softmax function computes the output vector $\sigma(\mathbf{z}) = (\sigma(z_1), \sigma(z_2), ..., \sigma(z_k))$, where each element $\sigma(z_i)$ is defined as:
    \begin{equation}
    \sigma(z_i) = \frac{e^{z_i}}{\sum_{j=1}^{k} e^{z_j}}
\end{equation}
Here, $e$ is the base of the natural logarithm (Euler's number), $z_i$ is the input to the $i$-th neuron in the output layer, and $k$ is the total number of classes. The softmax function exponentiates each input element and divides it by the sum of all exponentiated inputs, ensuring that the output values lie in the range $[0, 1]$ and sum up to 1, representing a valid probability distribution.
\begin{figure}[h]
    \centering
    \includegraphics[width=0.75\linewidth]{Softmax.png}
    \caption{Softmax function}
    \label{softmax}
\end{figure}\\
In neural networks, the softmax function is typically used in the output layer to compute the probabilities of each class, making it suitable for multi-class classification tasks. The class with the highest probability is then predicted as the output class. The softmax function's output can also be interpreted as confidence scores for each class, providing insights into the model's uncertainty about its predictions.
    \end{enumerate}
\end{itemize}

\section{Other Approaches}
In addition to NMF and Artificial Neural Networks (ANNs), other notable approaches in audio signal processing include Mel-Frequency Cepstral Coefficients (MFCC) with spectral subtraction and Wiener filtering, Discrete Wavelet Transform (DWT), and Linear Predictive Coefficients (LPC). MFCC, coupled with noise reduction techniques, provides robust feature extraction and noise reduction capabilities crucial for tasks like speech recognition. DWT offers multi-resolution analysis, while LPC models spectral envelopes for speech synthesis and recognition.
\subsection{MFCC with Spectral Subtraction and Wiener Filtering}
\textbf{Mel-Frequency Cepstral Coefficients (MFCCs):} It is a widely used feature extraction technique in audio signal processing and speech recognition. MFCCs are derived from the short-term power spectrum of a signal and mimic the human auditory system's response to sound. The process involves several steps, including applying a mel-scale filterbank to the power spectrum, taking the discrete cosine transform (DCT) of the log filterbank energies, and selecting a subset of the resulting coefficients as features. MFCCs are valued for their ability to capture the essential characteristics of audio signals in a compact representation, making them suitable for tasks such as speaker recognition, speech detection, and music genre classification.\\
\begin{figure}[h]
    \centering
    \includegraphics[width=0.75\linewidth]{mfcc.png}
    \caption{Block Diagram of MFCC processor}
    \label{MFCC}
\end{figure}\\
The formula used to calculate the mels for any frequency is:
\begin{equation}
\text{mel}(f) = 2595 \times \log_{10}\left(1 + \frac{f}{700}\right)
\end{equation}
Where:\\
$mel(f)$ is the frequency in mels\\
$f$ is the frequency in $Hz$\\
The MFCCs are calculated using this equation:\\
\begin{equation}
\hat{C}_n = \sum_{k=1}^{n} \left(\log \hat{S}_k \right) \cos\left[\frac{n(k-1/2)\pi}{k}\right]
\end{equation}
Where:\\
$k$ is the number of mel cepstrum coefficients,\\
$\hat{S}_k$ is the output of filterbank and\\
$\hat{C}_n$ is the final $mfcc$ coefficients.\\ \\
\textbf{Spectral Subtraction: }It works by estimating the noise present in an audio signal and subtracting this estimated noise spectrum from the original signal spectrum. The basic idea is to compute an estimate of the noise spectrum during periods of silence or low-level audio, then subtract this estimate from the spectrum of the entire signal. This process can effectively remove stationary noise components but may introduce musical noise artifacts, especially in regions where the signal-to-noise ratio is low.\\
\textbf{Wiener Filtering:} It is a more advanced technique that aims to minimize the mean square error between the original clean signal and the processed signal. It operates in the frequency domain and involves calculating a time-varying filter based on the power spectral densities of the clean signal and the noise. The Wiener filter adapts its coefficients based on the characteristics of the input signal and noise, attempting to minimize distortion while reducing noise. Wiener filtering is effective in both stationary and non-stationary noise environments and generally produces better results compared to spectral subtraction, especially in scenarios with non-uniform noise characteristics.

\subsubsection{Approach Followed}
Firstly, 13 MFCCs are computed from the input audio signal. The MFCC feature vector is obtained by summing the MFCCs along the second axis.\\
The idea is to estimate the noise spectrum and subtract it from the noisy speech spectrum to obtain a clean speech signal. The MFCC feature vector is used to estimate the vocal/non-vocal regions. A threshold is set as half the maximum value of the feature vector, and a binary mask is created where values above the threshold are considered as vocal regions. This mask is then applied to the original audio signal to remove the estimated vocal regions, leaving behind the instrumental sound.\\
Wiener Filtering is a linear filtering technique used for signal denoising and enhancement. It is used to apply a Wiener filter to the output of the Spectral Subtraction step, further enhancing the instrumental sound.\\
\textbf{Original Waveform and Spectrogram}
\begin{figure}[h]
    \centering
    \includegraphics[width=0.75\linewidth]{mfcc_W_ss.png}
    \caption{Original Waveform}
    \label{Original MFCC Waveform}
\end{figure}
\begin{figure}[h]
    \centering
    \includegraphics[width=0.75\linewidth]{mfcc_W_ss1.png}
    \caption{Original Spectrogram}
    \label{Spectrogram MFCC}
\end{figure}
\begin{itemize}
    \item The waveform shows a complex audio signal with a combination of vocals and instrumental sounds.
    \item The spectrogram reveals a wide range of frequencies present in the audio, including both low and high frequencies.
\end{itemize}
\textbf{Extracted Instrumental Waveform and Spectrogram}

\begin{figure}[h]
    \centering
    \includegraphics[width=0.75\linewidth]{mfcc_W_ss_e.png}
    \caption{Extracted Instrumental Waveform}
    \label{Extracted MFCC}
\end{figure}
\begin{figure}[h]
    \centering
    \includegraphics[width=0.75\linewidth]{mfcc_W_ss_e1.png}
    \caption{Extracted Instrument Spectrogram}
    \label{Extracted Spectrogram}
\end{figure}
\begin{itemize}
    \item The extracted instrumental waveform appears to have successfully removed most of the vocal components, but some residual vocal artifacts may still be present.
    \item The extracted instrumental spectrogram shows a reduction in energy in the mid-frequency range, which is typical of vocal signals. However, some high-frequency components are still visible, which could be attributed to percussion or other instrumental sounds.
\end{itemize}
\textbf{Limitations}\\
1. \textbf{Voicing Detection Accuracy}: The accuracy of vocal/non-vocal detection using the MFCC feature vector and the chosen threshold is not optimal for all types of audio signals. Depending on the specific characteristics of the input audio, the threshold is needed to be adjusted to improve the separation performance.\\
2. \textbf{Spectral Leakage}: The Spectral Subtraction technique introduces some artifacts and musical noise in the extracted instrumental sound, especially in regions where the vocal and instrumental components overlap in the frequency domain.\\
3. \textbf{Residual Vocal Components}: As observed in the extracted instrumental spectrogram, some high-frequency components that could be associated with vocals or other non-instrumental sounds may still be present in the output.\\
4. \textbf{Limitation to Specific Genres}: The effectiveness of this approach varies depending on the genre and characteristics of the input audio. For example, it may work better for certain genres with distinct instrumental and vocal components, but may struggle with genres where the instrumental and vocal components are heavily intertwined.

\subsection{Discrete Wavelet Transform (DWT)}
The Discrete Wavelet Transform (DWT) is a fundamental tool in digital signal processing (DSP) used for analyzing and processing signals in both time and frequency domains. It decomposes a signal into different frequency components, revealing information about its time-frequency localization. Unlike other transforms such as the Fourier Transform, which provide frequency information at every point in time, the DWT achieves a time-frequency decomposition by applying a set of wavelet functions (known as wavelets) to the signal.\\
The DWT offers several advantages, including its ability to capture transient features with high resolution in time and frequency, as well as its efficient representation of signals with localized features. These properties make it widely used in various applications such as signal denoising, compression, feature extraction, and time-frequency analysis.
\begin{figure}[h]
    \centering
    \includegraphics[width=0.75\linewidth]{DWT.png}
    \caption{Block Diagram of DWT }
    \label{DWT}
\end{figure}
\subsubsection{Approach Followed}
The audio signal is first loaded into the system to obtain the digital representation of the sound, which includes the amplitude of the audio at various time intervals.\\
The core of the methodology is the application of the Discrete Wavelet Transform to the audio signal. DWT is a mathematical process that decomposes a signal into a set of basis functions called wavelets. Unlike Fourier transforms, which only analyze the frequency components, DWT provides both frequency and location information. This means it can give us an understanding of what frequencies are present in the signal and at what points in time they occur.\\
The DWT is performed using a specific type of wavelet ('db1' or Daubechies wavelet in this case), and the signal is decomposed to a certain level (level 3 in the code). Each level of decomposition separates the signal into low-frequency components (approximation coefficients) and high-frequency components (detail coefficients).\\
After decomposition, the methodology involves manipulating the wavelet coefficients. Specifically, all the detail coefficients (which correspond to higher frequency components) from level 1 to the final level are set to zero. This is based on the assumption that instrumental sounds primarily exist in the lower frequencies of the signal, whereas noise and other non-instrumental components such as fast transients or high-pitched tones are represented in the higher frequencies.\\
With the high-frequency detail coefficients zeroed, the signal is reconstructed using only the low-frequency approximation coefficients. This is done through an inverse wavelet transform. The reconstructed signal now has a smoother waveform, as seen in the plots, indicating that it is largely devoid of the high-frequency components that were removed.\\
\textbf{Original Waveform and Spectrogram}
\begin{figure}[h]
    \centering
    \includegraphics[width=0.75\linewidth]{dwt.png}
    \caption{Original Waveform}
    \label{Waveform}
\end{figure}
\begin{figure}[h]
    \centering
    \includegraphics[width=0.75\linewidth]{dwt_s.png}
    \caption{Original Spectrogram}
    \label{Original Spectrogram}
\end{figure}\\
The plot represents the original audio signal. Its amplitude varies significantly over time, indicating a mix of different sounds, including both instrumental and vocal elements.\\
\textbf{Extracted Instrumental Waveform and Spectrogram}
\begin{figure}[h]
    \centering
    \includegraphics[width=0.75\linewidth]{dwt_e.png}
    \caption{DWT Extracted Instrumental Waveform}
    \label{DWT Waveform}
\end{figure}\\\\
\begin{figure}[h]
    \centering
    \includegraphics[width=0.75\linewidth]{dwt_s_e.png}
    \caption{DWT Extracted Instrumental Spectrogram}
    \label{DWT Spectrogram}
\end{figure}\\
The plot shows the waveform after the DWT-based extraction. Compared to the original, the waveform seems smoother with fewer abrupt changes in amplitude. This suggests that some of the higher frequency components, which may represent noise or non-instrumental parts, have been removed.\\
\textbf{Limitations}
\begin{enumerate}
    \item \textbf{Loss of Detail:} By zeroing out all detail coefficients, there is a potential loss of fine detail in the audio signal. Some of these details might be subtle instrumental sounds that are now missing.
    
    \item \textbf{Choice of Wavelet and Decomposition Level:} The choice of db1 and a decomposition level of 3 is arbitrary and might not be optimal for all types of audio signals. Different instruments and genres may require different settings for best results.
    
    \item \textbf{Non-instrumental Low-frequency Components:} Low-frequency non-instrumental components, such as background hum or vocal elements, could still be present since only high-frequency components are discarded.
    
    \item \textbf{Artifact Introduction:} The inverse transform can introduce artifacts, as the reconstructed signal might not perfectly match the original audio's characteristics.
    
    \item \textbf{Over-Simplification:} Assuming that instrumental sounds only exist in the low-frequency range is an over-simplification. Some instruments produce rich harmonics that extend into higher frequencies.
\end{enumerate}


\subsection{Linear Prediction Coefficients (LPC)}
In Digital Signal Processing (DSP), Linear Prediction Coefficients (LPC) are a method used to model the spectral envelope of a signal, particularly in speech and audio processing. The LPC model assumes that a given signal can be approximated by the output of a linear filter driven by a stochastic or random input. This stochastic input is typically modeled as white noise.\\
The LPC analysis involves estimating the coefficients of an all-pole linear predictive filter that best predicts the current sample of the signal based on its past samples. This prediction is usually achieved through techniques like autocorrelation or covariance methods, or through algorithms such as the Levinson-Durbin recursion.\\
Once the LPC coefficients are obtained, they can be used to represent the spectral characteristics of the signal, including its resonant frequencies or formants. These coefficients are particularly useful in speech processing applications such as speech coding, speech synthesis, and speech recognition, where they provide a compact representation of the signal's spectral information, allowing for efficient compression and analysis.
\begin{figure}[h]
    \centering
    \includegraphics[width=0.75\linewidth]{lpc_a.png}
    \caption{Block Diagram of LPC}
    \label{LPC}
\end{figure}\\
\subsubsection{Approach Followed}
LPC operates by calculating coefficients that predict the current sample of an audio signal as a linear combination of previous samples. These coefficients are determined from the signal, which computes the LPC coefficients for a given audio signal ($y$) and a specified \emph{order}. The \emph{order} determines the number of coefficients, or the "memory" of the LPC filter, with higher orders allowing for more complex signal representations.\\
The LPC coefficients are then used to create an inverse filter that, when convolved with the original signal, yields a residual signal. This residual signal represents the difference between the predicted signal and the actual signal. Ideally, this should contain elements that are not predictable linearly, which often correspond to the harmonic structure of the voice.\\
The residual signal is considered to be the vocal track, while the instrumental track is obtained by subtracting the residual signal from the original audio signal. This separation is based on the assumption that the instrumental track will be more predictable (hence better represented by the LPC model) than the vocals.\\
\textbf{Original Waveform}\\
\begin{figure}[h]
    \centering
    \includegraphics[width=0.75\linewidth]{lpc.png}
    \caption{Orignal Waveform}
    \label{fig: Original Waveform}
\end{figure}
\begin{itemize}
    \item The original signal plot shows the full complexity of the waveform, with a mixture of high and low amplitudes corresponding to different elements of the music.
    \item Peaks and troughs are indicative of the rhythmic and melodic content present in the audio.
\end{itemize}
\textbf{Vocal using LPC}
\begin{figure}[h]
    \centering
    \includegraphics[width=0.75\linewidth]{lpc_v.png}
    \caption{Vocals using LPC}
    \label{Vocals_LPC}
\end{figure}
\begin{itemize}
    \item The LPC-extracted vocal plot seems to have a reduced dynamic range compared to the original signal, which suggests that some of the instrumental components have been subtracted out.
    \item The waveform appears more centered around the zero line, potentially indicating that the vocal extraction may include artifacts or that the vocals themselves do not have as wide a variation in amplitude as the original mix.
\end{itemize}
\textbf{Instrumental Waveform}
\begin{figure}[h]
    \centering
    \includegraphics[width=0.75\linewidth]{lpc_i.png}
    \caption{Instruments using LPC}
    \label{Instruments_LPC}
\end{figure}
\begin{itemize}
    \item The plot for the instruments extracted using LPC resembles the original signal's structure more closely, but with less 'noise' or variation in the peaks and valleys, suggesting that some of the less predictable elements (likely the vocals) have been removed.
    \item There is a noticeable preservation of the instrumental harmonic structure, with clear rhythmic patterns still visible.
\end{itemize}
\textbf{Limitations}\\
\textbf{Assumptions about Signal Predictability:}
\begin{enumerate}
    \item The effectiveness of this method relies on the assumption that instrumental sounds are more predictable and stable, which is not always the case.
    \item \textbf{Artifacts from LPC Processing:} The process can introduce artifacts, especially if the order of the LPC is not correctly chosen. A higher order may lead to overfitting, capturing noise as part of the signal.

    \item\textbf{Dependency on LPC Order:} The choice of LPC order is crucial. Too low of an order might not capture the complexity of the instruments, while too high could mean capturing transient vocal noises in the instrumental track.
    
    \item \textbf{Residual and Original Signal Length Matching:} The lengths of the residual and original signals are truncated to match, which might result in some loss of information at the end of the signals.

    \item\textbf{Potential for Vocal Artifacts in Instrumental Track:} Since the method subtracts the vocal track from the original, any errors in the vocal extraction are directly imposed on the instrumental track.
\end{enumerate}
\newpage

\chapter{Result and Analysis}
This chapter presents the results obtained from the implementation of the project's methodology, focusing on the song matching using cross-correlation and instrument sound extraction using Non-negative Matrix Factorization (NMF) with Kullback-Leibler (KL) divergence, followed by labeling the extracted sounds using an Artificial Neural Network (ANN).

\section{Testing and Analysis Phase}
During the testing phase of the project, the effectiveness and accuracy of proposed methods for identifying songs based on partial auditory cues, as well as separating vocals and instrument sounds from the audio tracks is being evaluated.
\subsection{Song Indentification}
The cross-correlation technique was implemented to match the humming input with segments of songs. The implemented code successfully computed the cross-correlation between the input audio and the segments of songs, identifying segments with high similarity scores. The songs are segmented into 3 seconds segment. By analyzing the cross-correlation values, the best matching segment was determined, enabling identification of the corresponding song. A threshold value was used to filter out segments with low cross-correlation scores, and the segment with the highest score above the threshold was selected as the best match, returning its corresponding matching score.
\subsubsection{Testing}
For testing the approach a sample audio file is used with the following specifications:
\begin{itemize}
    \item \textbf{Input audio:} \emph{pp.wav}
    \item \textbf{Duration:} 30 seconds
    \item \textbf{Testing audio duration}: 5 seconds
    \item \textbf{Duration of each segment:} 3 seconds
\end{itemize}
\textbf{Expected Output}\\
By manual inspection the expected output is segment 8 as a the testing audio is hummed like that of segment 8 and a total of 10 segments are generated.
\subsubsection{Analysis}
\begin{figure}[H]
    \centering
    \begin{minipage}{0.45\textwidth}
        \centering
        \includegraphics[width=\textwidth]{noMatchSegment.png} % First image file
        \caption{Unmathced Segment}
        \label{fig:image1}
    \end{minipage}\hfill
    \begin{minipage}{0.45\textwidth}
        \centering
        \includegraphics[width=\textwidth]{matchedSegment.png} % Second image file
        \caption{Matched Segment}
        \label{fig:image2}
    \end{minipage}
\end{figure}
\begin{figure}[h]
    \centering
    \includegraphics[width=0.75\linewidth]{cc.png}
    \caption{Cross-Correlation Output}
    \label{CC_Output}
\end{figure}

\begin{itemize}
    \item The cross-correlation technique effectively identified potential matches between the input audio and the song segments.
    \item Segments with a high degree of spectral similarity and cross-correlation value, like Segment 8 in Figure \ref{fig:image1}, are more likely to be identified as potential matches for the input audio.
    \item Segments with significant differences in the frequency spectra and cross-correlation value, such as Segment 6 in Figure \ref{fig:image2}, are less likely to be considered as matches due to the poorer correlation.
\end{itemize}
Applying a threshold value helped eliminate false positives and ensure that only segments with a sufficiently high correlation were considered as potential matches. The segment with the highest cross-correlation score above the threshold was determined as the best match, providing the corresponding song identification along with the matching score. The frequency spectrum analysis complements the cross-correlation technique, providing an additional perspective on the matching process and aiding in the accurate identification of songs from the hummed input.

\subsection{Instrument Sound Extraction}
The non-negative matrix factorization (NMF) technique was applied to separate the vocal and instrumental components from the identified song. After obtaining the extracted instrumental sounds, a low-pass filter of order 5 with a cutoff frequency of 2000$Hz$ was employed. \\
This filtering step aimed to remove high-frequency noise and artifacts, thereby enhancing the clarity and quality of the instrumental sounds. The low-pass filter attenuated unwanted high-frequency components while preserving the essential tonal qualities and harmonic structure of the instruments. The filtered signals exhibited an improved signal-to-noise ratio (SNR) and a more focused frequency spectrum. These improvements facilitated subsequent processing steps, enabling accurate instrument recognition and labeling.

\subsubsection{Testing}
For testing the approach a sample audio file is used with the following specifications:
\begin{itemize}
    \item \textbf{Input Audio:} test.wav
    \item \textbf{Duration:} 16 seconds
    \item \textbf{Instrument 1:} Violin
    \item \textbf{Instrument 2:} Guitar
    \item \textbf{Instrument 3:} Drums
\end{itemize}
\textbf{Expected Output}
In anticipation of the model's outcomes, we anticipate obtaining distinct audio files containing the extracted sounds of violin, guitar, and drums. Each instrument will be isolated into its own audio file through the model's extraction process.
\subsubsection{Analysis}
\begin{figure}[H]
    \centering
    \begin{minipage}{0.45\textwidth}
        \centering
        \includegraphics[width=\textwidth]{spectrogram_sound_source_1.png} % First image file
        \caption{Spectrogram: Source 1}
        \label{fig: source1}
    \end{minipage}\hfill
    \begin{minipage}{0.5\textwidth}
        \centering
        \includegraphics[width=\textwidth]{waveform_sound_source_1.png} % Second image file
        \caption{Waveform: Source 1}
        \label{fig: wsource1}
    \end{minipage}
\end{figure}
\begin{figure}[H]
    \centering
    \begin{minipage}{0.45\textwidth}
        \centering
        \includegraphics[width=\textwidth]{spectrogram_sound_source_2.png} % First image file
        \caption{Spectrogram: Source 2}
        \label{fig: source2}
    \end{minipage}\hfill
    \begin{minipage}{0.5\textwidth}
        \centering
        \includegraphics[width=\textwidth]{waveform_sound_source_2.png} % Second image file
        \caption{Waveform: Source 2}
        \label{fig: wsource2}
    \end{minipage}
\end{figure}
\begin{figure}[H]
    \centering
    \begin{minipage}{0.45\textwidth}
        \centering
        \includegraphics[width=\textwidth]{spectrogram_sound_source_3.png} % First image file
        \caption{Spectrogram: Source 3}
        \label{fig: source3}
    \end{minipage}\hfill
    \begin{minipage}{0.5\textwidth}
        \centering
        \includegraphics[width=\textwidth]{waveform_sound_source_3.png} % Second image file
        \caption{Waveform: Source 3}
        \label{fig: wsource3}
    \end{minipage}
\end{figure}
\begin{figure}[h]
    \centering
    \includegraphics[width=0.75\linewidth]{inst_ext.png}
    \caption{Extraction Output}
    \label{fig:output_ext}
\end{figure}
\begin{itemize}
    \item \textbf{Optimization of Threshold and Maximum Iterations:} Increasing the threshold to 0.05 and setting the maximum iterations for NMF to 10000 significantly improved the extraction of distinct instrument sounds. This adjustment led to clearer separation of sources within the audio signal, enhancing fidelity in the reconstructed sounds.
    
    \item \textbf{Distinctiveness of Extracted Sounds:} Waveform analysis revealed distinct and identifiable characteristics for each instrument sound. Though some mixed signals were present intermittently, the majority of separated sources exhibited clear features, enabling accurate identification of individual instrument sounds.
    
    \item \textbf{Effectiveness of Low-Pass Filters:} Incorporating low-pass filters refined the extracted audio signals by reducing high-frequency noise. This refinement enhanced the clarity and fidelity of the reconstructed sounds, maintaining their original characteristics and improving overall output quality.
    
    \item \textbf{Presence of Minor Traces:} Despite optimization efforts, minor traces persisted in the extracted sounds, possibly from residual noise or artifacts. However, the overall output remained satisfactory, demonstrating the model's effectiveness in isolating and extracting instrument sounds.
\end{itemize}

The model's effectiveness in extracting instrument sounds from audio files using NMF with KL divergence is augmented by the incorporation of a low-pass filter with a cutoff frequency of 2000 Hz and an order of 5. This filter plays a crucial role in refining the separated sources by attenuating high-frequency noise. By implementing this filtering process, the model ensures that the reconstructed sounds maintain fidelity to the original instrument sounds while minimizing unwanted artifacts. Consequently, the inclusion of the low-pass filter enhances the overall quality and accuracy of the extracted instrument sounds, contributing to more precise analysis and evaluation of the separated sources.

\subsection{Instruments Labelling}
The methodology for labeling musical instrument sounds involves preprocessing audio data to standardize format and transform it into spectrograms, which serve as input features for a neural network model. This model, comprising densely connected layers with dropout regularization, is trained on a dataset of labeled instrument sounds to learn the mapping between spectrogram features and instrument class labels. Trained using categorical cross-entropy loss and the Adam optimizer, the model is then evaluated on unseen data to assess its accuracy in labeling instrument sounds.
\subsubsection{Testing}
The testing phase involves employing the trained neural network model to classify instrument sounds from unseen audio files.
\begin{itemize}
    \item \textbf{Input Audio:} Extracted\_Violin
\end{itemize}
\textbf{Expected Output}\\
The expected output for the extracted violin audio file involves the classification of the sound into its instrument category. For this test case the instrument category should be violin.

\subsubsection{Analysis}
\begin{figure}[h]
    \centering
    \includegraphics[width=0.75\linewidth]{ann.png}
    \caption{Extracted sound labelling}
    \label{fig:labeling}
\end{figure}
\begin{itemize}
    \item The accuracy of the model appears to be perfect, as indicated by the predicted class for the majority of test cases aligning with the expected instrument labels.
    \item However, due to the presence of minor noises in the extracted instrument sounds and the relatively small size of the dataset, occasional misclassifications occur.
    \item In specific instances, such as the predicted class for 'violin' being identified as 'mohanvina,' despite the model's overall accuracy, these misclassifications highlight the need for further refinement and augmentation of the dataset.
\end{itemize}
While the model demonstrates impressive accuracy in instrument classification, challenges such as minor noise interference and dataset size limitations underscore the importance of ongoing optimization efforts to enhance the model's performance and robustness.

\section{Result}
The cross-correlation method applied in the project provides a streamlined and effective approach to song identification, enabling the matching of input audio segments with reference templates. By computing the similarity between spectrograms, it allows for robust detection of songs amidst noise and variations in tempo or pitch. Leveraging sliding windows and template matching, this method accommodates subtle deviations in timing and is well-suited for real-time identification tasks. Nonetheless, with appropriate preprocessing and optimization, the cross-correlation method remains a reliable tool for accurate song identification in the project.\\
The instrument extraction process relies on Non-negative Matrix Factorization (NMF) with Kullback-Leibler (KL) divergence, facilitating the separation of constituent sources within audio signals. This method decomposes magnitude spectrograms into distinct sources corresponding to different instruments, allowing for their isolation and extraction. By iteratively updating factor matrices, NMF effectively disentangles overlapping sources and enhances the fidelity of reconstructed sounds. Additionally, incorporating low-pass filters refines the extracted audio signals by attenuating high-frequency noise, preserving the original characteristics of each instrument. Despite its effectiveness, minor traces may persist in the extracted sounds, necessitating further optimization and refinement. Nonetheless, NMF with KL divergence proves instrumental in the project's objective of accurate instrument extraction for subsequent analysis and labeling.\\
The labeling process involves training a deep learning model using spectrogram data from various instrument sources, including violin, mohanveena, and sitar. The model architecture consists of densely connected layers with appropriate activation functions and dropout regularization to prevent overfitting. By leveraging categorical cross-entropy loss and the Adam optimizer, the model learns to classify spectrogram representations of instrument sounds accurately. During testing, the model demonstrates high accuracy in predicting the instrument class for most test cases. However, due to minor noises present in the extracted instrument sounds and the relatively small dataset size, occasional misclassifications occur. For instance, in some cases, the predicted class for 'violin' may be misidentified as 'mohanveena.' Despite these challenges, the model's overall accuracy underscores its efficacy in instrument labeling tasks, albeit with room for further refinement and dataset augmentation.\\ \\
The comprehensive testing and analysis conducted in this chapter offer valuable insights for future development and refinement, enhancing the understanding and optimization of the implemented methods.
\newpage

\chapter{Conclusion}
The primary objective of this project was to develop an innovative system capable of identifying songs based on hummed or sung melodies provided by users, as well as separating and labeling the individual instrumental components within a given song. The implementation of this project involved leveraging advanced digital signal processing (DSP) techniques, such as cross-correlation, spectrogram analysis, Mel-Frequency Cepstral Coefficients (MFCCs), and Non-Negative Matrix Factorization (NMF), combined with machine learning algorithms, specifically Artificial Neural Networks (ANNs).\\
The song identification aspect of the project utilized cross-correlation to match the user's hummed or sung input against segments of songs within a comprehensive database. This approach proved highly effective in recognizing melodies, even when users provided incomplete or imperfect renditions of the original song. By employing cross-correlation, the system could accurately pinpoint the corresponding song, enabling users to rediscover forgotten or partially remembered musical pieces.\\
Furthermore, the project addressed the challenge of separating and labeling individual instrumental components within a song. Through the application of spectrogram analysis, MFCCs, and NMF, the system successfully isolated the vocal and instrumental tracks, enabling their independent analysis. The instrumental components were then labeled using an ANN model trained on a dataset of various instrument samples. This aspect of the project not only facilitated a deeper understanding of the musical composition but also paved the way for potential applications in areas such as music production, remixing, and audio editing.\\
Throughout the development process, various challenges were encountered and overcome. The most significant hurdle involved handling the inherent complexity and variability of audio signals, which required careful preprocessing and feature extraction techniques. Additionally, the training of the ANN model for instrument labeling necessitated a substantial amount of high-quality data, which was addressed through the curation of a comprehensive instrument dataset.\\
The successful implementation of this project has demonstrated the vast potential of combining DSP techniques with machine learning algorithms in the field of music information retrieval and audio processing. The ability to identify songs based on incomplete or imperfect inputs, as well as separate and label individual instrumental components, opens up new avenues for music discovery, appreciation, and analysis.\\
Looking ahead, several avenues for further research and development can be explored. Incorporating additional audio features and advanced neural network architectures could potentially enhance the accuracy and robustness of both the song identification and instrument labeling components. Furthermore, integrating this system with online music platforms or streaming services could provide a seamless and intuitive experience for users seeking to rediscover forgotten songs or gain deeper insights into musical compositions.

In conclusion, this project has successfully achieved its objectives by developing a comprehensive system for song identification and instrumental separation/labeling. The integration of DSP techniques and machine learning algorithms has demonstrated the immense potential of these technologies in the realm of music information retrieval and audio processing. While the current implementation represents a significant achievement, further advancements and refinements in this field hold promise for even more innovative and impactful applications in the future.
\section{Future Enhancement}
While the current implementation of this project has achieved benchmark success in song identification, instrumental separation, and labeling, there remain several avenues for further enhancement and expansion of its capabilities. This section outlines potential future developments that could broaden the project's scope and application domains.
\subsection{Language Conversion and Vocal Embedding}
One exciting prospective enhancement would be the integration of language conversion capabilities for the extracted vocal components. By leveraging advanced speech synthesis and natural language processing techniques, users could potentially convert the lyrics or vocals of a song into different languages or accents of their choice. This feature would enable users to enjoy their favorite songs in their preferred language or dialect, expanding the project's appeal to a global audience.\\
Furthermore, the ability to seamlessly embed the converted vocals back into the instrumental components would provide users with a truly immersive and personalized music experience. This could be achieved by leveraging advanced audio processing algorithms and signal manipulation techniques, ensuring a natural and harmonious blend of the modified vocals with the existing instrumental tracks.
\subsection{Instrument Extraction and Manipulation}
Building upon the current functionality of instrumental separation and labeling, a future enhancement could involve providing users with the ability to selectively extract individual instrument sounds from a song. This would enable users to isolate specific instrumental components, such as guitar riffs, drum patterns, or bass lines, for various creative or educational purposes.
\subsection{User-Driven Vocal Addition and Collaboration}
Expanding on the concept of personalization, a future enhancement could involve enabling users to incorporate their own vocal recordings into existing instrumental tracks or specific instrument sounds. This feature would open up exciting possibilities for collaborative music creation, allowing users to contribute their voices to their favorite songs or instrumental compositions.\\
Furthermore, a collaborative platform could be developed, facilitating the sharing and remixing of user-generated vocal contributions. This would foster a vibrant community of music enthusiasts, enabling them to collaborate, exchange ideas, and collectively shape the evolution of musical compositions.\\ \\
By implementing these enhancements, the project would evolve from a sophisticated song identification and instrumental separation tool into a comprehensive and versatile platform for music exploration, creation, and collaboration. It would empower users to seamlessly integrate their personal creative expression into existing musical works, while also providing them with the means to reshape and reinterpret these compositions in unique and innovative ways.


\bibliography{references}
\begin{thebibliography}{20} % Use the number of references as an argument
\bibitem{1}
Navneet Upadhyay, Abhijit Karmakar,
Speech Enhancement using Spectral Subtraction-type Algorithms: A Comparison and Simulation Study,
Procedia Computer Science,
Volume 54,
2015,
Pages 574-584,
ISSN 1877-0509,
https://doi.org/10.1016/j.procs.2015.06.066.
\url{https://www.sciencedirect.com/science/article/pii/S1877050915013903}
\bibitem{2}
S. Ajibola Alim and N. Khair Alang Rashid, ‘Some Commonly Used Speech Feature Extraction Algorithms’, From Natural to Artificial Intelligence - Algorithms and Applications. IntechOpen, Dec. 12, 2018. doi: 10.5772/intechopen.80419.
\url{https://www.intechopen.com/chapters/63970}

\bibitem{3}
Remi Swierczek \textit{Music identification system(TUNE HUNTER Inc)} Ser. No. 60/158,087 filed Oct. 7, 1999 and Ser. No. 60/186,565 filed Mar. 2, 2000.
\url{https://patents.google.com/patent/US6941275}


\bibitem{4}
Y. -X. Wang and Y. -J. Zhang, "Nonnegative Matrix Factorization: A Comprehensive Review," in IEEE Transactions on Knowledge and Data Engineering, vol. 25, no. 6, pp. 1336-1353, June 2013, doi: 10.1109/TKDE.2012.51. 
\url{https://ieeexplore.ieee.org/abstract/document/6165290}

\bibitem{5}
Lindasalwa Muda, Mumtaj Begam, I. Elamvazuthi, \textit{Voice Recognition Algorithms using Mel Frequency Cepstral Coefficient (MFCC) and Dynamic Time Warping (DTW) Techniques}, 22 March 2010
\url{https://doi.org/10.48550/arXiv.1003.4083}
\bibitem{6}
M. A. Hossan, S. Memon and M. A. Gregory, "A novel approach for MFCC feature extraction," 2010 4th International Conference on Signal Processing and Communication Systems, Gold Coast, QLD, Australia, 2010, pp. 1-5, doi: 10.1109/ICSPCS.2010.5709752. \url{https://ieeexplore.ieee.org/abstract/document/5709752}
\bibitem{7}
H. Gupta and D. Gupta, "LPC and LPCC method of feature extraction in Speech Recognition System," 2016 6th International Conference - Cloud System and Big Data Engineering (Confluence), Noida, India, 2016, pp. 498-502, doi: 10.1109/CONFLUENCE.2016.7508171.
\url{https://ieeexplore.ieee.org/abstract/document/7508171}

\end{thebibliography}

%ackwnoldegemnt


\end{document}