diff --git a/exercise/5-Clustering.tex b/exercise/5-Clustering.tex index 8f6259c..33e1050 100644 --- a/exercise/5-Clustering.tex +++ b/exercise/5-Clustering.tex @@ -1,5 +1,3 @@ -\def\solutionsflag{} - \documentclass[ english, smallborders @@ -900,15 +898,15 @@ \section*{Exercise 1: K-means} \end{enumerate} \end{solution} - +\newpage \section*{Exercise 2: DBSCAN} +\subsection*{Task 1: Basic Terms} + Given is a set of points in a two-dimensional space: \CoordinateSystem{}{(1,1), (1,2), (1,4), (2,1), (2,3), (3,2), (3,4), (4,1), (4,3), (4,4)}{} -\subsection*{Task 1: Basic Terms} - \subsubsection*{Task 1.1: Core Points} Determine whether $(1,1)$, $(2,1)$, $(2,3)$, and $(1,4)$ are \textbf{core points} if a density based clustering algorithm like \textbf{DBSCAN} is initialized with $\varepsilon = 1$ and $MinPts = 2$ and applied on the given point set. The distance is calculated using the Euclidean distance. @@ -1150,7 +1148,7 @@ \subsubsection*{Task 1.3: Density Reachability} \end{itemize} \end{solution} -\paragraph*{Task 1.3.1: Reversal of Density Reachability} \hfill +\paragraph*{Task 1.3.2: Reversal of Density Reachability} \hfill Determine whether $(3,4)$ is density reachable from $(4,4)$ and whether $(4,4)$ is density reachable from $(3,4)$ if a density based clustering algorithm like \textbf{DBSCAN} is initialized with $\varepsilon = 1$ and $MinPts = 3$. The distance is calculated using the Euclidean distance. @@ -1297,176 +1295,476 @@ \subsubsection*{Task 1.4: Density Connectivity} \subsection*{Task 2: Application of DBSCAN} -Apply the \textbf{DBSCAN} algorithm on the given point set with $\varepsilon = 1$ and $MinPts = 2$. +Given is a set of points in a two-dimensional space: + +\CoordinateSystem{}{(1,1), (1,2), (1,4), (2,1), (2,3), (3,2), (3,4), (4,1), (4,3), (4,4)}{} + +Apply the \textbf{DBSCAN} algorithm known from the lecture on the given point set while using $\varepsilon = 1$ and $MinPts = 2$. Write down \textbf{all} intermediate steps. \begin{solution} - The DBSCAN algorithm can either be structured recursively or iteratively. +The DBSCAN algorithm can either be structured recursively or iteratively. - In this sample solution, we will structure it iteratively, as it is less nested and therefore easier to write down. +In this sample solution, we will structure it iteratively, as it is less nested and therefore easier to write down. - \begin{enumerate} - \item \textbf{Select a Random Point:} +\begin{enumerate} +\item \textbf{Select $(1,2)$ as Random Point:} - In this sample solution, we will start with point $(1,2)$. +Every point can be selected as the starting point. In this sample solution, we randomly decided to use $(1,2)$ as the starting point: - \begin{enumerate} - \item \textbf{For $(1,2)$:} +\begin{enumerate} + \item \textbf{Mark $(1,2)$ as Visited:} - \begin{enumerate} - \item \textbf{Mark $(1,2)$ as Visited:} + Points should only be visited once. This is important to avoid infinite loops: - Points should only be visited once. This is important to avoid infinite loops: + \CoordinateSystemForDBSCAN{(1,1), (1,4), (2,1), (2,3), (3,2), (3,4), (4,1), (4,3), (4,4)}{(1,2)}{(1,2)}{}{}{} - \CoordinateSystemForDBSCAN{(1,1), (1,4), (2,1), (2,3), (3,2), (3,4), (4,1), (4,3), (4,4)}{(1,2)}{(1,2)}{}{}{} + \item \textbf{Check if $(1,2)$ is a Core Point:} - \item \textbf{Check if $(1,2)$ is a Core Point:} + If there are at least $2$ points in the $\varepsilon$-neighborhood of $(1,2)$, $(1,2)$ is a core point: - If there are at least $2$ points in the $\varepsilon$-neighborhood of $(1,2)$, $(1,2)$ is a core point: + \begin{alignat*}{2} + Distance_{(1,2)\leftrightarrow(1,2)} & = \sqrt{(1-1)^2+(2-2)^2} = 0 \\ + Distance_{(1,2)\leftrightarrow(1,1)} & = \sqrt{(1-1)^2+(2-1)^2} = 1 \\ + \end{alignat*} - \begin{alignat*}{2} - Distance_{(1,2)\leftrightarrow(1,2)} & = \sqrt{(1-1)^2+(2-2)^2} = 0 \\ - Distance_{(1,2)\leftrightarrow(1,1)} & = \sqrt{(1-1)^2+(2-1)^2} = 1 \\ - \end{alignat*} + Therefore, $(1,2)$ is a core point. - Therefore, $(1,2)$ is a core point. + \dotfill - \dotfill + Shown in the coordinate system: - Shown in the coordinate system: + \CoordinateSystemWithCircle{(1,1), (1,2)}{(1,4), (2,1), (2,3), (3,2), (3,4), (4,1), (4,3), (4,4)}{(1,2)}{1} - \CoordinateSystemWithCircle{(1,1), (1,2)}{(1,4), (2,1), (2,3), (3,2), (3,4), (4,1), (4,3), (4,4)}{(1,2)}{1} + \dotfill - \dotfill + \item \textbf{Create a New Cluster and Add $(1,2)$:} - \item \textbf{Create a New Cluster and Add $(1,2)$:} + In this sample solution, we simply name this cluster "$0$": - In this sample solution, we simply name this cluster "$0$": + \CoordinateSystemForDBSCAN{(1,1), (1,4), (2,1), (2,3), (3,2), (3,4), (4,1), (4,3), (4,4)}{(1,2)}{}{(1,2)}{}{} - \CoordinateSystemForDBSCAN{(1,1), (1,4), (2,1), (2,3), (3,2), (3,4), (4,1), (4,3), (4,4)}{(1,2)}{}{(1,2)}{}{} + \item \textbf{Add Points in the $\varepsilon$-Neighborhood of $(1,2)$ to the Candidate Set $N$:} - \item \textbf{Add Points in the $\varepsilon$-Neighborhood of $(1,2)$ to the Candidate Set $N$:} + Only $(1,1)$ is an unvisited point in the $\varepsilon$-neighborhood of $(1,2)$: - Only $(1,1)$ is an unvisited point in the $\varepsilon$-neighborhood of $(1,2)$: + \begin{alignat*}{2} + Distance_{(1,2)\leftrightarrow(1,1)} & = \sqrt{(1-1)^2+(2-1)^2} = 1 \\ + \end{alignat*} - \begin{alignat*}{2} - Distance_{(1,2)\leftrightarrow(1,1)} & = \sqrt{(1-1)^2+(2-1)^2} = 1 \\ - \end{alignat*} + So only $(1,1)$ is added to the candidate set $N$: - So only $(1,1)$ is added to the candidate set $N$: + \begin{alignat*}{2} + N & = \{(1,1)\} \\ + \end{alignat*} - \begin{alignat*}{2} - N & = \{(1,1)\} \\ - \end{alignat*} - \end{enumerate} - \end{enumerate} - \item \textbf{Iterate through the Candidate Set $N$ (for Cluster $0$):} - The candidate set $N$ is iterated through until it is empty. All points visited in this iteration will be added to cluster $0$, since this iteration was started with point $(1,2)$: + \item \textbf{Iterate through the Candidate Set $N$ (for Cluster $0$):} - \begin{enumerate} - \item \textbf{For $(1,1)$:} + The candidate set $N$ is iterated through until it is empty. All points visited in this iteration will be added to cluster $0$, since this iteration was started with point $(1,2)$: - \begin{enumerate} - \item \textbf{Remove $(1,1)$ from the Candidate Set $N$:} + \begin{enumerate} + \item \textbf{For $(1,1)$:} - \begin{alignat*}{2} - N & = \{\} \\ - \end{alignat*} + \begin{enumerate} + \item \textbf{Remove $(1,1)$ from the Candidate Set $N$:} - \item \textbf{Mark $(1,1)$ as Visited:} + \begin{alignat*}{2} + N & = \{\} \\ + \end{alignat*} - \CoordinateSystemForDBSCAN{(1,1), (1,4), (2,1), (2,3), (3,2), (3,4), (4,1), (4,3), (4,4)}{(1,1), (1,2)}{(1,1)}{(1,2)}{}{} + \item \textbf{Mark $(1,1)$ as Visited:} - \item \textbf{Add $(1,1)$ to Cluster $0$:} + \CoordinateSystemForDBSCAN{(1,4), (2,1), (2,3), (3,2), (3,4), (4,1), (4,3), (4,4)}{(1,1), (1,2)}{(1,1)}{(1,2)}{}{} - \CoordinateSystemForDBSCAN{(1,1), (1,4), (2,1), (2,3), (3,2), (3,4), (4,1), (4,3), (4,4)}{(1,1), (1,2)}{}{(1,2), (1,1)}{}{} + \item \textbf{Add $(1,1)$ to Cluster $0$:} - \item \textbf{Check if $(1,1)$ is a Core Point:} + \CoordinateSystemForDBSCAN{(1,4), (2,1), (2,3), (3,2), (3,4), (4,1), (4,3), (4,4)}{(1,1), (1,2)}{}{(1,2), (1,1)}{}{} - If there are at least $2$ points in the $\varepsilon$-neighborhood of $(1,1)$, $(1,1)$ is a core point: + \item \textbf{Check if $(1,1)$ is a Core Point:} - \begin{alignat*}{2} - Distance_{(1,1)\leftrightarrow(1,1)} & = \sqrt{(1-1)^2+(1-1)^2} = 0 \\ - Distance_{(1,1)\leftrightarrow(1,2)} & = \sqrt{(1-1)^2+(1-2)^2} = 1 \\ - Distance_{(1,1)\leftrightarrow(2,1)} & = \sqrt{(1-2)^2+(1-1)^2} = 1 \\ - \end{alignat*} + If there are at least $2$ points in the $\varepsilon$-neighborhood of $(1,1)$, $(1,1)$ is a core point: - Therefore, $(1,1)$ is a core point. + \begin{alignat*}{2} + Distance_{(1,1)\leftrightarrow(1,1)} & = \sqrt{(1-1)^2+(1-1)^2} = 0 \\ + Distance_{(1,1)\leftrightarrow(1,2)} & = \sqrt{(1-1)^2+(1-2)^2} = 1 \\ + Distance_{(1,1)\leftrightarrow(2,1)} & = \sqrt{(1-2)^2+(1-1)^2} = 1 \\ + \end{alignat*} - \dotfill + Therefore, $(1,1)$ is a core point. - Shown in the coordinate system: + \dotfill - \CoordinateSystemWithCircle{(1,1), (1,2), (2,1)}{(1,4), (2,3), (3,2), (3,4), (4,1), (4,3), (4,4)}{(1,1)}{1} + Shown in the coordinate system: - \dotfill + \CoordinateSystemWithCircle{(1,1), (1,2), (2,1)}{(1,4), (2,3), (3,2), (3,4), (4,1), (4,3), (4,4)}{(1,1)}{1} - \item \textbf{Add Points in the $\varepsilon$-Neighborhood of $(1,1)$ to the Candidate Set $N$:} + \dotfill - Only $(2,1)$ is an unvisited point in the $\varepsilon$-neighborhood of $(1,1)$: + \item \textbf{Add Points in the $\varepsilon$-Neighborhood of $(1,1)$ to the Candidate Set $N$:} - \begin{alignat*}{2} - Distance_{(1,1)\leftrightarrow(2,1)} & = \sqrt{(1-2)^2+(1-1)^2} = 1 \\ - \end{alignat*} + Only $(2,1)$ is an unvisited point in the $\varepsilon$-neighborhood of $(1,1)$: - So only $(2,1)$ is added to the candidate set $N$: + \begin{alignat*}{2} + Distance_{(1,1)\leftrightarrow(2,1)} & = \sqrt{(1-2)^2+(1-1)^2} = 1 \\ + \end{alignat*} - \begin{alignat*}{2} - N & = \{(2,1)\} \\ - \end{alignat*} - \end{enumerate} + So only $(2,1)$ is added to the candidate set $N$: - \item \textbf{For $(2,1)$:} + \begin{alignat*}{2} + N & = \{(2,1)\} \\ + \end{alignat*} + \end{enumerate} - \begin{enumerate} - \item \textbf{Remove $(2,1)$ from the Candidate Set $N$:} + \item \textbf{For $(2,1)$:} - \begin{alignat*}{2} - N & = \{\} \\ - \end{alignat*} + \begin{enumerate} + \item \textbf{Remove $(2,1)$ from the Candidate Set $N$:} - \item \textbf{Mark $(2,1)$ as Visited:} + \begin{alignat*}{2} + N & = \{\} \\ + \end{alignat*} - \CoordinateSystemForDBSCAN{(1,1), (1,4), (2,1), (2,3), (3,2), (3,4), (4,1), (4,3), (4,4)}{(1,1), (1,2), (2,1)}{(2,1)}{(1,1), (1,2)}{}{} + \item \textbf{Mark $(2,1)$ as Visited:} - \item \textbf{Add $(2,1)$ to Cluster $0$:} + \CoordinateSystemForDBSCAN{(1,4), (2,3), (3,2), (3,4), (4,1), (4,3), (4,4)}{(1,1), (1,2), (2,1)}{(2,1)}{(1,1), (1,2)}{}{} - \CoordinateSystemForDBSCAN{(1,1), (1,4), (2,1), (2,3), (3,2), (3,4), (4,1), (4,3), (4,4)}{(1,1), (1,2), (2,1)}{}{(1,1), (1,2), (2,1)}{}{} + \item \textbf{Add $(2,1)$ to Cluster $0$:} - \item \textbf{Check if $(2,1)$ is a Core Point:} + \CoordinateSystemForDBSCAN{(1,4), (2,3), (3,2), (3,4), (4,1), (4,3), (4,4)}{(1,1), (1,2), (2,1)}{}{(1,1), (1,2), (2,1)}{}{} - If there are at least $2$ points in the $\varepsilon$-neighborhood of $(2,1)$, $(2,1)$ is a core point: + \item \textbf{Check if $(2,1)$ is a Core Point:} - \begin{alignat*}{2} - Distance_{(2,1)\leftrightarrow(1,1)} & = \sqrt{(2-1)^2+(1-1)^2} = 1 \\ - Distance_{(2,1)\leftrightarrow(2,1)} & = \sqrt{(2-2)^2+(1-1)^2} = 0 \\ - \end{alignat*} + If there are at least $2$ points in the $\varepsilon$-neighborhood of $(2,1)$, $(2,1)$ is a core point: - Therefore, $(2,1)$ is a core point. + \begin{alignat*}{2} + Distance_{(2,1)\leftrightarrow(1,1)} & = \sqrt{(2-1)^2+(1-1)^2} = 1 \\ + Distance_{(2,1)\leftrightarrow(2,1)} & = \sqrt{(2-2)^2+(1-1)^2} = 0 \\ + \end{alignat*} - \dotfill + Therefore, $(2,1)$ is a core point. - Shown in the coordinate system: + \dotfill - \CoordinateSystemWithCircle{(1,1), (2,1)}{(1,2), (1,4), (2,3), (3,2), (3,4), (4,1), (4,3), (4,4)}{(2,1)}{1} + Shown in the coordinate system: - \dotfill + \CoordinateSystemWithCircle{(1,1), (2,1)}{(1,2), (1,4), (2,3), (3,2), (3,4), (4,1), (4,3), (4,4)}{(2,1)}{1} - \item \textbf{Add Points in the $\varepsilon$-Neighborhood of $(2,1)$ to the Candidate Set $N$:} + \dotfill - There are no unvisited points in the $\varepsilon$-neighborhood of $(2,1)$. + \item \textbf{Add Points in the $\varepsilon$-Neighborhood of $(2,1)$ to the Candidate Set $N$:} - Thus, the candidate set $N$ remains empty. - \end{enumerate} + There are no unvisited points in the $\varepsilon$-neighborhood of $(2,1)$. - \item \textbf{Stop Iteration for Cluster $0$} - \end{enumerate} - \end{enumerate} + Thus, the candidate set $N$ remains empty. + \end{enumerate} + + \item \textbf{Stop the Iteration:} + + The candidate set $N$ is empty, thus the iteration is stopped. + \end{enumerate} +\end{enumerate} + +\item \textbf{Select $(1,4)$ as Random Point:} + +Every unvisited point can be selected as the next point to visit. In this sample solution, we decided to use $(1,4)$ as the next point: + +\begin{enumerate} + \item \textbf{Mark $(1,4)$ as Visited:} + + \CoordinateSystemForDBSCAN{(2,3), (3,2), (3,4), (4,1), (4,3), (4,4)}{(1,1), (1,2), (1,4), (2,1)}{(1,4)}{(1,1), (1,2), (2,1)}{}{} + + \item \textbf{Check if $(1,4)$ is a Core Point:} + + If there are at least $2$ points in the $\varepsilon$-neighborhood of $(1,4)$, $(1,4)$ is a core point: + + \begin{alignat*}{2} + Distance_{(1,4)\leftrightarrow(1,4)} & = \sqrt{(1-1)^2+(4-4)^2} = 0 \\ + \end{alignat*} + + Only $(1,4)$ is in the $\varepsilon$-neighborhood of $(1,4)$, thus $(1,4)$ is not a core point. + + \dotfill + + Shown in the coordinate system: + + \CoordinateSystemWithCircle{(1,4)}{(1,1), (1,2), (2,1), (2,3), (3,2), (3,4), (4,1), (4,3), (4,4)}{(1,4)}{1} + + \dotfill + + \item \textbf{Mark $(1,4)$ as Noise:} + + Since $(1,4)$ is not a core point, it is marked as noise: + + \CoordinateSystemForDBSCAN{(2,3), (3,2), (3,4), (4,1), (4,3), (4,4)}{(1,1), (1,2), (1,4), (2,1)}{}{(1,1), (1,2), (2,1)}{}{(1,4)} +\end{enumerate} + +\item \textbf{Select $(4,4)$ as Random Point:} + +Every unvisited point can be selected as the next point to visit. In this sample solution, we decided to use $(4,4)$ as the next point: + +\begin{enumerate} + \item \textbf{Mark $(4,4)$ as Visited:} + + \CoordinateSystemForDBSCAN{(2,3), (3,2), (3,4), (4,1), (4,3)}{(1,1), (1,2), (1,4), (2,1), (4,4)}{(4,4)}{(1,1), (1,2), (2,1)}{}{(1,4)} + + \item \textbf{Check if $(4,4)$ is a Core Point:} + + If there are at least $2$ points in the $\varepsilon$-neighborhood of $(4,4)$, $(4,4)$ is a core point: + + \begin{alignat*}{2} + Distance_{(4,4)\leftrightarrow(4,4)} & = \sqrt{(4-4)^2+(4-4)^2} = 0 \\ + Distance_{(4,4)\leftrightarrow(3,4)} & = \sqrt{(4-3)^2+(4-4)^2} = 1 \\ + Distance_{(4,4)\leftrightarrow(4,3)} & = \sqrt{(4-4)^2+(4-3)^2} = 1 \\ + \end{alignat*} + + Therefore, $(4,4)$ is a core point. + + \dotfill + + Shown in the coordinate system: + + \CoordinateSystemWithCircle{(3,4), (4,3), (4,4)}{(1,1), (1,2), (1,4), (2,1), (2,3), (3,2), (4,1)}{(4,4)}{1} + + \dotfill + + + \item \textbf{Create a New Cluster and Add $(4,4)$:} + + In this sample solution, we simply name this cluster "$1$": + + \CoordinateSystemForDBSCAN{(2,3), (3,2), (3,4), (4,1), (4,3)}{(1,1), (1,2), (1,4), (2,1), (4,4)}{}{(1,1), (1,2), (2,1)}{(4,4)}{(1,4)} + + \item \textbf{Add Points in the $\varepsilon$-Neighborhood of $(4,4)$ to the Candidate Set $N$:} + + Both $(3,4)$ and $(4,3)$ are unvisited points in the $\varepsilon$-neighborhood of $(4,4)$: + + \begin{alignat*}{2} + Distance_{(4,4)\leftrightarrow(3,4)} & = \sqrt{(4-3)^2+(4-4)^2} = 1 \\ + Distance_{(4,4)\leftrightarrow(4,3)} & = \sqrt{(4-4)^2+(4-3)^2} = 1 \\ + \end{alignat*} + + Thus, both $(3,4)$ and $(4,3)$ are added to the candidate set $N$: + + \begin{alignat*}{2} + N & = \{(3,4), (4,3)\} \\ + \end{alignat*} + + \item \textbf{Iterate through the Candidate Set $N$ (for Cluster $1$):} + + The candidate set $N$ is iterated through until it is empty. All points visited in this iteration will be added to cluster $1$, since this iteration was started with point $(4,4)$: + + \begin{enumerate} + \item \textbf{For $(3,4)$:} + + \begin{enumerate} + \item \textbf{Remove $(3,4)$ from the Candidate Set $N$:} + + \begin{alignat*}{2} + N & = \{(4,3)\} \\ + \end{alignat*} + + \item \textbf{Mark $(3,4)$ as Visited:} + + \CoordinateSystemForDBSCAN{(2,3), (3,2), (4,1), (4,3)}{(1,1), (1,2), (1,4), (2,1), (3,4), (4,4)}{(3,4)}{(1,1), (1,2), (2,1)}{(4,4)}{(1,4)} + + \item \textbf{Add $(3,4)$ to Cluster $1$:} + + \CoordinateSystemForDBSCAN{(2,3), (3,2), (4,1), (4,3)}{(1,1), (1,2), (1,4), (2,1), (3,4), (4,4)}{}{(1,1), (1,2), (2,1)}{(3,4), (4,4)}{(1,4)} + + \item \textbf{Check if $(3,4)$ is a Core Point:} + + If there are at least $2$ points in the $\varepsilon$-neighborhood of $(3,4)$, $(3,4)$ is a core point: + + \begin{alignat*}{2} + Distance_{(3,4)\leftrightarrow(3,4)} & = \sqrt{(3-3)^2+(4-4)^2} = 0 \\ + Distance_{(3,4)\leftrightarrow(4,4)} & = \sqrt{(3-4)^2+(4-4)^2} = 1 \\ + \end{alignat*} + + Therefore, $(3,4)$ is a core point. + + \dotfill + + Shown in the coordinate system: + + \CoordinateSystemWithCircle{(3,4), (4,4)}{(1,1), (1,2), (1,4), (2,1), (2,3), (3,2), (4,1), (4,3)}{(3,4)}{1} + + \dotfill + + \item \textbf{Add Points in the $\varepsilon$-Neighborhood of $(3,4)$ to the Candidate Set $N$:} + + There are no unvisited points in the $\varepsilon$-neighborhood of $(3,4)$. + + Thus, the candidate set $N$ remains the same. + \end{enumerate} + + \item \textbf{For $(4,3)$:} + + \begin{enumerate} + \item \textbf{Remove $(4,3)$ from the Candidate Set $N$:} + + \begin{alignat*}{2} + N & = \{\} \\ + \end{alignat*} + + \item \textbf{Mark $(4,3)$ as Visited:} + + \CoordinateSystemForDBSCAN{(2,3), (3,2), (4,1)}{(1,1), (1,2), (1,4), (2,1), (3,4), (4,3), (4,4)}{(4,3)}{(1,1), (1,2), (2,1)}{(3,4), (4,4)}{(1,4)} + + \item \textbf{Add $(4,3)$ to Cluster $1$:} + + \CoordinateSystemForDBSCAN{(2,3), (3,2), (4,1)}{(1,1), (1,2), (1,4), (2,1), (3,4), (4,3), (4,4)}{}{(1,1), (1,2), (2,1)}{(3,4), (4,3), (4,4)}{(1,4)} + + \item \textbf{Check if $(4,3)$ is a Core Point:} + + If there are at least $2$ points in the $\varepsilon$-neighborhood of $(4,3)$, $(4,3)$ is a core point: + + \begin{alignat*}{2} + Distance_{(4,3)\leftrightarrow(4,3)} & = \sqrt{(4-4)^2+(3-3)^2} = 0 \\ + Distance_{(4,3)\leftrightarrow(4,4)} & = \sqrt{(4-4)^2+(3-4)^2} = 1 \\ + \end{alignat*} + + Therefore, $(4,3)$ is a core point. + + \dotfill + + Shown in the coordinate system: + + \CoordinateSystemWithCircle{(4,3), (4,4)}{(1,1), (1,2), (1,4), (2,1), (2,3), (3,2), (3,4), (4,1)}{(4,3)}{1} + + \dotfill + + \item \textbf{Add Points in the $\varepsilon$-Neighborhood of $(4,3)$ to the Candidate Set $N$:} + + There are no unvisited points in the $\varepsilon$-neighborhood of $(4,3)$. + + Thus, the candidate set $N$ remains empty. + + \end{enumerate} + + \item \textbf{Stop the Iteration:} + + The candidate set $N$ is empty, thus the iteration is stopped. + \end{enumerate} + + \item \textbf{Select $(2,3)$ as Random Point:} + + Every unvisited point can be selected as the next point to visit. In this sample solution, we decided to use $(2,3)$ as the next point: + + \begin{enumerate} + \item \textbf{Mark $(2,3)$ as Visited:} + + \CoordinateSystemForDBSCAN{(3,2), (4,1)}{(1,1), (1,2), (1,4), (2,1), (2,3), (3,4), (4,3), (4,4)}{(2,3)}{(1,1), (1,2), (2,1)}{(3,4), (4,3), (4,4)}{(1,4)} + + \item \textbf{Check if $(2,3)$ is a Core Point:} + + If there are at least $2$ points in the $\varepsilon$-neighborhood of $(2,3)$, $(2,3)$ is a core point: + + \begin{alignat*}{2} + Distance_{(2,3)\leftrightarrow(2,3)} & = \sqrt{(2-2)^2+(3-3)^2} = 0 \\ + \end{alignat*} + + Only $(2,3)$ is in the $\varepsilon$-neighborhood of $(2,3)$, thus $(2,3)$ is not a core point. + + \dotfill + + Shown in the coordinate system: + + \CoordinateSystemWithCircle{(2,3)}{(1,1), (1,2), (1,4), (2,1), (3,2), (3,4), (4,1), (4,3), (4,4)}{(2,3)}{1} + + \dotfill + + \item \textbf{Mark $(2,3)$ as Noise:} + + Since $(2,3)$ is not a core point, it is marked as noise: + + \CoordinateSystemForDBSCAN{(3,2), (4,1)}{(1,1), (1,2), (1,4), (2,1), (2,3), (3,4), (4,3), (4,4)}{}{(1,1), (1,2), (2,1)}{(3,4), (4,3), (4,4)}{(1,4), (2,3)} + \end{enumerate} + + \item \textbf{Select $(3,2)$ as Random Point:} + + Every unvisited point can be selected as the next point to visit. In this sample solution, we decided to use $(3,2)$ as the next point: + + \begin{enumerate} + \item \textbf{Mark $(3,2)$ as Visited:} + + \CoordinateSystemForDBSCAN{(4,1)}{(1,1), (1,2), (1,4), (2,1), (2,3), (3,2), (3,4), (4,3), (4,4)}{(3,2)}{(1,1), (1,2), (2,1)}{(3,4), (4,3), (4,4)}{(1,4), (2,3)} + + \item \textbf{Check if $(3,2)$ is a Core Point:} + + If there are at least $2$ points in the $\varepsilon$-neighborhood of $(3,2)$, $(3,2)$ is a core point: + + \begin{alignat*}{2} + Distance_{(3,2)\leftrightarrow(3,2)} & = \sqrt{(3-3)^2+(2-2)^2} = 0 \\ + \end{alignat*} + + Only $(3,2)$ is in the $\varepsilon$-neighborhood of $(3,2)$, thus $(3,2)$ is not a core point. + + \dotfill + + Shown in the coordinate system: + + \CoordinateSystemWithCircle{(3,2)}{(1,1), (1,2), (1,4), (2,1), (2,3), (3,4), (4,1), (4,3), (4,4)}{(3,2)}{1} + + \dotfill + + \item \textbf{Mark $(3,2)$ as Noise:} + + Since $(3,2)$ is not a core point, it is marked as noise: + + \CoordinateSystemForDBSCAN{(4,1)}{(1,1), (1,2), (1,4), (2,1), (2,3), (3,2), (3,4), (4,3), (4,4)}{}{(1,1), (1,2), (2,1)}{(3,4), (4,3), (4,4)}{(1,4), (2,3), (3,2)} + + \end{enumerate} + + \item \textbf{Select $(4,1)$ as Random Point:} + + Every unvisited point can be selected as the next point to visit. In this sample solution, we decided to use $(4,1)$ as the next point: + + \begin{enumerate} + \item \textbf{Mark $(4,1)$ as Visited:} + \CoordinateSystemForDBSCAN{}{(1,1), (1,2), (1,4), (2,1), (2,3), (3,2), (3,4), (4,1), (4,3), (4,4)}{(4,1)}{(1,1), (1,2), (2,1)}{(3,4), (4,3), (4,4)}{(1,4), (2,3), (3,2)} + + \item \textbf{Check if $(4,1)$ is a Core Point:} + + If there are at least $2$ points in the $\varepsilon$-neighborhood of $(4,1)$, $(4,1)$ is a core point: + + \begin{alignat*}{2} + Distance_{(4,1)\leftrightarrow(4,1)} & = \sqrt{(4-4)^2+(1-1)^2} = 0 \\ + \end{alignat*} + + Only $(4,1)$ is in the $\varepsilon$-neighborhood of $(4,1)$, thus $(4,1)$ is not a core point. + + \dotfill + + Shown in the coordinate system: + + \CoordinateSystemWithCircle{(4,1)}{(1,1), (1,2), (1,4), (2,1), (2,3), (3,2), (3,4), (4,3), (4,4)}{(4,1)}{1} + + \dotfill + + \item \textbf{Mark $(4,1)$ as Noise:} + + Since $(4,1)$ is not a core point, it is marked as noise: + + \CoordinateSystemForDBSCAN{}{(1,1), (1,2), (1,4), (2,1), (2,3), (3,2), (3,4), (4,1), (4,3), (4,4)}{}{(1,1), (1,2), (2,1)}{(3,4), (4,3), (4,4)}{(1,4), (2,3), (3,2), (4,1)} + \end{enumerate} + + \item \textbf{Stop the Algorithm:} + + All points have been visited, thus the algorithm is stopped. + + The final results are: + + \begin{alignat*}{2} + \text{Cluster }{0} & : \{(1,1), (1,2), (2,1)\} \\ + \text{Cluster }{1} & : \{(3,4), (4,3), (4,4)\} \\ + \text{Noise} & : \{(1,4), (2,3), (3,2), (4,1)\} \\ + \end{alignat*} +\end{enumerate} \end{solution} diff --git a/exercise/5-Clustering/Clustering-in-Python.ipynb b/exercise/5-Clustering/Clustering-in-Python.ipynb new file mode 100644 index 0000000..666a17f --- /dev/null +++ b/exercise/5-Clustering/Clustering-in-Python.ipynb @@ -0,0 +1,882 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "tags": [ + "specification" + ] + }, + "source": [ + "# Exercise 3: Clustering in Python\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Imagine the following scenario:" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "tags": [ + "specification" + ] + }, + "source": [ + "*You are once again a Data Scientist at the fictitious company Adventure Works GmbH. After your successful analyses on the topics of Frequent Patterns and Classification, your bosses now assign you the task of dividing the products into groups of different profitability.*\n", + "\n", + "*In discussions with your business administration colleagues, you learn that the decisive metrics here are probably the number of products actually sold and the profit per product (the sales price minus the production costs).*\n", + "\n", + "*The colleagues from IT tell you that you will probably find the required data in the tables `Product` and `SalesOrderDetail`.*" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [ + "specification" + ] + }, + "outputs": [], + "source": [ + "# Import required libraries\n", + "import os\n", + "import tempfile\n", + "import sqlite3\n", + "import urllib.request\n", + "import pandas as pd\n", + "import seaborn as sns\n", + "from matplotlib import pyplot as plt\n", + "from sklearn.cluster import KMeans\n", + "from sklearn.cluster import DBSCAN\n", + "from sklearn.cluster import Birch\n", + "from sklearn.preprocessing import MinMaxScaler" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [ + "specification" + ] + }, + "outputs": [], + "source": [ + "# Create a temporary directory\n", + "dataset_folder = tempfile.mkdtemp()\n", + "\n", + "# Build path to database\n", + "database_path = os.path.join(dataset_folder, \"adventure-works.db\")\n", + "\n", + "# Get the database\n", + "urllib.request.urlretrieve(\n", + " \"https://github.com/FAU-CS6/KDD-Databases/raw/main/AdventureWorks/adventure-works.db\",\n", + " database_path,\n", + ")\n", + "\n", + "# Open connection to the adventure-works.db\n", + "connection = sqlite3.connect(database_path)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "tags": [ + "specification" + ] + }, + "source": [ + "
\n", + "\n", + "**Task 1.1:** \n", + " \n", + "Cluster the products within the OLTP database of the fictitious Adventure Works GmbH according to their profitability. Furthermore, visualize the result.\n", + "\n", + "
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [ + "student" + ] + }, + "outputs": [], + "source": [ + "# Cluster the products based on their profitability (Code placeholder 01/10)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [ + "student" + ] + }, + "outputs": [], + "source": [ + "# Cluster the products based on their profitability (Code placeholder 02/10)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [ + "student" + ] + }, + "outputs": [], + "source": [ + "# Cluster the products based on their profitability (Code placeholder 03/10)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [ + "student" + ] + }, + "outputs": [], + "source": [ + "# Cluster the products based on their profitability (Code placeholder 04/10)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [ + "student" + ] + }, + "outputs": [], + "source": [ + "# Cluster the products based on their profitability (Code placeholder 05/10)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [ + "student" + ] + }, + "outputs": [], + "source": [ + "# Cluster the products based on their profitability (Code placeholder 06/10)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [ + "student" + ] + }, + "outputs": [], + "source": [ + "# Cluster the products based on their profitability (Code placeholder 07/10)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [ + "student" + ] + }, + "outputs": [], + "source": [ + "# Cluster the products based on their profitability (Code placeholder 08/10)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [ + "student" + ] + }, + "outputs": [], + "source": [ + "# Cluster the products based on their profitability (Code placeholder 09/10)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [ + "student" + ] + }, + "outputs": [], + "source": [ + "# Cluster the products based on their profitability (Code placeholder 10/10)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "tags": [ + "solution" + ] + }, + "source": [ + "The first step in this task is obviously to read the data from the database into a DataFrame and to take a first a look at the data at hand:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [ + "solution" + ] + }, + "outputs": [], + "source": [ + "# Load Product into a DataFrame and display the first ten rows\n", + "product_df = pd.read_sql_query(\"SELECT * FROM Product\", connection)\n", + "product_df.head(10)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [ + "solution" + ] + }, + "outputs": [], + "source": [ + "# Load SalesOrderDetail into a DataFrame and display the first ten rows\n", + "sales_order_detail_df = pd.read_sql_query(\"SELECT * FROM SalesOrderDetail\", connection)\n", + "sales_order_detail_df.head(10)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "tags": [ + "solution" + ] + }, + "source": [ + "While getting to know the data, it is important to identify the relevant attributes. In our case we need:\n", + "\n", + "- **Number of copies sold per product:** \n", + "The table `SalesOrderDetail` contains information on how many products have been sold within that single order (`OrderQty`). If we sum up the `OrderQty` per product we get the number of copies sold per product.\n", + "- **Average profit per copy sold (per product):** \n", + "First of all, it must be understood that the profit per sale can be calculated simply by subtracting the manufacturing cost (`StandardCost` in `Product`) from the actual selling price (`UnitPrice` in `SalesOrderDetail`). In this case, however, should not be summed up for a product, but the average should be determined." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "tags": [ + "solution" + ] + }, + "source": [ + "These can either be created by preprocessing the initial DataFrames, or by using a simple SQL Statement for the aggregation. In this sample solution, we decided to do the latter:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [ + "solution" + ] + }, + "outputs": [], + "source": [ + "# Load `ProductID`,`ProfitPerUnit` and `OverallOrderQty` into a DataFrame\n", + "product_overview_df = pd.read_sql_query(\n", + " \"SELECT p.ProductID, AVG(sod.UnitPrice - p.StandardCost) AS ProfitPerUnit, SUM(sod.OrderQty) AS OverallOrderQty FROM Product p, SalesOrderDetail sod WHERE p.ProductID = sod.ProductID GROUP BY p.ProductID\",\n", + " connection,\n", + ")\n", + "product_overview_df.head(10)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "tags": [ + "solution" + ] + }, + "source": [ + "Since the value range of `OverallOrderQty` goes from 4 to 8311 and the value range of `ProfitPerUnit` only goes from about -55 to about 1155, this dataset would not currently be a good fit for most clustering techniques. The `OverallOrderQty` would have a much higher influence in this case, which is why it makes sense to normalize the `product_overview_df` first:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [ + "solution" + ] + }, + "outputs": [], + "source": [ + "# Normalize the product_overview_df\n", + "min_max_scaler = MinMaxScaler()\n", + "product_overview_df[\n", + " [\"ProfitPerUnit\", \"OverallOrderQty\"]\n", + "] = min_max_scaler.fit_transform(\n", + " product_overview_df[[\"ProfitPerUnit\", \"OverallOrderQty\"]]\n", + ")\n", + "product_overview_df.head(10)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "tags": [ + "solution" + ] + }, + "source": [ + "This preprocessed DataFrame can be used for clustering. However, the question is which clustering method should be used. We will focus on K-Means, DBSCAN and BIRCH. All of these methods have been presented in the lecture and are implemented in scikit-learn:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [ + "solution" + ] + }, + "outputs": [], + "source": [ + "# Perform scikit-learn's K-means clustering on the dataset\n", + "kmeans = KMeans(n_clusters=6, n_init=\"auto\").fit(\n", + " product_overview_df[[\"ProfitPerUnit\", \"OverallOrderQty\"]]\n", + ")\n", + "\n", + "# Save the labels to a copy of the big_dataset to generate the equivalent of our clustered_big_dataset\n", + "clustered_product_overview_df = product_overview_df.copy()\n", + "clustered_product_overview_df[\"cluster\"] = kmeans.labels_\n", + "\n", + "# Print the result\n", + "plt.figure(figsize=(8, 8))\n", + "sns.scatterplot(\n", + " x=clustered_product_overview_df[\"ProfitPerUnit\"],\n", + " y=clustered_product_overview_df[\"OverallOrderQty\"],\n", + " hue=clustered_product_overview_df[\"cluster\"],\n", + " palette=\"deep\",\n", + ")\n", + "plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.0)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [ + "solution" + ] + }, + "outputs": [], + "source": [ + "# Perform scikit-learn's DBSCAN clustering on the dataset\n", + "dbscan = DBSCAN(eps=0.2, min_samples=5).fit(\n", + " product_overview_df[[\"ProfitPerUnit\", \"OverallOrderQty\"]]\n", + ")\n", + "\n", + "# Save the labels to a copy of the big_dataset to generate the equivalent of our clustered_big_dataset\n", + "clustered_product_overview_df = product_overview_df.copy()\n", + "clustered_product_overview_df[\"cluster\"] = dbscan.labels_\n", + "\n", + "# Print the result\n", + "plt.figure(figsize=(8, 8))\n", + "sns.scatterplot(\n", + " x=clustered_product_overview_df[\"ProfitPerUnit\"],\n", + " y=clustered_product_overview_df[\"OverallOrderQty\"],\n", + " hue=clustered_product_overview_df[\"cluster\"],\n", + " palette=\"deep\",\n", + ")\n", + "plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.0)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [ + "solution" + ] + }, + "outputs": [], + "source": [ + "# Perform scikit-learn's BIRCH clustering on the dataset\n", + "birch = Birch(threshold=0.1, n_clusters=6).fit(\n", + " product_overview_df[[\"ProfitPerUnit\", \"OverallOrderQty\"]]\n", + ")\n", + "\n", + "# Save the labels to a copy of the big_dataset to generate the equivalent of our clustered_big_dataset\n", + "clustered_product_overview_df = product_overview_df.copy()\n", + "clustered_product_overview_df[\"cluster\"] = birch.labels_\n", + "\n", + "# Print the result\n", + "plt.figure(figsize=(8, 8))\n", + "sns.scatterplot(\n", + " x=clustered_product_overview_df[\"ProfitPerUnit\"],\n", + " y=clustered_product_overview_df[\"OverallOrderQty\"],\n", + " hue=clustered_product_overview_df[\"cluster\"],\n", + " palette=\"deep\",\n", + ")\n", + "plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.0)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "tags": [ + "solution" + ] + }, + "source": [ + "Of course, your fictitious bosses don't want to be presented with three different results from you. A clustering procedure should be chosen. For this purpose, it is useful to compare the results briefly:\n", + "\n", + "- **K-means and BIRCH:** \n", + "Both K-means and BIRCH produce quite similar results in this case. With our parameters we can interpret the found clusters the following way: \n", + " - Less interesting from a business perspective:\n", + " - The cluster that does not bring much profit per unit and that was not sold frequently.\n", + " - The cluster that does not bring much profit per unit, but was sold at least a little more often.\n", + " - The cluster that brings a little more profit per unit, but which was not sold frequently.\n", + " - Interesting from a business perspective:\n", + " - The cluster that does not bring much profit per unit, but which was sold at least extremely often.\n", + " - The cluster that is average both in terms of profit per unit and frequency of sales.\n", + " - The cluster that brings extremely much profit, but which was hardly sold.\n", + "\n", + "- **DBSCAN:** \n", + "With our parameters we can interpret the found clusters the following way:\n", + " - Less interesting from a business perspective:\n", + " - Merged into one cluster\n", + " - Interesting from a business perspective:\n", + " - The cluster that does not bring much profit per unit, but which was sold at least extremely often.\n", + " - The cluster that is average both in terms of profit per unit and frequency of sales.\n", + " - The cluster that brings extremely much profit, but which was hardly sold.\n", + " \n", + "Since DBSCAN merges the less interesting products into a cluster, it can be said that DBSCAN is probably the better choice here. This way, the focus in a presentation can be placed on the economically more interesting product clusters. \n", + "\n", + "However, it is of course not a big problem in this case if K-means or BIRCH are used, since one can at least argue that here a better distinction is made between completely uninteresting and at least somewhat interesting products." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "tags": [ + "specification" + ] + }, + "source": [ + "Since the first scenario was quite straightforward, imagine a second scenario:" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "tags": [ + "specification" + ] + }, + "source": [ + "*As a Data Scientist who has already solved multiple different tasks for his bosses in the fictitious Adventure Works GmbH, you are immediately assigned another task. Your bosses want to make the sales team more efficient by assigning customers with similar product interests to the same employee.*\n", + "\n", + "*In order to be able to carry out this reassignment, you are tasked with dividing the customers into 16 clusters (there are 16 sales persons in the company). This classification is to be based on the products that the customers have ordered in the past.*\n", + "\n", + "*Via the IT department you learn that the customers can probably be found in the table ´Customer´. You will need to join the table `SalesOrderHeader` and then the table `SalesOrderDetail` to get information on the ordered ProductIDs per customers.*" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
\n", + "\n", + "**Task 2:**\n", + " \n", + "Group the customers into 16 clusters by using K-means. You do not need to visualize the result.\n", + "\n", + "
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [ + "student" + ] + }, + "outputs": [], + "source": [ + "# Cluster the customers based on their interests (Code placeholder 01/10)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [ + "student" + ] + }, + "outputs": [], + "source": [ + "# Cluster the customers based on their interests (Code placeholder 02/10)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [ + "student" + ] + }, + "outputs": [], + "source": [ + "# Cluster the customers based on their interests (Code placeholder 03/10)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [ + "student" + ] + }, + "outputs": [], + "source": [ + "# Cluster the customers based on their interests (Code placeholder 04/10)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [ + "student" + ] + }, + "outputs": [], + "source": [ + "# Cluster the customers based on their interests (Code placeholder 05/10)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [ + "student" + ] + }, + "outputs": [], + "source": [ + "# Cluster the customers based on their interests (Code placeholder 06/10)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [ + "student" + ] + }, + "outputs": [], + "source": [ + "# Cluster the customers based on their interests (Code placeholder 07/10)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [ + "student" + ] + }, + "outputs": [], + "source": [ + "# Cluster the customers based on their interests (Code placeholder 08/10)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [ + "student" + ] + }, + "outputs": [], + "source": [ + "# Cluster the customers based on their interests (Code placeholder 09/10)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [ + "student" + ] + }, + "outputs": [], + "source": [ + "# Cluster the customers based on their interests (Code placeholder 10/10)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "tags": [ + "solution" + ] + }, + "source": [ + "The first step is again to load the data into a DataFrame:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [ + "solution" + ] + }, + "outputs": [], + "source": [ + "customer_purchases_df = pd.read_sql_query(\n", + " \"SELECT c.CustomerID, sod.ProductID, sod.OrderQty FROM Customer c JOIN SalesOrderHeader soh ON c.CustomerID = soh.CustomerID JOIN SalesOrderDetail sod ON sod.SalesOrderID = soh.SalesOrderID\",\n", + " connection,\n", + " index_col=\"CustomerID\",\n", + ")\n", + "customer_purchases_df.head(10)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "tags": [ + "solution" + ] + }, + "source": [ + "After taking a first look at the available data, you should realize one thing above all:\n", + "\n", + "The attribute `ProductID` is numeric. If the attribute were to be fed directly to the clustering, the numerical distance between two IDs would probably be used to calculate a distance between customers. \n", + "\n", + "A customer who bought the product with the ID `707` would thus be more dissimilar to a customer who bought the product with the ID `879` than to a customer who bought the product with the ID `712`. Usually, however, the distribution of product IDs is not based on similarity, but simply on the order in which the products were added to the catalog.\n", + "\n", + "So for this task, we first need to put the data into a different format. Basically, we need to compare each customer's interest in each product individually. \n", + "\n", + "We have to choose between two different possibilities:\n", + "\n", + "- **Concept 1:** Determine for each product if the customer has purchased the product (binary scale):\n", + "\n", + "| | Product 1 | Product 2 | Product 3 | Product 4 |\n", + "|------------|-----------|-----------|-----------|-----------|\n", + "| Customer 1 | 1 | 0 | 0 | 0 |\n", + "| Customer 2 | 1 | 1 | 0 | 0 |\n", + "| Customer 3 | 0 | 1 | 1 | 1 |\n", + "\n", + "\n", + "- **Concept 2:** Determine sum of copies purchased for each product purchased (continuous scale): \n", + "\n", + "| | Product 1 | Product 2 | Product 3 | Product 4 |\n", + "|------------|-----------|-----------|-----------|-----------|\n", + "| Customer 1 | 236 | 0 | 0 | 0 |\n", + "| Customer 2 | 1 | 199 | 0 | 0 |\n", + "| Customer 3 | 0 | 199 | 5 | 1 |\n", + "\n", + "In the first variant, `Customer 1` and `Customer 2` would be most similar. In the second variant, the interests of `Customer 2` and `Customer 3` would be most similar.\n", + "\n", + "Since both options have advantages and disadvantages, but we cannot discuss with our fictitious bosses which option is better for our company, we had to decide for ourselves and in this sample solution we simply have opted for concept 2." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "tags": [ + "solution" + ] + }, + "source": [ + "Of course, we also need the quantity of all purchases in the DataFrame, which we do using SQL:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [ + "solution" + ] + }, + "outputs": [], + "source": [ + "# Load a DataFrame that contains the total copies purchased per customer and product\n", + "customer_interests_df = pd.read_sql_query(\n", + " \"SELECT c.CustomerID, sod.ProductID, SUM(sod.OrderQty) AS TotalOrderQty FROM Customer c JOIN SalesOrderHeader soh ON c.CustomerID = soh.CustomerID JOIN SalesOrderDetail sod ON sod.SalesOrderID = soh.SalesOrderID GROUP BY c.CustomerID, sod.ProductID\",\n", + " connection,\n", + ")\n", + "customer_interests_df.tail(10)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "tags": [ + "solution" + ] + }, + "source": [ + "This DataFrame can then be pivoted in order to arrive at a DataFrame corresponding to the above concept:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [ + "solution" + ] + }, + "outputs": [], + "source": [ + "# Pivot the DataFrame\n", + "customer_interests_pivot_df = customer_interests_df.pivot(\n", + " index=\"CustomerID\", columns=\"ProductID\", values=\"TotalOrderQty\"\n", + ").fillna(0)\n", + "customer_interests_pivot_df.tail(10)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "tags": [ + "solution" + ] + }, + "source": [ + "It is now prepared to be used in clustering:\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [ + "solution" + ] + }, + "outputs": [], + "source": [ + "# Perform scikit-learn's K-means clustering on the dataset\n", + "kmeans = KMeans(n_clusters=16, n_init=\"auto\").fit(customer_interests_pivot_df)\n", + "\n", + "# Save the labels to a copy of the big_dataset to generate the equivalent of our clustered_big_dataset\n", + "clustered_customer_interests_pivot_df = customer_interests_pivot_df.copy()\n", + "clustered_customer_interests_pivot_df[\"cluster\"] = kmeans.labels_\n", + "\n", + "# Print the resulting DataFrame\n", + "clustered_customer_interests_pivot_df.tail(10)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "tags": [ + "solution" + ] + }, + "source": [ + "Even if this leads a clustering result, you might want to take a look at the value distribution in these clusters:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [ + "solution" + ] + }, + "outputs": [], + "source": [ + "# Get the count of customers per cluster\n", + "clustered_customer_interests_pivot_df[[\"cluster\"]].value_counts()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "tags": [ + "solution" + ] + }, + "source": [ + "A very unbalanced distribution can be seen, indicating that these clusters should definitely not be used in this way to assign customers to the 16 sales persons. If you try the different clustering methods implemented scikit-learn, you will notice that none of the methods changes this imbalance decisively. \n", + "\n", + "This is simply because clustering is not designed to achieve (roughly) equal cluster sizes. There are ideas here on how to get around this (e.g., calculate significantly more clusters and then merge each of these with neighbors until about the required size is reached), but these new groups would then not necessarily only contain customers with similar interests. \n", + "\n", + "It would make more sense here to approach the fictitious bosses again and tell them that similarity of interests is probably not the best criterion for dividing customers among sales persons. \n", + "\n", + "Since it is of course a shame to end an exercise sheet with a perceived failure: \n", + "There are some interesting things that can be concluded from the identified clusters. We now know that the majority of the customer base seems to share similar interests. This can be pitched to the management to further specialize the focus of the company. Even failures in data science sometimes contain new insights, you just have to be open enough to discover them." + ] + } + ], + "metadata": { + "celltoolbar": "Tags", + "interpreter": { + "hash": "916dbcbb3f70747c44a77c7bcd40155683ae19c65e1c03b4aa3499c5328201f1" + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.5" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/exercise/5-Clustering/requirements.txt b/exercise/5-Clustering/requirements.txt new file mode 100644 index 0000000..bc8bfa3 --- /dev/null +++ b/exercise/5-Clustering/requirements.txt @@ -0,0 +1,8 @@ +pandas==1.5.3 +seaborn==0.12.2 +numpy==1.26.1 +scikit-learn==1.2.2 +matplotlib==3.7.1 + +ipython==8.11.0 +jupyterlab==3.6.2