From a80183bc8b8681865392568b798357b10f4e253d Mon Sep 17 00:00:00 2001
From: Henriquelay <henriquelayber@gmail.com>
Date: Mon, 15 Mar 2021 05:01:28 -0300
Subject: [PATCH] Very big size update

---
 Makefile                     |  2 ++
 lib/data.h                   | 15 ++++-----
 lib/distances.h              | 31 +++++++++++++++++
 lib/{fileReader.h => file.h} |  0
 lib/graph.h                  | 15 +++++++++
 lib/unionFind.h              | 12 +++----
 src/data.c                   | 35 ++++++++++---------
 src/distances.c              | 65 ++++++++++++++++++++++++++++++++++++
 src/{fileReader.c => file.c} |  2 +-
 src/graph.c                  | 45 +++++++++++++++++++++++++
 src/main.c                   | 33 ++++++++++--------
 src/unionFind.c              | 39 +++++++++++-----------
 12 files changed, 231 insertions(+), 63 deletions(-)
 create mode 100644 lib/distances.h
 rename lib/{fileReader.h => file.h} (100%)
 create mode 100644 lib/graph.h
 create mode 100644 src/distances.c
 rename src/{fileReader.c => file.c} (98%)
 create mode 100644 src/graph.c

diff --git a/Makefile b/Makefile
index 468c333..804c058 100644
--- a/Makefile
+++ b/Makefile
@@ -31,6 +31,8 @@ $(BUILD_DIR)/%.c.o: %.c
 clean:
 	$(RM) */*.o */*.d
 
+run: $(TARGET_EXEC)
+	$(BUILD_DIR)/$(TARGET_EXEC) $(ARGS)
 
 valzin: $(TARGET_EXEC)
 	$(VALGRIND) $(VALZIN_FLAGS) $(FLAGS) $(BUILD_DIR)/$(TARGET_EXEC) $(ARGS)
diff --git a/lib/data.h b/lib/data.h
index e93c4d0..133ac87 100644
--- a/lib/data.h
+++ b/lib/data.h
@@ -14,27 +14,24 @@
 #ifndef _DATA_H_
 #define _DATA_H_
 
-#define _GNU_SOURCE // Needed to use qsort_r when not using c11 or gnu89/gnu99
-#include <math.h>
-
-#include "./fileReader.h"
-#include "./unionFind.h"
+#include "./file.h"
 
 // "Rows"
-typedef struct sample {
+typedef struct sample_t {
     char* id;
     long double* features;
+    size_t index;
 } sample_t;
 
 // "Lines" of rows
-typedef struct dataSet {
+typedef struct dataSet_t {
     size_t nElements;
     size_t nFeatures;
     sample_t* samples;
 } dataSet_t;
 
-
-dataSet_t* loadData(FILE* file, const char *separator);
+dataSet_t* initDataSet(size_t nFeatures, size_t nElements);
+dataSet_t* loadData(FILE* file, const char* separator);
 void printSample(const sample_t* sample, const size_t* nFeatures);
 void printDataSet(dataSet_t* dataSet);
 void destroySample(sample_t* sample);
diff --git a/lib/distances.h b/lib/distances.h
new file mode 100644
index 0000000..d5a84ed
--- /dev/null
+++ b/lib/distances.h
@@ -0,0 +1,31 @@
+/**
+    * This library is for handling distances calculations and related manners.
+**/
+
+#ifndef _DISTS_H_
+#define _DISTS_H_
+
+#include <math.h>
+
+#include "./data.h"
+
+// "Rows"
+typedef struct distanceSample_t {
+    char* id;
+    long double distance;
+    // On distanceSet
+    sample_t  *from, *to;
+} distanceSample_t;
+
+// "Lines" of rows
+typedef struct distanceDataSet_t {
+    size_t nElements;
+    size_t depth;
+    distanceSample_t* samples;
+} distanceDataSet_t;
+
+distanceDataSet_t* calculateDistances(dataSet_t* points);
+void printDistanceSet(distanceDataSet_t* dataSet);
+void destroyDistanceDataSet(distanceDataSet_t* dataSet);
+
+#endif
diff --git a/lib/fileReader.h b/lib/file.h
similarity index 100%
rename from lib/fileReader.h
rename to lib/file.h
diff --git a/lib/graph.h b/lib/graph.h
new file mode 100644
index 0000000..bf8ea11
--- /dev/null
+++ b/lib/graph.h
@@ -0,0 +1,15 @@
+/**
+* This library is for handling dataSets as in they were a graph. Any graph algorithm should be here
+**/
+
+#ifndef _GRAPH_H_
+#define _GRAPH_H_
+
+#define _GNU_SOURCE // Needed to use qsort_r when not using c11 or gnu89/gnu99
+
+#include "./data.h"
+#include "./unionFind.h"
+
+union_t* kruskal(distanceDataSet_t* dataSet, size_t groupsNumber);
+
+#endif
diff --git a/lib/unionFind.h b/lib/unionFind.h
index 9cd1c57..59091dd 100644
--- a/lib/unionFind.h
+++ b/lib/unionFind.h
@@ -4,19 +4,19 @@
 #include <stdlib.h>
 #include <stdio.h>
 
+#include "./distances.h"
+
 typedef struct union_t {
     size_t* array;
     // Array of sizes to calculate weight and balance the insertion
     size_t* size;
-    // Number of elements on array
-    size_t arraySize;
     // Tie-in to elem
-    void *elem;
+    distanceSample_t *sample;
 } union_t;
 
-union_t* UF_init(const size_t size, void** elemArray);
-union_t* UF_destroy(union_t* unionStruct);
+union_t* UF_init(const size_t size, distanceSample_t* samples);
+void UF_destroy(union_t* unionStruct);
 size_t UF_find(union_t* unionStruct, size_t index);
-void UF_union(union_t* unionStruct, const size_t p, const size_t q);
+char UF_union(union_t* unionStruct, const size_t p, const size_t q);
 
 #endif
diff --git a/src/data.c b/src/data.c
index 79bdc14..b8d5616 100644
--- a/src/data.c
+++ b/src/data.c
@@ -2,31 +2,32 @@
 
 #define BUFSIZE 1500
 
-dataSet_t* loadData(FILE* file, const char* separator) {
-    char buffer[BUFSIZE];
-    size_t bufferSize = BUFSIZE;
-
+dataSet_t* initDataSet(size_t nFeatures, size_t nElements) {
     dataSet_t* dataSet = (dataSet_t*)malloc(sizeof(dataSet_t));
     if (dataSet == NULL) {
         perror("Error allocating new dataSet. Exiting");
         exit(1);
     };
-
-    dataSet->nFeatures = getLineSize(file, *separator, buffer, &bufferSize);
-    dataSet->nElements = countLines(file);
-
-
-    // 0 will always be id, read as `char*`.
+    dataSet->nFeatures = nFeatures;
+    dataSet->nElements = nElements;
     dataSet->samples = (sample_t*)malloc(sizeof(sample_t) * dataSet->nElements);
     if (dataSet->samples == NULL) {
         perror("Error allocating new samples. Exiting");
         exit(1);
     }
+    return dataSet;
+}
+
+dataSet_t* loadData(FILE* file, const char* separator) {
+    char buffer[BUFSIZE];
+    size_t bufferSize = BUFSIZE;
+
+    dataSet_t* dataSet = initDataSet(getLineSize(file, *separator, buffer, &bufferSize), countLines(file));
 
-    // void** line = NULL;
     for (size_t i = 0; i < dataSet->nElements; i++) {
         char** line = readLine(file, separator, buffer, &bufferSize, &dataSet->nFeatures);
         dataSet->samples[i].id = line[0];
+        dataSet->samples[i].index = i;
         dataSet->samples[i].features = (long double*)malloc(sizeof(long double) * dataSet->nFeatures);
         if (dataSet->samples == NULL) {
             perror("Error allocating features for new sample. Exiting");
@@ -34,7 +35,7 @@ dataSet_t* loadData(FILE* file, const char* separator) {
         }
         for (size_t j = 0; j < dataSet->nFeatures; j++) {
             dataSet->samples[i].features[j] = strtold(line[j + 1], NULL);
-            free(line[j+1]);
+            free(line[j + 1]);
         }
         free(line);
     }
@@ -43,9 +44,10 @@ dataSet_t* loadData(FILE* file, const char* separator) {
 }
 
 void printSample(const sample_t* sample, const size_t* nFeatures) {
-    printf("%s", sample->id);
+    printf("%s:", sample->id);
     for (size_t j = 0; j < *nFeatures; j++) {
-        printf(",%Lf", sample->features[j]);
+        // printf("\t[%ld]", j);
+        printf("\t%Lf", sample->features[j]);
     }
     puts("");
 }
@@ -56,15 +58,18 @@ void printDataSet(dataSet_t* dataSet) {
     }
 }
 
+// Does not free IDs
 void destroySample(sample_t* sample) {
     free(sample->features);
-    free(sample->id);
+    sample = NULL;
 }
 
+// Does not free IDs
 void destroyDataSet(dataSet_t* dataSet) {
     for (size_t i = 0; i < dataSet->nElements; i++) {
         destroySample(&dataSet->samples[i]);
     }
     free(dataSet->samples);
     free(dataSet);
+    dataSet = NULL;
 }
diff --git a/src/distances.c b/src/distances.c
new file mode 100644
index 0000000..7f61f79
--- /dev/null
+++ b/src/distances.c
@@ -0,0 +1,65 @@
+#include "../lib/distances.h"
+
+distanceDataSet_t* initDistanceDataSet(size_t nElements) {
+    distanceDataSet_t* dataSet = (distanceDataSet_t*)malloc(sizeof(distanceDataSet_t));
+    if (dataSet == NULL) {
+        perror("Error allocating new dataSet. Exiting");
+        exit(1);
+    };
+    dataSet->depth = nElements;
+    dataSet->nElements = (nElements / 2) * (nElements - 1);
+    dataSet->samples = (distanceSample_t*)malloc(sizeof(distanceSample_t) * dataSet->nElements);
+    if (dataSet->samples == NULL) {
+        perror("Error allocating new samples. Exiting");
+        exit(1);
+    }
+    return dataSet;
+}
+
+long double euclidianDistance(long double* a, long double* b, size_t* nFeatures) {
+    long double accumulator = 0;
+
+    // SUM(abs(a - b)^2)
+    for (size_t i = 0; i < *nFeatures; i++) {
+        // printf("[dim = %ld Val a = %Lf b = %Lf]", *nFeatures, a[i], b[i]);
+
+        if (a[i] < b[i]) {
+            accumulator += (b[i] - a[i]) * (b[i] - a[i]);
+        } else {
+            accumulator += (a[i] - b[i]) * (a[i] - b[i]);
+        }
+    }
+    long double dist = sqrtl(accumulator);
+    // printf(" Dist: %Lf\n", dist);
+    return dist;
+}
+
+distanceDataSet_t* calculateDistances(dataSet_t* locationSet) {
+    distanceDataSet_t* distanceSet = initDistanceDataSet(locationSet->nElements);
+
+    for (size_t i = 0, count = 0; i < locationSet->nElements; i++) {
+        for (size_t j = 0; j < i; j++, count++) {
+            distanceSet->samples[count].from = &locationSet->samples[i];
+            distanceSet->samples[count].to = &locationSet->samples[j];
+            printf("Calculating distance from %ld to %ld, which are %s and %s\n", i, j, locationSet->samples[i].id, locationSet->samples[j].id);
+            distanceSet->samples[count].distance = euclidianDistance(locationSet->samples[i].features, locationSet->samples[j].features, &locationSet->nFeatures);
+        }
+    }
+    return distanceSet;
+}
+
+void printDistanceSet(distanceDataSet_t* dataSet) {
+    for (size_t i = 0, k = 0; i < dataSet->depth; i++) {
+        printf("%s:", dataSet->samples[k].from->id);
+        for (size_t j = 0; j < i; j++, k++) {
+            printf("\t%Lf", dataSet->samples[k].distance);
+        }
+        puts("");
+    }
+}
+
+void destroyDistanceDataSet(distanceDataSet_t* dataSet) {
+    free(dataSet->samples);
+    free(dataSet);
+    dataSet = NULL;
+}
diff --git a/src/fileReader.c b/src/file.c
similarity index 98%
rename from src/fileReader.c
rename to src/file.c
index 060bf30..fc7adc5 100644
--- a/src/fileReader.c
+++ b/src/file.c
@@ -1,4 +1,4 @@
-#include "../lib/fileReader.h"
+#include "../lib/file.h"
 
 /**
  * Opens file as readonly, checks for errors, exit if any errors, then returns
diff --git a/src/graph.c b/src/graph.c
new file mode 100644
index 0000000..6964322
--- /dev/null
+++ b/src/graph.c
@@ -0,0 +1,45 @@
+#include "../lib/graph.h"
+
+int compareDistanceSamples(const void* a, const void* b) {
+    if (((distanceSample_t*)a)->distance < ((distanceSample_t*)b)->distance) return -1;
+    if (((distanceSample_t*)a)->distance > ((distanceSample_t*)b)->distance) return 1;
+    return 0;
+}
+
+union_t* kruskal(distanceDataSet_t* dataSet, size_t groupsNumber) {
+    // Qsorting dataSet, Kruskal needs a sorted set
+    qsort(dataSet->samples, dataSet->nElements, sizeof(distanceSample_t), &compareDistanceSamples);
+
+
+    union_t* un = UF_init(dataSet->depth, dataSet->samples);
+    size_t currentGroups = dataSet->depth;
+
+    // Not executing K times, don't even need to remove later
+    // Sice it's sorted, it's on the 3 largest distances
+    for (size_t i = 0; currentGroups != groupsNumber; i++) {
+        size_t p = dataSet->samples[i].from->index, q = dataSet->samples[i].to->index;
+        if (UF_union(un, p, q) == 1) {
+            currentGroups--;
+        }
+    }
+
+    // puts("Final");
+    // for (size_t i = 0; i < dataSet->depth; i++) {
+    //     printf("%ld ", i);
+    // }
+    // puts("");
+    // for (size_t i = 0; i < dataSet->depth; i++) {
+    //     printf("%ld ", UF_find(un, i));
+    // }
+    // puts("");
+    // for (size_t i = 0; i < dataSet->depth; i++) {
+    //     printf("%s ", un->sample[i].from->id);
+    // }
+    // puts("");
+
+    return un;
+}
+
+sample_t** sortMST(union_t* MST) {
+    return NULL
+}
diff --git a/src/main.c b/src/main.c
index 3d5bb6b..c74a85f 100644
--- a/src/main.c
+++ b/src/main.c
@@ -1,10 +1,11 @@
-#include "../lib/data.h"
+#include "../lib/distances.h"
+#include "../lib/graph.h"
 
 #define SEPARATOR ","
 
 int main(int argc, char** argv) {
     char* filename = argv[1];
-    // size_t K = strtoul(argv[2], NULL, 10);
+    size_t K = strtoul(argv[2], NULL, 10);
     FILE* file = openFile(filename);
     // size_t tokenAmount = getLineSize(file, ",", buffer, &bufferSize);
     // char** tokens = readLine(file, ",", buffer, &bufferSize, &tokenAmount);
@@ -14,23 +15,29 @@ int main(int argc, char** argv) {
     // }
     // puts("");
 
-    dataSet_t* data = loadData(file, SEPARATOR);
+    dataSet_t* dataPlot = loadData(file, SEPARATOR);
     closeFile(file);
 
-    puts("Loaded data:");
-    printDataSet(data);
+    puts("Loaded dataPlot:");
+    printDataSet(dataPlot);
 
-    // data_t* distances = getDistances(data);
-    // puts("Distances matrix:");
-    // printDistanceMatrix(distances);
+    // FIXME printing B: in in on first line
+    distanceDataSet_t* distanceSet = calculateDistances(dataPlot);
+    puts("Loaded distanceSet:");
+    printDistanceSet(distanceSet);
 
-    // dataVector_t* dataVec = vectorizeData(distances, data);
-    // puts("Vectorized data:");
-    // printVectorizedData(dataVec);
+    union_t * MST = kruskal(distanceSet, K);
 
-    // kruskal(dataVec, K, data);
 
-    destroyDataSet(data);
+    // Freeing ids here...
+    for (size_t i = 0; i < dataPlot->nElements; i++) {
+        free(dataPlot->samples[i].id);
+    }
+    // ...because these doesn't free ID's, they reuse the same address
+    destroyDataSet(dataPlot);
+    destroyDistanceDataSet(distanceSet);
+    UF_destroy(MST);
+
 
     // char* outputFile = argv[3];
 }
diff --git a/src/unionFind.c b/src/unionFind.c
index a41c742..d1072cc 100644
--- a/src/unionFind.c
+++ b/src/unionFind.c
@@ -1,58 +1,59 @@
 #include "../lib/unionFind.h"
 
-union_t* UF_init(const size_t size, void** elemArray) {
+union_t* UF_init(const size_t size, distanceSample_t* samples) {
     union_t* newStruct = (union_t*)malloc(sizeof(union_t));
     if (newStruct == NULL) {
         perror("Error allocating unionFind struct. Exiting");
         exit(1);
     }
 
-    newStruct->arraySize = size;
-    newStruct->array = (size_t*)malloc(sizeof(size_t) * newStruct->arraySize);
+    newStruct->array = (size_t*)malloc(sizeof(size_t) * size);
     if (newStruct->array == NULL) {
         perror("Error allocating unionFind array. Exiting");
         exit(1);
     }
-    newStruct->size = (size_t*)malloc(sizeof(size_t) * newStruct->arraySize);
+    newStruct->size = (size_t*)malloc(sizeof(size_t) * size);
     if (newStruct->size == NULL) {
         perror("Error allocating unionFind ranks array. Exiting");
         exit(1);
     }
 
-    for (size_t i = 0;i < size;i++) {
+    for (size_t i = 0; i < size; i++) {
         newStruct->array[i] = i;
         newStruct->size[i] = 1;
-        newStruct->elem = elemArray[i];
+        newStruct->sample = &samples[i];
     }
 
     return newStruct;
 }
 
-union_t* UF_destroy(union_t* unionStruct) {
+void UF_destroy(union_t* unionStruct) {
     free(unionStruct->array);
+    free(unionStruct->size);
     free(unionStruct);
-    return NULL;
+    unionStruct = NULL;
 }
 
-// Return who is the ancestor
+// Return the ancestor
 size_t UF_find(union_t* unionStruct, size_t index) {
     while (unionStruct->array[index] != index) {
-        unionStruct->array[index] = unionStruct->array[unionStruct->array[index]];
+        // unionStruct->array[index] = unionStruct->array[unionStruct->array[index]];
         index = unionStruct->array[index];
     }
     return index;
 }
 
 // Joins 2 elements
-void UF_union(union_t* unionStruct, const size_t p, const size_t q) {
-    size_t i = UF_find(unionStruct, p);
-    size_t j = UF_find(unionStruct, q);
-    if (i == j) return;
-    if (i < j) {
-        unionStruct->array[i] = j;
-        unionStruct->size[j] += unionStruct->size[i];
+char UF_union(union_t* unionStruct, size_t p, size_t q) {
+    p = UF_find(unionStruct, p);
+    q = UF_find(unionStruct, q);
+    if (p == q) return 0;
+    if (p < q) {
+        unionStruct->array[p] = q;
+        unionStruct->size[q] += unionStruct->size[p];
     } else {
-        unionStruct->array[j] = i;
-        unionStruct->size[i] += unionStruct->size[j];
+        unionStruct->array[q] = p;
+        unionStruct->size[p] += unionStruct->size[q];
     }
+    return 1;
 }