From a80183bc8b8681865392568b798357b10f4e253d Mon Sep 17 00:00:00 2001 From: Henriquelay Date: Mon, 15 Mar 2021 05:01:28 -0300 Subject: [PATCH] Very big size update --- Makefile | 2 ++ lib/data.h | 15 ++++----- lib/distances.h | 31 +++++++++++++++++ lib/{fileReader.h => file.h} | 0 lib/graph.h | 15 +++++++++ lib/unionFind.h | 12 +++---- src/data.c | 35 ++++++++++--------- src/distances.c | 65 ++++++++++++++++++++++++++++++++++++ src/{fileReader.c => file.c} | 2 +- src/graph.c | 45 +++++++++++++++++++++++++ src/main.c | 33 ++++++++++-------- src/unionFind.c | 39 +++++++++++----------- 12 files changed, 231 insertions(+), 63 deletions(-) create mode 100644 lib/distances.h rename lib/{fileReader.h => file.h} (100%) create mode 100644 lib/graph.h create mode 100644 src/distances.c rename src/{fileReader.c => file.c} (98%) create mode 100644 src/graph.c diff --git a/Makefile b/Makefile index 468c333..804c058 100644 --- a/Makefile +++ b/Makefile @@ -31,6 +31,8 @@ $(BUILD_DIR)/%.c.o: %.c clean: $(RM) */*.o */*.d +run: $(TARGET_EXEC) + $(BUILD_DIR)/$(TARGET_EXEC) $(ARGS) valzin: $(TARGET_EXEC) $(VALGRIND) $(VALZIN_FLAGS) $(FLAGS) $(BUILD_DIR)/$(TARGET_EXEC) $(ARGS) diff --git a/lib/data.h b/lib/data.h index e93c4d0..133ac87 100644 --- a/lib/data.h +++ b/lib/data.h @@ -14,27 +14,24 @@ #ifndef _DATA_H_ #define _DATA_H_ -#define _GNU_SOURCE // Needed to use qsort_r when not using c11 or gnu89/gnu99 -#include - -#include "./fileReader.h" -#include "./unionFind.h" +#include "./file.h" // "Rows" -typedef struct sample { +typedef struct sample_t { char* id; long double* features; + size_t index; } sample_t; // "Lines" of rows -typedef struct dataSet { +typedef struct dataSet_t { size_t nElements; size_t nFeatures; sample_t* samples; } dataSet_t; - -dataSet_t* loadData(FILE* file, const char *separator); +dataSet_t* initDataSet(size_t nFeatures, size_t nElements); +dataSet_t* loadData(FILE* file, const char* separator); void printSample(const sample_t* sample, const size_t* nFeatures); void printDataSet(dataSet_t* dataSet); void destroySample(sample_t* sample); diff --git a/lib/distances.h b/lib/distances.h new file mode 100644 index 0000000..d5a84ed --- /dev/null +++ b/lib/distances.h @@ -0,0 +1,31 @@ +/** + * This library is for handling distances calculations and related manners. +**/ + +#ifndef _DISTS_H_ +#define _DISTS_H_ + +#include + +#include "./data.h" + +// "Rows" +typedef struct distanceSample_t { + char* id; + long double distance; + // On distanceSet + sample_t *from, *to; +} distanceSample_t; + +// "Lines" of rows +typedef struct distanceDataSet_t { + size_t nElements; + size_t depth; + distanceSample_t* samples; +} distanceDataSet_t; + +distanceDataSet_t* calculateDistances(dataSet_t* points); +void printDistanceSet(distanceDataSet_t* dataSet); +void destroyDistanceDataSet(distanceDataSet_t* dataSet); + +#endif diff --git a/lib/fileReader.h b/lib/file.h similarity index 100% rename from lib/fileReader.h rename to lib/file.h diff --git a/lib/graph.h b/lib/graph.h new file mode 100644 index 0000000..bf8ea11 --- /dev/null +++ b/lib/graph.h @@ -0,0 +1,15 @@ +/** +* This library is for handling dataSets as in they were a graph. Any graph algorithm should be here +**/ + +#ifndef _GRAPH_H_ +#define _GRAPH_H_ + +#define _GNU_SOURCE // Needed to use qsort_r when not using c11 or gnu89/gnu99 + +#include "./data.h" +#include "./unionFind.h" + +union_t* kruskal(distanceDataSet_t* dataSet, size_t groupsNumber); + +#endif diff --git a/lib/unionFind.h b/lib/unionFind.h index 9cd1c57..59091dd 100644 --- a/lib/unionFind.h +++ b/lib/unionFind.h @@ -4,19 +4,19 @@ #include #include +#include "./distances.h" + typedef struct union_t { size_t* array; // Array of sizes to calculate weight and balance the insertion size_t* size; - // Number of elements on array - size_t arraySize; // Tie-in to elem - void *elem; + distanceSample_t *sample; } union_t; -union_t* UF_init(const size_t size, void** elemArray); -union_t* UF_destroy(union_t* unionStruct); +union_t* UF_init(const size_t size, distanceSample_t* samples); +void UF_destroy(union_t* unionStruct); size_t UF_find(union_t* unionStruct, size_t index); -void UF_union(union_t* unionStruct, const size_t p, const size_t q); +char UF_union(union_t* unionStruct, const size_t p, const size_t q); #endif diff --git a/src/data.c b/src/data.c index 79bdc14..b8d5616 100644 --- a/src/data.c +++ b/src/data.c @@ -2,31 +2,32 @@ #define BUFSIZE 1500 -dataSet_t* loadData(FILE* file, const char* separator) { - char buffer[BUFSIZE]; - size_t bufferSize = BUFSIZE; - +dataSet_t* initDataSet(size_t nFeatures, size_t nElements) { dataSet_t* dataSet = (dataSet_t*)malloc(sizeof(dataSet_t)); if (dataSet == NULL) { perror("Error allocating new dataSet. Exiting"); exit(1); }; - - dataSet->nFeatures = getLineSize(file, *separator, buffer, &bufferSize); - dataSet->nElements = countLines(file); - - - // 0 will always be id, read as `char*`. + dataSet->nFeatures = nFeatures; + dataSet->nElements = nElements; dataSet->samples = (sample_t*)malloc(sizeof(sample_t) * dataSet->nElements); if (dataSet->samples == NULL) { perror("Error allocating new samples. Exiting"); exit(1); } + return dataSet; +} + +dataSet_t* loadData(FILE* file, const char* separator) { + char buffer[BUFSIZE]; + size_t bufferSize = BUFSIZE; + + dataSet_t* dataSet = initDataSet(getLineSize(file, *separator, buffer, &bufferSize), countLines(file)); - // void** line = NULL; for (size_t i = 0; i < dataSet->nElements; i++) { char** line = readLine(file, separator, buffer, &bufferSize, &dataSet->nFeatures); dataSet->samples[i].id = line[0]; + dataSet->samples[i].index = i; dataSet->samples[i].features = (long double*)malloc(sizeof(long double) * dataSet->nFeatures); if (dataSet->samples == NULL) { perror("Error allocating features for new sample. Exiting"); @@ -34,7 +35,7 @@ dataSet_t* loadData(FILE* file, const char* separator) { } for (size_t j = 0; j < dataSet->nFeatures; j++) { dataSet->samples[i].features[j] = strtold(line[j + 1], NULL); - free(line[j+1]); + free(line[j + 1]); } free(line); } @@ -43,9 +44,10 @@ dataSet_t* loadData(FILE* file, const char* separator) { } void printSample(const sample_t* sample, const size_t* nFeatures) { - printf("%s", sample->id); + printf("%s:", sample->id); for (size_t j = 0; j < *nFeatures; j++) { - printf(",%Lf", sample->features[j]); + // printf("\t[%ld]", j); + printf("\t%Lf", sample->features[j]); } puts(""); } @@ -56,15 +58,18 @@ void printDataSet(dataSet_t* dataSet) { } } +// Does not free IDs void destroySample(sample_t* sample) { free(sample->features); - free(sample->id); + sample = NULL; } +// Does not free IDs void destroyDataSet(dataSet_t* dataSet) { for (size_t i = 0; i < dataSet->nElements; i++) { destroySample(&dataSet->samples[i]); } free(dataSet->samples); free(dataSet); + dataSet = NULL; } diff --git a/src/distances.c b/src/distances.c new file mode 100644 index 0000000..7f61f79 --- /dev/null +++ b/src/distances.c @@ -0,0 +1,65 @@ +#include "../lib/distances.h" + +distanceDataSet_t* initDistanceDataSet(size_t nElements) { + distanceDataSet_t* dataSet = (distanceDataSet_t*)malloc(sizeof(distanceDataSet_t)); + if (dataSet == NULL) { + perror("Error allocating new dataSet. Exiting"); + exit(1); + }; + dataSet->depth = nElements; + dataSet->nElements = (nElements / 2) * (nElements - 1); + dataSet->samples = (distanceSample_t*)malloc(sizeof(distanceSample_t) * dataSet->nElements); + if (dataSet->samples == NULL) { + perror("Error allocating new samples. Exiting"); + exit(1); + } + return dataSet; +} + +long double euclidianDistance(long double* a, long double* b, size_t* nFeatures) { + long double accumulator = 0; + + // SUM(abs(a - b)^2) + for (size_t i = 0; i < *nFeatures; i++) { + // printf("[dim = %ld Val a = %Lf b = %Lf]", *nFeatures, a[i], b[i]); + + if (a[i] < b[i]) { + accumulator += (b[i] - a[i]) * (b[i] - a[i]); + } else { + accumulator += (a[i] - b[i]) * (a[i] - b[i]); + } + } + long double dist = sqrtl(accumulator); + // printf(" Dist: %Lf\n", dist); + return dist; +} + +distanceDataSet_t* calculateDistances(dataSet_t* locationSet) { + distanceDataSet_t* distanceSet = initDistanceDataSet(locationSet->nElements); + + for (size_t i = 0, count = 0; i < locationSet->nElements; i++) { + for (size_t j = 0; j < i; j++, count++) { + distanceSet->samples[count].from = &locationSet->samples[i]; + distanceSet->samples[count].to = &locationSet->samples[j]; + printf("Calculating distance from %ld to %ld, which are %s and %s\n", i, j, locationSet->samples[i].id, locationSet->samples[j].id); + distanceSet->samples[count].distance = euclidianDistance(locationSet->samples[i].features, locationSet->samples[j].features, &locationSet->nFeatures); + } + } + return distanceSet; +} + +void printDistanceSet(distanceDataSet_t* dataSet) { + for (size_t i = 0, k = 0; i < dataSet->depth; i++) { + printf("%s:", dataSet->samples[k].from->id); + for (size_t j = 0; j < i; j++, k++) { + printf("\t%Lf", dataSet->samples[k].distance); + } + puts(""); + } +} + +void destroyDistanceDataSet(distanceDataSet_t* dataSet) { + free(dataSet->samples); + free(dataSet); + dataSet = NULL; +} diff --git a/src/fileReader.c b/src/file.c similarity index 98% rename from src/fileReader.c rename to src/file.c index 060bf30..fc7adc5 100644 --- a/src/fileReader.c +++ b/src/file.c @@ -1,4 +1,4 @@ -#include "../lib/fileReader.h" +#include "../lib/file.h" /** * Opens file as readonly, checks for errors, exit if any errors, then returns diff --git a/src/graph.c b/src/graph.c new file mode 100644 index 0000000..6964322 --- /dev/null +++ b/src/graph.c @@ -0,0 +1,45 @@ +#include "../lib/graph.h" + +int compareDistanceSamples(const void* a, const void* b) { + if (((distanceSample_t*)a)->distance < ((distanceSample_t*)b)->distance) return -1; + if (((distanceSample_t*)a)->distance > ((distanceSample_t*)b)->distance) return 1; + return 0; +} + +union_t* kruskal(distanceDataSet_t* dataSet, size_t groupsNumber) { + // Qsorting dataSet, Kruskal needs a sorted set + qsort(dataSet->samples, dataSet->nElements, sizeof(distanceSample_t), &compareDistanceSamples); + + + union_t* un = UF_init(dataSet->depth, dataSet->samples); + size_t currentGroups = dataSet->depth; + + // Not executing K times, don't even need to remove later + // Sice it's sorted, it's on the 3 largest distances + for (size_t i = 0; currentGroups != groupsNumber; i++) { + size_t p = dataSet->samples[i].from->index, q = dataSet->samples[i].to->index; + if (UF_union(un, p, q) == 1) { + currentGroups--; + } + } + + // puts("Final"); + // for (size_t i = 0; i < dataSet->depth; i++) { + // printf("%ld ", i); + // } + // puts(""); + // for (size_t i = 0; i < dataSet->depth; i++) { + // printf("%ld ", UF_find(un, i)); + // } + // puts(""); + // for (size_t i = 0; i < dataSet->depth; i++) { + // printf("%s ", un->sample[i].from->id); + // } + // puts(""); + + return un; +} + +sample_t** sortMST(union_t* MST) { + return NULL +} diff --git a/src/main.c b/src/main.c index 3d5bb6b..c74a85f 100644 --- a/src/main.c +++ b/src/main.c @@ -1,10 +1,11 @@ -#include "../lib/data.h" +#include "../lib/distances.h" +#include "../lib/graph.h" #define SEPARATOR "," int main(int argc, char** argv) { char* filename = argv[1]; - // size_t K = strtoul(argv[2], NULL, 10); + size_t K = strtoul(argv[2], NULL, 10); FILE* file = openFile(filename); // size_t tokenAmount = getLineSize(file, ",", buffer, &bufferSize); // char** tokens = readLine(file, ",", buffer, &bufferSize, &tokenAmount); @@ -14,23 +15,29 @@ int main(int argc, char** argv) { // } // puts(""); - dataSet_t* data = loadData(file, SEPARATOR); + dataSet_t* dataPlot = loadData(file, SEPARATOR); closeFile(file); - puts("Loaded data:"); - printDataSet(data); + puts("Loaded dataPlot:"); + printDataSet(dataPlot); - // data_t* distances = getDistances(data); - // puts("Distances matrix:"); - // printDistanceMatrix(distances); + // FIXME printing B: in in on first line + distanceDataSet_t* distanceSet = calculateDistances(dataPlot); + puts("Loaded distanceSet:"); + printDistanceSet(distanceSet); - // dataVector_t* dataVec = vectorizeData(distances, data); - // puts("Vectorized data:"); - // printVectorizedData(dataVec); + union_t * MST = kruskal(distanceSet, K); - // kruskal(dataVec, K, data); - destroyDataSet(data); + // Freeing ids here... + for (size_t i = 0; i < dataPlot->nElements; i++) { + free(dataPlot->samples[i].id); + } + // ...because these doesn't free ID's, they reuse the same address + destroyDataSet(dataPlot); + destroyDistanceDataSet(distanceSet); + UF_destroy(MST); + // char* outputFile = argv[3]; } diff --git a/src/unionFind.c b/src/unionFind.c index a41c742..d1072cc 100644 --- a/src/unionFind.c +++ b/src/unionFind.c @@ -1,58 +1,59 @@ #include "../lib/unionFind.h" -union_t* UF_init(const size_t size, void** elemArray) { +union_t* UF_init(const size_t size, distanceSample_t* samples) { union_t* newStruct = (union_t*)malloc(sizeof(union_t)); if (newStruct == NULL) { perror("Error allocating unionFind struct. Exiting"); exit(1); } - newStruct->arraySize = size; - newStruct->array = (size_t*)malloc(sizeof(size_t) * newStruct->arraySize); + newStruct->array = (size_t*)malloc(sizeof(size_t) * size); if (newStruct->array == NULL) { perror("Error allocating unionFind array. Exiting"); exit(1); } - newStruct->size = (size_t*)malloc(sizeof(size_t) * newStruct->arraySize); + newStruct->size = (size_t*)malloc(sizeof(size_t) * size); if (newStruct->size == NULL) { perror("Error allocating unionFind ranks array. Exiting"); exit(1); } - for (size_t i = 0;i < size;i++) { + for (size_t i = 0; i < size; i++) { newStruct->array[i] = i; newStruct->size[i] = 1; - newStruct->elem = elemArray[i]; + newStruct->sample = &samples[i]; } return newStruct; } -union_t* UF_destroy(union_t* unionStruct) { +void UF_destroy(union_t* unionStruct) { free(unionStruct->array); + free(unionStruct->size); free(unionStruct); - return NULL; + unionStruct = NULL; } -// Return who is the ancestor +// Return the ancestor size_t UF_find(union_t* unionStruct, size_t index) { while (unionStruct->array[index] != index) { - unionStruct->array[index] = unionStruct->array[unionStruct->array[index]]; + // unionStruct->array[index] = unionStruct->array[unionStruct->array[index]]; index = unionStruct->array[index]; } return index; } // Joins 2 elements -void UF_union(union_t* unionStruct, const size_t p, const size_t q) { - size_t i = UF_find(unionStruct, p); - size_t j = UF_find(unionStruct, q); - if (i == j) return; - if (i < j) { - unionStruct->array[i] = j; - unionStruct->size[j] += unionStruct->size[i]; +char UF_union(union_t* unionStruct, size_t p, size_t q) { + p = UF_find(unionStruct, p); + q = UF_find(unionStruct, q); + if (p == q) return 0; + if (p < q) { + unionStruct->array[p] = q; + unionStruct->size[q] += unionStruct->size[p]; } else { - unionStruct->array[j] = i; - unionStruct->size[i] += unionStruct->size[j]; + unionStruct->array[q] = p; + unionStruct->size[p] += unionStruct->size[q]; } + return 1; }