From fc67b3d06f22da240958d8ba4894756a1c5ba42a Mon Sep 17 00:00:00 2001 From: Henriquelay Date: Sun, 14 Mar 2021 22:22:41 -0300 Subject: [PATCH] Bulletando --- lib/data.h | 48 ++++++------ lib/fileReader.h | 4 +- src/data.c | 188 +++++++++++------------------------------------ src/fileReader.c | 54 +++++--------- src/main.c | 22 ++++-- 5 files changed, 96 insertions(+), 220 deletions(-) diff --git a/lib/data.h b/lib/data.h index 98d50ad..e93c4d0 100644 --- a/lib/data.h +++ b/lib/data.h @@ -1,5 +1,5 @@ /** - * This library is for correcly handling the data structure defined for this project. + * This library is for correcly handling the data structures defined for this project. **/ /** @@ -14,34 +14,30 @@ #ifndef _DATA_H_ #define _DATA_H_ +#define _GNU_SOURCE // Needed to use qsort_r when not using c11 or gnu89/gnu99 #include + #include "./fileReader.h" #include "./unionFind.h" -typedef struct data_t { - void*** dataMatrix; - size_t i, j; -} data_t; - -typedef struct dataVectorCell_t { - long double* distance; - // I and J from the original matrix the vector is pointing to - size_t i; - size_t j; -} dataVectorCell_t; - -typedef struct dataVector_t { - dataVectorCell_t* vec; - // Number of indexed in vec - size_t size; -} dataVector_t; - -data_t* loadData(FILE* file, const char* separator); -void printData(const data_t* dataStruct); -data_t* getDistances(data_t* data); -void destroyData(data_t* data); -dataVector_t* vectorizeData(data_t* data); -union_t* kruskal(dataVector_t* dataVec, size_t K); - +// "Rows" +typedef struct sample { + char* id; + long double* features; +} sample_t; + +// "Lines" of rows +typedef struct dataSet { + size_t nElements; + size_t nFeatures; + sample_t* samples; +} dataSet_t; + + +dataSet_t* loadData(FILE* file, const char *separator); +void printSample(const sample_t* sample, const size_t* nFeatures); +void printDataSet(dataSet_t* dataSet); +void destroySample(sample_t* sample); +void destroyDataSet(dataSet_t* dataSet); #endif diff --git a/lib/fileReader.h b/lib/fileReader.h index 7859d1d..61ca4d3 100644 --- a/lib/fileReader.h +++ b/lib/fileReader.h @@ -11,8 +11,8 @@ FILE* openFile(const char* fileName); char closeFile(FILE* file); -size_t getLineSize(FILE* file, const char* separator, char* buffer, size_t* bufferSize); +char** readLine(FILE* file, const char *separator, char* buffer, size_t* bufferSize, const size_t* nFeatures); +size_t getLineSize(FILE* file, const char separator, char* buffer, size_t* bufferSize); size_t countLines(FILE* file); -void** readLine(FILE* file, const char* separator, char* buffer, size_t* bufferSize, const size_t* tokenAmount); #endif diff --git a/src/data.c b/src/data.c index 6dadacd..79bdc14 100644 --- a/src/data.c +++ b/src/data.c @@ -1,176 +1,70 @@ #include "../lib/data.h" -// TODO - Check if a Macro to derefence (*((long double*)b[i])) is viable +#define BUFSIZE 1500 -void printData(const data_t* dataStruct) { - for (size_t i = 0; i < dataStruct->i; i++) { - printf("%s", (char*)dataStruct->dataMatrix[i][0]); - for (size_t j = 1; j < dataStruct->j; j++) { - printf(",%.15Lf", *((long double*)dataStruct->dataMatrix[i][j])); - } - puts(""); - } -} +dataSet_t* loadData(FILE* file, const char* separator) { + char buffer[BUFSIZE]; + size_t bufferSize = BUFSIZE; -// TODO check before freeing -- don't check whole array -void destroyData(data_t* data) { - for (size_t i = 0; i < data->i; i++) { - for (size_t j = 0; j < data->j; j++) { - free(data->dataMatrix[i][j]); - } - free(data->dataMatrix[i]); - } - free(data->dataMatrix); - free(data); -} - -data_t* loadData(FILE* file, const char* separator) { - char buffer[5000]; - size_t bufferSize = 0; - - data_t* dataStruct = (data_t*)malloc(sizeof(data_t)); - if (dataStruct == NULL) { - perror("Error allocating new dataStructure. Exiting"); + dataSet_t* dataSet = (dataSet_t*)malloc(sizeof(dataSet_t)); + if (dataSet == NULL) { + perror("Error allocating new dataSet. Exiting"); exit(1); }; - dataStruct->i = countLines(file); - dataStruct->j = getLineSize(file, separator, buffer, &bufferSize); + dataSet->nFeatures = getLineSize(file, *separator, buffer, &bufferSize); + dataSet->nElements = countLines(file); // 0 will always be id, read as `char*`. - dataStruct->dataMatrix = (void***)malloc(sizeof(void**) * dataStruct->i); - if (dataStruct->dataMatrix == NULL) { - perror("Error allocating new dataString lines. Exiting"); + dataSet->samples = (sample_t*)malloc(sizeof(sample_t) * dataSet->nElements); + if (dataSet->samples == NULL) { + perror("Error allocating new samples. Exiting"); exit(1); } // void** line = NULL; - for (size_t i = 0; i < dataStruct->i; i++) { - dataStruct->dataMatrix[i] = readLine(file, separator, buffer, &bufferSize, &dataStruct->j); - } - - return dataStruct; -} - -long double distance(long double** a, long double** b, size_t dimensions) { - long double accumulator = 0; - for (size_t i = 1; i <= dimensions; i++) { - // printf("dim = %ld Val a = %Lf b = %Lf\n", dimensions, *a[i], *b[i]); - if ((*a[i]) < (*b[i])) { - accumulator += ((*b[i]) - (*a[i])) * ((*b[i]) - (*a[i])); - } else { - accumulator += ((*a[i]) - (*b[i])) * ((*a[i]) - (*b[i])); - } - } - // TODO Try using sqrtl. Defined in - return sqrt(accumulator); -} - -data_t* getDistances(data_t* data) { - data_t* distances = (data_t*)malloc(sizeof(data_t)); - if (distances == NULL) { - perror("Error allocating new distancesure. Exiting"); - exit(1); - }; - - distances->i = data->i; - // Data has [0] as identifiers, won't be using for calculations; - distances->j = data->j - 1; - // 0 will always be id, read as `char*`. - distances->dataMatrix = (void***)malloc(sizeof(long double**) * distances->i); - if (distances->dataMatrix == NULL) { - perror("Error allocating new dataString lines. Exiting"); - exit(1); - } - - - for (size_t i = 0; i < distances->i; i++) { - distances->dataMatrix[i] = (void**)malloc(sizeof(void*) * i); - if (distances->dataMatrix[i] == NULL) { - perror("Error allocating new dataString lines on distances struct. Exiting"); + for (size_t i = 0; i < dataSet->nElements; i++) { + char** line = readLine(file, separator, buffer, &bufferSize, &dataSet->nFeatures); + dataSet->samples[i].id = line[0]; + dataSet->samples[i].features = (long double*)malloc(sizeof(long double) * dataSet->nFeatures); + if (dataSet->samples == NULL) { + perror("Error allocating features for new sample. Exiting"); exit(1); } - for (size_t j = 0; j < i; j++) { - distances->dataMatrix[i][j] = (long double*)malloc(sizeof(long double)); - if (distances->dataMatrix[i] == NULL) { - perror("Error allocating new dataString lines on distances struct. Exiting"); - exit(1); - } - *((long double*)distances->dataMatrix[i][j]) = distance((long double**)data->dataMatrix[i], (long double**)data->dataMatrix[j], distances->j); + for (size_t j = 0; j < dataSet->nFeatures; j++) { + dataSet->samples[i].features[j] = strtold(line[j + 1], NULL); + free(line[j+1]); } + free(line); } - return distances; -} - -int compareDataVecs(const void* a, const void* b) { - if (*(((dataVectorCell_t*)a)->distance) < *(((dataVectorCell_t*)b)->distance)) return -1; - if (*(((dataVectorCell_t*)a)->distance) > *(((dataVectorCell_t*)b)->distance)) return 1; - return 0; + return dataSet; } -dataVector_t* vectorizeData(data_t* data) { - // It's a triangle - size_t cells = data->i * (data->i - 1) / 2; - dataVectorCell_t* vector = (dataVectorCell_t*)malloc(sizeof(dataVectorCell_t) * cells); - if (vector == NULL) { - perror("Erro allocating dataVector. Exiting"); - exit(1); +void printSample(const sample_t* sample, const size_t* nFeatures) { + printf("%s", sample->id); + for (size_t j = 0; j < *nFeatures; j++) { + printf(",%Lf", sample->features[j]); } + puts(""); +} - size_t k = 0; - for (size_t i = 0; i < data->i; i++) { - for (size_t j = 0; j < i; j++, k++) { - vector[k].distance = data->dataMatrix[i][j]; - vector[k].i = i; - vector[k].j = j; - } - } - - // Sorting on distances - qsort(vector, cells, sizeof(dataVectorCell_t), &compareDataVecs); - - dataVector_t* dataVec = (dataVector_t*)malloc(sizeof(dataVector_t)); - if (dataVec == NULL) { - perror("Erro allocating dataVector. Exiting"); - exit(1); +void printDataSet(dataSet_t* dataSet) { + for (size_t i = 0; i < dataSet->nElements; i++) { + printSample(&dataSet->samples[i], &dataSet->nFeatures); } - - dataVec->vec = vector; - dataVec->size = k; - - // for (size_t i = 0; i < cells; i++) { - // printf("%Lf ", *vector[i].distance); - // } - // puts(""); - - return dataVec; } -union_t* kruskal(dataVector_t* dataVec, size_t groupsNumber) { - union_t* un = UF_init(dataVec->size); - size_t currentGroups = dataVec->size; - for (size_t i = 0; currentGroups != groupsNumber; i++, currentGroups--) { - - printf("I=%ld if=%d ancestor[i]=%ld ancestor[j]=%ld\n", i, UF_find(un, dataVec->vec[i].i) != UF_find(un, dataVec->vec[i].j), UF_find(un, dataVec->vec[i].i), UF_find(un, dataVec->vec[i].j)); +void destroySample(sample_t* sample) { + free(sample->features); + free(sample->id); +} - if (UF_find(un, dataVec->vec[i].i) != UF_find(un, dataVec->vec[i].j)) { - UF_union(un, UF_find(un, dataVec->vec[i].i), UF_find(un, dataVec->vec[i].j)); - } - } - puts("\nFinal"); - for (size_t i = 0; i < dataVec->size; i++) { - printf("%ld %ld | ", dataVec->vec[i].i, dataVec->vec[i].j); +void destroyDataSet(dataSet_t* dataSet) { + for (size_t i = 0; i < dataSet->nElements; i++) { + destroySample(&dataSet->samples[i]); } - - // puts("\nRemovendo os 3 maiores"); - - // for(size_t i = un->size - 1; i > un->size - 3; i--) { - // free(un->array); - // free(un->arraySize); - // } - // un->size = un->size - 3; - - return un; + free(dataSet->samples); + free(dataSet); } diff --git a/src/fileReader.c b/src/fileReader.c index 462c79b..060bf30 100644 --- a/src/fileReader.c +++ b/src/fileReader.c @@ -24,15 +24,16 @@ char closeFile(FILE* file) { } -// TODO usar o getline esperamente para não precisar de getLineSize e countLines +// TODO usar o getline espertamente para não precisar de getLineSize e countLines /** - * Returns how many tokens are in a like of input, including the identifier. + * Returns how many features are in a like of input, not including the identifier. * Takes a buffer as input to avoid keep re-allocating memory * */ -size_t getLineSize(FILE* file, const char* separator, char* buffer, size_t* bufferSize) { - size_t charsRead = 0; +size_t getLineSize(FILE* file, const char separator, char* buffer, size_t* bufferSize) { fpos_t previousPosition; // Stores previous position fgetpos(file, &previousPosition); + + size_t charsRead = 0; if ((charsRead = getline(&buffer, bufferSize, file)) == -1) { perror("Error getting new line. Exiting"); exit(1); @@ -40,14 +41,12 @@ size_t getLineSize(FILE* file, const char* separator, char* buffer, size_t* buff fsetpos(file, &previousPosition); // Restores previous position size_t counter = 0; - for (int i = 0; i < charsRead; i++) { - if (buffer[i] == *separator) { + for (size_t i = 0; i < charsRead; i++) { + if (buffer[i] == separator) { counter++; } } - // Buffer is not used, delete it after reading content - free(buffer); - return ++counter; + return counter; } /** @@ -58,11 +57,8 @@ size_t countLines(FILE* file) { fgetpos(file, &previousPosition); size_t lines = 0; - for (char ch = fgetc(file); ch != EOF; ch = fgetc(file)) { - if (ch == '\n') { - lines++; - } - } + for (char ch = fgetc(file); !feof(file); ch = fgetc(file)) + if (ch == '\n') lines++; fsetpos(file, &previousPosition); // Restores previous position return lines; @@ -72,14 +68,12 @@ size_t countLines(FILE* file) { * Takes an open file, returns an allocated struct with identifier and values * Takes a buffer as input to avoid keep re-allocating memory * */ -void** readLine(FILE* file, const char* separator, char* buffer, size_t* bufferSize, const size_t* tokenAmount) { +char** readLine(FILE* file, const char *separator, char* buffer, size_t* bufferSize, const size_t* nFeatures) { if (getline(&buffer, bufferSize, file) == -1) { - // perror("Error getting new line. Exiting"); - // exit(1); return NULL; } - void** tokens = (void**)malloc(sizeof(void*) * *tokenAmount); + char** tokens = (char**)malloc(sizeof(char*) * (*nFeatures + 1)); if (tokens == NULL) { perror("Error allocating tokens array"); exit(1); @@ -87,30 +81,16 @@ void** readLine(FILE* file, const char* separator, char* buffer, size_t* bufferS // Alternativelly, a null pointer may be specified, in which case the function //continues scanning where a previous successful call to the function ended. - char* token; - unsigned int i = 0; - char* identifier = strtok(buffer, separator); - // Allocates space for identifies cell - tokens[0] = (char*)malloc(sizeof(char) * ((strlen(identifier)) + 1)); - if (tokens[0] == NULL) { - perror("Error allocating new dataString cells. Exiting"); - exit(1); - } - strcpy(tokens[0], identifier); - + char* token = strtok(buffer, separator); // NOTE - Try removing one of the checks for's second term to gain a bit of time if it works - for (i = 1, token = strtok(NULL, separator); token != NULL && i < *tokenAmount; token = strtok(NULL, separator), i++) { + for (size_t i = 0; token != NULL; token = strtok(NULL, separator), i++) { // Allocates space for data cell - tokens[i] = (long double*)malloc(sizeof(long double)); + tokens[i] = (char*)malloc(sizeof(long double)); if (tokens[i] == NULL) { - perror("Error allocating token cell value"); + perror("Error allocating feature for dataPoint. Exiting"); exit(1); } - *((long double*)tokens[i]) = strtold(token, NULL); + strcpy(tokens[i], token); } - - // Remove trailing '\n' - // size_t lastStringLen = strlen(tokens[--i]); - // tokens[i][lastStringLen - 1] = '\0'; return tokens; } diff --git a/src/main.c b/src/main.c index 2ea6675..3d5bb6b 100644 --- a/src/main.c +++ b/src/main.c @@ -1,8 +1,10 @@ #include "../lib/data.h" +#define SEPARATOR "," + int main(int argc, char** argv) { char* filename = argv[1]; - size_t K = strtoul(argv[2], NULL, 10); + // size_t K = strtoul(argv[2], NULL, 10); FILE* file = openFile(filename); // size_t tokenAmount = getLineSize(file, ",", buffer, &bufferSize); // char** tokens = readLine(file, ",", buffer, &bufferSize, &tokenAmount); @@ -12,19 +14,23 @@ int main(int argc, char** argv) { // } // puts(""); - data_t* data = loadData(file, ","); + dataSet_t* data = loadData(file, SEPARATOR); closeFile(file); - data_t* distances = getDistances(data); + puts("Loaded data:"); + printDataSet(data); - printData(data); + // data_t* distances = getDistances(data); + // puts("Distances matrix:"); + // printDistanceMatrix(distances); + // dataVector_t* dataVec = vectorizeData(distances, data); + // puts("Vectorized data:"); + // printVectorizedData(dataVec); - kruskal(vectorizeData(distances), K); + // kruskal(dataVec, K, data); - destroyData(data); + destroyDataSet(data); - // unsigned int k = atoi(argv[2]); // char* outputFile = argv[3]; - // free(tokens); }