-
Notifications
You must be signed in to change notification settings - Fork 2
/
invertedIndex.c
96 lines (83 loc) · 2.52 KB
/
invertedIndex.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
//Ass1 , Luka Gamulin z5163726
#include <assert.h>
#include <err.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include <sysexits.h>
#include "DLList.h"
#include <math.h>
#include "Tree.h"
#include "List.h"
#include "invertedIndex.h"
//Given a string sets all the words to lowercase
static char * lowerCase(char *str) {
for(int i = 0; str[i]; i++)
str[i] = tolower(str[i]);
return str;
}
// Functions for Part-1
char * normaliseWord(char *str) {
//check the string is not empty
assert(str != NULL);
//initialiaze counter
int i = 0, len = strlen(str);
//iterate through string and get chars, if the last char is: "." "?" ";" ":" "," do not include in string
while(i < len) {
if((str[i] == '.') || (str[i] == '?') || (str[i] == ',')
|| (str[i] == ';') || (str[i] == ':'))
if ((str[i+1] == '\0'))
break;
i++;
}
str[i] = '\0';
//Set all words to lower case
str = lowerCase(str);
return str;
}
InvertedIndexBST generateInvertedIndex(char *collectionFilename)
{
FILE* ptr = fopen(collectionFilename,"r");
if (ptr==NULL) {
printf("no such file.");
}
char buf[1000];
InvertedIndexBST tree = NULL;
while (fscanf(ptr,"%s",buf)==1) {
//GenTree in Tree.c
tree = GenTree(tree,buf);
}
return tree;
}
//Print words in ascending order with relevant files
void printInvertedIndex(InvertedIndexBST tree)
{
FILE *out;
out = fopen("invertedIndex.txt", "w");
BSTreeInfix(tree,out);
fclose(out);
}
//Calculate TfIdf if word not found returns NULL
TfIdfList calculateTfIdf(InvertedIndexBST tree, char *searchWord , int D) {
assert(tree != NULL);
FileList FilesRelated = getListTree(tree,searchWord);
if(FilesRelated == NULL) return NULL; //word not found
double idftemp = idf(FilesRelated,D);
TfIdfList newList = NULL;
newList = getListTfIdf(FilesRelated,idftemp,newList);
return newList;
}
//Given an array of words retrieves relevant files
TfIdfList retrieve(InvertedIndexBST tree, char* searchWords[] , int D) {
int i = 0;
TfIdfList newList = NULL;
FileList currFiles = NULL;
while(searchWords[i] != NULL) {
currFiles = getListTree(tree,searchWords[i]);
double idftemp = idf(currFiles,D);
newList = getListTfIdf(currFiles,idftemp,newList);
i++;
}
return newList;
}