From 6feb8a1cf7e28c7632d99ab5a61df49e5ef638ef Mon Sep 17 00:00:00 2001
From: Charles-Emmanuel Dias <Charles-Emmanuel.Dias@etu.upmc.fr>
Date: Fri, 13 Jun 2014 17:25:34 +0200
Subject: [PATCH 01/10] n gram in vocab, need ngram in train

---
 word2vec.c | 92 ++++++++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 76 insertions(+), 16 deletions(-)

diff --git a/word2vec.c b/word2vec.c
index e844f8a..75715a0 100644
--- a/word2vec.c
+++ b/word2vec.c
@@ -32,6 +32,7 @@
 #define MAX_EXP 6
 #define MAX_SENTENCE_LENGTH 1000
 #define MAX_CODE_LENGTH 40
+#define NGRAM 3
 
 const int vocab_hash_size = 30000000;  // Maximum 30 * 0.7 = 21M words in the vocabulary
 
@@ -156,6 +157,10 @@ int ReadWordIndex(FILE *fin) {
 	if (feof(fin)) 
 		return -1;
 
+
+
+
+
 	return SearchVocab(word);
 }
 
@@ -214,7 +219,7 @@ void SortVocab() {
 		}
 		else {
 		// Hash will be re-computed, as after the sorting it is not actual
-			hash=GetWordHash(vocab[a].word);
+			hash = GetWordHash(vocab[a].word);
 
 			while (vocab_hash[hash] != -1)
 				hash = (hash + 1) % vocab_hash_size;
@@ -349,13 +354,29 @@ void CreateBinaryTree() {
 	free(parent_node);
 }
 
+//Look if word already in vocab, if not add, if yes, increment.
+void searchAndAddToVocab(char* word){
+	long long a,i;
+	i = SearchVocab(word);
+
+		if (i == -1) {
+			a = AddWordToVocab(word);
+			vocab[a].cn = 1;
+		} else
+			vocab[i].cn++;
+
+		if (vocab_size > vocab_hash_size * 0.7)
+			ReduceVocab();
+}
+
 void LearnVocabFromTrainFile() {
 	char word[MAX_STRING];
 	FILE *fin;
-	long long a, i;
+	int i,start,end;
+	char* gram;
 
-	for (a = 0; a < vocab_hash_size; a++) //init vocab hashtable
-		vocab_hash[a] = -1;
+	for (i = 0; i < vocab_hash_size; i++) //init vocab hashtable
+		vocab_hash[i] = -1;
 
 	fin = fopen(train_file, "rb");
 
@@ -370,6 +391,26 @@ void LearnVocabFromTrainFile() {
 	while (1) {
 		ReadWord(word, fin);
 
+		if(NGRAM > 0) //learn ngrams instead of words
+		{
+			gram = (char*)calloc(NGRAM,sizeof(char));
+			start = 0;
+			end = NGRAM-1;
+			//printf("word: %s, len: %d\n",word,(int) strlen(word));
+			while(end<strlen(word)){
+				strncpy(gram,word+sizeof(char)*start,NGRAM);
+				//printf("gram: %s\n",gram);
+				searchAndAddToVocab(gram);
+
+				end++;
+				start++;
+			}
+		}
+		else
+		{
+			searchAndAddToVocab(word);
+		}
+
 		if (feof(fin))
 			break;
 
@@ -379,17 +420,6 @@ void LearnVocabFromTrainFile() {
 			printf("%lldK%c", train_words / 1000, 13);
 			fflush(stdout);
 		}
-
-		i = SearchVocab(word);
-
-		if (i == -1) {
-			a = AddWordToVocab(word);
-			vocab[a].cn = 1;
-		} else
-			vocab[i].cn++;
-
-		if (vocab_size > vocab_hash_size * 0.7)
-			ReduceVocab();
 	}
 
 	SortVocab();
@@ -401,6 +431,7 @@ void LearnVocabFromTrainFile() {
 
 	file_size = ftell(fin);
 	fclose(fin);
+	free(gram);
 }
 
 void SaveVocab() {
@@ -508,6 +539,11 @@ void *TrainModelThread(void *id) {
 	real f, g;
 	clock_t now;
 
+	char wordToGram[MAX_STRING];
+	char* gram;
+	int start = 0;
+	int end = NGRAM-1;
+
 	real *neu1 = (real *)calloc(layer1_size, sizeof(real)); //one vector
 	real *neu1e = (real *)calloc(layer1_size, sizeof(real)); 
 	FILE *fi = fopen(train_file, "rb");
@@ -537,8 +573,32 @@ void *TrainModelThread(void *id) {
 		if (sentence_length == 0) {
 
 			while (1) {
-				word = ReadWordIndex(fi);
 
+
+
+				if(NGRAM > 0) //learn ngrams instead of words
+				{
+					
+					if( (start == 0 && end == NGRAM) || (end > strlen(wordToGram)) ){
+						ReadWord(wordToGram, fi);
+						gram = (char*)calloc(NGRAM,sizeof(char));
+						start == 0;
+						end == NGRAM-1;
+					}
+					
+					/// SEGMENTATION FAULT IN THE COUIN
+
+					strncpy(gram,wordToGram+sizeof(char)*start,NGRAM);
+					word = SearchVocab(gram);
+					end++;
+					start++;
+					
+				}
+				else
+				{
+				word = ReadWordIndex(fi); 
+
+				}
 				if (feof(fi))
 					break;
 				if (word == -1)

From 4ace57eb26e82dc82cf44bb765fc8a76d23f628b Mon Sep 17 00:00:00 2001
From: charles-emmanuel <charles-emmanuel.dias@etu.upmc.fr>
Date: Mon, 16 Jun 2014 09:31:40 +0200
Subject: [PATCH 02/10] no rmaly w/out hashes ok

---
 word2vec.c | 30 ++++++++++++++++++++++++------
 1 file changed, 24 insertions(+), 6 deletions(-)

diff --git a/word2vec.c b/word2vec.c
index 75715a0..f2b4e74 100644
--- a/word2vec.c
+++ b/word2vec.c
@@ -543,6 +543,9 @@ void *TrainModelThread(void *id) {
 	char* gram;
 	int start = 0;
 	int end = NGRAM-1;
+	gram = (char*)calloc(NGRAM,sizeof(char));
+	int newWord = 1;
+	int wordLength = 0;
 
 	real *neu1 = (real *)calloc(layer1_size, sizeof(real)); //one vector
 	real *neu1e = (real *)calloc(layer1_size, sizeof(real)); 
@@ -579,19 +582,32 @@ void *TrainModelThread(void *id) {
 				if(NGRAM > 0) //learn ngrams instead of words
 				{
 					
-					if( (start == 0 && end == NGRAM) || (end > strlen(wordToGram)) ){
+					
+					if(newWord){
 						ReadWord(wordToGram, fi);
-						gram = (char*)calloc(NGRAM,sizeof(char));
-						start == 0;
-						end == NGRAM-1;
+						start = 0;
+						end = NGRAM-1;
+						wordLength = strlen(wordToGram);
+					//	printf("new word: %s, length:%d\n",wordToGram,wordLength);
+						newWord = 0;
 					}
 					
-					/// SEGMENTATION FAULT IN THE COUIN
 
+					if(wordLength <= NGRAM){
+						word =  SearchVocab(wordToGram);
+						newWord = 1;
+						continue;
+					}
+
+					
 					strncpy(gram,wordToGram+sizeof(char)*start,NGRAM);
 					word = SearchVocab(gram);
+					//printf("word: %s, gram: %s,index:%lld, start: %d, end %d \n",wordToGram,gram,word,start,end);
 					end++;
 					start++;
+
+					if(end == wordLength)
+						newWord = 1;
 					
 				}
 				else
@@ -599,6 +615,8 @@ void *TrainModelThread(void *id) {
 				word = ReadWordIndex(fi); 
 
 				}
+				
+
 				if (feof(fi))
 					break;
 				if (word == -1)
@@ -623,7 +641,7 @@ void *TrainModelThread(void *id) {
 				if (sentence_length >= MAX_SENTENCE_LENGTH)
 					break;
 			}
-
+			
 			sentence_position = 0;
 		}
 

From e75d65e95ab5df15b96ff26bed5be8a92c40723b Mon Sep 17 00:00:00 2001
From: charles-emmanuel <charles-emmanuel.dias@etu.upmc.fr>
Date: Mon, 16 Jun 2014 09:49:14 +0200
Subject: [PATCH 03/10] [wip] adding args to handle grams/hash

---
 word2vec.c | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/word2vec.c b/word2vec.c
index f2b4e74..285ad8e 100644
--- a/word2vec.c
+++ b/word2vec.c
@@ -33,6 +33,7 @@
 #define MAX_SENTENCE_LENGTH 1000
 #define MAX_CODE_LENGTH 40
 #define NGRAM 3
+#define HASHBANG 1
 
 const int vocab_hash_size = 30000000;  // Maximum 30 * 0.7 = 21M words in the vocabulary
 
@@ -47,7 +48,7 @@ struct vocab_word {
 char train_file[MAX_STRING], output_file[MAX_STRING];
 char save_vocab_file[MAX_STRING], read_vocab_file[MAX_STRING];
 struct vocab_word *vocab;
-int binary = 0, cbow = 0, debug_mode = 2, window = 5, min_count = 5, num_threads = 1, min_reduce = 1;
+int binary = 0, cbow = 0, debug_mode = 2, window = 5, min_count = 5, num_threads = 1, min_reduce = 1, ngram = 0, hashbang = 0;
 int *vocab_hash;
 long long vocab_max_size = 1000, vocab_size = 0, layer1_size = 100;
 long long train_words = 0, word_count_actual = 0, file_size = 0, classes = 0;
@@ -393,8 +394,8 @@ void LearnVocabFromTrainFile() {
 
 		if(NGRAM > 0) //learn ngrams instead of words
 		{
-			gram = (char*)calloc(NGRAM,sizeof(char));
-			start = 0;
+			gram = (char*)calloc(NGRAM,sizeof(char)); //leak 
+ 			start = 0;
 			end = NGRAM-1;
 			//printf("word: %s, len: %d\n",word,(int) strlen(word));
 			while(end<strlen(word)){
@@ -544,6 +545,7 @@ void *TrainModelThread(void *id) {
 	int start = 0;
 	int end = NGRAM-1;
 	gram = (char*)calloc(NGRAM,sizeof(char));
+	gramBang = (char*)calloc(NGRAM,sizeof(char)+2); // w/ hashbang i.e: #good# -> #go goo ood od#
 	int newWord = 1;
 	int wordLength = 0;
 
@@ -1047,6 +1049,10 @@ int main(int argc, char **argv) {
 		printf("\t\tThe vocabulary will be read from <file>, not constructed from the training data\n");
 		printf("\t-cbow <int>\n");
 		printf("\t\tUse the continuous bag of words model; default is 0 (skip-gram model)\n");
+		printf("\t\tUse N-GRAM model instead of words to train vectors \n");
+		printf("\t-ngram <int> (default 0 - use words) \n");
+		printf("\t\tUse hashbang on n-grams - i.e #good# -> #go,goo,ood,od#\n");
+		printf("\t-wordhash <0-1> (default 0)\n");
 		printf("\nExamples:\n");
 		printf("./word2vec -train data.txt -output vec.txt -debug 2 -size 200 -window 5 -sample 1e-4 -negative 5 -hs 0 -binary 0 -cbow 1\n\n");
 		return 0;
@@ -1072,6 +1078,8 @@ int main(int argc, char **argv) {
 	if ((i = ArgPos((char *)"-threads", argc, argv)) > 0) num_threads = atoi(argv[i + 1]);
 	if ((i = ArgPos((char *)"-min-count", argc, argv)) > 0) min_count = atoi(argv[i + 1]);
 	if ((i = ArgPos((char *)"-classes", argc, argv)) > 0) classes = atoi(argv[i + 1]);
+	//if ((i = ArgPos ((char *) "-ngram", argc, argv)) > 0 ) ngram = atoi(argv[i + 1])
+	//if ((i = ArgPos ((char *) "-wordhash", argc, argv)) > 0 ) wordhash = atoi(argv[i + 1])
 	
 
 	vocab = (struct vocab_word *)calloc(vocab_max_size, sizeof(struct vocab_word));

From 68ff1a65322321f8f43fd8192750eef2578e58ab Mon Sep 17 00:00:00 2001
From: Charles-Emmanuel Dias <Charles-Emmanuel.Dias@etu.upmc.fr>
Date: Mon, 16 Jun 2014 15:18:17 +0200
Subject: [PATCH 04/10] ngram + hashbangs training ok

---
 word2vec.c | 30 ++++++++++++++++++++++++------
 1 file changed, 24 insertions(+), 6 deletions(-)

diff --git a/word2vec.c b/word2vec.c
index 285ad8e..a20d5ff 100644
--- a/word2vec.c
+++ b/word2vec.c
@@ -90,6 +90,11 @@ void InitUnigramTable() {
 // Reads a single word from a file, assuming space + tab + EOL to be word boundaries
 void ReadWord(char *word, FILE *fin) {
 	int a = 0, character;
+	
+	if(HASHBANG){
+		word[a] = '#'; //words starts with #
+		a++;
+	}
 
 	while (!feof(fin)) {
 		character = fgetc(fin);
@@ -119,6 +124,12 @@ void ReadWord(char *word, FILE *fin) {
 		if (a >= MAX_STRING - 1)
 			a--;   // Truncate too long words
 	}
+
+	if(HASHBANG){
+		word[a] = '#'; //words ends with #
+		a++;
+	}
+
 	word[a] = 0;
 }
 
@@ -373,7 +384,7 @@ void searchAndAddToVocab(char* word){
 void LearnVocabFromTrainFile() {
 	char word[MAX_STRING];
 	FILE *fin;
-	int i,start,end;
+	int i,start,end,lenWord;
 	char* gram;
 
 	for (i = 0; i < vocab_hash_size; i++) //init vocab hashtable
@@ -394,11 +405,14 @@ void LearnVocabFromTrainFile() {
 
 		if(NGRAM > 0) //learn ngrams instead of words
 		{
+			lenWord = strlen(word);
+
 			gram = (char*)calloc(NGRAM,sizeof(char)); //leak 
  			start = 0;
 			end = NGRAM-1;
 			//printf("word: %s, len: %d\n",word,(int) strlen(word));
-			while(end<strlen(word)){
+			while(end<lenWord)
+			{
 				strncpy(gram,word+sizeof(char)*start,NGRAM);
 				//printf("gram: %s\n",gram);
 				searchAndAddToVocab(gram);
@@ -545,7 +559,7 @@ void *TrainModelThread(void *id) {
 	int start = 0;
 	int end = NGRAM-1;
 	gram = (char*)calloc(NGRAM,sizeof(char));
-	gramBang = (char*)calloc(NGRAM,sizeof(char)+2); // w/ hashbang i.e: #good# -> #go goo ood od#
+	//gramBang = (char*)calloc(NGRAM,sizeof(char)+2); // w/ hashbang i.e: #good# -> #go goo ood od#
 	int newWord = 1;
 	int wordLength = 0;
 
@@ -557,6 +571,7 @@ void *TrainModelThread(void *id) {
 
 	while (1) {
 
+
 		if (word_count - last_word_count > 10000) {
 			word_count_actual += word_count - last_word_count;
 			last_word_count = word_count;
@@ -578,8 +593,11 @@ void *TrainModelThread(void *id) {
 		if (sentence_length == 0) {
 
 			while (1) {
+				
 
-
+				if (feof(fi))
+					break;
+				
 
 				if(NGRAM > 0) //learn ngrams instead of words
 				{
@@ -619,8 +637,6 @@ void *TrainModelThread(void *id) {
 				}
 				
 
-				if (feof(fi))
-					break;
 				if (word == -1)
 					continue;
 
@@ -903,6 +919,8 @@ void TrainModel() {
 	for (a = 0; a < num_threads; a++)
 		pthread_join(pt[a], NULL);
 
+	printf("Training Ended !\n");
+
 	fo = fopen(output_file, "wb");
 
 	if (classes == 0) {

From 7c9b629a5cf43bb61b6d9f1ef72b2f0398cc04d3 Mon Sep 17 00:00:00 2001
From: Charles-Emmanuel Dias <Charles-Emmanuel.Dias@etu.upmc.fr>
Date: Mon, 16 Jun 2014 17:19:52 +0200
Subject: [PATCH 05/10] [wip] From N-Gram to Words, to Accuracy computation

---
 compute-accuracy.c      | 175 ++++++++++++++++++++++++++++++++--------
 script-tools/main.py    |   6 ++
 script-tools/tooling.py |  42 ++++++++++
 word2vec.c              |  37 ++++-----
 4 files changed, 208 insertions(+), 52 deletions(-)
 create mode 100644 script-tools/main.py
 create mode 100644 script-tools/tooling.py

diff --git a/compute-accuracy.c b/compute-accuracy.c
index d83fcbb..a1166c6 100644
--- a/compute-accuracy.c
+++ b/compute-accuracy.c
@@ -32,106 +32,217 @@ int main(int argc, char **argv)
   float *M;
   char *vocab;
   int TCN, CCN = 0, TACN = 0, CACN = 0, SECN = 0, SYCN = 0, SEAC = 0, SYAC = 0, QID = 0, TQ = 0, TQS = 0;
+
   if (argc < 2) {
     printf("Usage: ./compute-accuracy <FILE> <threshold>\nwhere FILE contains word projections, and threshold is used to reduce vocabulary of the model for fast approximate evaluation (0 = off, otherwise typical value is 30000)\n");
     return 0;
   }
+
   strcpy(file_name, argv[1]);
-  if (argc > 2) threshold = atoi(argv[2]);
+
+  if (argc > 2)
+    threshold = atoi(argv[2]);
+
   f = fopen(file_name, "rb");
+
   if (f == NULL) {
     printf("Input file not found\n");
     return -1;
   }
+
   fscanf(f, "%lld", &words);
-  if (threshold) if (words > threshold) words = threshold;
+
+  if (threshold)
+    if (words > threshold)
+      words = threshold;
+
   fscanf(f, "%lld", &size);
+
   vocab = (char *)malloc(words * max_w * sizeof(char));
+
   M = (float *)malloc(words * size * sizeof(float));
+
   if (M == NULL) {
     printf("Cannot allocate memory: %lld MB\n", words * size * sizeof(float) / 1048576);
     return -1;
   }
+
   for (b = 0; b < words; b++) {
+
     fscanf(f, "%s%c", &vocab[b * max_w], &ch);
-    for (a = 0; a < max_w; a++) vocab[b * max_w + a] = toupper(vocab[b * max_w + a]);
-    for (a = 0; a < size; a++) fread(&M[a + b * size], sizeof(float), 1, f);
+
+    for (a = 0; a < max_w; a++)
+      vocab[b * max_w + a] = toupper(vocab[b * max_w + a]);
+
+    for (a = 0; a < size; a++)
+      fread(&M[a + b * size], sizeof(float), 1, f);
+
     len = 0;
-    for (a = 0; a < size; a++) len += M[a + b * size] * M[a + b * size];
+
+    for (a = 0; a < size; a++)
+      len += M[a + b * size] * M[a + b * size];
+
     len = sqrt(len);
-    for (a = 0; a < size; a++) M[a + b * size] /= len;
+
+    for (a = 0; a < size; a++)
+      M[a + b * size] /= len;
   }
+
   fclose(f);
+
   TCN = 0;
+
   while (1) {
-    for (a = 0; a < N; a++) bestd[a] = 0;
-    for (a = 0; a < N; a++) bestw[a][0] = 0;
+
+    for (a = 0; a < N; a++)
+      bestd[a] = 0;
+
+    for (a = 0; a < N; a++)
+      bestw[a][0] = 0;
+
     scanf("%s", st1);
-    for (a = 0; a < strlen(st1); a++) st1[a] = toupper(st1[a]);
+
+    for (a = 0; a < strlen(st1); a++)
+      st1[a] = toupper(st1[a]);
+
     if ((!strcmp(st1, ":")) || (!strcmp(st1, "EXIT")) || feof(stdin)) {
-      if (TCN == 0) TCN = 1;
+
+      if (TCN == 0)
+        TCN = 1;
+
       if (QID != 0) {
         printf("ACCURACY TOP1: %.2f %%  (%d / %d)\n", CCN / (float)TCN * 100, CCN, TCN);
         printf("Total accuracy: %.2f %%   Semantic accuracy: %.2f %%   Syntactic accuracy: %.2f %% \n", CACN / (float)TACN * 100, SEAC / (float)SECN * 100, SYAC / (float)SYCN * 100);
       }
+
       QID++;
       scanf("%s", st1);
-      if (feof(stdin)) break;
+
+      if (feof(stdin))
+        break;
+
       printf("%s:\n", st1);
       TCN = 0;
       CCN = 0;
       continue;
     }
-    if (!strcmp(st1, "EXIT")) break;
+
+    if (!strcmp(st1, "EXIT"))
+      break;
+
     scanf("%s", st2);
-    for (a = 0; a < strlen(st2); a++) st2[a] = toupper(st2[a]);
+
+    for (a = 0; a < strlen(st2); a++)
+      st2[a] = toupper(st2[a]);
+
     scanf("%s", st3);
-    for (a = 0; a<strlen(st3); a++) st3[a] = toupper(st3[a]);
+
+    for (a = 0; a<strlen(st3); a++)
+      st3[a] = toupper(st3[a]);
+
     scanf("%s", st4);
-    for (a = 0; a < strlen(st4); a++) st4[a] = toupper(st4[a]);
-    for (b = 0; b < words; b++) if (!strcmp(&vocab[b * max_w], st1)) break;
+
+    for (a = 0; a < strlen(st4); a++)
+      st4[a] = toupper(st4[a]);
+
+    for (b = 0; b < words; b++)
+      if (!strcmp(&vocab[b * max_w], st1))
+        break;
+
     b1 = b;
-    for (b = 0; b < words; b++) if (!strcmp(&vocab[b * max_w], st2)) break;
+
+    for (b = 0; b < words; b++)
+      if (!strcmp(&vocab[b * max_w], st2))
+        break;
+
     b2 = b;
-    for (b = 0; b < words; b++) if (!strcmp(&vocab[b * max_w], st3)) break;
+
+    for (b = 0; b < words; b++)
+      if (!strcmp(&vocab[b * max_w], st3))
+        break;
+
     b3 = b;
-    for (a = 0; a < N; a++) bestd[a] = 0;
-    for (a = 0; a < N; a++) bestw[a][0] = 0;
+
+    for (a = 0; a < N; a++)
+      bestd[a] = 0;
+
+    for (a = 0; a < N; a++)
+      bestw[a][0] = 0;
+
     TQ++;
-    if (b1 == words) continue;
-    if (b2 == words) continue;
-    if (b3 == words) continue;
-    for (b = 0; b < words; b++) if (!strcmp(&vocab[b * max_w], st4)) break;
-    if (b == words) continue;
-    for (a = 0; a < size; a++) vec[a] = (M[a + b2 * size] - M[a + b1 * size]) + M[a + b3 * size];
+
+    if (b1 == words)
+      continue;
+
+    if (b2 == words)
+      continue;
+
+    if (b3 == words)
+      continue;
+
+    for (b = 0; b < words; b++) 
+      if (!strcmp(&vocab[b * max_w], st4))
+        break;
+
+    if (b == words)
+      continue;
+
+    for (a = 0; a < size; a++)
+      vec[a] = (M[a + b2 * size] - M[a + b1 * size]) + M[a + b3 * size];
+
     TQS++;
+
     for (c = 0; c < words; c++) {
-      if (c == b1) continue;
-      if (c == b2) continue;
-      if (c == b3) continue;
+
+      if (c == b1)
+        continue;
+
+      if (c == b2)
+        continue;
+
+      if (c == b3)
+        continue;
+
       dist = 0;
-      for (a = 0; a < size; a++) dist += vec[a] * M[a + c * size];
+
+      for (a = 0; a < size; a++)
+        dist += vec[a] * M[a + c * size];
+
       for (a = 0; a < N; a++) {
+
         if (dist > bestd[a]) {
+
           for (d = N - 1; d > a; d--) {
             bestd[d] = bestd[d - 1];
             strcpy(bestw[d], bestw[d - 1]);
           }
+
           bestd[a] = dist;
           strcpy(bestw[a], &vocab[c * max_w]);
           break;
         }
       }
     }
+
     if (!strcmp(st4, bestw[0])) {
       CCN++;
       CACN++;
-      if (QID <= 5) SEAC++; else SYAC++;
+
+      if (QID <= 5)
+        SEAC++;
+      else
+        SYAC++;
     }
-    if (QID <= 5) SECN++; else SYCN++;
+
+    if (QID <= 5)
+      SECN++;
+    else 
+      SYCN++;
+
     TCN++;
     TACN++;
   }
+  
   printf("Questions seen / total: %d %d   %.2f %% \n", TQS, TQ, TQS/(float)TQ*100);
   return 0;
 }
diff --git a/script-tools/main.py b/script-tools/main.py
new file mode 100644
index 0000000..e73d16e
--- /dev/null
+++ b/script-tools/main.py
@@ -0,0 +1,6 @@
+#main.py
+import tooling as tools
+
+
+vectors = tools.toDicVec("vec.txt");
+print vectors["#th"]
diff --git a/script-tools/tooling.py b/script-tools/tooling.py
new file mode 100644
index 0000000..7f588c1
--- /dev/null
+++ b/script-tools/tooling.py
@@ -0,0 +1,42 @@
+import numpy as np
+
+
+def toDicVec(filename):
+	dic = {}
+	first = True
+	vecfile = open(filename,"r")
+	vecfile.readline(); #1st line = useless
+	for line in vecfile:
+		for word in line.split(" "):
+			if(first):
+				key = word
+				dic[key]=[]
+				first = False
+			else:
+				dic[key].append(word)
+		dic[key].pop()
+		first = True
+
+	return dic
+
+
+def wordCorpusSum(filename,corpus,gramsize,hashbangs):
+	dic = toDicVec(filename)
+	wordDic = {}
+
+	cfile = open(corpus,"r")
+
+	for line in cfile:
+		for word in line.split(" "):
+			key = word
+
+			if(hashbangs):
+				word = '#'+word+'#'
+
+			start=0
+			end=gramsize-1
+
+
+
+
+
diff --git a/word2vec.c b/word2vec.c
index a20d5ff..c9f3724 100644
--- a/word2vec.c
+++ b/word2vec.c
@@ -32,8 +32,7 @@
 #define MAX_EXP 6
 #define MAX_SENTENCE_LENGTH 1000
 #define MAX_CODE_LENGTH 40
-#define NGRAM 3
-#define HASHBANG 1
+
 
 const int vocab_hash_size = 30000000;  // Maximum 30 * 0.7 = 21M words in the vocabulary
 
@@ -91,7 +90,7 @@ void InitUnigramTable() {
 void ReadWord(char *word, FILE *fin) {
 	int a = 0, character;
 	
-	if(HASHBANG){
+	if(hashbang > 0){
 		word[a] = '#'; //words starts with #
 		a++;
 	}
@@ -125,7 +124,7 @@ void ReadWord(char *word, FILE *fin) {
 			a--;   // Truncate too long words
 	}
 
-	if(HASHBANG){
+	if(hashbang>0){
 		word[a] = '#'; //words ends with #
 		a++;
 	}
@@ -385,7 +384,7 @@ void LearnVocabFromTrainFile() {
 	char word[MAX_STRING];
 	FILE *fin;
 	int i,start,end,lenWord;
-	char* gram;
+	char* gram = (char*)calloc(ngram,sizeof(char));
 
 	for (i = 0; i < vocab_hash_size; i++) //init vocab hashtable
 		vocab_hash[i] = -1;
@@ -403,17 +402,16 @@ void LearnVocabFromTrainFile() {
 	while (1) {
 		ReadWord(word, fin);
 
-		if(NGRAM > 0) //learn ngrams instead of words
+		if(ngram > 0) //learn ngrams instead of words
 		{
 			lenWord = strlen(word);
 
-			gram = (char*)calloc(NGRAM,sizeof(char)); //leak 
  			start = 0;
-			end = NGRAM-1;
+			end = ngram-1;
 			//printf("word: %s, len: %d\n",word,(int) strlen(word));
 			while(end<lenWord)
 			{
-				strncpy(gram,word+sizeof(char)*start,NGRAM);
+				strncpy(gram,word+sizeof(char)*start,ngram);
 				//printf("gram: %s\n",gram);
 				searchAndAddToVocab(gram);
 
@@ -557,9 +555,8 @@ void *TrainModelThread(void *id) {
 	char wordToGram[MAX_STRING];
 	char* gram;
 	int start = 0;
-	int end = NGRAM-1;
-	gram = (char*)calloc(NGRAM,sizeof(char));
-	//gramBang = (char*)calloc(NGRAM,sizeof(char)+2); // w/ hashbang i.e: #good# -> #go goo ood od#
+	int end = ngram-1;
+	gram = (char*)calloc(ngram,sizeof(char));
 	int newWord = 1;
 	int wordLength = 0;
 
@@ -599,28 +596,28 @@ void *TrainModelThread(void *id) {
 					break;
 				
 
-				if(NGRAM > 0) //learn ngrams instead of words
+				if(ngram > 0) //learn ngrams instead of words
 				{
 					
 					
 					if(newWord){
 						ReadWord(wordToGram, fi);
 						start = 0;
-						end = NGRAM-1;
+						end = ngram-1;
 						wordLength = strlen(wordToGram);
 					//	printf("new word: %s, length:%d\n",wordToGram,wordLength);
 						newWord = 0;
 					}
 					
 
-					if(wordLength <= NGRAM){
+					if(wordLength <= ngram){
 						word =  SearchVocab(wordToGram);
 						newWord = 1;
 						continue;
 					}
 
 					
-					strncpy(gram,wordToGram+sizeof(char)*start,NGRAM);
+					strncpy(gram,wordToGram+sizeof(char)*start,ngram);
 					word = SearchVocab(gram);
 					//printf("word: %s, gram: %s,index:%lld, start: %d, end %d \n",wordToGram,gram,word,start,end);
 					end++;
@@ -1067,10 +1064,10 @@ int main(int argc, char **argv) {
 		printf("\t\tThe vocabulary will be read from <file>, not constructed from the training data\n");
 		printf("\t-cbow <int>\n");
 		printf("\t\tUse the continuous bag of words model; default is 0 (skip-gram model)\n");
-		printf("\t\tUse N-GRAM model instead of words to train vectors \n");
 		printf("\t-ngram <int> (default 0 - use words) \n");
+		printf("\t\tUse N-GRAM model instead of words to train vectors \n");
+		printf("\t-hashbang <0-1> (default 0)\n");
 		printf("\t\tUse hashbang on n-grams - i.e #good# -> #go,goo,ood,od#\n");
-		printf("\t-wordhash <0-1> (default 0)\n");
 		printf("\nExamples:\n");
 		printf("./word2vec -train data.txt -output vec.txt -debug 2 -size 200 -window 5 -sample 1e-4 -negative 5 -hs 0 -binary 0 -cbow 1\n\n");
 		return 0;
@@ -1096,8 +1093,8 @@ int main(int argc, char **argv) {
 	if ((i = ArgPos((char *)"-threads", argc, argv)) > 0) num_threads = atoi(argv[i + 1]);
 	if ((i = ArgPos((char *)"-min-count", argc, argv)) > 0) min_count = atoi(argv[i + 1]);
 	if ((i = ArgPos((char *)"-classes", argc, argv)) > 0) classes = atoi(argv[i + 1]);
-	//if ((i = ArgPos ((char *) "-ngram", argc, argv)) > 0 ) ngram = atoi(argv[i + 1])
-	//if ((i = ArgPos ((char *) "-wordhash", argc, argv)) > 0 ) wordhash = atoi(argv[i + 1])
+	if ((i = ArgPos ((char *) "-ngram", argc, argv)) > 0 ) ngram = atoi(argv[i + 1]);
+	if ((i = ArgPos ((char *) "-hashbang", argc, argv)) > 0 ) hashbang = atoi(argv[i + 1]);
 	
 
 	vocab = (struct vocab_word *)calloc(vocab_max_size, sizeof(struct vocab_word));

From c8b5642adc5d8b23c02c8cc54832daad90958733 Mon Sep 17 00:00:00 2001
From: Charles-Emmanuel Dias <Charles-Emmanuel.Dias@etu.upmc.fr>
Date: Tue, 17 Jun 2014 17:01:49 +0200
Subject: [PATCH 06/10] [wip] find segfault...

---
 script-tools/main.py    | 21 ++++++++-
 script-tools/tooling.py | 53 +++++++++++++++++++++--
 word2vec.c              | 96 ++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 164 insertions(+), 6 deletions(-)

diff --git a/script-tools/main.py b/script-tools/main.py
index e73d16e..14ffdfd 100644
--- a/script-tools/main.py
+++ b/script-tools/main.py
@@ -3,4 +3,23 @@
 
 
 vectors = tools.toDicVec("vec.txt");
-print vectors["#th"]
+wordDic = tools.wordCorpusSum("vec.txt","text8",3,True,200)
+
+print "wordDic has %d words in it" % (len(wordDic))
+"""print "Starting kmean:"
+
+a = tools.kmean(wordDic)
+clusters = zip(a,wordDic.keys())
+clusters = sorted(clusters)
+
+f = open("/tmp/Clusts","w")
+
+for clust,word in clusters:
+	line = str(clust) + " => " + word + "\n"
+	f.write(line)
+
+f.close()"""
+
+tools.writeWordDicBin(wordDic, 200)
+
+
diff --git a/script-tools/tooling.py b/script-tools/tooling.py
index 7f588c1..617b16f 100644
--- a/script-tools/tooling.py
+++ b/script-tools/tooling.py
@@ -1,5 +1,5 @@
 import numpy as np
-
+from sklearn import cluster 
 
 def toDicVec(filename):
 	dic = {}
@@ -17,25 +17,72 @@ def toDicVec(filename):
 		dic[key].pop()
 		first = True
 
+
+	for key in dic:
+		dic[key] = np.array(dic[key],float)
+
 	return dic
 
 
-def wordCorpusSum(filename,corpus,gramsize,hashbangs):
+def wordCorpusSum(filename,corpus,gramsize,hashbangs,vsize):
 	dic = toDicVec(filename)
 	wordDic = {}
+	errorCpt = 0;
 
 	cfile = open(corpus,"r")
 
 	for line in cfile:
 		for word in line.split(" "):
+			
+			if(wordDic.has_key(word)):
+				continue
+
 			key = word
 
+
 			if(hashbangs):
 				word = '#'+word+'#'
 
 			start=0
-			end=gramsize-1
+			end=gramsize
+			vec = np.zeros(vsize)
+			while end <= len(word):
+				
+				try:
+					vec = np.add(vec,dic[word[start:end]])
+				except:
+					#print "the %d-gram %s from word %s is not in the dictionnary "%(gramsize,word[start:end],word)
+					end = end+1
+					start = start+1
+					errorCpt += 1
+					continue
+				
+				end = end+1
+				start = start+1
+
+			wordDic[key] = vec
+
+	print "%d grams where missing from vocabulary" % (errorCpt)
+	return wordDic
+
+
+def kmean(wordDic):
+	km = cluster.KMeans(n_clusters=100, init='k-means++', n_init=10, max_iter=30, tol=0.0001, precompute_distances=True, verbose=1, n_jobs=8)
+	km.fit(wordDic.values())
+	return km.predict(wordDic.values())
+
+
+def writeWordDicBin(wordDic,size):
+	f = open("/tmp/vec.bin","wb")
+	string = str(len(wordDic))+" "+str(size) 
+	f.write(bytearray(string))
 
+	for word in wordDic.keys():
+		f.write(bytearray(word+" "))
+		for num in wordDic[word]:
+			f.write(bytearray(str(num)))
+		f.write(bytearray("\n"))
+	f.close()
 
 
 
diff --git a/word2vec.c b/word2vec.c
index c9f3724..edbfb26 100644
--- a/word2vec.c
+++ b/word2vec.c
@@ -181,9 +181,11 @@ int AddWordToVocab(char *word) {
 
 	if (length > MAX_STRING)
 		length = MAX_STRING;
-
+	printf("hello\n");
 	vocab[vocab_size].word = (char *)calloc(length, sizeof(char));
+	printf("bye12\n");
 	strcpy(vocab[vocab_size].word, word);
+	printf("bye1\n");
 	vocab[vocab_size].cn = 0;
 	vocab_size++;
 
@@ -192,7 +194,6 @@ int AddWordToVocab(char *word) {
 		vocab_max_size += 1000;
 		vocab = (struct vocab_word *)realloc(vocab, vocab_max_size * sizeof(struct vocab_word));
 	}
-
 	hash = GetWordHash(word);
 
 	while (vocab_hash[hash] != -1)
@@ -1009,6 +1010,94 @@ void TrainModel() {
 	fclose(fo);
 }
 
+
+void createWordVectorFile(){
+
+	char word[MAX_STRING];
+	FILE *fin, *fo;
+	int i,start,end,lenWord,indGram, offset;
+	char* gram = (char*)calloc(ngram,sizeof(char));
+	real wordVec[layer1_size];
+
+	fin = fopen(train_file, "rb");
+	fo = fopen(output_file, "wb");
+
+	if (fin == NULL) {
+		printf("ERROR: training data file not found!\n");
+		exit(1);
+	}
+ 	
+ 	fprintf(fo, "%lld %lld\n", vocab_size, layer1_size);
+
+ 	/*write </s>
+ 	if (binary)
+			for (i = 0; i < layer1_size; i++)
+				fwrite(&word2vec[i], sizeof(real), 1, fo);
+		else
+			for (i = 0; i < layer1_size; i++)
+				fprintf(fo, "%lf ", wordVec[i]);
+	*/
+	while (1) {
+		printf("new iter\n");
+		if (feof(fin))
+			break;
+
+		ReadWord(word, fin);
+
+		for(i=0;i<layer1_size;i++) //init word vec
+			wordVec[i] = 0;
+
+		lenWord = strlen(word);
+		start = 0;
+		end = ngram-1;
+		printf("new word: %s\n",word );
+		if(SearchVocab(word) != -1)
+			continue;
+
+		printf("word: %s, len: %d\n",word,(int) strlen(word));
+		while(end<lenWord)
+		{
+			strncpy(gram,word+sizeof(char)*start,ngram);
+			printf("gram: %s\n",gram);
+
+			indGram = SearchVocab(gram);
+
+			if(indGram > -1)
+				offset = indGram * layer1_size;
+			else
+			{
+				end++;
+				start++;
+				continue;
+			}
+			
+
+			for(i=0;i<layer1_size;i++){
+				wordVec[i] = syn0[offset+i];
+			}
+
+			end++;
+			start++;
+		}
+
+		fprintf(fo, "%s ", word);
+		for (i = 0; i < layer1_size; i++){
+			printf("num: %d, val:%lf\n",i,wordVec[i] );
+			if (binary)
+					fwrite(&wordVec[i], sizeof(real), 1, fo);
+			else
+					fprintf(fo, "%lf ", wordVec[i]);
+		}
+		printf("before\n");
+		AddWordToVocab(word);
+		printf("after\n");
+		fprintf(fo, "\n");
+	}
+	printf("Fin\n");
+	fclose(fo);
+	fclose(fin);
+}
+
 int ArgPos(char *str, int argc, char **argv) {
 	int a;
 	for (a = 1; a < argc; a++)
@@ -1107,5 +1196,8 @@ int main(int argc, char **argv) {
 	}
 	
 	TrainModel();
+
+	if(ngram > 0)
+		createWordVectorFile();
 	return 0;
 }
\ No newline at end of file

From f85cab531347512f5f88318ba6c1577c0cada1cb Mon Sep 17 00:00:00 2001
From: Charles-Emmanuel Dias <Charles-Emmanuel.Dias@etu.upmc.fr>
Date: Wed, 18 Jun 2014 16:45:58 +0200
Subject: [PATCH 07/10] crap

---
 makefile   |   3 +-
 word2vec.c | 115 ++++++++++++++++++++++++++++++++++++++---------------
 2 files changed, 84 insertions(+), 34 deletions(-)

diff --git a/makefile b/makefile
index d446b1a..23c95e7 100644
--- a/makefile
+++ b/makefile
@@ -1,6 +1,7 @@
 CC = gcc
 #The -Ofast might not work with older versions of gcc; in that case, use -O2
-CFLAGS = -lm -pthread -Ofast -march=native -Wall -funroll-loops -Wno-unused-result
+CFLAGS = -lm -pthread -Ofast -march=native -Wall -funroll-loops -Wno-unused-result -g
+#
 
 all: word2vec word2phrase distance word-analogy compute-accuracy
 
diff --git a/word2vec.c b/word2vec.c
index edbfb26..10e2faa 100644
--- a/word2vec.c
+++ b/word2vec.c
@@ -42,7 +42,7 @@ struct vocab_word {
  	long long cn; //times of occurence in train file
  	int *point;
  	char *word, *code, codelen;
-};
+}typedef vword;
 
 char train_file[MAX_STRING], output_file[MAX_STRING];
 char save_vocab_file[MAX_STRING], read_vocab_file[MAX_STRING];
@@ -129,7 +129,7 @@ void ReadWord(char *word, FILE *fin) {
 		a++;
 	}
 
-	word[a] = 0;
+	word[a] = '\0';
 }
 
 // Returns hash value of a word
@@ -143,6 +143,36 @@ int GetWordHash(char *word) {
 	return hash;
 }
 
+void DestroyVocab() {
+  int a;
+
+  for (a = 0; a < vocab_size; a++) {
+    if (vocab[a].word != NULL) {
+      free(vocab[a].word);
+    }
+    if (vocab[a].code != NULL) {
+      free(vocab[a].code);
+    }
+    if (vocab[a].point != NULL) {
+      free(vocab[a].point);
+    }
+  }
+  free(vocab[vocab_size].word);
+  free(vocab);
+}
+
+void DestroyNet() {
+  if (syn0 != NULL) {
+    free(syn0);
+  }
+  if (syn1 != NULL) {
+    free(syn1);
+  }
+  if (syn1neg != NULL) {
+    free(syn1neg);
+  }
+}
+
 // Returns position of a word in the vocabulary; if the word is not found, returns -1
 int SearchVocab(char *word) {
 	unsigned int hash = GetWordHash(word);
@@ -168,24 +198,22 @@ int ReadWordIndex(FILE *fin) {
 	if (feof(fin)) 
 		return -1;
 
-
-
-
-
 	return SearchVocab(word);
 }
 
 // Adds a word to the vocabulary
 int AddWordToVocab(char *word) {
+
 	unsigned int hash, length = strlen(word) + 1;
 
 	if (length > MAX_STRING)
 		length = MAX_STRING;
-	printf("hello\n");
+
+		
 	vocab[vocab_size].word = (char *)calloc(length, sizeof(char));
-	printf("bye12\n");
+
 	strcpy(vocab[vocab_size].word, word);
-	printf("bye1\n");
+
 	vocab[vocab_size].cn = 0;
 	vocab_size++;
 
@@ -223,11 +251,13 @@ void SortVocab() {
 	size = vocab_size;
 	train_words = 0;
 
-	for (a = 0; a < size; a++) {
+	for (a = 1; a < size; a++) {
 	// Words occuring less than min_count times will be discarded from the vocab
 		if (vocab[a].cn < min_count) {
 			vocab_size--;
-			free(vocab[vocab_size].word);
+			free(vocab[vocab_size].word); 
+			//free(vocab[a].word);
+			vocab[a].word = NULL;
 		}
 		else {
 		// Hash will be re-computed, as after the sorting it is not actual
@@ -241,6 +271,7 @@ void SortVocab() {
 		}
 	}
 
+
 	vocab = (struct vocab_word *)realloc(vocab, (vocab_size + 1) * sizeof(struct vocab_word));
 
 	// Allocate memory for the binary tree construction
@@ -252,16 +283,20 @@ void SortVocab() {
 
 // Reduces the vocabulary by removing infrequent tokens
 void ReduceVocab() {
+
 	int a, b = 0;
 	unsigned int hash;
 
-	for (a = 0; a < vocab_size; a++)
+	for (a = 0; a < vocab_size; a++){
 		if (vocab[a].cn > min_reduce) {
+
 			vocab[b].cn = vocab[a].cn;
 			vocab[b].word = vocab[a].word;
 			b++;
-	} else
-		free(vocab[a].word);
+
+		} else
+			free(vocab[a].word);
+	}
 
 	vocab_size = b;
 
@@ -385,7 +420,8 @@ void LearnVocabFromTrainFile() {
 	char word[MAX_STRING];
 	FILE *fin;
 	int i,start,end,lenWord;
-	char* gram = (char*)calloc(ngram,sizeof(char));
+
+	char* gram = (char*)calloc(ngram,sizeof(char)); //"\0"
 
 	for (i = 0; i < vocab_hash_size; i++) //init vocab hashtable
 		vocab_hash[i] = -1;
@@ -407,13 +443,18 @@ void LearnVocabFromTrainFile() {
 		{
 			lenWord = strlen(word);
 
+			if(lenWord<ngram){
+				searchAndAddToVocab(word);
+				continue;
+			}
+
  			start = 0;
 			end = ngram-1;
 			//printf("word: %s, len: %d\n",word,(int) strlen(word));
 			while(end<lenWord)
 			{
 				strncpy(gram,word+sizeof(char)*start,ngram);
-				//printf("gram: %s\n",gram);
+			//	printf("gram: %s\n",gram);
 				searchAndAddToVocab(gram);
 
 				end++;
@@ -919,8 +960,13 @@ void TrainModel() {
 
 	printf("Training Ended !\n");
 
+	if(ngram > 0)
+		return;
+
+
 	fo = fopen(output_file, "wb");
 
+
 	if (classes == 0) {
 		// Save the word vectors
 		fprintf(fo, "%lld %lld\n", vocab_size, layer1_size);
@@ -1012,13 +1058,19 @@ void TrainModel() {
 
 
 void createWordVectorFile(){
-
+	int hashset[vocab_hash_size];
+	int hash =0;
 	char word[MAX_STRING];
 	FILE *fin, *fo;
 	int i,start,end,lenWord,indGram, offset;
-	char* gram = (char*)calloc(ngram,sizeof(char));
+
+	char gram[ngram+1];
+
 	real wordVec[layer1_size];
 
+	for(i=0;i<vocab_hash_size;i++)
+		hashset[i] = -1;
+
 	fin = fopen(train_file, "rb");
 	fo = fopen(output_file, "wb");
 
@@ -1027,7 +1079,7 @@ void createWordVectorFile(){
 		exit(1);
 	}
  	
- 	fprintf(fo, "%lld %lld\n", vocab_size, layer1_size);
+ 	fprintf(fo, "%lld %lld\n", vocab_size, layer1_size); //prints size
 
  	/*write </s>
  	if (binary)
@@ -1038,27 +1090,28 @@ void createWordVectorFile(){
 				fprintf(fo, "%lf ", wordVec[i]);
 	*/
 	while (1) {
-		printf("new iter\n");
+		//printf("new iter\n");
 		if (feof(fin))
 			break;
 
 		ReadWord(word, fin);
-
+		hash = GetWordHash(word);
 		for(i=0;i<layer1_size;i++) //init word vec
 			wordVec[i] = 0;
 
 		lenWord = strlen(word);
 		start = 0;
 		end = ngram-1;
-		printf("new word: %s\n",word );
-		if(SearchVocab(word) != -1)
+
+		if(hashset[hash] != -1){
 			continue;
+		}
 
-		printf("word: %s, len: %d\n",word,(int) strlen(word));
+		printf("word: %s\n",word );
 		while(end<lenWord)
 		{
-			strncpy(gram,word+sizeof(char)*start,ngram);
-			printf("gram: %s\n",gram);
+			strncpy(gram,word+(sizeof(char)*start),ngram);
+			
 
 			indGram = SearchVocab(gram);
 
@@ -1071,7 +1124,7 @@ void createWordVectorFile(){
 				continue;
 			}
 			
-
+			printf("gram: %s\n",gram );
 			for(i=0;i<layer1_size;i++){
 				wordVec[i] = syn0[offset+i];
 			}
@@ -1082,18 +1135,14 @@ void createWordVectorFile(){
 
 		fprintf(fo, "%s ", word);
 		for (i = 0; i < layer1_size; i++){
-			printf("num: %d, val:%lf\n",i,wordVec[i] );
 			if (binary)
 					fwrite(&wordVec[i], sizeof(real), 1, fo);
 			else
 					fprintf(fo, "%lf ", wordVec[i]);
 		}
-		printf("before\n");
-		AddWordToVocab(word);
-		printf("after\n");
+		hashset[hash] = 1;
 		fprintf(fo, "\n");
 	}
-	printf("Fin\n");
 	fclose(fo);
 	fclose(fin);
 }
@@ -1187,6 +1236,7 @@ int main(int argc, char **argv) {
 	
 
 	vocab = (struct vocab_word *)calloc(vocab_max_size, sizeof(struct vocab_word));
+
 	vocab_hash = (int *)calloc(vocab_hash_size, sizeof(int));
 	expTable = (real *)malloc((EXP_TABLE_SIZE + 1) * sizeof(real));
 
@@ -1196,7 +1246,6 @@ int main(int argc, char **argv) {
 	}
 	
 	TrainModel();
-
 	if(ngram > 0)
 		createWordVectorFile();
 	return 0;

From 8bf92142b087e6686b68ea9ae801c1df56c2ba8d Mon Sep 17 00:00:00 2001
From: Charles-Emmanuel Dias <Charles-Emmanuel.Dias@etu.upmc.fr>
Date: Thu, 19 Jun 2014 16:48:22 +0200
Subject: [PATCH 08/10] problem w/ determinism..

---
 demo-word-accuracy.sh |   4 +-
 word2vec.c            | 148 +++++++++++++++++++++++++++++++-----------
 2 files changed, 113 insertions(+), 39 deletions(-)

diff --git a/demo-word-accuracy.sh b/demo-word-accuracy.sh
index ffe828a..218cacc 100755
--- a/demo-word-accuracy.sh
+++ b/demo-word-accuracy.sh
@@ -3,6 +3,6 @@ if [ ! -e text8 ]; then
   wget http://mattmahoney.net/dc/text8.zip -O text8.gz
   gzip -d text8.gz -f
 fi
-time ./word2vec -train text8 -output vectors.bin -cbow 0 -size 200 -window 5 -negative 0 -hs 1 -sample 1e-3 -threads 12 -binary 1
-./compute-accuracy vectors.bin 30000 < questions-words.txt
+time ./word2vec -train text8 -output /tmp/vectors.bin -cbow 0 -size 200 -window 5 -negative 0 -hs 1 -sample 1e-3 -threads 12 -binary 1 -ngram 3 -hashbang 1 
+./compute-accuracy /tmp/vectors.bin 30000 < questions-words.txt
 # to compute accuracy with the full vocabulary, use: ./compute-accuracy vectors.bin < questions-words.txt
diff --git a/word2vec.c b/word2vec.c
index 10e2faa..edcaf36 100644
--- a/word2vec.c
+++ b/word2vec.c
@@ -90,20 +90,18 @@ void InitUnigramTable() {
 void ReadWord(char *word, FILE *fin) {
 	int a = 0, character;
 	
-	if(hashbang > 0){
-		word[a] = '#'; //words starts with #
-		a++;
-	}
-
 	while (!feof(fin)) {
 		character = fgetc(fin);
 
+
+
 		if (character == 13) //Carriage Return
 			continue;
 
 		if ((character == ' ') || (character == '\t') || (character == '\n')) {
 			
 			if (a > 0) {
+
 		    	if (character == '\n')
 		    		ungetc(character, fin); //we don't want the new line char.
 		    break;
@@ -124,12 +122,29 @@ void ReadWord(char *word, FILE *fin) {
 			a--;   // Truncate too long words
 	}
 
-	if(hashbang>0){
-		word[a] = '#'; //words ends with #
+	word[a] = '\0';
+
+	if(hashbang > 0)
+	{
+
+		a = strlen(word); //'\0'
+		word[a] = '#';
 		a++;
+		word[a] = '\0';
+		a++;
+
+
+ 		while(a>0)
+ 		{
+ 			word[a] = word[a-1];
+ 			a--;
+ 		}
+
+ 		word[0] ='#';
 	}
 
-	word[a] = '\0';
+
+	return;
 }
 
 // Returns hash value of a word
@@ -421,7 +436,7 @@ void LearnVocabFromTrainFile() {
 	FILE *fin;
 	int i,start,end,lenWord;
 
-	char* gram = (char*)calloc(ngram,sizeof(char)); //"\0"
+	char gram[ngram+1];
 
 	for (i = 0; i < vocab_hash_size; i++) //init vocab hashtable
 		vocab_hash[i] = -1;
@@ -443,18 +458,29 @@ void LearnVocabFromTrainFile() {
 		{
 			lenWord = strlen(word);
 
-			if(lenWord<ngram){
+			if(lenWord<=ngram){ //word smaller or equal to ngram var.
 				searchAndAddToVocab(word);
 				continue;
 			}
 
  			start = 0;
 			end = ngram-1;
-			//printf("word: %s, len: %d\n",word,(int) strlen(word));
+			i=0;
+
+		
+
 			while(end<lenWord)
 			{
-				strncpy(gram,word+sizeof(char)*start,ngram);
-			//	printf("gram: %s\n",gram);
+
+				for (i = 0; i < ngram; i++)
+				{
+					gram[i] = word[start+i];
+				}
+				gram[ngram] = '\0';
+
+
+				
+
 				searchAndAddToVocab(gram);
 
 				end++;
@@ -486,7 +512,6 @@ void LearnVocabFromTrainFile() {
 
 	file_size = ftell(fin);
 	fclose(fin);
-	free(gram);
 }
 
 void SaveVocab() {
@@ -587,7 +612,7 @@ void InitNet() {
 }
 
 void *TrainModelThread(void *id) {
-	long long a, b, d, word, last_word, sentence_length = 0, sentence_position = 0;
+	long long a, b, d, i, word, last_word, sentence_length = 0, sentence_position = 0;
 	long long word_count = 0, last_word_count = 0, sen[MAX_SENTENCE_LENGTH + 1];
 	long long l1, l2, c, target, label;
 	unsigned long long next_random = (long long)id;
@@ -595,10 +620,9 @@ void *TrainModelThread(void *id) {
 	clock_t now;
 
 	char wordToGram[MAX_STRING];
-	char* gram;
+	char gram[ngram+1];
 	int start = 0;
 	int end = ngram-1;
-	gram = (char*)calloc(ngram,sizeof(char));
 	int newWord = 1;
 	int wordLength = 0;
 
@@ -647,7 +671,7 @@ void *TrainModelThread(void *id) {
 						start = 0;
 						end = ngram-1;
 						wordLength = strlen(wordToGram);
-					//	printf("new word: %s, length:%d\n",wordToGram,wordLength);
+					
 						newWord = 0;
 					}
 					
@@ -659,9 +683,15 @@ void *TrainModelThread(void *id) {
 					}
 
 					
-					strncpy(gram,wordToGram+sizeof(char)*start,ngram);
+					for (i = 0; i < ngram; i++)
+					{
+						gram[i] = wordToGram[start+i];
+					}
+					gram[ngram] = '\0';
+
+
 					word = SearchVocab(gram);
-					//printf("word: %s, gram: %s,index:%lld, start: %d, end %d \n",wordToGram,gram,word,start,end);
+					
 					end++;
 					start++;
 
@@ -1058,14 +1088,19 @@ void TrainModel() {
 
 
 void createWordVectorFile(){
-	int hashset[vocab_hash_size];
+	char grama[ngram+1];
 	int hash =0;
 	char word[MAX_STRING];
 	FILE *fin, *fo;
 	int i,start,end,lenWord,indGram, offset;
+	int *hashset;
+	long long unsigned int cptWord=0;
 
-	char gram[ngram+1];
 
+	
+	
+
+	hashset = calloc(vocab_hash_size,sizeof(int));
 	real wordVec[layer1_size];
 
 	for(i=0;i<vocab_hash_size;i++)
@@ -1078,23 +1113,39 @@ void createWordVectorFile(){
 		printf("ERROR: training data file not found!\n");
 		exit(1);
 	}
- 	
- 	fprintf(fo, "%lld %lld\n", vocab_size, layer1_size); //prints size
 
- 	/*write </s>
- 	if (binary)
-			for (i = 0; i < layer1_size; i++)
-				fwrite(&word2vec[i], sizeof(real), 1, fo);
+	while (1) {
+		
+		if (feof(fin))
+			break;
+
+		ReadWord(word, fin);
+		hash = GetWordHash(word);
+
+		if (hashset[hash] != -1)
+			continue;
 		else
-			for (i = 0; i < layer1_size; i++)
-				fprintf(fo, "%lf ", wordVec[i]);
-	*/
+			hashset[hash] = 1;
+		cptWord++;
+	}
+
+	fprintf(fo, "%lld %lld\n", cptWord, layer1_size); //prints size
+	printf("number of words: %lld\n",cptWord );
+
+ 	/*write </s> missing */
+
+	/*reset*/
+	rewind(fin);
+	for(i=0;i<vocab_hash_size;i++)
+		hashset[i] = -1;
+		
 	while (1) {
-		//printf("new iter\n");
+		
 		if (feof(fin))
 			break;
 
 		ReadWord(word, fin);
+
 		hash = GetWordHash(word);
 		for(i=0;i<layer1_size;i++) //init word vec
 			wordVec[i] = 0;
@@ -1107,13 +1158,21 @@ void createWordVectorFile(){
 			continue;
 		}
 
-		printf("word: %s\n",word );
+		//printf("word: %s\n",word );
 		while(end<lenWord)
 		{
-			strncpy(gram,word+(sizeof(char)*start),ngram);
+
+
+			for (i = 0; i < ngram; i++)
+			{
+				grama[i] = word[start+i];
+			}
+			grama[ngram] = '\0';
+
+			//strncpy(gram,word+(sizeof(char)*start),ngram);
 			
 
-			indGram = SearchVocab(gram);
+			indGram = SearchVocab(grama);
 
 			if(indGram > -1)
 				offset = indGram * layer1_size;
@@ -1124,7 +1183,7 @@ void createWordVectorFile(){
 				continue;
 			}
 			
-			printf("gram: %s\n",gram );
+			//printf("gram: %s\n",grama );
 			for(i=0;i<layer1_size;i++){
 				wordVec[i] = syn0[offset+i];
 			}
@@ -1133,6 +1192,18 @@ void createWordVectorFile(){
 			start++;
 		}
 
+		hashset[hash] = 1;
+		cptWord++;
+
+
+
+		//removes #bangs
+		for(i=1;i<lenWord;i++){
+			word[i-1]=word[i];
+		}
+		word[lenWord-2]='\0';
+
+
 		fprintf(fo, "%s ", word);
 		for (i = 0; i < layer1_size; i++){
 			if (binary)
@@ -1140,9 +1211,12 @@ void createWordVectorFile(){
 			else
 					fprintf(fo, "%lf ", wordVec[i]);
 		}
-		hashset[hash] = 1;
+		
 		fprintf(fo, "\n");
+		
 	}
+	
+	
 	fclose(fo);
 	fclose(fin);
 }

From 051f439e22a011e289153375b831678fcedb949b Mon Sep 17 00:00:00 2001
From: Charles-Emmanuel Dias <Charles-Emmanuel.Dias@etu.upmc.fr>
Date: Thu, 19 Jun 2014 17:59:10 +0200
Subject: [PATCH 09/10] [wip] Correct ngram bug + add normalisation and mean

---
 demo-word-accuracy.sh |  2 +-
 word2vec.c            | 45 ++++++++++++++++++++++++++++++++++++-------
 2 files changed, 39 insertions(+), 8 deletions(-)

diff --git a/demo-word-accuracy.sh b/demo-word-accuracy.sh
index 218cacc..1388fde 100755
--- a/demo-word-accuracy.sh
+++ b/demo-word-accuracy.sh
@@ -3,6 +3,6 @@ if [ ! -e text8 ]; then
   wget http://mattmahoney.net/dc/text8.zip -O text8.gz
   gzip -d text8.gz -f
 fi
-time ./word2vec -train text8 -output /tmp/vectors.bin -cbow 0 -size 200 -window 5 -negative 0 -hs 1 -sample 1e-3 -threads 12 -binary 1 -ngram 3 -hashbang 1 
+time ./word2vec -train text8 -output /tmp/vectors.bin -cbow 0 -size 200 -window 5 -negative 0 -hs 0 -threads 12 -binary 1 -ngram 8 -hashbang 1 -min-count 0 
 ./compute-accuracy /tmp/vectors.bin 30000 < questions-words.txt
 # to compute accuracy with the full vocabulary, use: ./compute-accuracy vectors.bin < questions-words.txt
diff --git a/word2vec.c b/word2vec.c
index edcaf36..ad921dd 100644
--- a/word2vec.c
+++ b/word2vec.c
@@ -109,6 +109,7 @@ void ReadWord(char *word, FILE *fin) {
 
 		 	if (character == '\n') { 
 			    strcpy(word, (char *)"</s>");  //newline become </s> in corpus
+			    printf("READ newline\n");
 			    return;
 		  	}
 		 	else
@@ -257,6 +258,9 @@ void SortVocab() {
 	int a, size;
 	unsigned int hash;
 
+	if(debug_mode > 2)
+		printf("Sorting Vocab...\n");
+
 	// Sort the vocabulary and keep </s> at the first position
 	qsort(&vocab[1], vocab_size - 1, sizeof(struct vocab_word), VocabCompare);
 
@@ -294,6 +298,9 @@ void SortVocab() {
 		vocab[a].code = (char *)calloc(MAX_CODE_LENGTH, sizeof(char));
 		vocab[a].point = (int *)calloc(MAX_CODE_LENGTH, sizeof(int));
 	}
+
+	printf("Sorting ended !\n");
+
 }
 
 // Reduces the vocabulary by removing infrequent tokens
@@ -1095,7 +1102,9 @@ void createWordVectorFile(){
 	int i,start,end,lenWord,indGram, offset;
 	int *hashset;
 	long long unsigned int cptWord=0;
-
+	int skipCpt=0;
+	int unexistCpt=0;
+	int gramCpt=0;
 
 	
 	
@@ -1132,12 +1141,27 @@ void createWordVectorFile(){
 	fprintf(fo, "%lld %lld\n", cptWord, layer1_size); //prints size
 	printf("number of words: %lld\n",cptWord );
 
- 	/*write </s> missing */
+ 	
 
 	/*reset*/
 	rewind(fin);
 	for(i=0;i<vocab_hash_size;i++)
 		hashset[i] = -1;
+	cptWord=0;
+
+	/*write </s>*/
+	indGram = SearchVocab("</s>");
+	offset = indGram * layer1_size;
+	fprintf(fo, "</s> ");
+	for (i = 0; i < layer1_size; i++){
+		if (binary)
+			fwrite(&wordVec[i], sizeof(real), 1, fo);
+		else
+			fprintf(fo, "%lf ", wordVec[i]);
+	}
+	fprintf(fo, "\n");
+
+
 		
 	while (1) {
 		
@@ -1147,7 +1171,7 @@ void createWordVectorFile(){
 		ReadWord(word, fin);
 
 		hash = GetWordHash(word);
-		for(i=0;i<layer1_size;i++) //init word vec
+		for(i=0;i<layer1_size;i++) 
 			wordVec[i] = 0;
 
 		lenWord = strlen(word);
@@ -1155,21 +1179,20 @@ void createWordVectorFile(){
 		end = ngram-1;
 
 		if(hashset[hash] != -1){
+			skipCpt++;
 			continue;
 		}
 
-		//printf("word: %s\n",word );
 		while(end<lenWord)
 		{
 
-
 			for (i = 0; i < ngram; i++)
 			{
 				grama[i] = word[start+i];
 			}
 			grama[ngram] = '\0';
 
-			//strncpy(gram,word+(sizeof(char)*start),ngram);
+			
 			
 
 			indGram = SearchVocab(grama);
@@ -1178,6 +1201,7 @@ void createWordVectorFile(){
 				offset = indGram * layer1_size;
 			else
 			{
+				unexistCpt++;
 				end++;
 				start++;
 				continue;
@@ -1185,13 +1209,19 @@ void createWordVectorFile(){
 			
 			//printf("gram: %s\n",grama );
 			for(i=0;i<layer1_size;i++){
-				wordVec[i] = syn0[offset+i];
+				wordVec[i] += syn0[offset+i];
 			}
+			gramCpt++;
 
 			end++;
 			start++;
 		}
 
+		//normalization
+		for(i=0;i<layer1_size;i++){
+				wordVec[i] /= gramCpt;
+		}
+
 		hashset[hash] = 1;
 		cptWord++;
 
@@ -1216,6 +1246,7 @@ void createWordVectorFile(){
 		
 	}
 	
+	printf("Saved %lld word vectors, %d grams weren't in dictionnary, %d words were skipped (doubles)\n",cptWord,unexistCpt,skipCpt);
 	
 	fclose(fo);
 	fclose(fin);

From 67d17eeecdf71e7694263e061d796a079a7a4a3a Mon Sep 17 00:00:00 2001
From: Charles-Emmanuel Dias <Charles-Emmanuel.Dias@etu.upmc.fr>
Date: Mon, 23 Jun 2014 16:08:18 +0200
Subject: [PATCH 10/10] test version

---
 compute-accuracy-syntax.c | 262 ++++++++++++++++++++++++++++++++++++++
 compute-accuracy.c        |  26 +++-
 demo-word-accuracy.sh     |   2 +-
 makefile                  |   8 +-
 test-ngram-w2vec.py       |  36 ++++++
 testNgrams.sh             |  16 +++
 word2vec.c                |  42 +++---
 7 files changed, 365 insertions(+), 27 deletions(-)
 create mode 100644 compute-accuracy-syntax.c
 create mode 100644 test-ngram-w2vec.py
 create mode 100755 testNgrams.sh

diff --git a/compute-accuracy-syntax.c b/compute-accuracy-syntax.c
new file mode 100644
index 0000000..e1fbd47
--- /dev/null
+++ b/compute-accuracy-syntax.c
@@ -0,0 +1,262 @@
+//  Copyright 2013 Google Inc. All Rights Reserved.
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+#include <malloc.h>
+#include <ctype.h>
+
+const long long max_size = 2000;         // max length of strings
+const long long N = 1;                   // number of closest words
+const long long max_w = 50;              // max length of vocabulary entries
+
+int main(int argc, char **argv)
+{
+	FILE *f;
+	char st1[max_size], st2[max_size], st3[max_size], st4[max_size], bestw[N][max_size], file_name[max_size], ch;
+	float dist, len, bestd[N], vec[max_size];
+	long long words, size, a, b, c, d, b1, b2, b3, threshold = 0;
+	float *M;
+	char *vocab;
+	int TCN, CCN = 0, TACN = 0, CACN = 0, SECN = 0, SYCN = 0, SEAC = 0, SYAC = 0, QID = 0, TQ = 0, TQS = 0;
+	int small = 0;
+
+
+	if (argc < 2) {
+		printf("Usage: ./compute-accuracy <FILE> <threshold> <small>\nwhere FILE contains word projections, and threshold is used to reduce vocabulary of the model for fast approximate evaluation (0 = off, otherwise typical value is 30000)\n");
+		return 0;
+	}
+
+	strcpy(file_name, argv[1]);
+
+	if (argc > 2)
+		threshold = atoi(argv[2]);
+
+	if (argc > 3)
+		small = 1;
+
+	f = fopen(file_name, "rb");
+
+	if (f == NULL) {
+		printf("Input file not found\n");
+		return -1;
+	}
+
+	fscanf(f, "%lld", &words);
+
+	if (threshold)
+		if (words > threshold)
+			words = threshold;
+
+	fscanf(f, "%lld", &size);
+
+	vocab = (char *)malloc(words * max_w * sizeof(char));
+
+	M = (float *)malloc(words * size * sizeof(float));
+
+	if (M == NULL) {
+		printf("Cannot allocate memory: %lld MB\n", words * size * sizeof(float) / 1048576);
+		return -1;
+	}
+
+	for (b = 0; b < words; b++) {
+
+		fscanf(f, "%s%c", &vocab[b * max_w], &ch);
+
+		for (a = 0; a < max_w; a++)
+			vocab[b * max_w + a] = toupper(vocab[b * max_w + a]);
+
+		for (a = 0; a < size; a++)
+			fread(&M[a + b * size], sizeof(float), 1, f);
+
+		len = 0;
+
+		for (a = 0; a < size; a++)
+			len += M[a + b * size] * M[a + b * size];
+
+		len = sqrt(len);
+
+		for (a = 0; a < size; a++)
+			M[a + b * size] /= len;
+	}
+
+	fclose(f);
+
+	TCN = 0;
+
+	while (1) {
+
+		for (a = 0; a < N; a++)
+			bestd[a] = 0;
+
+		for (a = 0; a < N; a++)
+			bestw[a][0] = 0;
+
+		scanf("%s", st1);
+
+		for (a = 0; a < strlen(st1); a++)
+			st1[a] = toupper(st1[a]);
+
+		if ((!strcmp(st1, ":")) || (!strcmp(st1, "EXIT")) || feof(stdin)) {
+
+			if (TCN == 0)
+			TCN = 1;
+
+			if (QID != 0){ 
+				if(small ==0)
+					printf("%.2f %%  (%d / %d)\n", CCN / (float)TCN * 100, CCN, TCN);
+				else
+					printf("%.2f\t", CCN / (float)TCN * 100);
+			}
+			
+		
+
+		QID++;
+		scanf("%s", st1);
+
+		if (feof(stdin))
+			break;
+
+		if(small==0)
+			printf("%s\t", st1);
+
+
+		TCN = 0;
+		CCN = 0;
+
+		continue;
+
+		}
+
+		if (!strcmp(st1, "EXIT"))
+			break;
+
+		scanf("%s", st2);
+
+		for (a = 0; a < strlen(st2); a++)
+			st2[a] = toupper(st2[a]);
+
+		scanf("%s", st3);
+
+		for (a = 0; a<strlen(st3); a++)
+			st3[a] = toupper(st3[a]);
+
+		scanf("%s", st4);
+
+		for (a = 0; a < strlen(st4); a++)
+			st4[a] = toupper(st4[a]);
+
+		for (b = 0; b < words; b++)
+			if (!strcmp(&vocab[b * max_w], st1))
+				break;
+
+		b1 = b;
+
+		for (b = 0; b < words; b++)
+			if (!strcmp(&vocab[b * max_w], st2))
+				break;
+
+		b2 = b;
+
+		for (b = 0; b < words; b++)
+			if (!strcmp(&vocab[b * max_w], st3))
+				break;
+
+		b3 = b;
+
+		for (a = 0; a < N; a++)
+			bestd[a] = 0;
+
+		for (a = 0; a < N; a++)
+			bestw[a][0] = 0;
+
+		TQ++;
+
+		if (b1 == words)
+			continue;
+
+		if (b2 == words)
+			continue;
+
+		if (b3 == words)
+			continue;
+
+		for (b = 0; b < words; b++) 
+			if (!strcmp(&vocab[b * max_w], st4))
+				break;
+
+		if (b == words)
+			continue;
+
+		for (a = 0; a < size; a++)
+			vec[a] = (M[a + b2 * size] - M[a + b1 * size]) + M[a + b3 * size];
+
+		TQS++;
+
+		for (c = 0; c < words; c++) {
+
+			if (c == b1)
+				continue;
+
+			if (c == b2)
+				continue;
+
+			if (c == b3)
+				continue;
+
+			dist = 0;
+
+			for (a = 0; a < size; a++)
+				dist += vec[a] * M[a + c * size];
+
+			for (a = 0; a < N; a++) {
+
+				if (dist > bestd[a]) {
+
+					for (d = N - 1; d > a; d--) {
+						bestd[d] = bestd[d - 1];
+						strcpy(bestw[d], bestw[d - 1]);
+					}
+
+					bestd[a] = dist;
+					strcpy(bestw[a], &vocab[c * max_w]);
+					break;
+				}
+			}
+		}
+
+		if (!strcmp(st4, bestw[0])) {
+			CCN++;
+			CACN++;
+			SYAC++;
+		}	
+
+
+		SYCN++;
+		TCN++;
+		TACN++;
+
+	}
+
+	if(small == 0){
+		printf("Total accuracy: %.2f %%\n", CACN / (float)TACN * 100);
+		printf("Questions seen / total: %d %d   %.2f %% \n", TQS, TQ, TQS/(float)TQ*100);
+	}else{
+		printf("%.2f\n",CACN / (float)TACN * 100);
+	}
+	return 0;
+
+}
diff --git a/compute-accuracy.c b/compute-accuracy.c
index a1166c6..193fa18 100644
--- a/compute-accuracy.c
+++ b/compute-accuracy.c
@@ -26,15 +26,15 @@ const long long max_w = 50;              // max length of vocabulary entries
 int main(int argc, char **argv)
 {
   FILE *f;
-  char st1[max_size], st2[max_size], st3[max_size], st4[max_size], bestw[N][max_size], file_name[max_size], ch;
+  char st1[max_size], st2[max_size], st3[max_size], st4[max_size], bestw[N][max_size], file_name[max_size],output_file[max_size], ch;
   float dist, len, bestd[N], vec[max_size];
   long long words, size, a, b, c, d, b1, b2, b3, threshold = 0;
   float *M;
   char *vocab;
   int TCN, CCN = 0, TACN = 0, CACN = 0, SECN = 0, SYCN = 0, SEAC = 0, SYAC = 0, QID = 0, TQ = 0, TQS = 0;
-
+  int small_print =0;
   if (argc < 2) {
-    printf("Usage: ./compute-accuracy <FILE> <threshold>\nwhere FILE contains word projections, and threshold is used to reduce vocabulary of the model for fast approximate evaluation (0 = off, otherwise typical value is 30000)\n");
+    printf("Usage: ./compute-accuracy <FILE> <threshold> <small_print>\nwhere FILE contains word projections, and threshold is used to reduce vocabulary of the model for fast approximate evaluation (0 = off, otherwise typical value is 30000)\n");
     return 0;
   }
 
@@ -43,6 +43,9 @@ int main(int argc, char **argv)
   if (argc > 2)
     threshold = atoi(argv[2]);
 
+  if (argc > 3)
+    small_print = 1; //output is smaller
+
   f = fopen(file_name, "rb");
 
   if (f == NULL) {
@@ -91,7 +94,8 @@ int main(int argc, char **argv)
   fclose(f);
 
   TCN = 0;
-
+  if(small_print)
+    printf("Type\tAccuracy(top1)%%\tTotal Acc%%\tSemantic Acc%%\tSyntactic Acc%%\tSuccess\tTotal\n");
   while (1) {
 
     for (a = 0; a < N; a++)
@@ -111,8 +115,12 @@ int main(int argc, char **argv)
         TCN = 1;
 
       if (QID != 0) {
-        printf("ACCURACY TOP1: %.2f %%  (%d / %d)\n", CCN / (float)TCN * 100, CCN, TCN);
-        printf("Total accuracy: %.2f %%   Semantic accuracy: %.2f %%   Syntactic accuracy: %.2f %% \n", CACN / (float)TACN * 100, SEAC / (float)SECN * 100, SYAC / (float)SYCN * 100);
+        if(small_print){
+          printf("%.2f\t%.2f\t%.2f\t%.2f\t%d\t%d\n", CCN / (float)TCN * 100,CACN / (float)TACN * 100, SEAC / (float)SECN * 100, SYAC / (float)SYCN * 100, CCN, TCN);
+        }else{
+          printf("ACCURACY TOP1: %.2f %%  (%d / %d)\n", CCN / (float)TCN * 100, CCN, TCN);
+          printf("Total accuracy: %.2f %%   Semantic accuracy: %.2f %%   Syntactic accuracy: %.2f %% \n", CACN / (float)TACN * 100, SEAC / (float)SECN * 100, SYAC / (float)SYCN * 100);
+        }
       }
 
       QID++;
@@ -121,7 +129,11 @@ int main(int argc, char **argv)
       if (feof(stdin))
         break;
 
-      printf("%s:\n", st1);
+      if(small_print)
+        printf("%s\t", st1);
+      else
+        printf("%s:\n", st1);
+
       TCN = 0;
       CCN = 0;
       continue;
diff --git a/demo-word-accuracy.sh b/demo-word-accuracy.sh
index 1388fde..8ec7622 100755
--- a/demo-word-accuracy.sh
+++ b/demo-word-accuracy.sh
@@ -3,6 +3,6 @@ if [ ! -e text8 ]; then
   wget http://mattmahoney.net/dc/text8.zip -O text8.gz
   gzip -d text8.gz -f
 fi
-time ./word2vec -train text8 -output /tmp/vectors.bin -cbow 0 -size 200 -window 5 -negative 0 -hs 0 -threads 12 -binary 1 -ngram 8 -hashbang 1 -min-count 0 
+time ./word2vec -train text8 -output /tmp/vectors.bin -cbow 0 -size 200 -window 5 -negative 0 -hs 0 -threads 12 -binary 1
 ./compute-accuracy /tmp/vectors.bin 30000 < questions-words.txt
 # to compute accuracy with the full vocabulary, use: ./compute-accuracy vectors.bin < questions-words.txt
diff --git a/makefile b/makefile
index 23c95e7..da67021 100644
--- a/makefile
+++ b/makefile
@@ -1,9 +1,9 @@
 CC = gcc
 #The -Ofast might not work with older versions of gcc; in that case, use -O2
-CFLAGS = -lm -pthread -Ofast -march=native -Wall -funroll-loops -Wno-unused-result -g
+CFLAGS = -lm -pthread -Ofast -march=native -Wall -funroll-loops -Wno-unused-result
 #
 
-all: word2vec word2phrase distance word-analogy compute-accuracy
+all: word2vec word2phrase distance word-analogy compute-accuracy compute-accuracy-syntax 
 
 word2vec : word2vec.c
 	$(CC) word2vec.c -o word2vec $(CFLAGS)
@@ -15,7 +15,9 @@ word-analogy : word-analogy.c
 	$(CC) word-analogy.c -o word-analogy $(CFLAGS)
 compute-accuracy : compute-accuracy.c
 	$(CC) compute-accuracy.c -o compute-accuracy $(CFLAGS)
+compute-accuracy-syntax : compute-accuracy-syntax.c
+	$(CC) compute-accuracy-syntax.c -o compute-accuracy-syntax $(CFLAGS)
 	chmod +x *.sh
 
 clean:
-	rm -rf word2vec word2phrase distance word-analogy compute-accuracy
\ No newline at end of file
+	rm -rf word2vec word2phrase distance word-analogy compute-accuracy compute-accuracy-syntax
\ No newline at end of file
diff --git a/test-ngram-w2vec.py b/test-ngram-w2vec.py
new file mode 100644
index 0000000..85d213f
--- /dev/null
+++ b/test-ngram-w2vec.py
@@ -0,0 +1,36 @@
+#test-ngram-w2vec.py
+import subprocess as sp
+import numpy as np
+
+sizes = range(200,350,50)
+samples = ["0","1e-5"]
+negatives = range(0,10,5)
+alphas = np.arange(0.025,0.060,0.015)
+ngrams = range(2,5,1)
+hashbs = [0,1]
+cbows = [0,1]
+hsE = [0,1]
+
+
+cpt = 1
+logFile = open("results.txt" , "w")
+lofFile2 = open("parameters.txt", "w")
+lofFile2.write("size\tsample\tnegative\talpha\tngram\thashbang\tcbow\ths\n");
+for size in sizes:
+	for sample in samples:
+		for negative in negatives:
+			for hs in hsE:
+				if negative == 0 and hs == 0:
+					continue;
+				for alpha in alphas:
+					for ngram in ngrams:
+						for hashb in hashbs:
+							for cbow in cbows:
+									print "iteration %d on 649" % (cpt)
+									argsLine= "./testNgrams.sh %s %s %s %s %s %s %s %s" % (str(size),str(sample),str(negative),str(alpha),str(ngram),str(hashb),str(cbow),str(hs))
+									argu= "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (str(size),str(sample),str(negative),str(alpha),str(ngram),str(hashb),str(cbow),str(hs))
+									lofFile2.write(argu);
+									sp.call(args=argsLine,shell=True,stdout=logFile)
+									cpt = cpt+1
+	
+
diff --git a/testNgrams.sh b/testNgrams.sh
new file mode 100755
index 0000000..cb3bed7
--- /dev/null
+++ b/testNgrams.sh
@@ -0,0 +1,16 @@
+if [ "$#" -ne 8 ]; then
+    echo "Illegal number of parameters"
+    echo "Usage: testNgram size sample neg alpha ngram hashbang cbow hs"
+fi
+
+p_size=$1
+p_sample=$2
+p_neg=$3
+p_alpha=$4
+p_ngram=$5
+p_hashb=$6
+p_cbow=$7
+p_hs=$8
+
+./word2vec -train text8 -output /tmp/vectors.bin -debug 0 -min-count 0 -window 5 -threads 12 -binary 1 -cbow $p_cbow -size $p_size -negative $p_neg -hs $p_neg -sample $p_sample -ngram $p_ngram -hashbang $p_hashb -alpha $p_alpha 
+./compute-accuracy-syntax /tmp/vectors.bin 10000 2 < questions-words-syntax.txt
diff --git a/word2vec.c b/word2vec.c
index ad921dd..3b70592 100644
--- a/word2vec.c
+++ b/word2vec.c
@@ -109,7 +109,6 @@ void ReadWord(char *word, FILE *fin) {
 
 		 	if (character == '\n') { 
 			    strcpy(word, (char *)"</s>");  //newline become </s> in corpus
-			    printf("READ newline\n");
 			    return;
 		  	}
 		 	else
@@ -299,8 +298,6 @@ void SortVocab() {
 		vocab[a].point = (int *)calloc(MAX_CODE_LENGTH, sizeof(int));
 	}
 
-	printf("Sorting ended !\n");
-
 }
 
 // Reduces the vocabulary by removing infrequent tokens
@@ -467,13 +464,18 @@ void LearnVocabFromTrainFile() {
 
 			if(lenWord<=ngram){ //word smaller or equal to ngram var.
 				searchAndAddToVocab(word);
-				continue;
+				//printf("smaller\n");
+
+				if (feof(fin))
+					break;
+				else
+					continue;
 			}
 
  			start = 0;
 			end = ngram-1;
 			i=0;
-
+			//printf("%s\n",word );
 		
 
 			while(end<lenWord)
@@ -486,7 +488,7 @@ void LearnVocabFromTrainFile() {
 				gram[ngram] = '\0';
 
 
-				
+				//printf("%s\n",gram );
 
 				searchAndAddToVocab(gram);
 
@@ -512,7 +514,7 @@ void LearnVocabFromTrainFile() {
 
 	SortVocab();
 
-	if (debug_mode > 0) {
+	if (debug_mode > 1) {
 		printf("Vocab size: %lld\n", vocab_size);
 		printf("Words in train file: %lld\n", train_words);
 	}
@@ -560,7 +562,7 @@ void ReadVocab() {
 
 	SortVocab();
 
-	if (debug_mode > 0) {
+	if (debug_mode > 1) {
 		printf("Vocab size: %lld\n", vocab_size);
 		printf("Words in train file: %lld\n", train_words);
 	}
@@ -968,7 +970,10 @@ void TrainModel() {
 	long a, b, c, d;
 	FILE *fo;
 	pthread_t *pt = (pthread_t *)malloc(num_threads * sizeof(pthread_t));
-	printf("Starting training using file %s\n", train_file);
+
+	if(debug_mode>0)
+		printf("Starting training using file %s\n", train_file);
+
 	starting_alpha = alpha;
 
 	if (read_vocab_file[0] != 0)
@@ -995,7 +1000,8 @@ void TrainModel() {
 	for (a = 0; a < num_threads; a++)
 		pthread_join(pt[a], NULL);
 
-	printf("Training Ended !\n");
+	if(debug_mode > 0)
+		printf("Training Ended !\n");
 
 	if(ngram > 0)
 		return;
@@ -1139,7 +1145,9 @@ void createWordVectorFile(){
 	}
 
 	fprintf(fo, "%lld %lld\n", cptWord, layer1_size); //prints size
-	printf("number of words: %lld\n",cptWord );
+	
+	if(debug_mode > 0)
+		printf("number of words: %lld\n",cptWord );
 
  	
 
@@ -1228,10 +1236,12 @@ void createWordVectorFile(){
 
 
 		//removes #bangs
-		for(i=1;i<lenWord;i++){
-			word[i-1]=word[i];
+		if(hashbang > 0){
+			for(i=1;i<lenWord;i++){
+				word[i-1]=word[i];
+			}
+			word[lenWord-2]='\0';
 		}
-		word[lenWord-2]='\0';
 
 
 		fprintf(fo, "%s ", word);
@@ -1245,8 +1255,8 @@ void createWordVectorFile(){
 		fprintf(fo, "\n");
 		
 	}
-	
-	printf("Saved %lld word vectors, %d grams weren't in dictionnary, %d words were skipped (doubles)\n",cptWord,unexistCpt,skipCpt);
+	if(debug_mode > 0)
+		printf("Saved %lld word vectors, %d grams weren't in dictionnary, %d words were skipped (doubles)\n",cptWord,unexistCpt,skipCpt);
 	
 	fclose(fo);
 	fclose(fin);