-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathTokenizer.h
368 lines (337 loc) · 19.3 KB
/
Tokenizer.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *\
* Header File: Definition of the 'Tokenizer' Class *
* CISC 3160 - Programming Languages - Fall 2019 - Toy Language Project *
* Author: *
* Miriam Briskman *
* Supervised by: *
* Professor Neng-Fa Zhou *
\* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
#ifndef TOKENIZER_H
#define TOKENIZER_H
#include <cctype> // ---> "Character handling functions"
#include <fstream> // ---> Header providing file stream classes
#include <iomanip> // ---> "parametric manipulators"
#include <iostream> // ---> Input / Output streams controlling objects
#include <map> // ---> The map container
#include <sstream> // ---> Used by the 'to_string' function
#include <string> // ---> "String types, character traits and a set of converting functions"
#include "Symbol_Table_Row.h" // ---> The "Symbol_Table_Row" Object
using namespace std;
/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *\
* The Token Class *
* Purpose: *
* To preserve both the content of a token (as a string) and its type (based on the Token *
* 'Type' Categorizations below.) *
* Data variables (private): *
* 1) 'type' (char) - denotes the type of the kept token (see Token 'Type' Categorizations) *
* 2) 'name' (string) - keeps the token's name and key to the corresponding Symbol Table value. *
* Functions (public): *
* 1) Optional constructor. *
* 2) 'getName' - receives no arguments and returns the content of 'name' (string). *
* 3) 'getType' - receives no arguments and returns the content of 'type' (char). *
* Class's Friends: *
* 1) Tokenizer *
* Token 'Type' Categorizations: *
* 1) Identifier ---> the 'type' variable contains the character '1' *
* 2) Literal ---> the 'type' variable contains the character '2' *
* 3) Symbol ---> if from [= ; + - * ( )], the 'type' variable contains the character '3' *
* 4) Input End ---> if the end of the input was reached, the 'type' variable contains '5' *
* 5) Disallowed ---> if none of the above, the 'type' variable contains the character '0' *
* -------------------------------------------------------- *
\* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
class Token
{
private:
/* Data Fields: */
string name; // Contains the token's name and key to the corresponding Symbol Table value
char type; // Digit [0-5] categorizing the token (See "Token Categorizations" above)
public:
/* Optional Constructor: */
Token (char type = '0', string name = "")
{
this -> name = name;
this -> type = type;
}
/* Copy Constructor: */
Token (const Token & t2)
{
this -> name = t2.name;
this -> type = t2.type;
}
// Friend of the class:
friend class Tokenizer;
/* Accessors (Getters) Functions (all inline functions) */
/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *\
* 'getName': *
* Input: None. *
* Output: Return 'name' (string). *
\* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
string getName (void) const { return name; }
/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *\
* 'getType': *
* Input: None. *
* Output: Return 'type' (char). *
\* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
char getType (void) const { return type; }
}; // End of the Token class
/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *\
* The Tokenizer Class *
* Purpose: *
* Generation of tokens from a string of text based on the Definition of the Toy Language. *
* Data variables (private): *
* 1) 'iter' (long int) - iterates across the string of text to find tokens. *
* 2) 'code' (string) - contains the text with unprocessed tokens. *
* 3) 'currLine' (long int) - the number of the currently scanned line. *
* 4) 'currCol' (long int) - the number of the currently scanned column (character on the line.)*
* 5) 'currLinestr' (string) - copy of lines that remain to be processed. *
* 6) 'symbolTable' (map pointer) - pointer to the Symbol Table map object *
* Functions (public): *
* 1) Optional Constructor. Arguments : name of file to process and symbol table pointer. *
* 2) Copy Constructor. *
* 3) 'next' - sets the Token argument's type and content to those the next token. *
* 4) 'to_string' - returns the string object representing the parameter passed to it. Static. *
* 5) 'getCurrLine' - returns the currently processed line. *
* 6) 'getCurrCol' - returns the currently processed column. *
* 7) 'getRemainingCode' - returns a string with code that remains to process. *
\* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
class Tokenizer
{
private:
/* Data Fields: */
long iter; // iterates across the code source
string code; // String variable containing source code
long currLine; // The number of the currently scanned line
long currCol; // The number of the currently scanned column
string currLinestr; // Copy of the lines that remain to be processed
map<string, Symbol_Table_Row>* symbolTable; // Pointer to the Symbol Table map object
public:
/* Optional Constructor: */
Tokenizer (string fileName = "", map<string, Symbol_Table_Row>* symbolTable = 0)
{
this -> iter = -1;
this -> currLine = 1;
this -> currCol = -1;
this -> symbolTable = symbolTable;
// Clean the filename from single or double quotes around it, and trim it from spaces:
if (fileName != "")
{
int i = 0;
while (fileName[i] == '\'' || fileName[i] == '"' || isspace(fileName[i]))
{
fileName = fileName.substr(1);
i++;
}
i = fileName.length()-1;
while (fileName[i] == '\'' || fileName[i] == '"' || isspace(fileName[i]))
{
fileName = fileName.substr(0, fileName.length()-1);
i--;
}
if (fileName.find(".txt") == string::npos)
fileName += ".txt"; // Add the extension '.txt' to the filename.
ifstream infile(fileName.c_str());
cout << "\t" << fileName.c_str() << endl; // Print the filename to the user.
if (infile.is_open())
{
// Copy the content of the file into 'code'.
code.assign ((istreambuf_iterator<char>(infile)), istreambuf_iterator<char>());
infile.close();
cout << endl << "The file " << endl
<< "\t" << fileName << endl << "has successfully opened."
<< endl << endl << "Interpreting the code..." << endl << endl;
}
else
cout << endl << "Error: the file " << endl << "\t" << fileName << endl
<< "whose you name you entered could not open." << endl;
}
this -> currLinestr = this -> code;
}
/* Copy Constructor: */
Tokenizer (const Tokenizer & myTokenizr2)
{
this -> code = myTokenizr2.code;
this -> iter = myTokenizr2.iter;
this -> currLine = myTokenizr2.currLine;
this -> currCol = myTokenizr2.currCol;
this -> symbolTable = myTokenizr2.symbolTable;
this -> currLinestr = this -> code;
}
/* Accessors (Getters) Functions (all inline functions) */
/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *\
* 'getCurrLine': *
* Input: None. *
* Output: 'currLine' (long int). *
\* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
long getCurrLine (void) { return currLine; }
/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *\
* 'getCurrCol': *
* Input: None. *
* Output: 'currCol' (long int). *
\* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
long getCurrCol (void) { return currCol; }
/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *\
* 'getRemainingCode': *
* Input: None. *
* Output: 'currLinestr' (string). *
\* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
string getRemainingCode (void) { return currLinestr; }
/* Function to Retrieve the Next Token */
/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *\
* 'next': *
* Input: 't1' (Token object). *
* Output: Put the currently processed token's information into 't1'. *
* No object is returned. *
\* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
void next (Token& t1)
{
iter++; // Increment 'iter' for the next search below
currCol++; // Increment 'currCol' as well to know on which place in the line we look
if (iter >= code.length())
t1.type = '5'; // Signifies the end of the token stream.
else
{
map<string, Symbol_Table_Row>::iterator mapIter; // Declare map's iterator.
// Delete whitespace characters:
while ((isspace(code[iter]) || code[iter] == 0) && iter < code.length())
{
if (code[iter] == '\n')
{
// Increment the line number (for better error messages)
currLine++;
// Restart the column counting
currCol = -1;
// Chop-off the old line from currLinestr:
currLinestr = currLinestr.substr(currLinestr.find("\n") + 1);
}
iter++;
currCol++;
}
// If the end of the file is reached,
if (iter >= code.length())
{
t1.type = '5'; // Signifies the end of the token stream.
return;
}
// Search for tokens!
switch(code[iter])
{
case '=':
case ';':
case '+':
case '-':
case '*':
case '(':
case ')': t1.type = '3'; // This is a 'symbol' token!
t1.name = to_string(code[iter]);
break;
case '0': t1.type = '2'; // This is a 'numeric literal' token!
t1.name = "0";
break;
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9': t1.type = '2'; // This is a 'numeric literal' token!
t1.name = ""; // Clean the content of the token
do
{
t1.name += to_string(code[iter]);
iter++;
currCol++;
} while (iter < code.length() && isdigit(code[iter]));
iter--;
currCol--;
break;
case 'a':
case 'b':
case 'c':
case 'd':
case 'e':
case 'f':
case 'g':
case 'h':
case 'i':
case 'j':
case 'k':
case 'l':
case 'm':
case 'n':
case 'o':
case 'p':
case 'q':
case 'r':
case 's':
case 't':
case 'u':
case 'v':
case 'w':
case 'x':
case 'y':
case 'z':
case 'A':
case 'B':
case 'C':
case 'D':
case 'E':
case 'F':
case 'G':
case 'H':
case 'I':
case 'J':
case 'K':
case 'L':
case 'M':
case 'N':
case 'O':
case 'P':
case 'Q':
case 'R':
case 'S':
case 'T':
case 'U':
case 'V':
case 'W':
case 'X':
case 'Y':
case 'Z':
case '_': t1.type = '1'; // This is an 'identifier' token!
t1.name = ""; // Clean the content of the token
do
{
t1.name += to_string(code[iter]);
iter++;
currCol++;
} while (iter < code.length() && (isalnum(code[iter]) || code[iter] == '_'));
iter--;
currCol--;
// Check if that identifier already exists in the symbol table:
mapIter = (*symbolTable).find(t1.name);
// If not, create a new symbol table row:
if (mapIter == (*symbolTable).end())
(*symbolTable)[t1.name] = Symbol_Table_Row ();
break;
default: t1.type = '0'; // This is a disallowed symbol token!
t1.name = to_string(code[iter]);
}
}
}
// * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
// Following is a template function converting the passed parameter into a string object and *
// returning it. It comes as an alternative to the C++11 'to_string' member function of *
// the string object/class. The function declares an 'ostringstream' into which it streams *
// the value of the generic parameter. This stream is returned from the function in a form *
// of a C++ string. *
// * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
template <class T>
static string to_string (T obj)
{
ostringstream mystream;
mystream << obj;
return mystream.str();
}
}; // End of the Tokenizer class
#endif