-
Notifications
You must be signed in to change notification settings - Fork 0
/
Scanner.cpp
377 lines (347 loc) · 10.2 KB
/
Scanner.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
#include "Scanner.h"
#include "Admin.h"
#include <iostream>
#include <sstream>
// The friendly names for token integer enums
string Scanner::namesRev[50] = {"ID", "NUM", "BLIT", "ENDFILE", "ERROR",
"AND", "BOOL", "BRANCH", "CASE", "CONTINUE", "DEFAULT", "ELSE", "END",
"EXIT", "IF", "INT", "LOOP", "MOD", "NOT", "OR", "REF", "RETURN", "VOID",
"PLUS", "MINUS", "MULT", "DIV", "ANDTHEN", "ORELSE", "LT", "LTEQ",
"GT", "GTEQ", "EQ", "NEQ", "ASSIGN", "SEMI", "COMMA", "LPAREN", "RPAREN",
"LSQR", "RSQR", "LCRLY", "RCRLY",
"INLINECOMMENT", "COMMENTOPEN", "COMMENTCLOSE",
"IGNORED", "COLON","NULLSTMT"};
Scanner::Scanner() : symbolCount(0), commentDepth(0), errorCount(0), wordTable(unordered_map<string, pair<int, int>>()),
symbolTable(unordered_map<string, pair<int, int>>())
{
populateWordTable();
populateSymbolsReadWrite();
}
Scanner::Scanner(Admin& adminMod) : symbolCount(0), commentDepth(0), errorCount(0), wordTable(unordered_map<string, pair<int, int>>()),
symbolTable(unordered_map<string, pair<int, int>>()), admin(&adminMod)
{
populateWordTable();
populateSymbolsReadWrite();
}
/* We do not create new instances of Admin or Parser in either of the copy constructor/assignment operators
* because there should only be one instance available (these functions should rarely be used)
*/
Scanner::Scanner(const Scanner &other) : symbolCount(other.symbolCount), commentDepth(other.commentDepth),
errorCount(other.errorCount), wordTable(other.wordTable), symbolTable(other.symbolTable), admin(other.admin)
{
}
Scanner& Scanner::operator= (const Scanner &rhs)
{
// do the copy
symbolCount = rhs.symbolCount;
commentDepth = rhs.commentDepth;
errorCount = rhs.errorCount;
wordTable = rhs.wordTable;
symbolTable = rhs.symbolTable;
admin = rhs.admin;
// return the existing object
return *this;
}
Scanner::~Scanner() {
// The Parser does not delete the Scanner or Admin instances, because it does not have ownership.
wordTable.clear();
symbolTable.clear();
spellingTable.clear();
errorTable.clear();
}
// Returns true if the character is a letter from A-Z (case insensitive)
bool Scanner::isLetter(char c) {
return (c >= 65 && c <= 90) ||
(c >= 97 && c <= 122);
}
// Returns true if the character is between 0-9
bool Scanner::isDigit(char c) {
return c >= 48 && c <= 57;
}
/* Returns true if the character is acceptable as part of an identifier (not including first char!)
* The criteria is:
* ID = letter (letter | underscore | dollarsign | digit)*
*/
bool Scanner::isValidIdChar(char c) {
return isLetter(c) || isDigit(c) || c == '$' || c == '_';
}
// Starts the fetch of a new lexeme. Breaks down into letters, numbers, and symbols
pair<string, int> Scanner::getLexeme() {
char c;
// Cycle through invisible characters out here first
while(c = admin->getCh(true)) {
if(isLetter(c)) {
/* The value returned by this pair is irrelevant for identifiers
* because they have not yet been assigned a unique index in the
* symbol table
*/
return make_pair(getIdentifier(c), -4);
}
else if(isDigit(c)) {
return make_pair("NUM", getNumeral(c));
}
else {
int symbol = getSpecial(c);
if(symbol == ERROR) {
stringstream ss;
ss << c;
return make_pair(ss.str(), ERROR);
}
return make_pair("SYMBOL", symbol);
}
}
return make_pair("SYMBOL", ENDFILE);
}
// Continues concatenating characters to an identifier string until it is not valid, then removes the recent one
string Scanner::getIdentifier(char c) {
string ident = "";
ident += c;
while(c = admin->getCh(false)) {
if(!isValidIdChar(c)) {
admin->unget();
break;
}
ident += c;
}
return ident;
}
// Continues concatenating characters to a number until a non-number is found
int Scanner::getNumeral(char c) {
string numeral = "";
numeral += c;
while(c = admin->getCh(false)) {
if(!isDigit(c)) {
admin->unget();
break;
}
numeral += c;
}
return atoi(numeral.c_str());
}
/* DFA state when a symbol has been initially detected.
* getSpecial() breaks down into multiple other states depending on the symbol found.
* Comment-related states use the commentDepth integer to determine the level of comment nesting
*/
int Scanner::getSpecial(char c) {
switch(c) {
// First bunch are for trivial cases
case '+':
return PLUS;
case '=':
return EQ;
case ';':
return SEMI;
case ',':
return COMMA;
case '(':
return LPAREN;
case ')':
return RPAREN;
case '[':
return LSQR;
case ']':
return RSQR;
case '{':
return LCRLY;
case '}':
return RCRLY;
case '-':
c = admin->getCh(false);
if(c != '-') {
admin->unget();
return MINUS;
}
else {
return INLINECOMMENT;
}
break;
case '*':
if(commentDepth == 0) {
return MULT;
}
else {
c = admin->getCh(false);
if(c != '/') {
admin->unget();
return MULT;
}
else {
commentDepth--;
return COMMENTCLOSE;
}
}
break;
case '/':
c = admin->getCh(false);
if(c == '=') {
return NEQ;
}
else if(c == '*') {
commentDepth++;
return COMMENTOPEN;
}
else {
admin->unget();
return DIV;
}
break;
case '&':
c = admin->getCh(false);
if(c == '&') {
return ANDTHEN;
}
else {
admin->unget();
return ERROR;
}
break;
case '|':
c = admin->getCh(false);
if(c == '|') {
return ORELSE;
}
else {
admin->unget();
return ERROR;
}
break;
case '<':
c = admin->getCh(false);
if(c == '=') {
return LTEQ;
}
else {
admin->unget();
return LT;
}
break;
case '>':
c = admin->getCh(false);
if(c == '=') {
return GTEQ;
}
else {
admin->unget();
return GT;
}
break;
case ':':
c = admin->getCh(false);
if(c == '=') {
return ASSIGN;
}
else {
return COLON;
}
break;
case EOF:
return ENDFILE;
break;
default:
return ERROR;
}
}
void Scanner::populateSymbolsReadWrite() {
symbolTable.insert(make_pair("readint", make_pair(ID, symbolCount++)));
spellingTable.push_back("readint");
symbolTable.insert(make_pair("writeint", make_pair(ID, symbolCount++)));
spellingTable.push_back("writeint");
symbolTable.insert(make_pair("outint", make_pair(ID, symbolCount++)));
spellingTable.push_back("outint");
symbolTable.insert(make_pair("readbool", make_pair(ID, symbolCount++)));
spellingTable.push_back("readbool");
symbolTable.insert(make_pair("writebool", make_pair(ID, symbolCount++)));
spellingTable.push_back("writebool");
symbolTable.insert(make_pair("outbool", make_pair(ID, symbolCount++)));
spellingTable.push_back("outbool");
}
// Fills out the word table with reserved keywords and their values
void Scanner::populateWordTable() {
wordTable.insert(make_pair("and", make_pair(AND, -2)));
wordTable.insert(make_pair("branch", make_pair(BRANCH, -2)));
wordTable.insert(make_pair("bool", make_pair(BOOL, -2)));
wordTable.insert(make_pair("case", make_pair(CASE, -2)));
wordTable.insert(make_pair("continue", make_pair(CONTINUE, -2)));
wordTable.insert(make_pair("default", make_pair(DEFAULT, -2)));
wordTable.insert(make_pair("else", make_pair(ELSE, -2)));
wordTable.insert(make_pair("end", make_pair(END, -2)));
wordTable.insert(make_pair("exit", make_pair(EXIT, -2)));
wordTable.insert(make_pair("if", make_pair(IF, -2)));
wordTable.insert(make_pair("int", make_pair(INT, -2)));
wordTable.insert(make_pair("loop", make_pair(LOOP, -2)));
wordTable.insert(make_pair("mod", make_pair(MOD, -2)));
wordTable.insert(make_pair("not", make_pair(NOT, -2)));
wordTable.insert(make_pair("or", make_pair(OR, -2)));
wordTable.insert(make_pair("ref", make_pair(REF, -2)));
wordTable.insert(make_pair("return", make_pair(RETURN, -2)));
wordTable.insert(make_pair("void", make_pair(VOID, -2)));
wordTable.insert(make_pair("true", make_pair(BLIT, 1)));
wordTable.insert(make_pair("false", make_pair(BLIT, 0)));
}
int Scanner::getErrorCount() { return errorCount; }
int Scanner::getIdentifierCount() { return symbolTable.size(); }
// Wraps getToken(pair<string, int> tok)
Token Scanner::getToken() {
Token nonIgnoredToken;
do {
nonIgnoredToken = getToken(getLexeme());
} while(nonIgnoredToken.getTokenType() == IGNORED);
return nonIgnoredToken;
}
// Builds an appropriate token for a given lexeme and returns it
Token Scanner::getToken(pair<string, int> tok) {
Token token;
// These are within comment scope and are discarded
if(commentDepth > 0 || tok.second == COMMENTCLOSE) {
if(tok.second == ENDFILE && tok.first == "SYMBOL") {
token = Token(tok.second, -2);
}
else {
token = Token(IGNORED, -2);
}
}
// Inline comments trigger an instant endline in the source file
else if(tok.second == INLINECOMMENT) {
token = Token(IGNORED, -2);
admin->endLine();
}
else if(tok.first == "NUM") {
token = Token(NUM, tok.second);
}
else if(tok.first == "SYMBOL") {
token = Token(tok.second, -2);
}
else {
// Save the character if it was an error token
if(tok.second == ERROR) {
token = Token(ERROR, errorCount++);
errorTable.push_back(tok.first);
}
// Check if the word is a reserved keyword
else if(wordTable.count(tok.first) > 0) {
token = Token(wordTable.at(tok.first).first, wordTable.at(tok.first).second);
}
// If not, it is an identifier
// First, check if it's already got an id in the symbol table
else if(symbolTable.count(tok.first) == 0) {
symbolTable.insert(make_pair(tok.first, make_pair(ID, symbolCount)));
// symbolCount is unique count of identifiers
token = Token(ID, symbolCount++);
// Add a vector entry for looking up the identifier's friendly name from the id in the symbol table
spellingTable.push_back(tok.first);
}
// Identifier was already added, just look up its id
else {
token = Token(symbolTable.at(tok.first).first, symbolTable.at(tok.first).second);
}
}
return token;
}
// Returns the friendly name of an identifier, given its id
string Scanner::getIdentifierName(int id) {
return spellingTable.at(id);
}
// Returns the string that caused an error
string Scanner::getErrorName(int id) {
return errorTable.at(id);
}