Skip to content

Commit

Permalink
Complete the lexer
Browse files Browse the repository at this point in the history
  • Loading branch information
Frusadev committed Nov 6, 2024
0 parents commit 82554ed
Show file tree
Hide file tree
Showing 7 changed files with 418 additions and 0 deletions.
175 changes: 175 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,175 @@
# Based on https://raw.githubusercontent.com/github/gitignore/main/Node.gitignore

# Logs

logs
_.log
npm-debug.log_
yarn-debug.log*
yarn-error.log*
lerna-debug.log*
.pnpm-debug.log*

# Caches

.cache

# Diagnostic reports (https://nodejs.org/api/report.html)

report.[0-9]_.[0-9]_.[0-9]_.[0-9]_.json

# Runtime data

pids
_.pid
_.seed
*.pid.lock

# Directory for instrumented libs generated by jscoverage/JSCover

lib-cov

# Coverage directory used by tools like istanbul

coverage
*.lcov

# nyc test coverage

.nyc_output

# Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files)

.grunt

# Bower dependency directory (https://bower.io/)

bower_components

# node-waf configuration

.lock-wscript

# Compiled binary addons (https://nodejs.org/api/addons.html)

build/Release

# Dependency directories

node_modules/
jspm_packages/

# Snowpack dependency directory (https://snowpack.dev/)

web_modules/

# TypeScript cache

*.tsbuildinfo

# Optional npm cache directory

.npm

# Optional eslint cache

.eslintcache

# Optional stylelint cache

.stylelintcache

# Microbundle cache

.rpt2_cache/
.rts2_cache_cjs/
.rts2_cache_es/
.rts2_cache_umd/

# Optional REPL history

.node_repl_history

# Output of 'npm pack'

*.tgz

# Yarn Integrity file

.yarn-integrity

# dotenv environment variable files

.env
.env.development.local
.env.test.local
.env.production.local
.env.local

# parcel-bundler cache (https://parceljs.org/)

.parcel-cache

# Next.js build output

.next
out

# Nuxt.js build / generate output

.nuxt
dist

# Gatsby files

# Comment in the public line in if your project uses Gatsby and not Next.js

# https://nextjs.org/blog/next-9-1#public-directory-support

# public

# vuepress build output

.vuepress/dist

# vuepress v2.x temp and cache directory

.temp

# Docusaurus cache and generated files

.docusaurus

# Serverless directories

.serverless/

# FuseBox cache

.fusebox/

# DynamoDB Local files

.dynamodb/

# TernJS port file

.tern-port

# Stores VSCode versions used for testing VSCode extensions

.vscode-test

# yarn v2

.yarn/cache
.yarn/unplugged
.yarn/build-state.yml
.yarn/install-state.gz
.pnp.*

# IntelliJ based IDEs
.idea

# Finder (MacOS) folder config
.DS_Store
15 changes: 15 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# smdparser

To install dependencies:

```bash
bun install
```

To run:

```bash
bun run parser.ts
```

This project was created using `bun init` in bun v1.1.29. [Bun](https://bun.sh) is a fast all-in-one JavaScript runtime.
Binary file added bun.lockb
Binary file not shown.
11 changes: 11 additions & 0 deletions package.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
{
"name": "smdparser",
"module": "parser.ts",
"type": "module",
"devDependencies": {
"@types/bun": "latest"
},
"peerDependencies": {
"typescript": "^5.0.0"
}
}
183 changes: 183 additions & 0 deletions parser.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,183 @@
/**
* GRAMMAR
* formatStmt: text |
* bold |
* italic |
* code |
* multiline_code |
* text: TEXT ?EOF
* bold: STAR innerBoldStmt STAR
* innerBoldStmt: italic | text
* italic: UNDERSCORE innerItalicStmt UNDERSCORE
* innerItalicStmt: bold | text
* code: BACKTICK formatStmt BACKTICK
* multiline_code: BACKTICK BACKTICK BACKTICK text BACKTICK BACKTICK BACKTICK
*/

import { isAsciiAlpha, isAlphaNumeric } from "./utils.ts";

enum TokenType {
STRING,
BOLD,
ITALIC,
UNORDEREDLI,
H1,
H2,
H3,
H4,
H5,
MONOSPACE,
CODE,
UNTYPED,
EOF,
LINEBREAK,
}

type Token = {
tokenValue: string;
tokenType: TokenType;
};

class Lexer {
private position = 0;
private input: string;
private currentChar: string;
public currentToken: Token = {
tokenValue: "",
tokenType: TokenType.UNTYPED,
};

constructor(input: string, currentToken: Token = this.currentToken) {
this.currentToken = currentToken;
this.input = input;
this.currentChar = this.input[this.position];
}

private peek(stroke = 1): string {
if (this.input.length - 1 >= this.position + stroke) {
return this.input[this.position + stroke];
}
return "";
}

private peekSequence(end: number): string {
let p = this.position;
let s = "";
while (p <= end && this.input.length - 1 <= p) {
s += this.input[this.position + p];
p++;
}
return s;
}

private invalidCharacterError() {
throw `Invalid character: \`${this.currentChar}\` at position ${this.position}`;
}

private advance(stroke = 1) {
if (this.position + stroke > this.input.length - 1) {
this.currentChar = "\0";
} else {
this.currentChar = this.input[this.position + stroke];
this.position += stroke;
}
}

private getHeaderToken(): Token {
const token: Token = {
tokenValue: "",
tokenType: TokenType.UNTYPED,
};

for (let i = 5; i > 0; i--) {
const prefix = "#".repeat(i);
if (this.peekSequence(i) === prefix) {
token.tokenValue = prefix;
token.tokenType = TokenType[`H${i}` as keyof typeof TokenType];
this.advance(i);
return token;
}
}

token.tokenValue = "#";
token.tokenType = TokenType.H1;
return token;
}

private getTextToken(): Token {
let s = "";
while (isAlphaNumeric(this.currentChar) || this.currentChar === " ") {
s += this.currentChar;
this.advance();
}
return {
tokenValue: s,
tokenType: TokenType.STRING,
};
}

private getCodeToken(): Token {
const token: Token = {
tokenValue: "",
tokenType: TokenType.UNTYPED,
};
if (this.peek() === "`" && this.peek(2) === "`") {
token.tokenValue = "```";
token.tokenType = TokenType.CODE;
} else {
token.tokenValue = "`";
token.tokenType = TokenType.MONOSPACE;
}
return token;
}

public getNextToken(): Token {
let token: Token = {
tokenValue: "",
tokenType: TokenType.UNTYPED,
};
switch (this.currentChar) {
case "\0":
token.tokenType = TokenType.EOF;
token.tokenValue = "\0";
break;
case "_":
token.tokenValue = "_";
token.tokenType = TokenType.ITALIC;
this.advance();
break;
case "*":
if (this.peek() === "*") {
token.tokenValue = "**";
token.tokenType = TokenType.BOLD;
this.advance(2);
} else {
this.invalidCharacterError()
}
break;
case "-":
if (this.peek() === "-" && this.peek(2) === "-") {
token.tokenValue = "---";
token.tokenType = TokenType.LINEBREAK;
this.advance(3);
} else {
token.tokenValue = "-";
token.tokenType = TokenType.UNORDEREDLI;
this.advance();
}
break;
case "#":
token = this.getHeaderToken();
break;
case "`":
token = this.getCodeToken();
break;
default:
token = this.getTextToken();
}
this.currentToken = token;
return this.currentToken;
}
}

const lexer = new Lexer("_**Hello** World_");
Loading

0 comments on commit 82554ed

Please sign in to comment.