diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..9b1ee42 --- /dev/null +++ b/.gitignore @@ -0,0 +1,175 @@ +# Based on https://raw.githubusercontent.com/github/gitignore/main/Node.gitignore + +# Logs + +logs +_.log +npm-debug.log_ +yarn-debug.log* +yarn-error.log* +lerna-debug.log* +.pnpm-debug.log* + +# Caches + +.cache + +# Diagnostic reports (https://nodejs.org/api/report.html) + +report.[0-9]_.[0-9]_.[0-9]_.[0-9]_.json + +# Runtime data + +pids +_.pid +_.seed +*.pid.lock + +# Directory for instrumented libs generated by jscoverage/JSCover + +lib-cov + +# Coverage directory used by tools like istanbul + +coverage +*.lcov + +# nyc test coverage + +.nyc_output + +# Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files) + +.grunt + +# Bower dependency directory (https://bower.io/) + +bower_components + +# node-waf configuration + +.lock-wscript + +# Compiled binary addons (https://nodejs.org/api/addons.html) + +build/Release + +# Dependency directories + +node_modules/ +jspm_packages/ + +# Snowpack dependency directory (https://snowpack.dev/) + +web_modules/ + +# TypeScript cache + +*.tsbuildinfo + +# Optional npm cache directory + +.npm + +# Optional eslint cache + +.eslintcache + +# Optional stylelint cache + +.stylelintcache + +# Microbundle cache + +.rpt2_cache/ +.rts2_cache_cjs/ +.rts2_cache_es/ +.rts2_cache_umd/ + +# Optional REPL history + +.node_repl_history + +# Output of 'npm pack' + +*.tgz + +# Yarn Integrity file + +.yarn-integrity + +# dotenv environment variable files + +.env +.env.development.local +.env.test.local +.env.production.local +.env.local + +# parcel-bundler cache (https://parceljs.org/) + +.parcel-cache + +# Next.js build output + +.next +out + +# Nuxt.js build / generate output + +.nuxt +dist + +# Gatsby files + +# Comment in the public line in if your project uses Gatsby and not Next.js + +# https://nextjs.org/blog/next-9-1#public-directory-support + +# public + +# vuepress build output + +.vuepress/dist + +# vuepress v2.x temp and cache directory + +.temp + +# Docusaurus cache and generated files + +.docusaurus + +# Serverless directories + +.serverless/ + +# FuseBox cache + +.fusebox/ + +# DynamoDB Local files + +.dynamodb/ + +# TernJS port file + +.tern-port + +# Stores VSCode versions used for testing VSCode extensions + +.vscode-test + +# yarn v2 + +.yarn/cache +.yarn/unplugged +.yarn/build-state.yml +.yarn/install-state.gz +.pnp.* + +# IntelliJ based IDEs +.idea + +# Finder (MacOS) folder config +.DS_Store diff --git a/README.md b/README.md new file mode 100644 index 0000000..bb2f95c --- /dev/null +++ b/README.md @@ -0,0 +1,15 @@ +# smdparser + +To install dependencies: + +```bash +bun install +``` + +To run: + +```bash +bun run parser.ts +``` + +This project was created using `bun init` in bun v1.1.29. [Bun](https://bun.sh) is a fast all-in-one JavaScript runtime. diff --git a/bun.lockb b/bun.lockb new file mode 100755 index 0000000..bb705be Binary files /dev/null and b/bun.lockb differ diff --git a/package.json b/package.json new file mode 100644 index 0000000..4c6307e --- /dev/null +++ b/package.json @@ -0,0 +1,11 @@ +{ + "name": "smdparser", + "module": "parser.ts", + "type": "module", + "devDependencies": { + "@types/bun": "latest" + }, + "peerDependencies": { + "typescript": "^5.0.0" + } +} \ No newline at end of file diff --git a/parser.ts b/parser.ts new file mode 100644 index 0000000..e01d8a8 --- /dev/null +++ b/parser.ts @@ -0,0 +1,183 @@ +/** + * GRAMMAR + * formatStmt: text | + * bold | + * italic | + * code | + * multiline_code | + * text: TEXT ?EOF + * bold: STAR innerBoldStmt STAR + * innerBoldStmt: italic | text + * italic: UNDERSCORE innerItalicStmt UNDERSCORE + * innerItalicStmt: bold | text + * code: BACKTICK formatStmt BACKTICK + * multiline_code: BACKTICK BACKTICK BACKTICK text BACKTICK BACKTICK BACKTICK + */ + +import { isAsciiAlpha, isAlphaNumeric } from "./utils.ts"; + +enum TokenType { + STRING, + BOLD, + ITALIC, + UNORDEREDLI, + H1, + H2, + H3, + H4, + H5, + MONOSPACE, + CODE, + UNTYPED, + EOF, + LINEBREAK, +} + +type Token = { + tokenValue: string; + tokenType: TokenType; +}; + +class Lexer { + private position = 0; + private input: string; + private currentChar: string; + public currentToken: Token = { + tokenValue: "", + tokenType: TokenType.UNTYPED, + }; + + constructor(input: string, currentToken: Token = this.currentToken) { + this.currentToken = currentToken; + this.input = input; + this.currentChar = this.input[this.position]; + } + + private peek(stroke = 1): string { + if (this.input.length - 1 >= this.position + stroke) { + return this.input[this.position + stroke]; + } + return ""; + } + + private peekSequence(end: number): string { + let p = this.position; + let s = ""; + while (p <= end && this.input.length - 1 <= p) { + s += this.input[this.position + p]; + p++; + } + return s; + } + + private invalidCharacterError() { + throw `Invalid character: \`${this.currentChar}\` at position ${this.position}`; + } + + private advance(stroke = 1) { + if (this.position + stroke > this.input.length - 1) { + this.currentChar = "\0"; + } else { + this.currentChar = this.input[this.position + stroke]; + this.position += stroke; + } + } + + private getHeaderToken(): Token { + const token: Token = { + tokenValue: "", + tokenType: TokenType.UNTYPED, + }; + + for (let i = 5; i > 0; i--) { + const prefix = "#".repeat(i); + if (this.peekSequence(i) === prefix) { + token.tokenValue = prefix; + token.tokenType = TokenType[`H${i}` as keyof typeof TokenType]; + this.advance(i); + return token; + } + } + + token.tokenValue = "#"; + token.tokenType = TokenType.H1; + return token; + } + + private getTextToken(): Token { + let s = ""; + while (isAlphaNumeric(this.currentChar) || this.currentChar === " ") { + s += this.currentChar; + this.advance(); + } + return { + tokenValue: s, + tokenType: TokenType.STRING, + }; + } + + private getCodeToken(): Token { + const token: Token = { + tokenValue: "", + tokenType: TokenType.UNTYPED, + }; + if (this.peek() === "`" && this.peek(2) === "`") { + token.tokenValue = "```"; + token.tokenType = TokenType.CODE; + } else { + token.tokenValue = "`"; + token.tokenType = TokenType.MONOSPACE; + } + return token; + } + + public getNextToken(): Token { + let token: Token = { + tokenValue: "", + tokenType: TokenType.UNTYPED, + }; + switch (this.currentChar) { + case "\0": + token.tokenType = TokenType.EOF; + token.tokenValue = "\0"; + break; + case "_": + token.tokenValue = "_"; + token.tokenType = TokenType.ITALIC; + this.advance(); + break; + case "*": + if (this.peek() === "*") { + token.tokenValue = "**"; + token.tokenType = TokenType.BOLD; + this.advance(2); + } else { + this.invalidCharacterError() + } + break; + case "-": + if (this.peek() === "-" && this.peek(2) === "-") { + token.tokenValue = "---"; + token.tokenType = TokenType.LINEBREAK; + this.advance(3); + } else { + token.tokenValue = "-"; + token.tokenType = TokenType.UNORDEREDLI; + this.advance(); + } + break; + case "#": + token = this.getHeaderToken(); + break; + case "`": + token = this.getCodeToken(); + break; + default: + token = this.getTextToken(); + } + this.currentToken = token; + return this.currentToken; + } +} + +const lexer = new Lexer("_**Hello** World_"); diff --git a/tsconfig.json b/tsconfig.json new file mode 100644 index 0000000..238655f --- /dev/null +++ b/tsconfig.json @@ -0,0 +1,27 @@ +{ + "compilerOptions": { + // Enable latest features + "lib": ["ESNext", "DOM"], + "target": "ESNext", + "module": "ESNext", + "moduleDetection": "force", + "jsx": "react-jsx", + "allowJs": true, + + // Bundler mode + "moduleResolution": "bundler", + "allowImportingTsExtensions": true, + "verbatimModuleSyntax": true, + "noEmit": true, + + // Best practices + "strict": true, + "skipLibCheck": true, + "noFallthroughCasesInSwitch": true, + + // Some stricter flags (disabled by default) + "noUnusedLocals": false, + "noUnusedParameters": false, + "noPropertyAccessFromIndexSignature": false + } +} diff --git a/utils.ts b/utils.ts new file mode 100644 index 0000000..5c237f3 --- /dev/null +++ b/utils.ts @@ -0,0 +1,7 @@ +export function isAlphaNumeric(str: string): boolean { + return /^[A-Za-z0-9]+$/.test(str); +} + +export function isAsciiAlpha(str: string): boolean { + return /^[A-Za-z]+$/.test(str); +}