Skip to content

Commit

Permalink
core: rewrite
Browse files Browse the repository at this point in the history
  • Loading branch information
ahmedriad1 committed Sep 12, 2024
1 parent 6f5cf29 commit 8833005
Show file tree
Hide file tree
Showing 14 changed files with 1,089 additions and 495 deletions.
484 changes: 340 additions & 144 deletions src/parser.ts

Large diffs are not rendered by default.

42 changes: 0 additions & 42 deletions src/types/Block.ts

This file was deleted.

8 changes: 0 additions & 8 deletions src/types/ParseResult.ts

This file was deleted.

3 changes: 1 addition & 2 deletions src/types/index.ts
Original file line number Diff line number Diff line change
@@ -1,2 +1 @@
export * from './Block';
export * from './ParseResult';
export {};
45 changes: 0 additions & 45 deletions src/utils/block-quote.ts

This file was deleted.

21 changes: 21 additions & 0 deletions src/utils/functions.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
export function rsplit(
str: string,
separator: string,
limit: number
): string[] {
// Split the string using the separator
const parts = str.split(separator);

// If the number of parts is less than or equal to the limit, return as is
if (parts.length <= limit) {
return parts;
}

// Join the first part of the array before the limit, keep the rest separately
const result = [
parts.slice(0, -limit).join(separator), // All parts before the last 'limit' items
...parts.slice(-limit), // The last 'limit' parts
];

return result;
}
26 changes: 0 additions & 26 deletions src/utils/page.ts

This file was deleted.

89 changes: 89 additions & 0 deletions src/utils/sanitizer.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
// Regular expression replace function
function regReplace(
text: string,
replacements: { [key: string]: string }
): string {
for (const [pattern, replacement] of Object.entries(replacements)) {
const regex = new RegExp(pattern, 'gm');
text = text.replace(regex, replacement);
}
return text;
}

// Preprocessing to preserve structural elements before a cleaner
export function replaceChapterHeadings(text: string): string {
text = text.replace(/###\s*\|+\s*(.*?)(?=P|$)/gm, '~~<h1>$1</h1>');
text = text.replace(/PageVPP/gm, 'PageV01P');
text = text.replace(/PageV(\D)/gm, 'PageV01P000$1');
text = text.replace(/#\d{1,}#/gm, '');
text = text.replace(/(PageV\d+P) (\d+)/gm, '$1$2');
text = text.replace(/\n (PageV\d+P\d+)/gm, '\n$1');
text = text.replace(/^# (?!PageV\d+P\d+)(.*)/gm, '~~<p>$1');
text = text.replace(/PageV(\d+)P(\d+)/gm, '~~a11b$1a11b$2');

return text;
}

// List of replacements for leftover annotations and extraneous characters not removed by the cleaner
const replacements = {
'\\n': ' ',
'CHECK|AUTO|=|¬|_|^</p>|@\\D{,2}@|@\\D{1,}\\d{1,}\\s': '',
'(a11b\\d{2}a11b\\d{3})': '\\1\\n',
'\\n+': '\\n',
'[ ]+': ' ',
'<p>': '</p><p>',
'<p><h1>': '<h1>',
'</h1></p>': '</h1>',
'(?<!</p>)(?=(a11b\\d{2}a11b\\d{3}))': '</p>',
'^': '<p>',
'<p><p>': '<p>',
'<p> <p>': '<p>',
'(?<=.)<h1>': '</p><h1>',
'</h1>(?!(,\\d+|<p>))': '</h1><p>',
'<p> </p>': '',
'<p></p>': '',
'\\n<p> $': '',
'-+NO PAGE NO-+': 'a11b00a11b000',
};

// Chunk texts that have no pages into 1800 character segments and paginate
export function chunkAndPage(inputText: string): string {
const cleanText = regReplace(inputText, replacements);

if (cleanText.match(/a11b\d{2}a11b\d{3,}/)) {
return cleanText;
}

const pattern = /(<\/p>|<p>.*?<\/p>|<h1>.*?<\/h1>)/gs;

const chunks: string[] = [];
let currentChunk = '';
let currentLength = 0;

let match;
while ((match = pattern.exec(cleanText)) !== null) {
const part = match[0];
const contentLength = part.replace(/<[^>]+>/g, '').length;

if (currentLength + contentLength <= 1800) {
currentChunk += part;
currentLength += contentLength;
} else {
chunks.push(currentChunk);
currentChunk = part;
currentLength = contentLength;
}
}

if (currentChunk) {
chunks.push(currentChunk);
}

return chunks
.map((chunk, i) =>
i === 0
? `${chunk}a11b01a11b001\n`
: `${chunk}a11b01a11b${String(i + 1).padStart(3, '0')}\n`
)
.join('');
}
Loading

0 comments on commit 8833005

Please sign in to comment.