Skip to content

Commit

Permalink
Fix HTML to markdown parsing
Browse files Browse the repository at this point in the history
  • Loading branch information
Skalakid committed Nov 22, 2024
1 parent 4665020 commit 17c5562
Showing 1 changed file with 39 additions and 2 deletions.
41 changes: 39 additions & 2 deletions lib/ExpensiMark.ts
Original file line number Diff line number Diff line change
Expand Up @@ -1097,8 +1097,8 @@ export default class ExpensiMark {
return;
}

// Insert '\n' unless it ends with '\n' or '>' or it's the last element, or if it's a header ('# ') with a space.
if (text.match(/[\n|>][>]?[\s]?$/) || index === splitText.length - 1 || text === '# ') {
// Insert '\n' unless it ends with '\n' or it's the last element, or if it's a header ('# ') with a space.
if (text.match(/[\n][\s]?$/) || index === splitText.length - 1 || text === '# ') {
joinedText += text;
} else {
joinedText += `${text}\n`;
Expand All @@ -1110,6 +1110,41 @@ export default class ExpensiMark {
return joinedText;
}

splitNestedQuotesIntoSeparateOnes(text: string): string {
let count = 0;
let parsedText = text.replace(/(<\/blockquote>)+/g, (match) => {
return `${match.slice(0, match.lastIndexOf('</blockquote>'))}</blockquote><br />`;
});
const splittedText = parsedText.split('<br />');
if (splittedText.length > 0 && splittedText[splittedText.length - 1] === '') {
splittedText.pop();
}
parsedText = splittedText
.map((line, index, arr) => {
if (!line) return '';

if (line.startsWith('<blockquote>')) {
count += (line.match(/<blockquote>/g) || []).length;
}

if (line.endsWith('</blockquote>')) {
count -= (line.match(/<\/blockquote>/g) || []).length;
if (count > 0) {
return `${line}${'<blockquote>'.repeat(count)}`;
}
}

if (count > 0) {
return `${line}${'</blockquote>'}${'<blockquote>'.repeat(count)}`;
}

return line + (index < arr.length - 1 ? '<br />' : '');
})
.join('');

return parsedText;
}

/**
* Replaces HTML with markdown
*/
Expand All @@ -1118,6 +1153,8 @@ export default class ExpensiMark {
const body = /<(body)(?:"[^"]*"|'[^']*'|[^'"><])*>(?:\n|\r\n)?([\s\S]*?)(?:\n|\r\n)?<\/\1>(?![^<]*(<\/pre>|<\/code>))/im;
const parseBodyTag = generatedMarkdown.match(body);

generatedMarkdown = this.splitNestedQuotesIntoSeparateOnes(generatedMarkdown);

// If body tag is found then use the content of body rather than the whole HTML
if (parseBodyTag) {
generatedMarkdown = parseBodyTag[2];
Expand Down

0 comments on commit 17c5562

Please sign in to comment.