Skip to content

Commit

Permalink
Merge pull request #675 from brandonhenry/add-markdown-trunctation-util
Browse files Browse the repository at this point in the history
ExpensiMark: Add New Markdown Truncation Utility
  • Loading branch information
puneetlath authored Jul 15, 2024
2 parents 629f703 + 01311ad commit cd42f4e
Show file tree
Hide file tree
Showing 2 changed files with 307 additions and 0 deletions.
108 changes: 108 additions & 0 deletions __tests__/ExpensiMark-Markdown-Truncate-test.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
import ExpensiMark from '../lib/ExpensiMark';

const parser = new ExpensiMark();

describe('truncateHTML', () => {
test('should return original text as HTML if it does not exceed the limit', () => {
const markdown = 'This is a *short* text that does not exceed the character limit.';
const limit = 100;
const html = parser.replace(markdown);
const result = parser.truncateHTML(html, limit, {ellipsis: true});
expect(result).toBe(html);
});

test('should truncate HTML and add ellipsis if it exceeds the limit', () => {
const markdown =
'This is a *long* text that exceeds the character limit. It contains multiple sentences to test the truncation functionality. The truncation should occur at the specified character count.';
const limit = 80;
const html = parser.replace(markdown);
const result = parser.truncateHTML(html, limit, {ellipsis: true});
expect(result).toBe('This is a <strong>long</strong> text that exceeds the character limit. It contains multiple sente...');
});

test('should handle HTML with multiple markdown elements', () => {
const markdown =
'This is a *long* text with _multiple_ markdown ~elements~. It includes *bold*, _italic_, and ~strikethrough~ formatting. The truncation should preserve the markdown syntax.';
const limit = 150;
const html = parser.replace(markdown);
const result = parser.truncateHTML(html, limit, {ellipsis: true});
expect(result).toBe(
'This is a <strong>long</strong> text with <em>multiple</em> markdown <del>elements</del>. It includes <strong>bold</strong>, <em>italic</em>, and <del>strikethrough</del> formatting. The truncation should preserve the markdo...',
);
});

test('should handle HTML with nested markdown elements', () => {
const markdown = 'This is a *long _nested_ markdown* text. It contains *_bold italic_* and ~_strikethrough italic_~ formatting. The truncation should handle the nesting correctly.';
const limit = 120;
const html = parser.replace(markdown);
const result = parser.truncateHTML(html, limit, {ellipsis: true});
expect(result).toBe(
'This is a <strong>long <em>nested</em> markdown</strong> text. It contains <strong><em>bold italic</em></strong> and <del><em>strikethrough italic</em></del> formatting. The truncation should ...',
);
});

test('should handle HTML with links', () => {
const markdown =
'This is a text with [a link](https://example.com) that exceeds the limit. The link should be preserved in the truncated text. Additionally, it includes [another link](https://example.org) for testing purposes.';
const limit = 141;
const html = parser.replace(markdown);
const result = parser.truncateHTML(html, limit, {ellipsis: true});
expect(result).toBe(
`This is a text with <a href=\"https://example.com\" target=\"_blank\" rel=\"noreferrer noopener\">a link</a> that exceeds the limit. The link should be preserved in the truncated text. Additionally, it includes <a href=\"https://example.org\" target=\"_blank\" rel=\"noreferrer noopener\">another link</a>...`,
);
});

test('should handle HTML with inline code', () => {
const markdown = 'This is a text with `inline code`. The inline code should be preserved in the truncated text.';
const limit = 25;
const html = parser.replace(markdown);
const result = parser.truncateHTML(html, limit, {ellipsis: true});
expect(result).toBe('This is a text with <code>inlin...</code>');
});

test('should handle HTML with headings', () => {
const markdown =
'# Heading 1\n\nThis is a text with headings that exceeds the limit. The headings should be preserved in the truncated text. Here is an example of a heading:\n\n## Heading 2\n\nThe truncation should handle headings correctly.';
const limit = 100;
const html = parser.replace(markdown);
const result = parser.truncateHTML(html, limit, {ellipsis: true});
expect(result).toBe('<h1>Heading 1</h1><br />This is a text with headings that exceeds the limit. The headings should be preserved in th...');
});

test('should handle HTML with lists', () => {
const markdown = `
This is a text with lists. Here are examples:
Unordered list:
- Item 1
- Item 2
- Item 3
Ordered list:
1. First item
2. Second item
3. Third item
`;
const limit = 250; // Increased to accommodate full lists
const html = parser.replace(markdown);
const result = parser.truncateHTML(html, limit, {ellipsis: false});
expect(result).toBe(`<br /> This is a text with lists. Here are examples:<br /> <br /> Unordered list:<br /> - Item 1<br /> - Item 2<br /> - Item 3<br /> <br /> Ordered list:<br /> 1. First item<br /> 2. Second item<br /> 3. Third item<br /> `);
});

test('should handle HTML with horizontal rules', () => {
const markdown = 'This is a text with horizontal rules. Here is an example of a horizontal rule:\n\n---\n\nThe truncation should handle horizontal rules correctly.';
const limit = 100;
const html = parser.replace(markdown);
const result = parser.truncateHTML(html, limit, {ellipsis: true});
expect(result).toBe('This is a text with horizontal rules. Here is an example of a horizontal rule:<br /><br />---<br /><br />The truncation shou...');
});

test('should handle HTML with images', () => {
const markdown =
'This is a text with an image ![alt text](https://example.com/image.jpg) that exceeds the limit. The image should be preserved in the truncated text. Additionally, it includes another image ![alt text 2](https://example.org/image2.jpg) for testing purposes.';
const limit = 80;
const html = parser.replace(markdown);
const result = parser.truncateHTML(html, limit, {ellipsis: true});
expect(result).toBe('This is a text with an image <img src="https://example.com/image.jpg" alt="alt text" /> that exceeds the limit. The image should be preser...');
});
});
199 changes: 199 additions & 0 deletions lib/ExpensiMark.ts
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,13 @@ type ReplaceOptions = {
shouldKeepRawInput?: boolean;
};

type TruncateOptions = {
ellipsis?: string;
truncateLastWord?: boolean;
slop?: number;
removeImageTag?: boolean;
};

const MARKDOWN_LINK_REGEX = new RegExp(`\\[([^\\][]*(?:\\[[^\\][]*][^\\][]*)*)]\\(${UrlPatterns.MARKDOWN_URL_REGEX}\\)(?![^<]*(<\\/pre>|<\\/code>))`, 'gi');
const MARKDOWN_IMAGE_REGEX = new RegExp(`\\!(?:\\[([^\\][]*(?:\\[[^\\][]*][^\\][]*)*)])?\\(${UrlPatterns.MARKDOWN_URL_REGEX}\\)(?![^<]*(<\\/pre>|<\\/code>))`, 'gi');

Expand Down Expand Up @@ -1259,6 +1266,198 @@ export default class ExpensiMark {
return Utils.escape(originalContent);
}

/**
* Determines the end position to truncate the HTML content while considering word boundaries.
*
* @param {string} content - The HTML content to be truncated.
* @param {number} tailPosition - The position up to which the content should be considered.
* @param {number} maxLength - The maximum length of the truncated content.
* @param {number} totalLength - The length of the content processed so far.
* @param {Object} opts - Options to customize the truncation.
* @returns {number} The calculated position to truncate the content.
*/
getEndPosition(content: string, tailPosition: number | undefined, maxLength: number, totalLength: number, opts: TruncateOptions) {
const WORD_BREAK_REGEX = /\W+/g;

// Calculate the default position to truncate based on the maximum length and the length of the content processed so far
const defaultPosition = maxLength - totalLength;

// Define the slop value, which determines the tolerance for cutting off content near the maximum length
const slop = opts.slop;
if (!slop) return defaultPosition;

// Initialize the position to the default position
let position = defaultPosition;

// Determine if the default position is considered "short" based on the slop value
const isShort = defaultPosition < slop;

// Calculate the position within the slop range
const slopPos = isShort ? defaultPosition : slop - 1;

// Extract the substring to analyze for word boundaries, considering the slop and tail position
const substr = content.slice(isShort ? 0 : defaultPosition - slop, tailPosition !== undefined ? tailPosition : defaultPosition + slop);

// Find the first word boundary within the substring
const wordBreakMatch = WORD_BREAK_REGEX.exec(substr);

// Adjust the position to avoid truncating in the middle of a word if the option is enabled
if (!opts.truncateLastWord) {
if (tailPosition && substr.length <= tailPosition) {
// If tail position is defined and the substring length is within the tail position, set position to the substring length
position = substr.length;
} else {
// Iterate through word boundary matches to adjust the position
while (wordBreakMatch !== null) {
if (wordBreakMatch.index < slopPos) {
// If the word boundary is before the slop position, adjust position backward
position = defaultPosition - (slopPos - wordBreakMatch.index);
if (wordBreakMatch.index === 0 && defaultPosition <= 1) {
break;
}
} else if (wordBreakMatch.index === slopPos) {
// If the word boundary is at the slop position, set position to the default position
position = defaultPosition;
break;
} else {
// If the word boundary is after the slop position, adjust position forward
position = defaultPosition + (wordBreakMatch.index - slopPos);
break;
}
}
}
// If the character at the determined position is a whitespace, adjust position backward
if (content.charAt(position - 1).match(/\s$/)) {
position--;
}
}

// Return the calculated position to truncate the content
return position;
}

/**
* Truncate HTML string and keep tag safe.
* pulled from https://github.com/huang47/nodejs-html-truncate/blob/master/lib/truncate.js
*
* @param {string} html - The string that needs to be truncated
* @param {number} maxLength - Length of truncated string
* @param {Object} [options] - Optional configuration options
* @returns {string} The truncated string
*/
truncateHTML(html: string, maxLength: number, options?: TruncateOptions) {
const EMPTY_STRING = '';
const DEFAULT_TRUNCATE_SYMBOL = '...';
const DEFAULT_SLOP = Math.min(10, maxLength);
const tagsStack = [];
const KEY_VALUE_REGEX = '((?:\\s+(?:\\w+|-)+(?:\\s*=\\s*(?:"(?:\\\\.|[^"\\\\])*"|\'(?:\\\\.|[^\'\\\\])*\'|[^\'">\\s]+))?)*)';
const IS_CLOSE_REGEX = '\\s*\\/?\\s*';
const CLOSE_REGEX = '\\s*\\/\\s*';
const SELF_CLOSE_REGEX = new RegExp(`<\\/?(\\w+)${KEY_VALUE_REGEX}${CLOSE_REGEX}>`);
const HTML_TAG_REGEX = new RegExp(`<\\/?(\\w+)${KEY_VALUE_REGEX}${IS_CLOSE_REGEX}>`);
const URL_REGEX = /(((ftp|https?):\/\/)[\\-\w@:%_\\+.~#?,&\\/\\/=]+)|((mailto:)?[_.\w\\-]+@([\w][\w\\-]+\.)+[a-zA-Z]{2,3})/g;
const IMAGE_TAG_REGEX = new RegExp(`<img\\s*${KEY_VALUE_REGEX}${CLOSE_REGEX}>`);
let truncatedContent = EMPTY_STRING;
let totalLength = 0;
let matches = HTML_TAG_REGEX.exec(html);
let endResult;
let index;
let tag;
let selfClose = null;
let htmlString = html;

const opts = {
ellipsis: DEFAULT_TRUNCATE_SYMBOL,
truncateLastWord: true,
slop: DEFAULT_SLOP,
...options,
};

function removeImageTag(content: string): string {
const match = IMAGE_TAG_REGEX.exec(content);
if (!match) {
return content;
}

const matchIndex = match.index;
const matchLength = match[0].length;

return content.substring(0, matchIndex) + content.substring(matchIndex + matchLength);
}

function closeTags(tags: string[]): string {
return tags
.reverse()
.map((mappedTag) => {
return `</${mappedTag}>`;
})
.join('');
}

while (matches) {
matches = HTML_TAG_REGEX.exec(htmlString);

if (!matches) {
if (totalLength >= maxLength) {
break;
}

matches = URL_REGEX.exec(htmlString);
if (!matches || matches.index >= maxLength) {
truncatedContent += htmlString.substring(0, this.getEndPosition(htmlString, undefined, maxLength, totalLength, opts));
break;
}

while (matches) {
endResult = matches[0];
if (endResult !== null) {
index = matches.index;
truncatedContent += htmlString.substring(0, index + endResult.length - totalLength);
htmlString = htmlString.substring(index + endResult.length);
matches = URL_REGEX.exec(htmlString);
}
}
break;
}

endResult = matches[0];
index = matches.index;

if (totalLength + index > maxLength) {
truncatedContent += htmlString.substring(0, this.getEndPosition(htmlString, index, maxLength, totalLength, opts));
break;
} else {
totalLength += index;
truncatedContent += htmlString.substring(0, index);
}

if (endResult[1] === '/') {
tagsStack.pop();
selfClose = null;
} else {
selfClose = SELF_CLOSE_REGEX.exec(endResult);
if (!selfClose) {
tag = matches[1];
tagsStack.push(tag);
}
}

truncatedContent += selfClose ? selfClose[0] : endResult;
htmlString = htmlString.substring(index + endResult.length); // Update htmlString
}

if (htmlString.length > maxLength - totalLength && opts.ellipsis) {
truncatedContent += opts.ellipsis ? '...' : '';
}
truncatedContent += closeTags(tagsStack);

if (opts.removeImageTag) {
truncatedContent = removeImageTag(truncatedContent);
}

return truncatedContent;
}

/**
* Replaces text with a replacement based on a regex
* @param text - The text to replace
Expand Down

0 comments on commit cd42f4e

Please sign in to comment.