Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ExpensiMark: Add New Markdown Truncation Utility #675

Merged
merged 44 commits into from
Jul 15, 2024
Merged
Show file tree
Hide file tree
Changes from 36 commits
Commits
Show all changes
44 commits
Select commit Hold shift + click to select a range
2a9ad51
initial concept
brandonhenry Apr 1, 2024
d6d5f72
swap to for loop + fix multiple function calls
brandonhenry Apr 2, 2024
b21b4ec
lint updates
brandonhenry Apr 3, 2024
4cb7381
Update lib/ExpensiMark.js
brandonhenry Apr 4, 2024
6f2c823
Update lib/ExpensiMark.js
brandonhenry Apr 4, 2024
7c88b3f
Added several tests
brandonhenry Apr 8, 2024
156b994
Update ExpensiMark.js
brandonhenry Apr 8, 2024
902c518
Update ExpensiMark.js
brandonhenry Apr 8, 2024
722d943
Merge remote-tracking branch 'upstream/main' into add-markdown-trunct…
brandonhenry Apr 8, 2024
9465d70
ran prettier
brandonhenry Apr 8, 2024
cae5564
Update ExpensiMark.js
brandonhenry Apr 8, 2024
5acadc8
Update ExpensiMark.js
brandonhenry Apr 9, 2024
eddb343
swap to truncated converted markdown instead
brandonhenry Apr 12, 2024
7d19047
test updates
brandonhenry Apr 16, 2024
8694bf9
rework so we truly only work with markdown. tests are passing now
brandonhenry Apr 16, 2024
380564c
Merge remote-tracking branch 'upstream/main' into add-markdown-trunct…
brandonhenry Apr 16, 2024
7516410
Update ExpensiMark.js
brandonhenry Apr 16, 2024
58cb81b
fixed merged of main
brandonhenry Apr 16, 2024
8d14bdd
Update ExpensiMark.js
brandonhenry Apr 16, 2024
77de7e7
added more tests and variance for safety
brandonhenry Apr 18, 2024
c567d56
Merge remote-tracking branch 'upstream/main' into add-markdown-trunct…
brandonhenry Apr 23, 2024
049d3e0
Merge remote-tracking branch 'upstream/main' into add-markdown-trunct…
brandonhenry May 31, 2024
263f7de
changed truncation method
brandonhenry Jun 7, 2024
f3112d3
Update CONST.d.ts
brandonhenry Jun 7, 2024
13e78ff
Delete CONST.d.ts
brandonhenry Jun 7, 2024
9ace1a3
Update ExpensiMark-Markdown-Truncate-test.js
brandonhenry Jun 7, 2024
4f0b0ec
fixed tests
brandonhenry Jun 8, 2024
679f15b
Merge remote-tracking branch 'upstream/main' into add-markdown-trunct…
brandonhenry Jun 10, 2024
099a7c3
lint updates
brandonhenry Jun 11, 2024
ba56236
fixed tests and updated function with comments
brandonhenry Jun 17, 2024
c258791
Merge remote-tracking branch 'upstream/main' into add-markdown-trunct…
brandonhenry Jun 18, 2024
97ce9cd
Merge branch 'Expensify:main' into add-markdown-trunctation-util
brandonhenry Jun 18, 2024
fdfe713
added underscore back in
brandonhenry Jun 18, 2024
7d50785
removed underscore
brandonhenry Jun 18, 2024
72b0bba
Merge remote-tracking branch 'upstream/main' into add-markdown-trunct…
brandonhenry Jun 21, 2024
8f0d25b
Fixed type script errors
brandonhenry Jun 21, 2024
69940fa
Update lib/ExpensiMark.ts
brandonhenry Jun 28, 2024
52ce61d
update tests
brandonhenry Jul 1, 2024
f84b0b5
Merge remote-tracking branch 'upstream/main' into add-markdown-trunct…
brandonhenry Jul 1, 2024
bae4a59
Merge remote-tracking branch 'upstream/main' into add-markdown-trunct…
brandonhenry Jul 3, 2024
e44a504
Update ExpensiMark-Markdown-Truncate-test.js
brandonhenry Jul 3, 2024
30c6ecb
Update ExpensiMark-Markdown-Truncate-test.js
brandonhenry Jul 4, 2024
f46585b
Merge remote-tracking branch 'upstream/main' into add-markdown-trunct…
brandonhenry Jul 4, 2024
01311ad
Update ExpensiMark-Markdown-Truncate-test.js
brandonhenry Jul 14, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
94 changes: 94 additions & 0 deletions __tests__/ExpensiMark-Markdown-Truncate-test.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
import ExpensiMark from '../lib/ExpensiMark';

const parser = new ExpensiMark();

describe('truncateHTML', () => {
test('should return original text as HTML if it does not exceed the limit', () => {
const markdown = 'This is a *short* text that does not exceed the character limit.';
const limit = 100;
const html = parser.replace(markdown);
const result = parser.truncateHTML(html, limit, { ellipsis: true });
expect(result).toBe(html);
});

test('should truncate HTML and add ellipsis if it exceeds the limit', () => {
const markdown = 'This is a *long* text that exceeds the character limit. It contains multiple sentences to test the truncation functionality. The truncation should occur at the nearest space to avoid cutting off words.';
const limit = 80;
const html = parser.replace(markdown);
const result = parser.truncateHTML(html, limit, { ellipsis: true });
expect(result).toBe('This is a <strong>long</strong> text that exceeds the character limit. It contains multiple sente...');
});

test('should truncate HTML without adding ellipsis if specified', () => {
const markdown = 'This is a *long* text that exceeds the character limit. It contains multiple sentences to test the truncation functionality. The truncation should occur at the nearest space to avoid cutting off words.';
const limit = 80;
const html = parser.replace(markdown);
const result = parser.truncateHTML(html, limit, { ellipsis: false });
expect(result).toBe('This is a <strong>long</strong> text that exceeds the character limit. It contains multiple sente');
brandonhenry marked this conversation as resolved.
Show resolved Hide resolved
});

test('should handle HTML with multiple markdown elements', () => {
const markdown = 'This is a *long* text with _multiple_ markdown ~elements~. It includes *bold*, _italic_, and ~strikethrough~ formatting. The truncation should preserve the markdown syntax.';
const limit = 80;
const html = parser.replace(markdown);
const result = parser.truncateHTML(html, limit, { ellipsis: true });
expect(result).toBe('This is a <strong>long</strong> text with <em>multiple</em> markdown <del>elements</del>. It includes <strong>bold</strong>, <em>italic</em>, a...');
});

test('should handle HTML with nested markdown elements', () => {
const markdown = 'This is a *long _nested_ markdown* text. It contains *_bold italic_* and ~_strikethrough italic_~ formatting. The truncation should handle the nesting correctly.';
brandonhenry marked this conversation as resolved.
Show resolved Hide resolved
const limit = 80;
const html = parser.replace(markdown);
const result = parser.truncateHTML(html, limit, { ellipsis: true });
expect(result).toBe('This is a <strong>long <em>nested</em> markdown</strong> text. It contains <strong><em>bold italic</em></strong> and <del><em>strikethrough i...</em></del>');
});

test('should handle HTML with links', () => {
const markdown = 'This is a text with [a link](https://example.com) that exceeds the limit. The link should be preserved in the truncated text. Additionally, it includes [another link](https://example.org) for testing purposes.';
const limit = 80;
const html = parser.replace(markdown);
const result = parser.truncateHTML(html, limit, { ellipsis: true });
expect(result).toBe('This is a text with <a href=\"https://example.com\" target=\"_blank\" rel=\"noreferrer noopener\">a link</a> that exceeds the limit. The link should be preserved ...');
});

test('should handle HTML with inline code', () => {
const markdown = 'This is a text with `inline code` that exceeds the limit. The inline code should be preserved in the truncated text. Additionally, it includes `another inline code` for testing purposes.';
const limit = 80;
brandonhenry marked this conversation as resolved.
Show resolved Hide resolved
const html = parser.replace(markdown);
const result = parser.truncateHTML(html, limit, { ellipsis: true });
expect(result).toBe('This is a text with <code>inline code</code> that exceeds the limit. The inline code should b...');
});

test('should handle HTML with headings', () => {
const markdown = '# Heading 1\n\nThis is a text with headings that exceeds the limit. The headings should be preserved in the truncated text. Here is an example of a heading:\n\n## Heading 2\n\nThe truncation should handle headings correctly.';
const limit = 100;
const html = parser.replace(markdown);
const result = parser.truncateHTML(html, limit, { ellipsis: true });
expect(result).toBe('<h1>Heading 1</h1><br />This is a text with headings that exceeds the limit. The headings should be preserved in th...');
});

test('should handle HTML with lists', () => {
const markdown = 'This is a text with lists. Here is an example of a list:\n\n- Item 1\n- Item 2\n- Item 3\n\nThe truncation should handle lists correctly.';
brandonhenry marked this conversation as resolved.
Show resolved Hide resolved
const limit = 100;
const html = parser.replace(markdown);
const result = parser.truncateHTML(html, limit, { ellipsis: true });
expect(result).toBe('This is a text with lists. Here is an example of a list:<br /><br />- Item 1<br />- Item 2<br />- Item 3<br /><br />The truncation shoul...');
});

test('should handle HTML with horizontal rules', () => {
const markdown = 'This is a text with horizontal rules. Here is an example of a horizontal rule:\n\n---\n\nThe truncation should handle horizontal rules correctly.';
const limit = 100;
const html = parser.replace(markdown);
const result = parser.truncateHTML(html, limit, { ellipsis: true });
expect(result).toBe('This is a text with horizontal rules. Here is an example of a horizontal rule:<br /><br />---<br /><br />The truncation shou...');
brandonhenry marked this conversation as resolved.
Show resolved Hide resolved
});

test('should handle HTML with images', () => {
const markdown = 'This is a text with an image ![alt text](https://example.com/image.jpg) that exceeds the limit. The image should be preserved in the truncated text. Additionally, it includes another image ![alt text 2](https://example.org/image2.jpg) for testing purposes.';
const limit = 80;
const html = parser.replace(markdown);
const result = parser.truncateHTML(html, limit, { ellipsis: true });
expect(result).toBe('This is a text with an image <img src=\"https://example.com/image.jpg\" alt=\"alt text\" /> that exceeds the limit. The image should be preser...');
});
});

200 changes: 200 additions & 0 deletions lib/ExpensiMark.ts
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,13 @@ type ReplaceOptions = {
shouldKeepRawInput?: boolean;
};

type TruncateOptions = {
ellipsis?: string;
truncateLastWord?: boolean;
slop?: number;
removeImageTag?: boolean;
};

const MARKDOWN_LINK_REGEX = new RegExp(`\\[([^\\][]*(?:\\[[^\\][]*][^\\][]*)*)]\\(${UrlPatterns.MARKDOWN_URL_REGEX}\\)(?![^<]*(<\\/pre>|<\\/code>))`, 'gi');
const MARKDOWN_IMAGE_REGEX = new RegExp(`\\!(?:\\[([^\\][]*(?:\\[[^\\][]*][^\\][]*)*)])?\\(${UrlPatterns.MARKDOWN_URL_REGEX}\\)(?![^<]*(<\\/pre>|<\\/code>))`, 'gi');

Expand Down Expand Up @@ -1233,6 +1240,199 @@ export default class ExpensiMark {
return Utils.escape(originalContent);
}

/**
* Determines the end position to truncate the HTML content while considering word boundaries.
*
* @param {string} content - The HTML content to be truncated.
* @param {number} tailPosition - The position up to which the content should be considered.
* @param {number} maxLength - The maximum length of the truncated content.
* @param {number} totalLength - The length of the content processed so far.
* @param {Object} opts - Options to customize the truncation.
* @returns {number} The calculated position to truncate the content.
*/
getEndPosition(content: string, tailPosition: number | undefined, maxLength: number, totalLength: number, opts: TruncateOptions) {
const WORD_BREAK_REGEX = /\W+/g;

// Calculate the default position to truncate based on the maximum length and the length of the content processed so far
const defaultPosition = maxLength - totalLength;

// Define the slop value, which determines the tolerance for cutting off content near the maximum length
const slop = opts.slop;

brandonhenry marked this conversation as resolved.
Show resolved Hide resolved
if (!slop) return defaultPosition;

// Initialize the position to the default position
let position = defaultPosition;

// Determine if the default position is considered "short" based on the slop value
const isShort = defaultPosition < slop;

// Calculate the position within the slop range
const slopPos = isShort ? defaultPosition : slop - 1;

// Extract the substring to analyze for word boundaries, considering the slop and tail position
const substr = content.slice(isShort ? 0 : defaultPosition - slop, tailPosition !== undefined ? tailPosition : defaultPosition + slop);

// Find the first word boundary within the substring
const wordBreakMatch = WORD_BREAK_REGEX.exec(substr);

// Adjust the position to avoid truncating in the middle of a word if the option is enabled
if (!opts.truncateLastWord) {
if (tailPosition && substr.length <= tailPosition) {
// If tail position is defined and the substring length is within the tail position, set position to the substring length
position = substr.length;
} else {
// Iterate through word boundary matches to adjust the position
while (wordBreakMatch !== null) {
if (wordBreakMatch.index < slopPos) {
// If the word boundary is before the slop position, adjust position backward
position = defaultPosition - (slopPos - wordBreakMatch.index);
if (wordBreakMatch.index === 0 && defaultPosition <= 1) {
break;
}
} else if (wordBreakMatch.index === slopPos) {
// If the word boundary is at the slop position, set position to the default position
position = defaultPosition;
break;
} else {
// If the word boundary is after the slop position, adjust position forward
position = defaultPosition + (wordBreakMatch.index - slopPos);
break;
}
}
}
// If the character at the determined position is a whitespace, adjust position backward
if (content.charAt(position - 1).match(/\s$/)) {
position--;
}
}

// Return the calculated position to truncate the content
return position;
}

/**
* Truncate HTML string and keep tag safe.
* pulled from https://github.com/huang47/nodejs-html-truncate/blob/master/lib/truncate.js
*
* @param {string} html - The string that needs to be truncated
* @param {number} maxLength - Length of truncated string
* @param {Object} [options] - Optional configuration options
* @returns {string} The truncated string
*/
truncateHTML(html: string, maxLength: number, options?: TruncateOptions) {
const EMPTY_STRING = '';
const DEFAULT_TRUNCATE_SYMBOL = '...';
const DEFAULT_SLOP = Math.min(10, maxLength);
const tagsStack = [];
const KEY_VALUE_REGEX = '((?:\\s+(?:\\w+|-)+(?:\\s*=\\s*(?:"(?:\\\\.|[^"\\\\])*"|\'(?:\\\\.|[^\'\\\\])*\'|[^\'">\\s]+))?)*)';
const IS_CLOSE_REGEX = '\\s*\\/?\\s*';
const CLOSE_REGEX = '\\s*\\/\\s*';
const SELF_CLOSE_REGEX = new RegExp(`<\\/?(\\w+)${KEY_VALUE_REGEX}${CLOSE_REGEX}>`);
const HTML_TAG_REGEX = new RegExp(`<\\/?(\\w+)${KEY_VALUE_REGEX}${IS_CLOSE_REGEX}>`);
const URL_REGEX = /(((ftp|https?):\/\/)[\\-\w@:%_\\+.~#?,&\\/\\/=]+)|((mailto:)?[_.\w\\-]+@([\w][\w\\-]+\.)+[a-zA-Z]{2,3})/g;
const IMAGE_TAG_REGEX = new RegExp(`<img\\s*${KEY_VALUE_REGEX}${CLOSE_REGEX}>`);
let truncatedContent = EMPTY_STRING;
let totalLength = 0;
let matches = HTML_TAG_REGEX.exec(html);
let endResult;
let index;
let tag;
let selfClose = null;
let htmlString = html;

const opts = {
ellipsis: DEFAULT_TRUNCATE_SYMBOL,
truncateLastWord: true,
slop: DEFAULT_SLOP,
...options,
};

function removeImageTag(content: string): string {
const match = IMAGE_TAG_REGEX.exec(content);
if (!match) {
return content;
}

const matchIndex = match.index;
const matchLength = match[0].length;

return content.substring(0, matchIndex) + content.substring(matchIndex + matchLength);
}

function closeTags(tags: string[]): string {
return tags
.reverse()
.map((mappedTag) => {
return `</${mappedTag}>`;
})
.join('');
}

while (matches) {
matches = HTML_TAG_REGEX.exec(htmlString);

if (!matches) {
if (totalLength >= maxLength) {
break;
}

matches = URL_REGEX.exec(htmlString);
if (!matches || matches.index >= maxLength) {
truncatedContent += htmlString.substring(0, this.getEndPosition(htmlString, undefined, maxLength, totalLength, opts));
break;
}

while (matches) {
endResult = matches[0];
if (endResult !== null) {
index = matches.index;
truncatedContent += htmlString.substring(0, index + endResult.length - totalLength);
htmlString = htmlString.substring(index + endResult.length);
matches = URL_REGEX.exec(htmlString);
}
}
break;
}

endResult = matches[0];
index = matches.index;

if (totalLength + index > maxLength) {
truncatedContent += htmlString.substring(0, this.getEndPosition(htmlString, index, maxLength, totalLength, opts));
break;
} else {
totalLength += index;
truncatedContent += htmlString.substring(0, index);
}

if (endResult[1] === '/') {
tagsStack.pop();
selfClose = null;
} else {
selfClose = SELF_CLOSE_REGEX.exec(endResult);
if (!selfClose) {
tag = matches[1];
tagsStack.push(tag);
}
}

truncatedContent += selfClose ? selfClose[0] : endResult;
htmlString = htmlString.substring(index + endResult.length); // Update htmlString
}

if (htmlString.length > maxLength - totalLength && opts.ellipsis) {
truncatedContent += opts.ellipsis ? '...' : '';
}
truncatedContent += closeTags(tagsStack);

if (opts.removeImageTag) {
truncatedContent = removeImageTag(truncatedContent);
}

return truncatedContent;
}

/**
* Replaces text with a replacement based on a regex
* @param text - The text to replace
Expand Down
Loading