From 2d844974c2a4294582f4e2ef345abdeac0eb5702 Mon Sep 17 00:00:00 2001 From: gautamkrishnar Date: Sun, 24 Jul 2022 18:14:11 +0530 Subject: [PATCH] added duplicate blog posts filter --- README.md | 63 +++++++++---------- action.yml | 4 ++ local-run.js | 1 + src/blog-post-workflow.js | 14 +++++ test/Readme.removeDuplicates.md.snap | 9 +++ test/sample.duplicate.xml | 90 ++++++++++++++++++++++++++++ test/test-server.js | 4 ++ test/test.js | 12 +++- 8 files changed, 165 insertions(+), 32 deletions(-) create mode 100644 test/Readme.removeDuplicates.md.snap create mode 100644 test/sample.duplicate.xml diff --git a/README.md b/README.md index 4da5ca6..4163806 100644 --- a/README.md +++ b/README.md @@ -41,37 +41,38 @@ This workflow has additional options that you can use to customize it for your use case. The following are the list of options available: -| Option | Default Value | Description | Required | -|---------------------------|------------------------------------------------------------------------------||----------| -| `feed_list` | `""` | Comma-separated list of RSS feed urls, eg: `https://example1.com,https://example2.com` | Yes | -| `max_post_count` | `5` | Maximum number of posts you want to show on your readme, all feeds combined | No | -| `readme_path` | `./README.md` | Comma separated paths of the readme files you want to update | No | -| `gh_token` | your GitHub token with repo scope | Use this to configure the token of the user that commits the workflow result to GitHub | No | -| `comment_tag_name` | `BLOG-POST-LIST` | Allows you to override the default comment tag name (``), if you want to show multiple instances of the action on the same repo, see advanced usage for more info | No | -| `disable_sort` | `false` | Disables the sorting of the list based on publish date | No | -| `feed_names` | `""` | Comma-separated list of RSS feed names, this is intended to be used with `template` option. eg: `Blog,Dev.to` | No | -| `template` | `default` | Allows you to change the structure of the posts list by using different variables. By default this workflow uses markdown list format to render the posts, you can override this behavior using this option. Eg: `[$title]($url) ` will give you a space-separated list of posts.

**Supported variables** | No | -| `categories_template` | `default` | By default `$categories` variable in the template is a comma separated string of categories. This option will allow you override it and customize the category item according to your use case. Following are the variables available: eg: `$category ` will show category list as `category1` `category2` `category3` etc | No | -| `date_format` | `UTC:ddd mmm dd yyyy h:MM TT` | Allows you to change the format of the date or time displayed when using the $date in the template option. This uses NPM dateformat library, please read the library [documentation](https://www.npmjs.com/package/dateformat#named-formats) for the supported formats | No | -| `user_agent` | `rss-parser` | Allows you to customize the user agent used by the RSS feed crawler | No | -| `accept_header` | `application/rss+xml` | Allows you to customize the accept header of the http requests | No | -| `tag_post_pre_newline` | `true` if you are not using **template** option | Allows you to insert a newline before the closing tag and after the opening tag when using the template option if needed, for better formatting | No | -| `filter_comments` | `medium,stackoverflow/Comment by $author/,stackexchange/Comment by $author/` | Comma separated list of platforms you want to enable the comment filter.

**Available filters** | No | -| `custom_tags` | `""` | Allows you to use the custom tags from your feed items in your template. Format: `variableName/tagName/,variableName/tagName/`. Please see the [issue comment](https://github.com/gautamkrishnar/blog-post-workflow/issues/28#issuecomment-696024087) for more details | No | -| `title_max_length` | `""` | Allows you to trim the title in the posts list, excess text will be appended with an ellipsis `...` | No | -| `description_max_length` | `""` | Allows you to trim the description in the posts list, excess text will be appended with an ellipsis `...` | No | -| `item_exec` | `""` | Allows you to execute custom JavaScript code on each post item fetched from the xml to do advanced text manipulation. Please see the [issue comment](https://github.com/gautamkrishnar/blog-post-workflow/issues/34#issuecomment-706582788) as an example | No | -| `commit_message` | `Updated with the latest blog posts` | Allows you to customize the commit message | No | -| `committer_username` | `blog-post-bot` | Allows you to customize the committer username | No | -| `committer_email` | `blog-post-bot@example.com` | Allows you to customize the committer email | No | -| `output_only` | `false` | Sets the generated array as `results` [output variable](https://docs.github.com/en/free-pro-team@latest/actions/reference/workflow-syntax-for-github-actions#jobsjob_idoutputs) so that it can be consumed in other actions and parsed via utilities like [jq](https://stedolan.github.io/jq/). This will also prevent committing to readme. See [#51](https://github.com/gautamkrishnar/blog-post-workflow/issues/51#issuecomment-758570235) for more details about the output format and how to use it. This will also generate a JSON file named `/tmp/blog_post_workflow_output.json` that you can use to consume the generated data and avoid issues like [#110](https://github.com/gautamkrishnar/blog-post-workflow/issues/110). | No | -| `enable_keepalive` | `true` | Workflow will automatically do a dummy commit to keep the repository active if there is no commit activity for the last 50 days. GitHub will stop running all cron based triggers if the repository is not active for more than 60 days. This flag allows you to disable this feature. See [#53](https://git.io/Jtm4V) for more details. | No | -| `retry_count` | `0` | Maximum number of times to retry the fetch operation if it fails, See [#66](https://github.com/gautamkrishnar/blog-post-workflow/issues/66) for more details. | No | -| `retry_wait_time` | `1` | Time to wait before each retry operation in seconds. | No | -| `disable_html_encoding` | `false` | Disables html encoding of the feed contents. | No | -| `disable_item_validation` | `false` | Disables the validation checks for Title, publish date and URL. | No | -| `filter_dates` | `""` | Allows you to filter post items based on date range.

**Supported Values** Make sure that you set the `max_post_count` to a higher value to get rid of max post count filtering, before using the above options. | No | -| `rand_seed` | `"username/repository"` | Provide your own seeding string for the randomness. More info: [#152](https://github.com/gautamkrishnar/blog-post-workflow/issues/152) | No | +| Option | Default Value | Description | Required | +|------------------------------|------------------------------------------------------------------------------||----------| +| `feed_list` | `""` | Comma-separated list of RSS feed urls, eg: `https://example1.com,https://example2.com` | Yes | +| `max_post_count` | `5` | Maximum number of posts you want to show on your readme, all feeds combined | No | +| `readme_path` | `./README.md` | Comma separated paths of the readme files you want to update | No | +| `gh_token` | your GitHub token with repo scope | Use this to configure the token of the user that commits the workflow result to GitHub | No | +| `comment_tag_name` | `BLOG-POST-LIST` | Allows you to override the default comment tag name (``), if you want to show multiple instances of the action on the same repo, see advanced usage for more info | No | +| `disable_sort` | `false` | Disables the sorting of the list based on publish date | No | +| `feed_names` | `""` | Comma-separated list of RSS feed names, this is intended to be used with `template` option. eg: `Blog,Dev.to` | No | +| `template` | `default` | Allows you to change the structure of the posts list by using different variables. By default this workflow uses markdown list format to render the posts, you can override this behavior using this option. Eg: `[$title]($url) ` will give you a space-separated list of posts.

**Supported variables** | No | +| `categories_template` | `default` | By default `$categories` variable in the template is a comma separated string of categories. This option will allow you override it and customize the category item according to your use case. Following are the variables available: eg: `$category ` will show category list as `category1` `category2` `category3` etc | No | +| `date_format` | `UTC:ddd mmm dd yyyy h:MM TT` | Allows you to change the format of the date or time displayed when using the $date in the template option. This uses NPM dateformat library, please read the library [documentation](https://www.npmjs.com/package/dateformat#named-formats) for the supported formats | No | +| `user_agent` | `rss-parser` | Allows you to customize the user agent used by the RSS feed crawler | No | +| `accept_header` | `application/rss+xml` | Allows you to customize the accept header of the http requests | No | +| `tag_post_pre_newline` | `true` if you are not using **template** option | Allows you to insert a newline before the closing tag and after the opening tag when using the template option if needed, for better formatting | No | +| `filter_comments` | `medium,stackoverflow/Comment by $author/,stackexchange/Comment by $author/` | Comma separated list of platforms you want to enable the comment filter.

**Available filters** | No | +| `custom_tags` | `""` | Allows you to use the custom tags from your feed items in your template. Format: `variableName/tagName/,variableName/tagName/`. Please see the [issue comment](https://github.com/gautamkrishnar/blog-post-workflow/issues/28#issuecomment-696024087) for more details | No | +| `title_max_length` | `""` | Allows you to trim the title in the posts list, excess text will be appended with an ellipsis `...` | No | +| `description_max_length` | `""` | Allows you to trim the description in the posts list, excess text will be appended with an ellipsis `...` | No | +| `item_exec` | `""` | Allows you to execute custom JavaScript code on each post item fetched from the xml to do advanced text manipulation. Please see the [issue comment](https://github.com/gautamkrishnar/blog-post-workflow/issues/34#issuecomment-706582788) as an example | No | +| `commit_message` | `Updated with the latest blog posts` | Allows you to customize the commit message | No | +| `committer_username` | `blog-post-bot` | Allows you to customize the committer username | No | +| `committer_email` | `blog-post-bot@example.com` | Allows you to customize the committer email | No | +| `output_only` | `false` | Sets the generated array as `results` [output variable](https://docs.github.com/en/free-pro-team@latest/actions/reference/workflow-syntax-for-github-actions#jobsjob_idoutputs) so that it can be consumed in other actions and parsed via utilities like [jq](https://stedolan.github.io/jq/). This will also prevent committing to readme. See [#51](https://github.com/gautamkrishnar/blog-post-workflow/issues/51#issuecomment-758570235) for more details about the output format and how to use it. This will also generate a JSON file named `/tmp/blog_post_workflow_output.json` that you can use to consume the generated data and avoid issues like [#110](https://github.com/gautamkrishnar/blog-post-workflow/issues/110). | No | +| `enable_keepalive` | `true` | Workflow will automatically do a dummy commit to keep the repository active if there is no commit activity for the last 50 days. GitHub will stop running all cron based triggers if the repository is not active for more than 60 days. This flag allows you to disable this feature. See [#53](https://git.io/Jtm4V) for more details. | No | +| `retry_count` | `0` | Maximum number of times to retry the fetch operation if it fails, See [#66](https://github.com/gautamkrishnar/blog-post-workflow/issues/66) for more details. | No | +| `retry_wait_time` | `1` | Time to wait before each retry operation in seconds. | No | +| `disable_html_encoding` | `false` | Disables html encoding of the feed contents. | No | +| `disable_item_validation` | `false` | Disables the validation checks for Title, publish date and URL. | No | +| `filter_dates` | `""` | Allows you to filter post items based on date range.

**Supported Values** Make sure that you set the `max_post_count` to a higher value to get rid of max post count filtering, before using the above options. | No | +| `rand_seed` | `"username/repository"` | Provide your own seeding string for the randomness. More info: [#152](https://github.com/gautamkrishnar/blog-post-workflow/issues/152) | No | +| `remove_duplicates` | `false` | Allows you to remove duplicate blog posts from multiple sources, if are cross posting. This filters contents based on blog post's title. | No | ## Advanced usage examples diff --git a/action.yml b/action.yml index f8e99f7..4f3730b 100644 --- a/action.yml +++ b/action.yml @@ -117,6 +117,10 @@ inputs: description: "Provide your own seeding string for the randomness" default: "" required: false + remove_duplicates: + description: "Allows you to remove duplicate blog posts from multiple sources" + default: "" + required: false outputs: results: description: "JSON stringified array of posts" diff --git a/local-run.js b/local-run.js index 3b322f3..dafa59f 100644 --- a/local-run.js +++ b/local-run.js @@ -34,6 +34,7 @@ fs.writeFile(path.join(__dirname, 'test', 'Readme.md'), template, () => { process.env.INPUT_DISABLE_ITEM_VALIDATION = 'false'; process.env.INPUT_FILTER_DATES = ''; process.env.INPUT_RAND_SEED = ''; + process.env.INPUT_REMOVE_DUPLICATES = ''; const testFile = process.env.DIST ? './dist/blog-post-workflow' : './src/blog-post-workflow'; console.log('Testing: ', testFile); require(testFile); diff --git a/src/blog-post-workflow.js b/src/blog-post-workflow.js index 5769050..3ecf7e8 100644 --- a/src/blog-post-workflow.js +++ b/src/blog-post-workflow.js @@ -124,6 +124,9 @@ feedList.forEach((siteUrl) => { reject('Cannot read response->item'); } else { const responsePosts = data.items; + // To handle duplicate filter + const appendedPostTitles = []; + const appendedPostDesc = []; const posts = responsePosts .filter(ignoreMediumComments) .filter(ignoreStackOverflowComments) @@ -181,6 +184,17 @@ feedList.forEach((siteUrl) => { process.exit(1); } } + if (post && core.getInput('remove_duplicates') === 'true') { + if ( + appendedPostTitles.indexOf(post.title) !== -1 || + appendedPostDesc.indexOf(post.description) !== -1 + ) { + post = null; + } else { + post.title && appendedPostTitles.push(post.title) + post.description && appendedPostDesc.push(post.description); + } + } // Doing HTML encoding at last ref: #117 const disableHtmlEncoding = core.getInput('disable_html_encoding') !== 'false'; diff --git a/test/Readme.removeDuplicates.md.snap b/test/Readme.removeDuplicates.md.snap new file mode 100644 index 0000000..66e7789 --- /dev/null +++ b/test/Readme.removeDuplicates.md.snap @@ -0,0 +1,9 @@ +# Readme test +Post list example: + +- [God Mode in browsers: document.designMode = "on"](https://dev.to/gautamkrishnar/god-mode-in-browsers-document-designmode-on-2pmo) +- [Hi, I'm Gautam krishna.R](https://dev.to/gautamkrishnar/hi-im-gautam-krishnar) + + +# Other contents +Test content diff --git a/test/sample.duplicate.xml b/test/sample.duplicate.xml new file mode 100644 index 0000000..2ad2114 --- /dev/null +++ b/test/sample.duplicate.xml @@ -0,0 +1,90 @@ + + + + Gautam krishna.R + Gautam krishna.R + Developer, open source lover and after all a wonderful human being... Software Engineer at Red Hat | DuckDuckGo Community Leader | Polygot + https://dev.to/gautamkrishnar + en + + Hi, I'm Gautam krishna.R + Gautam krishna.R + Sun, 02 Apr 2017 19:04:43 +0000 + https://dev.to/gautamkrishnar/hi-im-gautam-krishnar + https://dev.to/gautamkrishnar/hi-im-gautam-krishnar + <p>I have been coding for 9 years, I am a Microsoft student partner and DuckDuckGo community leader who's actively involved in several open source projects.</p> + + <p>You can find me on GitHub at <a href="https://github.com/gautamkrishnar">@gautamkrishnar</a></p> + + <p>You can also find me on Twitter as <a href="https://twitter.com/gautamkrishnar">@gautamkrishnar</a></p> + + <p>I live in Kollam.</p> + + <p>I mostly program in these languages: Phython, NodeJS, PHP, C, C#, Java.</p> + + <p>I am currently learning more about ML.</p> + + <p>Nice to meet you.</p> + + + hello + apple + introductions + + + God Mode in browsers: document.designMode = "on" + Gautam krishna.R + Thu, 16 Jul 2020 10:40:11 +0000 + world + https://dev.to/gautamkrishnar/god-mode-in-browsers-document-designmode-on-2pmo + https://dev.to/gautamkrishnar/god-mode-in-browsers-document-designmode-on-2pmo + <p>Just type <code>document.designMode = "on"</code> on you favourite browser devtools and see the magic. </p> + + <p>It will make any website editable:</p> + + <p><a href="https://i.giphy.com/media/iDByhJJoZGSac2RH5z/giphy.gif" class="article-body-image-wrapper"><img src="https://i.giphy.com/media/iDByhJJoZGSac2RH5z/giphy.gif" alt="preview"></a></p> + + + chrome + firefox + devtools + webdev + + + God Mode in browsers: document.designMode = "on" + Gautam krishna.R + Thu, 16 Jul 2020 10:40:11 +0000 + world + https://dev.to/gautamkrishnar/god-mode-in-browsers-document-designmode-on-2pmo + https://dev.to/gautamkrishnar/god-mode-in-browsers-document-designmode-on-2pmo + <p>Just type <code>document.designMode = "on"</code> on you favourite browser devtools and see the magic. </p> + + <p>It will make any website editable:</p> + + <p><a href="https://i.giphy.com/media/iDByhJJoZGSac2RH5z/giphy.gif" class="article-body-image-wrapper"><img src="https://i.giphy.com/media/iDByhJJoZGSac2RH5z/giphy.gif" alt="preview"></a></p> + + + chrome + firefox + devtools + webdev + + + Comment by Gautam Krishna R on test + Gautam Krishna R + Wed, 19 Apr 2017 12:24:15 +0000 + https://stackoverflow.com/questions/49310909/omniauth-facebook-login-does-not-work/49311637?cid=85720166#49311637 + https://stackoverflow.com/questions/49310909/omniauth-facebook-login-does-not-work/49311637?cid=85720166#49311637 + desc + + + Comment by Gautam Krishna R on test + Gautam Krishna R + Wed, 19 Apr 2017 12:24:15 +0000 + https://stackoverflow.com/questions/49310909/omniauth-facebook-login-does-not-work/49311637?cid=85720166#49311637 + https://stackoverflow.com/questions/49310909/omniauth-facebook-login-does-not-work/49311637?cid=85720166#49311637 + desc + + + + diff --git a/test/test-server.js b/test/test-server.js index dc197c0..13f35a2 100644 --- a/test/test-server.js +++ b/test/test-server.js @@ -11,6 +11,8 @@ const sendResponse = (res, statusCode, data) => { }; const xmlData = fs.readFileSync(path.join(__dirname, 'sample.xml'), 'utf-8'); +const duplicateXmlData = fs.readFileSync(path.join(__dirname, 'sample.duplicate.xml'), 'utf-8'); + http.createServer(function (req, res) { if (req.url === '/failtest') { @@ -42,6 +44,8 @@ http.createServer(function (req, res) { `; sendResponse(res, 200, emptyTagResponse); + } else if (req.url === '/duplicates') { + sendResponse(res, 200, duplicateXmlData); } else { sendResponse(res, 200, xmlData); diff --git a/test/test.js b/test/test.js index f7a0ea1..10435f5 100644 --- a/test/test.js +++ b/test/test.js @@ -29,7 +29,8 @@ const DEFAULT_TEST_ENV = { INPUT_CATEGORIES_TEMPLATE: 'default', INPUT_DISABLE_ITEM_VALIDATION: 'false', INPUT_FILTER_DATES: '', - INPUT_RAND_SEED: '' + INPUT_RAND_SEED: '', + INPUT_REMOVE_DUPLICATES: 'false' }; // Folder with readme snapshots @@ -256,4 +257,13 @@ describe('Blog post workflow tests', function () { }; await runAndCompareSnap('Readme.filter_dates.currentYear.md', envObj); }); + it('Generated readme with remove duplicates flag should match the snapshot', async function () { + const envObj = { + ...process.env, + ...DEFAULT_TEST_ENV, + INPUT_FEED_LIST: 'http://localhost:8080/duplicates', + INPUT_REMOVE_DUPLICATES: 'true' + }; + await runAndCompareSnap('Readme.removeDuplicates.md', envObj); + }); });