From c41de8642ac9a5ffd77a8f0530550a686088e096 Mon Sep 17 00:00:00 2001 From: Rajat Parashar Date: Thu, 22 Jul 2021 03:30:05 +0530 Subject: [PATCH 1/4] Fix HTML-> MD Conversion --- lib/ExpensiMark.js | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/lib/ExpensiMark.js b/lib/ExpensiMark.js index 811bd4be..3528c40b 100644 --- a/lib/ExpensiMark.js +++ b/lib/ExpensiMark.js @@ -1,3 +1,4 @@ +import _ from 'underscore'; import Str from './str'; import TLD_REGEX from './tlds'; @@ -33,7 +34,7 @@ export default class ExpensiMark { // with the new lines here since they need to be converted into
. And we don't // want to do this anywhere else since that would break HTML. //   will create styling issues so use - replacement: (match, _, textWithinFences) => { + replacement: (match, __, textWithinFences) => { const group = textWithinFences.replace(/(?:(?![\n\r])\s)/g, ' '); return `
${group}
`; }, @@ -195,6 +196,13 @@ export default class ExpensiMark { regex: /<(del)(?:"[^"]*"|'[^']*'|[^'">])*>(.*?)<\/\1>(?![^<]*(<\/pre>|<\/code>))/gi, replacement: '~$2~' }, + + // Used to Exclude tags + { + name: 'exclude', + regex: /<(script|style)(?:"[^"]*"|'[^']*'|[^'">])*>(.*?)<\/\1>(?![^<]*(<\/pre>|<\/code>))/gi, + replacement: '', + } ]; } @@ -298,7 +306,7 @@ export default class ExpensiMark { * @returns {String} */ htmlToMarkdown(htmlString) { - let generatedMarkdown = htmlString; + let generatedMarkdown = _.unescape(htmlString); this.htmlToMarkdownRules.forEach((rule) => { // Pre-processes input HTML before applying regex if (rule.pre) { From 4f1ca41f048218dfd97833ffec997bb894fa3ab5 Mon Sep 17 00:00:00 2001 From: Rajat Parashar Date: Tue, 27 Jul 2021 15:11:52 +0530 Subject: [PATCH 2/4] fix: extra newlines before after text --- lib/ExpensiMark.js | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/lib/ExpensiMark.js b/lib/ExpensiMark.js index 3528c40b..414aea8e 100644 --- a/lib/ExpensiMark.js +++ b/lib/ExpensiMark.js @@ -171,6 +171,18 @@ export default class ExpensiMark { * @type {Object[]} */ this.htmlToMarkdownRules = [ + { + name: 'Strip Special Tags', + regex: /<\/?(html|body)(?:"[^"]*"|'[^']*'|[^'"><])*>(?![^<]*(<\/pre>|<\/code>))(\n|\r\n)?/gim, + replacement: '' + }, + + // Used to Exclude tags + { + name: 'exclude', + regex: /<(script|style)(?:"[^"]*"|'[^']*'|[^'">])*>(.*?)<\/\1>(?![^<]*(<\/pre>|<\/code>))/gi, + replacement: '', + }, { name: 'newline', @@ -196,13 +208,6 @@ export default class ExpensiMark { regex: /<(del)(?:"[^"]*"|'[^']*'|[^'">])*>(.*?)<\/\1>(?![^<]*(<\/pre>|<\/code>))/gi, replacement: '~$2~' }, - - // Used to Exclude tags - { - name: 'exclude', - regex: /<(script|style)(?:"[^"]*"|'[^']*'|[^'">])*>(.*?)<\/\1>(?![^<]*(<\/pre>|<\/code>))/gi, - replacement: '', - } ]; } @@ -307,6 +312,7 @@ export default class ExpensiMark { */ htmlToMarkdown(htmlString) { let generatedMarkdown = _.unescape(htmlString); + this.htmlToMarkdownRules.forEach((rule) => { // Pre-processes input HTML before applying regex if (rule.pre) { From a64c086266153a5445598d69a6cd667355061cfc Mon Sep 17 00:00:00 2001 From: Rajat Parashar Date: Tue, 27 Jul 2021 15:47:45 +0530 Subject: [PATCH 3/4] fix newlines --- lib/ExpensiMark.js | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/lib/ExpensiMark.js b/lib/ExpensiMark.js index 414aea8e..acd22d1b 100644 --- a/lib/ExpensiMark.js +++ b/lib/ExpensiMark.js @@ -173,14 +173,20 @@ export default class ExpensiMark { this.htmlToMarkdownRules = [ { name: 'Strip Special Tags', - regex: /<\/?(html|body)(?:"[^"]*"|'[^']*'|[^'"><])*>(?![^<]*(<\/pre>|<\/code>))(\n|\r\n)?/gim, + regex: /(\n|\r\n)?<\/?(html|body)(?:"[^"]*"|'[^']*'|[^'"><])*>(?![^<]*(<\/pre>|<\/code>))(\n|\r\n)?/gim, replacement: '' }, // Used to Exclude tags { name: 'exclude', - regex: /<(script|style)(?:"[^"]*"|'[^']*'|[^'">])*>(.*?)<\/\1>(?![^<]*(<\/pre>|<\/code>))/gi, + regex: new RegExp( + [ + '<(script|style)(?:"[^"]*"|\'[^\']*\'|[^\'">])*>([\\s\\S]*?)<\\/\\1>', + '(?![^<]*(<\\/pre>|<\\/code>))(\n|\r\n)?' + ].join(''), + 'gim' + ), replacement: '', }, { @@ -190,7 +196,7 @@ export default class ExpensiMark { pre: inputString => inputString.replace('

', '
').replace('

', '
'), // Include the immediately followed newline as `
\n` should be equal to one \n. - regex: /])*>(?![^<]*(<\/pre>|<\/code>))\n?/gi, + regex: /<])*>(?![^<]*(<\/pre>|<\/code>))\n?/gi, replacement: '\n' }, { From 501ee5e618019d1adab43633c82d714cfef504db Mon Sep 17 00:00:00 2001 From: Rajat Parashar Date: Tue, 27 Jul 2021 15:47:59 +0530 Subject: [PATCH 4/4] added more tests --- __tests__/ExpensiMark-Markdown-test.js | 60 +++++++++++++++++++++++--- 1 file changed, 55 insertions(+), 5 deletions(-) diff --git a/__tests__/ExpensiMark-Markdown-test.js b/__tests__/ExpensiMark-Markdown-test.js index e6b4a12c..cfd9e288 100644 --- a/__tests__/ExpensiMark-Markdown-test.js +++ b/__tests__/ExpensiMark-Markdown-test.js @@ -5,9 +5,9 @@ const parser = new ExpensiMark(); test('Test bold HTML replacement', () => { const boldTestStartString = 'This is a sentence, and it has some punctuation, words, and spaces. ' - + 'test * testing* test*test*test. * testing * *testing * ' - + 'This is a sentence, and it has some punctuation, words, and spaces. ' - + 'test * testing* test*test*test. * testing * *testing *'; + + 'test * testing* test*test*test. * testing * *testing * ' + + 'This is a sentence, and it has some punctuation, words, and spaces. ' + + 'test * testing* test*test*test. * testing * *testing *'; const boldTestReplacedString = 'This is a *sentence,* and it has some *punctuation, words, and spaces*. ' + '*test* * testing* test*test*test. * testing * *testing * ' + 'This is a *sentence,* and it has some *punctuation, words, and spaces*. ' @@ -18,9 +18,9 @@ test('Test bold HTML replacement', () => { test('Test italic HTML replacement', () => { const italicTestStartString = 'This is a sentence, and it has some punctuation, words, and spaces. test _ testing_ test_test_test. _ test _ _test _ ' - + 'This is a sentence, and it has some punctuation, words, and spaces. test _ testing_ test_test_test. _ test _ _test _'; + + 'This is a sentence, and it has some punctuation, words, and spaces. test _ testing_ test_test_test. _ test _ _test _'; const italicTestReplacedString = 'This is a _sentence,_ and it has some _punctuation, words, and spaces_. _test_ _ testing_ test_test_test. _ test _ _test _ ' - + 'This is a _sentence,_ and it has some _punctuation, words, and spaces_. _test_ _ testing_ test_test_test. _ test _ _test _'; + + 'This is a _sentence,_ and it has some _punctuation, words, and spaces_. _test_ _ testing_ test_test_test. _ test _ _test _'; expect(parser.htmlToMarkdown(italicTestStartString)).toBe(italicTestReplacedString); }); @@ -64,3 +64,53 @@ test('Test HTML string with attributes', () => { expect(parser.htmlToMarkdown(testString)).toBe(resultString); }); + +test('Test HTML string with spcial Tags', () => { + const testString = '\n\ntest message\n\n\n'; + const resultString = 'test message'; + + expect(parser.htmlToMarkdown(testString)).toBe(resultString); +}); + + +test('Test HTML string with Internal Tags', () => { + const testString = ` + +

test message

`; + const resultString = 'test message'; + + expect(parser.htmlToMarkdown(testString)).toBe(resultString); +}); + +test('Test HTML string with encoded entities', () => { + const testString = 'Text Entity & "'; + const resultString = 'Text Entity & "'; + + expect(parser.htmlToMarkdown(testString)).toBe(resultString); +});