From c41de8642ac9a5ffd77a8f0530550a686088e096 Mon Sep 17 00:00:00 2001
From: Rajat Parashar <parasharrajat@users.noreply.github.com>
Date: Thu, 22 Jul 2021 03:30:05 +0530
Subject: [PATCH 1/4] Fix HTML-> MD Conversion

---
 lib/ExpensiMark.js | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)
diff --git a/lib/ExpensiMark.js b/lib/ExpensiMark.js
index 811bd4be..3528c40b 100644
--- a/lib/ExpensiMark.js
+++ b/lib/ExpensiMark.js
@@ -1,3 +1,4 @@
+import _ from 'underscore';
 import Str from './str';
 import TLD_REGEX from './tlds';
 
@@ -33,7 +34,7 @@ export default class ExpensiMark {
                 // with the new lines here since they need to be converted into <br>. And we don't
                 // want to do this anywhere else since that would break HTML.
                 // &nbsp; will create styling issues so use &#32;
-                replacement: (match, _, textWithinFences) => {
+                replacement: (match, __, textWithinFences) => {
                     const group = textWithinFences.replace(/(?:(?![\n\r])\s)/g, '&#32;');
                     return `<pre>${group}</pre>`;
                 },
@@ -195,6 +196,13 @@ export default class ExpensiMark {
                 regex: /<(del)(?:"[^"]*"|'[^']*'|[^'">])*>(.*?)<\/\1>(?![^<]*(<\/pre>|<\/code>))/gi,
                 replacement: '~$2~'
             },
+
+            // Used to Exclude tags
+            {
+                name: 'exclude',
+                regex: /<(script|style)(?:"[^"]*"|'[^']*'|[^'">])*>(.*?)<\/\1>(?![^<]*(<\/pre>|<\/code>))/gi,
+                replacement: '',
+            }
         ];
     }
 
@@ -298,7 +306,7 @@ export default class ExpensiMark {
      * @returns {String}
      */
     htmlToMarkdown(htmlString) {
-        let generatedMarkdown = htmlString;
+        let generatedMarkdown = _.unescape(htmlString);
         this.htmlToMarkdownRules.forEach((rule) => {
             // Pre-processes input HTML before applying regex
             if (rule.pre) {

From 4f1ca41f048218dfd97833ffec997bb894fa3ab5 Mon Sep 17 00:00:00 2001
From: Rajat Parashar <parasharrajat@users.noreply.github.com>
Date: Tue, 27 Jul 2021 15:11:52 +0530
Subject: [PATCH 2/4] fix: extra newlines before after text

---
 lib/ExpensiMark.js | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/lib/ExpensiMark.js b/lib/ExpensiMark.js
index 3528c40b..414aea8e 100644
--- a/lib/ExpensiMark.js
+++ b/lib/ExpensiMark.js
@@ -171,6 +171,18 @@ export default class ExpensiMark {
          * @type {Object[]}
          */
         this.htmlToMarkdownRules = [
+            {
+                name: 'Strip Special Tags',
+                regex: /<\/?(html|body)(?:"[^"]*"|'[^']*'|[^'"><])*>(?![^<]*(<\/pre>|<\/code>))(\n|\r\n)?/gim,
+                replacement: ''
+            },
+
+            // Used to Exclude tags
+            {
+                name: 'exclude',
+                regex: /<(script|style)(?:"[^"]*"|'[^']*'|[^'">])*>(.*?)<\/\1>(?![^<]*(<\/pre>|<\/code>))/gi,
+                replacement: '',
+            },
             {
                 name: 'newline',
 
@@ -196,13 +208,6 @@ export default class ExpensiMark {
                 regex: /<(del)(?:"[^"]*"|'[^']*'|[^'">])*>(.*?)<\/\1>(?![^<]*(<\/pre>|<\/code>))/gi,
                 replacement: '~$2~'
             },
-
-            // Used to Exclude tags
-            {
-                name: 'exclude',
-                regex: /<(script|style)(?:"[^"]*"|'[^']*'|[^'">])*>(.*?)<\/\1>(?![^<]*(<\/pre>|<\/code>))/gi,
-                replacement: '',
-            }
         ];
     }
 
@@ -307,6 +312,7 @@ export default class ExpensiMark {
      */
     htmlToMarkdown(htmlString) {
         let generatedMarkdown = _.unescape(htmlString);
+
         this.htmlToMarkdownRules.forEach((rule) => {
             // Pre-processes input HTML before applying regex
             if (rule.pre) {

From a64c086266153a5445598d69a6cd667355061cfc Mon Sep 17 00:00:00 2001
From: Rajat Parashar <parasharrajat@users.noreply.github.com>
Date: Tue, 27 Jul 2021 15:47:45 +0530
Subject: [PATCH 3/4] fix newlines

---
 lib/ExpensiMark.js | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/lib/ExpensiMark.js b/lib/ExpensiMark.js
index 414aea8e..acd22d1b 100644
--- a/lib/ExpensiMark.js
+++ b/lib/ExpensiMark.js
@@ -173,14 +173,20 @@ export default class ExpensiMark {
         this.htmlToMarkdownRules = [
             {
                 name: 'Strip Special Tags',
-                regex: /<\/?(html|body)(?:"[^"]*"|'[^']*'|[^'"><])*>(?![^<]*(<\/pre>|<\/code>))(\n|\r\n)?/gim,
+                regex: /(\n|\r\n)?<\/?(html|body)(?:"[^"]*"|'[^']*'|[^'"><])*>(?![^<]*(<\/pre>|<\/code>))(\n|\r\n)?/gim,
                 replacement: ''
             },
 
             // Used to Exclude tags
             {
                 name: 'exclude',
-                regex: /<(script|style)(?:"[^"]*"|'[^']*'|[^'">])*>(.*?)<\/\1>(?![^<]*(<\/pre>|<\/code>))/gi,
+                regex: new RegExp(
+                    [
+                        '<(script|style)(?:"[^"]*"|\'[^\']*\'|[^\'">])*>([\\s\\S]*?)<\\/\\1>',
+                        '(?![^<]*(<\\/pre>|<\\/code>))(\n|\r\n)?'
+                    ].join(''),
+                    'gim'
+                ),
                 replacement: '',
             },
             {
@@ -190,7 +196,7 @@ export default class ExpensiMark {
                 pre: inputString => inputString.replace('<br></br>', '<br/>').replace('<br><br/>', '<br/>'),
 
                 // Include the immediately followed newline as `<br>\n` should be equal to one \n.
-                regex: /<br(?:"[^"]*"|'[^']*'|[^'">])*>(?![^<]*(<\/pre>|<\/code>))\n?/gi,
+                regex: /<br(?:"[^"]*"|'[^']*'|[^'"><])*>(?![^<]*(<\/pre>|<\/code>))\n?/gi,
                 replacement: '\n'
             },
             {

From 501ee5e618019d1adab43633c82d714cfef504db Mon Sep 17 00:00:00 2001
From: Rajat Parashar <parasharrajat@users.noreply.github.com>
Date: Tue, 27 Jul 2021 15:47:59 +0530
Subject: [PATCH 4/4] added more tests

---
 __tests__/ExpensiMark-Markdown-test.js | 60 +++++++++++++++++++++++---
 1 file changed, 55 insertions(+), 5 deletions(-)

diff --git a/__tests__/ExpensiMark-Markdown-test.js b/__tests__/ExpensiMark-Markdown-test.js
index e6b4a12c..cfd9e288 100644
--- a/__tests__/ExpensiMark-Markdown-test.js
+++ b/__tests__/ExpensiMark-Markdown-test.js
@@ -5,9 +5,9 @@ const parser = new ExpensiMark();
 
 test('Test bold HTML replacement', () => {
     const boldTestStartString = 'This is a <strong>sentence,</strong> and it has some <strong>punctuation, words, and spaces</strong>. '
-    + '<strong>test</strong> * testing* test*test*test. * testing * *testing * '
-    + 'This is a <b>sentence,</b> and it has some <b>punctuation, words, and spaces</b>. '
-    + '<b>test</b> * testing* test*test*test. * testing * *testing *';
+        + '<strong>test</strong> * testing* test*test*test. * testing * *testing * '
+        + 'This is a <b>sentence,</b> and it has some <b>punctuation, words, and spaces</b>. '
+        + '<b>test</b> * testing* test*test*test. * testing * *testing *';
     const boldTestReplacedString = 'This is a *sentence,* and it has some *punctuation, words, and spaces*. '
         + '*test* * testing* test*test*test. * testing * *testing * '
         + 'This is a *sentence,* and it has some *punctuation, words, and spaces*. '
@@ -18,9 +18,9 @@ test('Test bold HTML replacement', () => {
 
 test('Test italic HTML replacement', () => {
     const italicTestStartString = 'This is a <em>sentence,</em> and it has some <em>punctuation, words, and spaces</em>. <em>test</em> _ testing_ test_test_test. _ test _ _test _ '
-    + 'This is a <i>sentence,</i> and it has some <i>punctuation, words, and spaces</i>. <i>test</i> _ testing_ test_test_test. _ test _ _test _';
+        + 'This is a <i>sentence,</i> and it has some <i>punctuation, words, and spaces</i>. <i>test</i> _ testing_ test_test_test. _ test _ _test _';
     const italicTestReplacedString = 'This is a _sentence,_ and it has some _punctuation, words, and spaces_. _test_ _ testing_ test_test_test. _ test _ _test _ '
-    + 'This is a _sentence,_ and it has some _punctuation, words, and spaces_. _test_ _ testing_ test_test_test. _ test _ _test _';
+        + 'This is a _sentence,_ and it has some _punctuation, words, and spaces_. _test_ _ testing_ test_test_test. _ test _ _test _';
     expect(parser.htmlToMarkdown(italicTestStartString)).toBe(italicTestReplacedString);
 });
 
@@ -64,3 +64,53 @@ test('Test HTML string with attributes', () => {
 
     expect(parser.htmlToMarkdown(testString)).toBe(resultString);
 });
+
+test('Test HTML string with spcial Tags', () => {
+    const testString = '<html>\n<body>\n<!--StartFragment--><span style="color: rgb(0, 0, 0); font-family: &quot;Times New Roman&quot;; font-size: medium; font-style: normal; font-variant-ligatures: normal; font-variant-caps: normal; font-weight: 400; letter-spacing: normal; orphans: 2; text-align: start; text-indent: 0px; text-transform: none; white-space: pre-wrap; widows: 2; word-spacing: 0px; -webkit-text-stroke-width: 0px; text-decoration-thickness: initial; text-decoration-style: initial; text-decoration-color: initial; display: inline !important; float: none;">test message</span><!--EndFragment-->\n</body>\n</html>\n';
+    const resultString = 'test message';
+
+    expect(parser.htmlToMarkdown(testString)).toBe(resultString);
+});
+
+
+test('Test HTML string with Internal Tags', () => {
+    const testString = `<style>
+    span {
+        color: rgb(0, 0, 0);
+        font-family: "Times New Roman";
+        font-size: medium;
+        font-style: normal;
+        font-variant-ligatures: normal;
+        font-variant-caps: normal;
+        font-weight: 400;
+        letter-spacing: normal;
+        orphans: 2;
+        text-align: start;
+        text-indent: 0px;
+        text-transform: none;
+        white-space: pre-wrap;
+        widows: 2;
+        word-spacing: 0px;
+        -webkit-text-stroke-width: 0px;
+        text-decoration-thickness: initial;
+        text-decoration-style: initial;
+        text-decoration-color: initial;
+        display: inline !important;
+        float: none;
+    }
+</style>
+<script type="text/javascript">
+    document.write('Hacked');
+</script>
+<p>test message</p>`;
+    const resultString = 'test message';
+
+    expect(parser.htmlToMarkdown(testString)).toBe(resultString);
+});
+
+test('Test HTML string with encoded entities', () => {
+    const testString = 'Text Entity &amp; &quot;';
+    const resultString = 'Text Entity & "';
+
+    expect(parser.htmlToMarkdown(testString)).toBe(resultString);
+});