From e7a28dc282365f6d1d4aa5c5ac9315ee3f314c51 Mon Sep 17 00:00:00 2001 From: Kevin Dew Date: Wed, 3 Apr 2019 18:27:12 +0100 Subject: [PATCH] Fix the mysterious case of spaces being removed This was a rather hard problem to solve, so brace yourself as this message might be complex. We were having cases of HTML pasted from browsers adding span elements in for spaces. So we could end up with HTML like: `Some text Link` where, depending on the environment, the contents of the span could be ` ` or ` `. In browsers it was the former, whereas we experienced the latter on JSDOM. This causes some poor behaviour in turndown where the presence of a nbsp; character (which has an ASCII code of 160) causes the whole space to be stripped out, resulting in `Some text[Link]()` as output. On the other hand the presence of a normal space (ASCII code 32) causes a different problem of two spaces, resulting in output such as `Some text [Link]()`. This problem is due to Turndowns whitespace rules where they only match a normal space character: https://github.com/domchristie/turndown/blob/80297cebeae4b35c8d299b1741b383c74eddc7c1/src/node.js#L25-L33 With our change to blankReplacement we can fix the case for   (which is the more common on we're witnessing) however we can't easily fix the double space issue. We have left tests for both cases to help any future developers that may venture into this hole. We've raised a PR with turndown to fix both of these issues, https://github.com/domchristie/turndown/pull/281, so hopefully in the non-too-distant future we can remove this code. --- src/to-govspeak.js | 14 +++++++++++++- test/to-govspeak.test.js | 16 +++++++++++++++- 2 files changed, 28 insertions(+), 2 deletions(-) diff --git a/src/to-govspeak.js b/src/to-govspeak.js index 33869a2..5248404 100644 --- a/src/to-govspeak.js +++ b/src/to-govspeak.js @@ -2,7 +2,19 @@ import TurndownService from 'turndown' const service = new TurndownService({ bulletListMarker: '-', - listIndent: ' ' // 3 spaces + listIndent: ' ', // 3 spaces + blankReplacement: (content, node) => { + if (node.isBlock) { + return '\n\n' + } + + // This fixes an issue with turndown where an element with a space + // inside can be removed causing a jarring HTML coversion. + const hasWhitespace = /\s/.test(node.textContent) + const hasFlanking = node.flankingWhitespace.trailing || node.flankingWhitespace.leading + + return hasWhitespace && !hasFlanking ? ' ' : '' + } }) // As a user may have pasted markdown we rather crudley diff --git a/test/to-govspeak.test.js b/test/to-govspeak.test.js index 7698b2f..f0b9133 100644 --- a/test/to-govspeak.test.js +++ b/test/to-govspeak.test.js @@ -5,7 +5,6 @@ import toGovspeak from '../src/to-govspeak' it('converts HTML to govspeak', () => { expect(toGovspeak('

Hello

')).toEqual('Hello') }) - it("doesn't escape markdown", () => { const html = '

[Markdown](link)

' expect(toGovspeak(html)).toEqual('[Markdown](link)') @@ -184,3 +183,18 @@ it('fixes an invalid nested ordered list that Google Docs produces', () => { '2. Parent sibling' ) }) + +it('Fixes cases where a   has the space stripped', () => { + const html = `Some text and some more text` + + expect(toGovspeak(html)).toEqual('Some text and some more text') +}) + +// This test is here to document an undesirable case that took a lot of +// investigation. This should be resolved when/if https://github.com/domchristie/turndown/pull/281 +// is released. +it('Maintains behaviour where a produces a double space', () => { + const html = `Some text and some more text` + + expect(toGovspeak(html)).toEqual('Some text and some more text') +})