diff --git a/src/Markdown.ts b/src/Markdown.ts index 96169d4011d..c6c77245393 100644 --- a/src/Markdown.ts +++ b/src/Markdown.ts @@ -17,6 +17,8 @@ limitations under the License. import * as commonmark from 'commonmark'; import { escape } from "lodash"; +import { logger } from 'matrix-js-sdk/src/logger'; +import * as linkify from 'linkifyjs'; const ALLOWED_HTML_TAGS = ['sub', 'sup', 'del', 'u']; @@ -29,6 +31,9 @@ interface CommonmarkHtmlRendererInternal extends commonmark.HtmlRenderer { link: (node: commonmark.Node, entering: boolean) => void; html_inline: (node: commonmark.Node) => void; // eslint-disable-line camelcase html_block: (node: commonmark.Node) => void; // eslint-disable-line camelcase + text: (node: commonmark.Node) => void; + out: (text: string) => void; + emph: (node: commonmark.Node) => void; } function isAllowedHtmlTag(node: commonmark.Node): boolean { @@ -61,6 +66,33 @@ function isMultiLine(node: commonmark.Node): boolean { return par.firstChild != par.lastChild; } +function getTextUntilEndOrLinebreak(node: commonmark.Node) { + let currentNode = node; + let text = ''; + while (currentNode !== null && currentNode.type !== 'softbreak' && currentNode.type !== 'linebreak') { + const { literal, type } = currentNode; + if (type === 'text' && literal) { + let n = 0; + let char = literal[n]; + while (char !== ' ' && char !== null && n <= literal.length) { + if (char === ' ') { + break; + } + if (char) { + text += char; + } + n += 1; + char = literal[n]; + } + if (char === ' ') { + break; + } + } + currentNode = currentNode.next; + } + return text; +} + /** * Class that wraps commonmark, adding the ability to see whether * a given message actually uses any markdown syntax or whether @@ -70,11 +102,103 @@ export default class Markdown { private input: string; private parsed: commonmark.Node; - constructor(input) { + constructor(input: string) { this.input = input; const parser = new commonmark.Parser(); this.parsed = parser.parse(this.input); + this.parsed = this.repairLinks(this.parsed); + } + + /** + * This method is modifying the parsed AST in such a way that links are always + * properly linkified instead of sometimes being wrongly emphasised in case + * if you were to write a link like the example below: + * https://my_weird-link_domain.domain.com + * ^ this link would be parsed to something like this: + * https://myweird-linkdomain.domain.com + * This method makes it so the link gets properly modified to a version where it is + * not emphasised until it actually ends. + * See: https://github.com/vector-im/element-web/issues/4674 + * @param parsed + */ + private repairLinks(parsed: commonmark.Node) { + const walker = parsed.walker(); + let event: commonmark.NodeWalkingStep = null; + let text = ''; + let isInPara = false; + let previousNode: commonmark.Node | null = null; + let shouldUnlinkEmphasisNode = false; + while ((event = walker.next())) { + const { node } = event; + if (node.type === 'paragraph') { + if (event.entering) { + isInPara = true; + } else { + isInPara = false; + } + } + if (isInPara) { + // Clear saved string when line ends + if ( + node.type === 'softbreak' || + node.type === 'linebreak' || + // Also start calculating the text from the beginning on any spaces + (node.type === 'text' && node.literal === ' ') + ) { + text = ''; + } + if (node.type === 'text') { + text += node.literal; + } + // We should not do this if previous node was not a textnode, as we can't combine it then. + if (node.type === 'emph' && previousNode.type === 'text') { + if (event.entering) { + const foundLinks = linkify.find(text); + for (const { value } of foundLinks) { + if (node.firstChild.literal) { + /** + * NOTE: This technically should unlink the emph node and create LINK nodes instead, adding all the next elements as siblings + * but this solution seems to work well and is hopefully slightly easier to understand too + */ + const nonEmphasizedText = `_${node.firstChild.literal}_`; + const f = getTextUntilEndOrLinebreak(node); + const newText = value + nonEmphasizedText + f; + const newLinks = linkify.find(newText); + // Should always find only one link here, if it finds more it means that the algorithm is broken + if (newLinks.length === 1) { + const emphasisTextNode = new commonmark.Node('text'); + emphasisTextNode.literal = nonEmphasizedText; + previousNode.insertAfter(emphasisTextNode); + node.firstChild.literal = ''; + event = node.walker().next(); + // Remove `em` opening and closing nodes + node.unlink(); + previousNode.insertAfter(event.node); + shouldUnlinkEmphasisNode = true; + } else { + logger.error( + "Markdown links escaping found too many links for following text: ", + text, + ); + logger.error( + "Markdown links escaping found too many links for modified text: ", + newText, + ); + } + } + } + } else { + if (shouldUnlinkEmphasisNode) { + node.unlink(); + shouldUnlinkEmphasisNode = false; + } + } + } + } + previousNode = node; + } + return parsed; } isPlainText(): boolean { @@ -120,9 +244,7 @@ export default class Markdown { // you can nest them. // // Let's try sending with

s anyway for now, though. - const realParagraph = renderer.paragraph; - renderer.paragraph = function(node: commonmark.Node, entering: boolean) { // If there is only one top level node, just return the // bare text: it's a single line of text and so should be diff --git a/test/Markdown-test.ts b/test/Markdown-test.ts new file mode 100644 index 00000000000..602fda3dfc1 --- /dev/null +++ b/test/Markdown-test.ts @@ -0,0 +1,143 @@ +/* +Copyright 2021 The Matrix.org Foundation C.I.C. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +import * as linkifyjs from 'linkifyjs'; +import Markdown from "../src/Markdown"; +import matrixLinkify from '../src/linkify-matrix'; + +beforeAll(() => { + // We need to call linkifier plugins before running those tests + matrixLinkify(linkifyjs); +}); + +describe("Markdown parser test", () => { + describe("fixing HTML links", () => { + const testString = [ + "Test1:", + "#_foonetic_xkcd:matrix.org", + "http://google.com/_thing_", + "https://matrix.org/_matrix/client/foo/123_", + "#_foonetic_xkcd:matrix.org", + "", + "Test1A:", + "#_foonetic_xkcd:matrix.org", + "http://google.com/_thing_", + "https://matrix.org/_matrix/client/foo/123_", + "#_foonetic_xkcd:matrix.org", + "", + "Test2:", + "http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg", + "http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg", + "", + "Test3:", + "https://riot.im/app/#/room/#_foonetic_xkcd:matrix.org", + "https://riot.im/app/#/room/#_foonetic_xkcd:matrix.org", + ].join("\n"); + + it('tests that links are getting properly HTML formatted', () => { + /* eslint-disable max-len */ + const expectedResult = [ + "

Test1:
#_foonetic_xkcd:matrix.org
http://google.com/_thing_
https://matrix.org/_matrix/client/foo/123_
#_foonetic_xkcd:matrix.org

", + "

Test1A:
#_foonetic_xkcd:matrix.org
http://google.com/_thing_
https://matrix.org/_matrix/client/foo/123_
#_foonetic_xkcd:matrix.org

", + "

Test2:
http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg
http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg

", + "

Test3:
https://riot.im/app/#/room/#_foonetic_xkcd:matrix.org
https://riot.im/app/#/room/#_foonetic_xkcd:matrix.org

", + "", + ].join("\n"); + /* eslint-enable max-len */ + const md = new Markdown(testString); + expect(md.toHTML()).toEqual(expectedResult); + }); + it('tests that links with autolinks are not touched at all and are still properly formatted', () => { + const test = [ + "Test1:", + "<#_foonetic_xkcd:matrix.org>", + "", + "", + "<#_foonetic_xkcd:matrix.org>", + "", + "Test1A:", + "<#_foonetic_xkcd:matrix.org>", + "", + "", + "<#_foonetic_xkcd:matrix.org>", + "", + "Test2:", + "", + "", + "", + "Test3:", + "", + "", + ].join("\n"); + /* eslint-disable max-len */ + /** + * NOTE: I'm not entirely sure if those "<"" and ">" should be visible in here for #_foonetic_xkcd:matrix.org + * but it seems to be actually working properly + */ + const expectedResult = [ + "

Test1:
<#_foonetic_xkcd:matrix.org>
http://google.com/_thing_
https://matrix.org/_matrix/client/foo/123_
<#_foonetic_xkcd:matrix.org>

", + "

Test1A:
<#_foonetic_xkcd:matrix.org>
http://google.com/_thing_
https://matrix.org/_matrix/client/foo/123_
<#_foonetic_xkcd:matrix.org>

", + "

Test2:
http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg
http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg

", + "

Test3:
https://riot.im/app/#/room/#_foonetic_xkcd:matrix.org
https://riot.im/app/#/room/#_foonetic_xkcd:matrix.org

", + "", + ].join("\n"); + /* eslint-enable max-len */ + const md = new Markdown(test); + expect(md.toHTML()).toEqual(expectedResult); + }); + + it('expects that links in codeblock are not modified', () => { + const expectedResult = [ + '
#_foonetic_xkcd:matrix.org',
+                'http://google.com/_thing_',
+                'https://matrix.org/_matrix/client/foo/123_',
+                '#_foonetic_xkcd:matrix.org',
+                '',
+                'Test1A:',
+                '#_foonetic_xkcd:matrix.org',
+                'http://google.com/_thing_',
+                'https://matrix.org/_matrix/client/foo/123_',
+                '#_foonetic_xkcd:matrix.org',
+                '',
+                'Test2:',
+                'http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg',
+                'http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg',
+                '',
+                'Test3:',
+                'https://riot.im/app/#/room/#_foonetic_xkcd:matrix.org',
+                'https://riot.im/app/#/room/#_foonetic_xkcd:matrix.org```',
+                '
', + '', + ].join('\n'); + const md = new Markdown("```" + testString + "```"); + expect(md.toHTML()).toEqual(expectedResult); + }); + + it('expects that links in one line will be "escaped" properly', () => { + /* eslint-disable max-len */ + const testString = [ + 'http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg' + " " + 'http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg', + 'http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg' + " " + 'http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg', + ].join('\n'); + const expectedResult = [ + "http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg", + "http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg", + ].join('
'); + /* eslint-enable max-len */ + const md = new Markdown(testString); + expect(md.toHTML()).toEqual(expectedResult); + }); + }); +}); diff --git a/test/editor/deserialize-test.js b/test/editor/deserialize-test.js index 17e8a317801..998e1d01907 100644 --- a/test/editor/deserialize-test.js +++ b/test/editor/deserialize-test.js @@ -197,7 +197,6 @@ describe('editor/deserialize', function() { it('code block with no trailing text', function() { const html = "
0xDEADBEEF\n
\n"; const parts = normalize(parseEvent(htmlMessage(html), createPartCreator())); - console.log(parts); expect(parts.length).toBe(5); expect(parts[0]).toStrictEqual({ type: "plain", text: "```" }); expect(parts[1]).toStrictEqual({ type: "newline", text: "\n" });