fix: Fix issue #21 by implementing a better parser for img and video …

…blocks.
plone · Mar 6, 2023 · b49614e · b49614e
1 parent ebea336
commit b49614e
Show file tree

Hide file tree

Showing 3 changed files with 112 additions and 14 deletions.
diff --git a/src/converters/fromHtml.js b/src/converters/fromHtml.js
@@ -14,10 +14,41 @@ const parser = new DOMParser();
 
 global.document = new JSDOM('...').window.document;
 
-const TEXT = 3;
+const TEXT_NODE = 3;
 const COMMENT = 8;
 
 const elementsWithConverters = ['IMG', 'VIDEO', 'TABLE', 'IFRAME'];
+const elementsShouldHaveText = [
+  'B',
+  'BLOCKQUOTE',
+  'BODY',
+  'CODE',
+  'DEL',
+  'DIV',
+  'EM',
+  'H1',
+  'H2',
+  'H3',
+  'H4',
+  'H5',
+  'H6',
+  'I',
+  'P',
+  'PRE',
+  'S',
+  'SPAN',
+  'STRONG',
+  'SUB',
+  'SUP',
+  'U',
+];
+
+const shouldKeepWrapper = (el) => {
+  if (elementsShouldHaveText.includes(el.tagName)) {
+    return el.textContent ? true : false;
+  }
+  return true;
+};
 
 const blockFromElement = (el, defaultTextBlock) => {
   let textBlock = slateTextBlock;
@@ -52,13 +83,13 @@ const skipCommentsAndWhitespace = (elements) => {
     (node) =>
       !(
         node.nodeType === COMMENT ||
-        (node.nodeType === TEXT && isWhitespace(node.textContent))
+        (node.nodeType === TEXT_NODE && isWhitespace(node.textContent))
       ),
   );
 };
 
 const isInline = (n) =>
-  n.nodeType === TEXT || isGlobalInline(n.tagName.toLowerCase());
+  n.nodeType === TEXT_NODE || isGlobalInline(n.tagName.toLowerCase());
 
 const convertFromHTML = (input, defaultTextBlock) => {
   const document = parser.parseFromString(input, 'text/html');
@@ -87,7 +118,7 @@ const convertFromHTML = (input, defaultTextBlock) => {
   // convert to blocks
   for (const el of elements) {
     const children = el.childNodes;
-    let keepWrapper = el.textContent ? true : false;
+    let keepWrapper = shouldKeepWrapper(el);
     for (const child of children) {
       // With children nodes, we keep the wrapper only
       // if at least one child is not  in elementsWithConverters

diff --git a/src/converters/fromHtml.test.js b/src/converters/fromHtml.test.js
@@ -82,7 +82,7 @@ describe('convertFromHTML parsing html with images nested in h2', () => {
     const result = convertFromHTML(html, 'draftjs');
 
     test('will return an array of blocks', () => {
-      expect(result).toHaveLength(8);
+      expect(result).toHaveLength(10);
     });
 
     test('will have a first block with an image', () => {
@@ -113,7 +113,7 @@ describe('convertFromHTML parsing html with images nested in h2', () => {
     const result = convertFromHTML(html, 'slate');
 
     test('will return an array of blocks', () => {
-      expect(result).toHaveLength(8);
+      expect(result).toHaveLength(10);
     });
 
     test('will have a first block with an image', () => {
@@ -330,10 +330,12 @@ describe('convertFromHTML parsing whitespace inside unknown tags', () => {
   });
 });
 
-describe('convertFromHTML parsing image inside a p element', () => {
-  const html = '<p><img src="image.jpeg"></p>';
+describe('convertFromHTML parsing image', () => {
+  // https://github.com/plone/blocks-conversion-tool/issues/21
+
+  describe('on its own', () => {
+    const html = '<img src="image.jpeg">';
 
-  describe('returns a block with an image', () => {
     const result = convertFromHTML(html, 'slate');
     expect(result).toHaveLength(1);
     expect(result).toEqual([
@@ -347,12 +349,63 @@ describe('convertFromHTML parsing image inside a p element', () => {
       },
     ]);
   });
-});
 
-describe('convertFromHTML parsing image inside a span element', () => {
-  const html = '<p><span><img src="image.jpeg"></span></p>';
+  describe('inside a p element', () => {
+    const html = '<p><img src="image.jpeg"></p>';
+
+    const result = convertFromHTML(html, 'slate');
+    expect(result).toHaveLength(1);
+    expect(result).toEqual([
+      {
+        '@type': 'image',
+        align: 'center',
+        alt: '',
+        size: 'l',
+        title: '',
+        url: 'image.jpeg',
+      },
+    ]);
+  });
+
+  describe('inside a span element', () => {
+    const html = '<p><span><img src="image.jpeg"></span></p>';
+
+    const result = convertFromHTML(html, 'slate');
+    expect(result).toHaveLength(1);
+    expect(result).toEqual([
+      {
+        '@type': 'image',
+        align: 'center',
+        alt: '',
+        size: 'l',
+        title: '',
+        url: 'image.jpeg',
+      },
+    ]);
+  });
+
+  describe('inside a div element', () => {
+    // https://github.com/plone/blocks-conversion-tool/issues/21#issuecomment-1455176066
+    const html = '<div><img src="image.jpeg"></div>';
+
+    const result = convertFromHTML(html, 'slate');
+    expect(result).toHaveLength(1);
+    expect(result).toEqual([
+      {
+        '@type': 'image',
+        align: 'center',
+        alt: '',
+        size: 'l',
+        title: '',
+        url: 'image.jpeg',
+      },
+    ]);
+  });
+
+  describe('inside a nested div element', () => {
+    // https://github.com/plone/blocks-conversion-tool/issues/21#issuecomment-1455176066
+    const html = '<div><div><img src="image.jpeg"></div></div>';
 
-  describe('returns valid result preserving the whitespace', () => {
     const result = convertFromHTML(html, 'slate');
     expect(result).toHaveLength(1);
     expect(result).toEqual([

diff --git a/src/converters/slate.js b/src/converters/slate.js
@@ -148,7 +148,21 @@ const bodyTagDeserializer = (el) => {
 };
 
 const divTagDeserializer = (el) => {
-  const children = Array.from(el.childNodes)
+  let children = el.childNodes;
+  if (children.length === 1) {
+    const child = children[0];
+    if (
+      // handle formatting from OpenOffice
+      child.nodeType === TEXT_NODE &&
+      child.textContent === '\n'
+    ) {
+      return jsx('text', {}, ' ');
+    } else if (elementsWithConverters.hasOwnProperty(child.tagName)) {
+      // If we have a child element that has its own converter, use it
+      return elementsWithConverters[child.tagName](child);
+    }
+  }
+  children = Array.from(children)
     .map((child) => {
       if (child.nodeType === TEXT_NODE) {
         let value = deserialize(child);