From f70cb1dbf41bcbb4ee3209adb02f809c97039995 Mon Sep 17 00:00:00 2001 From: Tyler Barnes Date: Fri, 26 Feb 2021 10:46:19 -0800 Subject: [PATCH] fix(gatsby-source-wordpress): HTML image regex's (#29778) Co-authored-by: gatsbybot (cherry picked from commit f6edccf8440acc29002ea2c89a815fe863c94670) --- .../__tests__/process-node.test.js | 25 +++++++ .../source-nodes/create-nodes/process-node.js | 71 +++++++++++-------- .../fetch-referenced-media-items.js | 3 +- 3 files changed, 68 insertions(+), 31 deletions(-) create mode 100644 packages/gatsby-source-wordpress/__tests__/process-node.test.js diff --git a/packages/gatsby-source-wordpress/__tests__/process-node.test.js b/packages/gatsby-source-wordpress/__tests__/process-node.test.js new file mode 100644 index 0000000000000..2e74410265063 --- /dev/null +++ b/packages/gatsby-source-wordpress/__tests__/process-node.test.js @@ -0,0 +1,25 @@ +import { + getImgSrcRemoteFileMatchesFromNodeString, + getImgTagMatchesWithUrl, +} from "../dist/steps/source-nodes/create-nodes/process-node" + +test(`HTML image transformation regex matches images`, async () => { + const wpUrl = `http://wp.fakesite.com` + + const nodeString = ` + + + + ` + + const matches = getImgSrcRemoteFileMatchesFromNodeString(nodeString) + + expect(matches.length).toBe(3) + + const imgTagMatches = getImgTagMatchesWithUrl({ + nodeString, + wpUrl, + }) + + expect(imgTagMatches.length).toBe(3) +}) diff --git a/packages/gatsby-source-wordpress/src/steps/source-nodes/create-nodes/process-node.js b/packages/gatsby-source-wordpress/src/steps/source-nodes/create-nodes/process-node.js index 4ac9c42581c9b..f2cb23da72615 100644 --- a/packages/gatsby-source-wordpress/src/steps/source-nodes/create-nodes/process-node.js +++ b/packages/gatsby-source-wordpress/src/steps/source-nodes/create-nodes/process-node.js @@ -1,3 +1,4 @@ +/* eslint-disable no-useless-escape */ import { isWebUri } from "valid-url" import { fluid } from "gatsby-plugin-sharp" import Img from "gatsby-image" @@ -30,7 +31,7 @@ const getNodeEditLink = node => { const findReferencedImageNodeIds = ({ nodeString, pluginOptions, node }) => { // if the lazyNodes plugin option is set we don't need to find - // image node id's because those nodes will be fetched lazily in resolvers + // image node id's because those nodes will be fetched lazily in resolvers. if (pluginOptions.type.MediaItem.lazyNodes) { return [] } @@ -327,6 +328,17 @@ const getCheerioElementFromMatch = wpUrl => ({ match, tag = `img` }) => { } } +const getCheerioElementsFromMatches = ({ imgTagMatches, wpUrl }) => + imgTagMatches + .map(getCheerioElementFromMatch(wpUrl)) + .filter(({ cheerioImg: { attribs } }) => { + if (!attribs.src) { + return false + } + + return isWebUri(encodeURI(attribs.src)) + }) + const getLargestSizeFromSizesAttribute = sizesString => { const sizesStringsArray = sizesString.split(`,`) @@ -444,6 +456,28 @@ const cacheCreatedFileNodeBySrc = ({ node, src }) => { } } +const imgSrcRemoteFileRegex = /(?:src=\\")((?:(?:https?|ftp|file):\/\/|www\.|ftp\.|\/)(?:[^'"])*\.(?:jpeg|jpg|png|gif|ico|mpg|ogv|svg|bmp|tif|tiff))(\?[^\\" \.]*|)(?=\\"| |\.)/gim + +export const getImgSrcRemoteFileMatchesFromNodeString = nodeString => + execall(imgSrcRemoteFileRegex, nodeString).filter(({ subMatches }) => { + // if our match is json encoded, that means it's inside a JSON + // encoded string field. + const isInJSON = subMatches[0].includes(`\\/\\/`) + + // we shouldn't process encoded JSON, so skip this match if it's JSON + return !isInJSON + }) + +export const getImgTagMatchesWithUrl = ({ nodeString, wpUrl }) => + execall( + //gim, + nodeString + // we don't want to match images inside pre + .replace(/.*(<\/pre>)/gim, ``) + // and code tags, so temporarily remove those tags and everything inside them + .replace(/.*(<\/code>)/gim, ``) + ).filter(filterMatches(wpUrl)) + const replaceNodeHtmlImages = async ({ nodeString, node, @@ -456,38 +490,15 @@ const replaceNodeHtmlImages = async ({ return nodeString } - const imgSrcRemoteFileRegex = /(?:src=\\")((?:(?:https?|ftp|file):\/\/|www\.|ftp\.|\/)(?:\([-A-Z0-9+&@#/%=~_|$?!:,.]*\)|[-A-Z0-9+&@#/%=~_|$?!:,.])*(?:\([-A-Z0-9+&@#/%=~_|$?!:,.]*\)|[A-Z0-9+&@#/%=~_|$])\.(?:jpeg|jpg|png|gif|ico|mpg|ogv|svg|bmp|tif|tiff))(\?[^\\" .]*|)(?=\\"| |\.)/gim + const imageUrlMatches = getImgSrcRemoteFileMatchesFromNodeString(nodeString) - const imageUrlMatches = execall(imgSrcRemoteFileRegex, nodeString).filter( - ({ subMatches }) => { - // if our match is json encoded, that means it's inside a JSON - // encoded string field. - const isInJSON = subMatches[0].includes(`\\/\\/`) - - // we shouldn't process encoded JSON, so skip this match if it's JSON - return !isInJSON - } - ) - - const imgTagMatches = execall( - //gim, - nodeString - // we don't want to match images inside pre - .replace(/.*(<\/pre>)/gim, ``) - // and code tags, so temporarily remove those tags and everything inside them - .replace(/.*(<\/code>)/gim, ``) - ).filter(filterMatches(wpUrl)) + const imgTagMatches = getImgTagMatchesWithUrl({ nodeString, wpUrl }) if (imageUrlMatches.length && imgTagMatches.length) { - const cheerioImages = imgTagMatches - .map(getCheerioElementFromMatch(wpUrl)) - .filter(({ cheerioImg: { attribs } }) => { - if (!attribs.src) { - return false - } - - return isWebUri(attribs.src) - }) + const cheerioImages = getCheerioElementsFromMatches({ + imgTagMatches, + wpUrl, + }) const htmlMatchesToMediaItemNodesMap = await fetchNodeHtmlImageMediaItemNodes( { diff --git a/packages/gatsby-source-wordpress/src/steps/source-nodes/fetch-nodes/fetch-referenced-media-items.js b/packages/gatsby-source-wordpress/src/steps/source-nodes/fetch-nodes/fetch-referenced-media-items.js index ac3112a4f4976..64523645ca96b 100644 --- a/packages/gatsby-source-wordpress/src/steps/source-nodes/fetch-nodes/fetch-referenced-media-items.js +++ b/packages/gatsby-source-wordpress/src/steps/source-nodes/fetch-nodes/fetch-referenced-media-items.js @@ -211,7 +211,8 @@ export const stripImageSizesFromUrl = url => { const fileExtension = urlToFileExtension(url) const imageSizesPattern = new RegExp( - `(?:[-_]([0-9]+)x([0-9]+))${fileExtension ? `.${fileExtension}` : ``}` + // eslint-disable-next-line no-useless-escape + `(?:[-_]([0-9]+)x([0-9]+))${fileExtension ? `\.${fileExtension}` : ``}` ) let urlWithoutSizes = url.replace(imageSizesPattern, ``)