diff --git a/README.md b/README.md index c93aefe6e..e5bcf5d98 100644 --- a/README.md +++ b/README.md @@ -13,6 +13,7 @@ Docsmith is a RESTful API, built using Node.js and the [Fastify](https://fastify.io/) web framework, that can convert files from: +- DOC to TXT - DOCX to HTML - DOCX to TXT - PDF to HTML diff --git a/package-lock.json b/package-lock.json index 8e448a650..a974e178f 100644 --- a/package-lock.json +++ b/package-lock.json @@ -6,7 +6,7 @@ "packages": { "": { "name": "docsmith", - "version": "10.0.14", + "version": "10.0.15", "license": "MIT", "dependencies": { "@fastify/accepts": "^4.1.0", @@ -20,6 +20,7 @@ "@fastify/static": "^6.10.1", "@fastify/swagger": "^8.3.1", "@fastify/under-pressure": "^8.2.0", + "cfb": "^1.2.2", "clean-css": "^5.3.2", "cssesc": "^3.0.0", "cssom": "^0.5.0", @@ -47,7 +48,8 @@ "redoc": "^2.0.0", "secure-json-parse": "^2.7.0", "tesseract.js": "^4.0.5", - "upath": "^2.0.1" + "upath": "^2.0.1", + "word-extractor": "^1.0.4" }, "devDependencies": { "@commitlint/cli": "^17.6.3", @@ -2865,6 +2867,14 @@ "node": ">=0.4.0" } }, + "node_modules/adler-32": { + "version": "1.3.1", + "resolved": "https://registry.npmjs.org/adler-32/-/adler-32-1.3.1.tgz", + "integrity": "sha512-ynZ4w/nUUv5rrsR8UUGoe1VC9hZj6V5hU9Qw1HlMDJGEJw5S7TfTErWTjMys6M7vr0YWcPqs3qAr4ss0nDfP+A==", + "engines": { + "node": ">=0.8" + } + }, "node_modules/agent-base": { "version": "6.0.2", "resolved": "https://registry.npmjs.org/agent-base/-/agent-base-6.0.2.tgz", @@ -3405,6 +3415,14 @@ "ieee754": "^1.2.1" } }, + "node_modules/buffer-crc32": { + "version": "0.2.13", + "resolved": "https://registry.npmjs.org/buffer-crc32/-/buffer-crc32-0.2.13.tgz", + "integrity": "sha512-VO9Ht/+p3SN7SKWqcrgEzjGbRSJYTx+Q1pTQC0wrWqHx0vpJraQ6GtHx8tvcg1rlK1byhU5gccxgOgj7B0TDkQ==", + "engines": { + "node": "*" + } + }, "node_modules/buffer-from": { "version": "1.1.2", "resolved": "https://registry.npmjs.org/buffer-from/-/buffer-from-1.1.2.tgz", @@ -3500,6 +3518,18 @@ } ] }, + "node_modules/cfb": { + "version": "1.2.2", + "resolved": "https://registry.npmjs.org/cfb/-/cfb-1.2.2.tgz", + "integrity": "sha512-KfdUZsSOw19/ObEWasvBP/Ac4reZvAGauZhs6S/gqNhXhI7cKwvlH7ulj+dOEYnca4bm4SGo8C1bTAQvnTjgQA==", + "dependencies": { + "adler-32": "~1.3.0", + "crc-32": "~1.2.0" + }, + "engines": { + "node": ">=0.8" + } + }, "node_modules/chalk": { "version": "4.1.2", "resolved": "https://registry.npmjs.org/chalk/-/chalk-4.1.2.tgz", @@ -3856,6 +3886,17 @@ "typescript": ">=3" } }, + "node_modules/crc-32": { + "version": "1.2.2", + "resolved": "https://registry.npmjs.org/crc-32/-/crc-32-1.2.2.tgz", + "integrity": "sha512-ROmzCKrTnOwybPcJApAA6WBWij23HVfGVNKqqrZpuyZOHqK2CwHSvpGuyt/UNNvaIjEd8X5IFGp4Mh+Ie1IHJQ==", + "bin": { + "crc32": "bin/crc32.njs" + }, + "engines": { + "node": ">=0.8" + } + }, "node_modules/create-require": { "version": "1.1.1", "resolved": "https://registry.npmjs.org/create-require/-/create-require-1.1.1.tgz", @@ -5254,6 +5295,14 @@ "bser": "2.1.1" } }, + "node_modules/fd-slicer": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/fd-slicer/-/fd-slicer-1.1.0.tgz", + "integrity": "sha512-cE1qsB/VwyQozZ+q1dGxR8LBYNZeofhEdUNGSMbQD3Gw2lAzX9Zb3uIU6Ebc/Fmyjo9AWWfnn0AUCHqtevs/8g==", + "dependencies": { + "pend": "~1.2.0" + } + }, "node_modules/file-entry-cache": { "version": "6.0.1", "resolved": "https://registry.npmjs.org/file-entry-cache/-/file-entry-cache-6.0.1.tgz", @@ -9138,6 +9187,11 @@ "xtend": "~4.0.1" } }, + "node_modules/pend": { + "version": "1.2.0", + "resolved": "https://registry.npmjs.org/pend/-/pend-1.2.0.tgz", + "integrity": "sha512-F3asv42UuXchdzt+xXqfW1OGlVBe+mxa2mqI0pg5yAHZPvFmY3Y6drSf/GQ1A86WgWEN9Kzh/WrgKa6iGcHXLg==" + }, "node_modules/perfect-scrollbar": { "version": "1.5.5", "resolved": "https://registry.npmjs.org/perfect-scrollbar/-/perfect-scrollbar-1.5.5.tgz", @@ -12131,6 +12185,26 @@ "url": "https://github.com/sponsors/ljharb" } }, + "node_modules/word-extractor": { + "version": "1.0.4", + "resolved": "https://registry.npmjs.org/word-extractor/-/word-extractor-1.0.4.tgz", + "integrity": "sha512-PyAGZQ2gjnVA5kcZAOAxoYciCMaAvu0dbVlw/zxHphhy+3be8cDeYKHJPO8iedIM3Sx0arA/ugKTJyXhZNgo6g==", + "dependencies": { + "saxes": "^5.0.1", + "yauzl": "^2.10.0" + } + }, + "node_modules/word-extractor/node_modules/saxes": { + "version": "5.0.1", + "resolved": "https://registry.npmjs.org/saxes/-/saxes-5.0.1.tgz", + "integrity": "sha512-5LBh1Tls8c9xgGjw3QrMwETmTMVk0oFgvrFSvWx62llR2hcEInrKNZ2GZCCuuy2lvWrdl5jhbpeqc5hRYKFOcw==", + "dependencies": { + "xmlchars": "^2.2.0" + }, + "engines": { + "node": ">=10" + } + }, "node_modules/word-wrap": { "version": "1.2.3", "resolved": "https://registry.npmjs.org/word-wrap/-/word-wrap-1.2.3.tgz", @@ -12302,6 +12376,15 @@ "node": ">=12" } }, + "node_modules/yauzl": { + "version": "2.10.0", + "resolved": "https://registry.npmjs.org/yauzl/-/yauzl-2.10.0.tgz", + "integrity": "sha512-p4a9I6X6nu6IhoGmBqAcbJy1mlC4j27vEPZX9F4L4/vZT3Lyq1VkFHw/V/PUcB9Buo+DG3iHkT0x3Qya58zc3g==", + "dependencies": { + "buffer-crc32": "~0.2.3", + "fd-slicer": "~1.1.0" + } + }, "node_modules/yn": { "version": "3.1.1", "resolved": "https://registry.npmjs.org/yn/-/yn-3.1.1.tgz", diff --git a/package.json b/package.json index 809aac5db..78d3de5ff 100644 --- a/package.json +++ b/package.json @@ -101,6 +101,7 @@ "@fastify/static": "^6.10.1", "@fastify/swagger": "^8.3.1", "@fastify/under-pressure": "^8.2.0", + "cfb": "^1.2.2", "clean-css": "^5.3.2", "cssesc": "^3.0.0", "cssom": "^0.5.0", @@ -128,6 +129,7 @@ "redoc": "^2.0.0", "secure-json-parse": "^2.7.0", "tesseract.js": "^4.0.5", - "upath": "^2.0.1" + "upath": "^2.0.1", + "word-extractor": "^1.0.4" } } diff --git a/src/config/index.js b/src/config/index.js index fbed8d774..c299a242b 100644 --- a/src/config/index.js +++ b/src/config/index.js @@ -263,6 +263,11 @@ async function getConfig() { : undefined, }, tags: [ + { + name: "DOC", + description: + "Endpoints used for the conversion of DOC documents", + }, { name: "DOCX", description: diff --git a/src/plugins/doc-to-txt/index.js b/src/plugins/doc-to-txt/index.js new file mode 100644 index 000000000..900ce132d --- /dev/null +++ b/src/plugins/doc-to-txt/index.js @@ -0,0 +1,44 @@ +const fp = require("fastify-plugin"); +const WordExtractor = require("word-extractor"); + +/** + * @author Frazer Smith + * @description Pre-handler plugin that uses Word-Extractor to convert Buffer containing + * DOC file in `req.body` to TXT. + * `req` object is decorated with `conversionResults.body` holding the converted document. + * @param {object} server - Fastify instance. + */ +async function plugin(server) { + const wordExtractor = new WordExtractor(); + + server.addHook("onRequest", async (req) => { + req.conversionResults = { body: undefined }; + }); + + server.addHook("preHandler", async (req, res) => { + try { + const results = await wordExtractor.extract(req.body); + + const value = `${results.getHeaders({ + includeFooters: false, + })}\n${results.getTextboxes({ + includeHeadersAndFooters: false, + })}\n${results.getBody()}\n${results.getEndnotes()}\n${results.getFootnotes()}\n${results.getFooters()}`.trim(); + + req.conversionResults.body = value; + res.type("text/plain; charset=utf-8"); + } catch { + /** + * Word-Extractor will throw if the .doc file provided + * by client is malformed, thus client error code + */ + throw server.httpErrors.badRequest(); + } + }); +} + +module.exports = fp(plugin, { + fastify: "4.x", + name: "doc-to-txt", + dependencies: ["@fastify/sensible"], +}); diff --git a/src/plugins/doc-to-txt/plugin.test.js b/src/plugins/doc-to-txt/plugin.test.js new file mode 100644 index 000000000..18bdcc047 --- /dev/null +++ b/src/plugins/doc-to-txt/plugin.test.js @@ -0,0 +1,91 @@ +const fs = require("fs/promises"); +const Fastify = require("fastify"); +const isHtml = require("is-html"); +const sensible = require("@fastify/sensible"); +const plugin = require("."); + +describe("DOC-to-TXT conversion plugin", () => { + let server; + + beforeAll(async () => { + server = Fastify(); + + server.addContentTypeParser( + "application/msword", + { parseAs: "buffer" }, + async (_req, payload) => payload + ); + + await server.register(sensible).register(plugin); + + server.post("/", (req, res) => { + res.header("content-type", "application/json").send( + req.conversionResults + ); + }); + + await server.ready(); + }); + + afterAll(async () => { + await server.close(); + }); + + it("Converts DOC file to TXT", async () => { + const response = await server.inject({ + method: "POST", + url: "/", + body: await fs.readFile( + "./test_resources/test_files/valid_doc.doc" + ), + headers: { + "content-type": "application/msword", + }, + }); + + const { body } = JSON.parse(response.payload); + + // String found in first heading of the test document + expect(body).toMatch( + "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nunc ac faucibus odio." + ); + // String found at end of the test document + expect(body).toMatch( + /Nullam venenatis commodo imperdiet. Morbi velit neque, semper quis lorem quis, efficitur dignissim ipsum. Ut ac lorem sed turpis imperdiet eleifend sit amet id sapien$/m + ); + expect(isHtml(body)).toBe(false); + expect(response.statusCode).toBe(200); + }); + + // TODO: use `it.concurrent.each()` once it is no longer experimental + it.each([ + { testName: "is missing" }, + { + testName: "is not a valid DOC file", + readFile: true, + }, + ])( + "Returns HTTP status code 400 if DOC file $testName", + async ({ readFile }) => { + const response = await server.inject({ + method: "POST", + url: "/", + headers: { + "content-type": "application/msword", + }, + body: readFile + ? await fs.readFile( + "./test_resources/test_files/invalid_doc.doc" + ) + : undefined, + }); + + expect(JSON.parse(response.payload)).toEqual({ + error: "Bad Request", + message: "Bad Request", + statusCode: 400, + }); + expect(response.statusCode).toBe(400); + } + ); +}); diff --git a/src/routes/doc/txt/index.js b/src/routes/doc/txt/index.js new file mode 100644 index 000000000..365f0ef56 --- /dev/null +++ b/src/routes/doc/txt/index.js @@ -0,0 +1,82 @@ +const cfb = require("cfb"); + +// Import plugins +const cors = require("@fastify/cors"); +const docToTxt = require("../../../plugins/doc-to-txt"); + +const { docToTxtPostSchema } = require("./schema"); + +const accepts = ["text/plain"]; + +/** + * @author Frazer Smith + * @description Sets routing options for server. + * @param {object} server - Fastify instance. + * @param {object} options - Route config values. + * @param {*=} options.bearerTokenAuthKeys - Apply `bearerToken` security scheme to route if defined. + * @param {object} options.cors - CORS settings. + */ +async function route(server, options) { + if (options.bearerTokenAuthKeys) { + docToTxtPostSchema.security = [{ bearerToken: [] }]; + docToTxtPostSchema.response[401] = { + $ref: "responses#/properties/unauthorized", + description: "Unauthorized", + }; + } + + server.addContentTypeParser( + "application/msword", + { parseAs: "buffer" }, + async (_req, payload) => { + /** + * The Content-Type header can be spoofed so is not trusted implicitly, + * this checks the file is actually a DOC file. + * DOC files use the Compound File Binary Format (CFBF), which the + * file-type package does not support; use cfb instead + */ + try { + const results = cfb.parse(payload); + // Check the CFBF file is a DOC file by looking for the WordDocument stream + if ( + !results?.FileIndex.find( + (file) => file.name === "WordDocument" + ) + ) { + throw new Error(); + } + } catch { + throw server.httpErrors.unsupportedMediaType(); + } + return payload; + } + ); + + // Register plugins + await server + // Enable CORS if options passed + .register(cors, { + ...options.cors, + methods: ["POST"], + }) + .register(docToTxt); + + server.route({ + method: "POST", + url: "/", + schema: docToTxtPostSchema, + onRequest: async (req) => { + if ( + // Catch unsupported Accept header media types + !req.accepts().type(accepts) + ) { + throw server.httpErrors.notAcceptable(); + } + }, + handler: (req, res) => { + res.send(req.conversionResults.body); + }, + }); +} + +module.exports = route; diff --git a/src/routes/doc/txt/route.test.js b/src/routes/doc/txt/route.test.js new file mode 100644 index 000000000..6dc3c2d42 --- /dev/null +++ b/src/routes/doc/txt/route.test.js @@ -0,0 +1,148 @@ +const accepts = require("@fastify/accepts"); +const fs = require("fs/promises"); +const Fastify = require("fastify"); +const isHtml = require("is-html"); +const sensible = require("@fastify/sensible"); +const route = require("."); +const getConfig = require("../../../config"); +const sharedSchemas = require("../../../plugins/shared-schemas"); + +describe("DOC-to-TXT route", () => { + let config; + let server; + + beforeAll(async () => { + config = await getConfig(); + + server = Fastify(); + await server + .register(accepts) + .register(sensible) + .register(sharedSchemas) + .register(route, config) + .ready(); + }); + + afterAll(async () => { + await server.close(); + }); + + it("Returns DOC file converted to TXT", async () => { + const response = await server.inject({ + method: "POST", + url: "/", + body: await fs.readFile( + "./test_resources/test_files/valid_doc.doc" + ), + headers: { + accept: "application/json, text/plain", + "content-type": "application/msword", + }, + }); + + expect(response.payload).toMatch( + "Etiam vehicula luctus fermentum. In vel metus congue, pulvinar lectus vel, fermentum dui." + ); + expect(isHtml(response.payload)).toBe(false); + expect(response.headers).toMatchObject({ + "content-type": "text/plain; charset=utf-8", + }); + expect(response.statusCode).toBe(200); + }); + + it("Returns HTTP status code 415 if file is missing", async () => { + const response = await server.inject({ + method: "POST", + url: "/", + + headers: { + accept: "application/json, text/plain", + "content-type": "application/msword", + }, + }); + + expect(JSON.parse(response.payload)).toEqual({ + error: "Unsupported Media Type", + message: "Unsupported Media Type", + statusCode: 415, + }); + expect(response.statusCode).toBe(415); + }); + + it.each([ + { + testName: "with '.doc' extension is not a valid DOC file", + filePath: "./test_resources/test_files/invalid_doc.doc", + }, + { + testName: "is a valid CFBF file but is not a Microsoft Word file", + filePath: "./test_resources/test_files/valid_xls.xls", + }, + ])( + "Returns HTTP status code 415 if file $testName", + async ({ filePath }) => { + const response = await server.inject({ + method: "POST", + url: "/", + // eslint-disable-next-line security/detect-non-literal-fs-filename + body: await fs.readFile(filePath), + query: { + lastPageToConvert: 1, + }, + headers: { + accept: "application/json, text/plain", + "content-type": "application/msword", + }, + }); + + expect(JSON.parse(response.payload)).toEqual({ + error: "Unsupported Media Type", + message: "Unsupported Media Type", + statusCode: 415, + }); + expect(response.statusCode).toBe(415); + } + ); + + it("Returns HTTP status code 415 if file media type is not supported by route", async () => { + const response = await server.inject({ + method: "POST", + url: "/", + body: await fs.readFile( + "./test_resources/test_files/valid_empty_html.html" + ), + headers: { + accept: "application/json, text/plain", + "content-type": "application/html", + }, + }); + + expect(JSON.parse(response.payload)).toEqual({ + error: "Unsupported Media Type", + message: "Unsupported Media Type: application/html", + statusCode: 415, + }); + expect(response.statusCode).toBe(415); + }); + + it("Returns HTTP status code 406 if media type in `Accept` request header is unsupported", async () => { + const response = await server.inject({ + method: "POST", + url: "/", + body: await fs.readFile( + "./test_resources/test_files/valid_doc.doc" + ), + headers: { + accept: "application/javascript", + "content-type": "application/msword", + }, + }); + + expect(JSON.parse(response.payload)).toEqual({ + error: "Not Acceptable", + message: "Not Acceptable", + statusCode: 406, + }); + expect(response.statusCode).toBe(406); + }); +}); diff --git a/src/routes/doc/txt/schema.js b/src/routes/doc/txt/schema.js new file mode 100644 index 000000000..3a5e9fac6 --- /dev/null +++ b/src/routes/doc/txt/schema.js @@ -0,0 +1,48 @@ +const S = require("fluent-json-schema"); + +const tags = ["DOC"]; + +/** + * Fastify uses AJV for JSON Schema Validation, + * see https://fastify.io/docs/latest/Reference/Validation-and-Serialization/ + * + * Input validation protects against XSS, HPP, prototype pollution, + * and most other injection attacks. + */ +const docToTxtPostSchema = { + tags, + summary: "Convert DOC to TXT", + description: + "Returns the result of converting a DOC document to TXT format.", + operationId: "postDocToTxt", + consumes: ["application/msword"], + produces: ["application/json", "application/xml"], + response: { + 200: { + content: { + "text/plain": { + schema: { + type: "string", + }, + }, + }, + }, + 400: S.ref("responses#/properties/badRequest").description( + "Bad Request" + ), + 406: S.ref("responses#/properties/notAcceptable").description( + "Not Acceptable" + ), + 415: S.ref("responses#/properties/unsupportedMediaType").description( + "Unsupported Media Type" + ), + 429: S.ref("responses#/properties/tooManyRequests").description( + "Too Many Requests" + ), + 503: S.ref("responses#/properties/serviceUnavailable").description( + "Service Unavailable" + ), + }, +}; + +module.exports = { docToTxtPostSchema }; diff --git a/src/server.test.js b/src/server.test.js index c899ef380..9665e2c6f 100644 --- a/src/server.test.js +++ b/src/server.test.js @@ -187,6 +187,29 @@ describe("Server deployment", () => { }); }); + describe("/doc/txt route", () => { + it("Returns DOC file converted to TXT, with expected headers set", async () => { + const response = await server.inject({ + method: "POST", + url: "/doc/txt", + body: await fs.readFile( + "./test_resources/test_files/valid_doc.doc" + ), + headers: { + accept: "application/json, text/plain", + "content-type": "application/msword", + }, + }); + + expect(response.payload).toMatch( + "Etiam vehicula luctus fermentum. In vel metus congue, pulvinar lectus vel, fermentum dui." + ); + expect(isHtml(response.payload)).toBe(false); + expect(response.headers).toEqual(expResHeaders); + expect(response.statusCode).toBe(200); + }); + }); + describe("/docx/html route", () => { it("Returns DOCX file converted to HTML, with expected headers set", async () => { const response = await server.inject({ diff --git a/test_resources/test_files/invalid_doc.doc b/test_resources/test_files/invalid_doc.doc new file mode 100644 index 000000000..30d74d258 --- /dev/null +++ b/test_resources/test_files/invalid_doc.doc @@ -0,0 +1 @@ +test \ No newline at end of file diff --git a/test_resources/test_files/valid_doc.doc b/test_resources/test_files/valid_doc.doc new file mode 100644 index 000000000..914742ac3 Binary files /dev/null and b/test_resources/test_files/valid_doc.doc differ diff --git a/test_resources/test_files/valid_xls.xls b/test_resources/test_files/valid_xls.xls new file mode 100644 index 000000000..694996d0d Binary files /dev/null and b/test_resources/test_files/valid_xls.xls differ