From 6ba79e55a26c614f63a8685adb0477f8ef5eab68 Mon Sep 17 00:00:00 2001 From: Frazer Smith Date: Sun, 18 Feb 2024 14:16:42 +0000 Subject: [PATCH 1/2] refactor(routes): cache accepted types from route schemas --- src/routes/admin/healthcheck/index.js | 3 ++- src/routes/doc/txt/index.js | 3 ++- src/routes/docs/index.js | 3 ++- src/routes/docs/openapi/index.js | 1 + src/routes/docx/html/index.js | 3 ++- src/routes/docx/txt/index.js | 3 ++- src/routes/html/txt/index.js | 3 ++- src/routes/pdf/html/index.js | 3 ++- src/routes/pdf/txt/index.js | 3 ++- src/routes/rtf/html/index.js | 3 ++- src/routes/rtf/txt/index.js | 3 ++- 11 files changed, 21 insertions(+), 10 deletions(-) diff --git a/src/routes/admin/healthcheck/index.js b/src/routes/admin/healthcheck/index.js index a66cb3766..a6d57a94d 100644 --- a/src/routes/admin/healthcheck/index.js +++ b/src/routes/admin/healthcheck/index.js @@ -5,7 +5,8 @@ const cors = require("@fastify/cors"); const { healthcheckGetSchema } = require("./schema"); -const accepts = ["text/plain"]; +// Cache supported media types so not having to navigate schema object each time +const accepts = Object.keys(healthcheckGetSchema.response[200].content); /** * @author Frazer Smith diff --git a/src/routes/doc/txt/index.js b/src/routes/doc/txt/index.js index 3e922a27d..c342fb4f7 100644 --- a/src/routes/doc/txt/index.js +++ b/src/routes/doc/txt/index.js @@ -8,7 +8,8 @@ const docToTxt = require("../../../plugins/doc-to-txt"); const { docToTxtPostSchema } = require("./schema"); -const accepts = ["text/plain"]; +// Cache supported media types so not having to navigate schema object each time +const accepts = Object.keys(docToTxtPostSchema.response[200].content); /** * @author Frazer Smith diff --git a/src/routes/docs/index.js b/src/routes/docs/index.js index b439bbf38..9d18163d7 100644 --- a/src/routes/docs/index.js +++ b/src/routes/docs/index.js @@ -7,7 +7,8 @@ const staticPlugin = require("@fastify/static"); const { docsGetSchema } = require("./schema"); -const accepts = ["text/html"]; +// Cache supported media types so not having to navigate schema object each time +const accepts = Object.keys(docsGetSchema.response[200].content); // Cache immutable regex as they are expensive to create and garbage collect const pathRegex = /\/redoc\.standalone\.js(?:.map)?/u; diff --git a/src/routes/docs/openapi/index.js b/src/routes/docs/openapi/index.js index eedd57f22..73c741849 100644 --- a/src/routes/docs/openapi/index.js +++ b/src/routes/docs/openapi/index.js @@ -5,6 +5,7 @@ const cors = require("@fastify/cors"); const { docsOpenapiGetSchema } = require("./schema"); +// Cache supported media types so not having to navigate schema object each time const accepts = docsOpenapiGetSchema.produces; /** diff --git a/src/routes/docx/html/index.js b/src/routes/docx/html/index.js index 45074e0db..d248d37b5 100644 --- a/src/routes/docx/html/index.js +++ b/src/routes/docx/html/index.js @@ -8,7 +8,8 @@ const docxToHtml = require("../../../plugins/docx-to-html"); const { docxToHtmlPostSchema } = require("./schema"); -const accepts = ["text/html"]; +// Cache supported media types so not having to navigate schema object each time +const accepts = Object.keys(docxToHtmlPostSchema.response[200].content); /** * @author Frazer Smith diff --git a/src/routes/docx/txt/index.js b/src/routes/docx/txt/index.js index 041b964c5..f1c1c9345 100644 --- a/src/routes/docx/txt/index.js +++ b/src/routes/docx/txt/index.js @@ -9,7 +9,8 @@ const docxToHtml = require("../../../plugins/docx-to-html"); const { docxToTxtPostSchema } = require("./schema"); -const accepts = ["text/plain"]; +// Cache supported media types so not having to navigate schema object each time +const accepts = Object.keys(docxToTxtPostSchema.response[200].content); /** * @author Frazer Smith diff --git a/src/routes/html/txt/index.js b/src/routes/html/txt/index.js index 868599a8d..19b9955be 100644 --- a/src/routes/html/txt/index.js +++ b/src/routes/html/txt/index.js @@ -7,7 +7,8 @@ const cors = require("@fastify/cors"); const { htmlToTxtPostSchema } = require("./schema"); -const accepts = ["text/plain"]; +// Cache supported media types so not having to navigate schema object each time +const accepts = Object.keys(htmlToTxtPostSchema.response[200].content); /** * @author Frazer Smith diff --git a/src/routes/pdf/html/index.js b/src/routes/pdf/html/index.js index a96bf73ae..6fc6bd4b4 100644 --- a/src/routes/pdf/html/index.js +++ b/src/routes/pdf/html/index.js @@ -8,7 +8,8 @@ const pdfToHtml = require("../../../plugins/pdf-to-html"); const { pdfToHtmlPostSchema } = require("./schema"); -const accepts = ["text/html"]; +// Cache supported media types so not having to navigate schema object each time +const accepts = Object.keys(pdfToHtmlPostSchema.response[200].content); /** * @author Frazer Smith diff --git a/src/routes/pdf/txt/index.js b/src/routes/pdf/txt/index.js index 01c3602cf..5092a71b5 100644 --- a/src/routes/pdf/txt/index.js +++ b/src/routes/pdf/txt/index.js @@ -8,7 +8,8 @@ const pdfToTxt = require("../../../plugins/pdf-to-txt"); const { pdfToTxtPostSchema } = require("./schema"); -const accepts = ["text/plain", "text/html"]; +// Cache supported media types so not having to navigate schema object each time +const accepts = Object.keys(pdfToTxtPostSchema.response[200].content); /** * @author Frazer Smith diff --git a/src/routes/rtf/html/index.js b/src/routes/rtf/html/index.js index 0e226b5c5..1fb717a2f 100644 --- a/src/routes/rtf/html/index.js +++ b/src/routes/rtf/html/index.js @@ -8,7 +8,8 @@ const rtfToHtml = require("../../../plugins/rtf-to-html"); const { rtfToHtmlPostSchema } = require("./schema"); -const accepts = ["text/html"]; +// Cache supported media types so not having to navigate schema object each time +const accepts = Object.keys(rtfToHtmlPostSchema.response[200].content); /** * @author Frazer Smith diff --git a/src/routes/rtf/txt/index.js b/src/routes/rtf/txt/index.js index 63278ca27..9da223621 100644 --- a/src/routes/rtf/txt/index.js +++ b/src/routes/rtf/txt/index.js @@ -8,7 +8,8 @@ const rtfToHtml = require("../../../plugins/rtf-to-html"); const { rtfToTxtPostSchema } = require("./schema"); -const accepts = ["text/plain"]; +// Cache supported media types so not having to navigate schema object each time +const accepts = Object.keys(rtfToTxtPostSchema.response[200].content); /** * @author Frazer Smith From c59cb9f19b19973af6201e9586b1360da7f9ac84 Mon Sep 17 00:00:00 2001 From: Frazer Smith Date: Sun, 18 Feb 2024 14:38:19 +0000 Subject: [PATCH 2/2] feat(routes/html): add support for xhtml documents --- README.md | 2 +- src/routes/html/txt/route.test.js | 41 +- src/routes/html/txt/route.test.js.snap | 75 +++ src/routes/html/txt/schema.js | 2 +- test_resources/test_files/xhtml_valid.xhtml | 575 ++++++++++++++++++++ 5 files changed, 689 insertions(+), 6 deletions(-) create mode 100644 test_resources/test_files/xhtml_valid.xhtml diff --git a/README.md b/README.md index 33962a3c1..9bbdca688 100644 --- a/README.md +++ b/README.md @@ -18,7 +18,7 @@ Docsmith is a RESTful API, built using Node.js and the [Fastify](https://fastify | DOC | TXT | DOT file variant supported | | DOCX | HTML | DOCM, DOTM, and DOTX file variants supported | | DOCX | TXT | DOCM, DOTM, and DOTX file variants supported | -| HTML | TXT | | +| HTML | TXT | XHTML file variant supported | | PDF | HTML | | | PDF | TXT | Scanned documents supported using OCR | | RTF | HTML | Images are removed[^1] | diff --git a/src/routes/html/txt/route.test.js b/src/routes/html/txt/route.test.js index 1d5dbb52b..137426d50 100644 --- a/src/routes/html/txt/route.test.js +++ b/src/routes/html/txt/route.test.js @@ -1,3 +1,5 @@ +/* eslint-disable security/detect-non-literal-fs-filename -- Test files are not user-provided */ + "use strict"; const { readFile } = require("node:fs/promises"); @@ -30,14 +32,29 @@ describe("HTML-to-TXT route", () => { afterAll(async () => server.close()); - it("Returns HTML file converted to TXT", async () => { + it.each([ + { + testName: "HTML file", + filePath: "./test_resources/test_files/html_valid.html", + headers: { + "content-type": "text/html", + }, + }, + { + testName: "XHTML file", + filePath: "./test_resources/test_files/xhtml_valid.xhtml", + headers: { + "content-type": "application/xhtml+xml", + }, + }, + ])("Returns $testName converted to TXT", async ({ filePath, headers }) => { const response = await server.inject({ method: "POST", url: "/", - body: await readFile("./test_resources/test_files/html_valid.html"), + body: await readFile(filePath), headers: { accept: "application/json, text/plain", - "content-type": "text/html", + ...headers, }, }); @@ -66,7 +83,23 @@ describe("HTML-to-TXT route", () => { expect(response.statusCode).toBe(400); }); - it("Returns HTTP status code 415 if body is not a valid HTML file", async () => { + it.each([ + { + testName: "is not a valid HTML file", + body: Buffer.from("test"), + headers: { + "content-type": "text/html", + }, + }, + { + testName: "is not a valid XHTML file", + body: Buffer.from("test"), + headers: { + "content-type": "aapplication/xhtml+xml", + }, + }, + ]); + it("Returns HTTP status code 415 if body $testName", async () => { const response = await server.inject({ method: "POST", url: "/", diff --git a/src/routes/html/txt/route.test.js.snap b/src/routes/html/txt/route.test.js.snap index bb91a1378..fed296a2e 100644 --- a/src/routes/html/txt/route.test.js.snap +++ b/src/routes/html/txt/route.test.js.snap @@ -46,6 +46,81 @@ Maecenas mauris lectus, lobortis et purus mattis, blandit dictum tellus. +In non mauris justo. Duis vehicula mi vel mi pretium, a viverra erat efficitur. Cras aliquam est ac eros varius, id iaculis dui auctor. Duis pretium neque ligula, et pulvinar mi placerat et. Nulla nec nunc sit amet nunc posuere vestibulum. Ut id neque eget tortor mattis tristique. Donec ante est, blandit sit amet tristique vel, lacinia pulvinar arcu. Pellentesque scelerisque fermentum erat, id posuere justo pulvinar ut. Cras id eros sed enim aliquam lobortis. Sed lobortis nisl ut eros efficitur tincidunt. Cras justo mi, porttitor quis mattis vel, ultricies ut purus. Ut facilisis et lacus eu cursus. + + +Cras fringilla ipsum magna, in fringilla dui commodo a. + + + + + + Lorem ipsum Lorem ipsum Lorem ipsum +1 In eleifend velit vitae libero sollicitudin euismod. Lorem +2 Cras fringilla ipsum magna, in fringilla dui commodo a. Ipsum +3 Aliquam erat volutpat. Lorem +4 Fusce vitae vestibulum velit. Lorem +5 Etiam vehicula luctus fermentum. Ipsum + + + + + +Etiam vehicula luctus fermentum. In vel metus congue, pulvinar lectus vel, fermentum dui. Maecenas ante orci, egestas ut aliquet sit amet, sagittis a magna. Aliquam ante quam, pellentesque ut dignissim quis, laoreet eget est. Aliquam erat volutpat. Class aptent taciti sociosqu ad litora torquent per conubia nostra, per inceptos himenaeos. Ut ullamcorper justo sapien, in cursus libero viverra eget. Vivamus auctor imperdiet urna, at pulvinar leo posuere laoreet. Suspendisse neque nisl, fringilla at iaculis scelerisque, ornare vel dolor. Ut et pulvinar nunc. Pellentesque fringilla mollis efficitur. Nullam venenatis commodo imperdiet. Morbi velit neque, semper quis lorem quis, efficitur dignissim ipsum. Ut ac lorem sed turpis imperdiet eleifend sit amet id sapien + + + + + +I am a footer" +`; + +exports[`HTML-to-TXT route Returns XHTML file converted to TXT 1`] = ` +"I am a header + +Lorem ipsum + + + + + + +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nunc ac faucibus odio. + + + + + +Vestibulum neque massa, scelerisque sit amet ligula eu, congue molestie mi. Praesent ut varius sem. Nullam at porttitor arcu, nec lacinia nisi. Ut ac dolor vitae odio interdum condimentum. Vivamus dapibus sodales ex, vitae malesuada ipsum cursus convallis. Maecenas sed egestas nulla, ac condimentum orci. Mauris diam felis, vulputate ac suscipit et, iaculis non est. Curabitur semper arcu ac ligula semper, nec luctus nisl blandit. Integer lacinia ante ac libero lobortis imperdiet. Nullam mollis convallis ipsum, ac accumsan nunc vehicula vitae. Nulla eget justo in felis tristique fringilla. Morbi sit amet tortor quis risus auctor condimentum. Morbi in ullamcorper elit. Nulla iaculis tellus sit amet mauris tempus fringilla. + +Maecenas mauris lectus, lobortis et purus mattis, blandit dictum tellus. + + * Maecenas non lorem quis tellus placerat varius. + + * Nulla facilisi. + + * Aenean congue fringilla justo ut aliquam. + + * Mauris id ex erat. Nunc vulputate neque vitae justo facilisis, non condimentum ante sagittis. + + * Morbi viverra semper lorem nec molestie. + + * Maecenas tincidunt est efficitur ligula euismod, sit amet ornare est vulputate. + + + + + + + + + + + + + + + In non mauris justo. Duis vehicula mi vel mi pretium, a viverra erat efficitur. Cras aliquam est ac eros varius, id iaculis dui auctor. Duis pretium neque ligula, et pulvinar mi placerat et. Nulla nec nunc sit amet nunc posuere vestibulum. Ut id neque eget tortor mattis tristique. Donec ante est, blandit sit amet tristique vel, lacinia pulvinar arcu. Pellentesque scelerisque fermentum erat, id posuere justo pulvinar ut. Cras id eros sed enim aliquam lobortis. Sed lobortis nisl ut eros efficitur tincidunt. Cras justo mi, porttitor quis mattis vel, ultricies ut purus. Ut facilisis et lacus eu cursus. diff --git a/src/routes/html/txt/schema.js b/src/routes/html/txt/schema.js index e16054d69..44e130f34 100644 --- a/src/routes/html/txt/schema.js +++ b/src/routes/html/txt/schema.js @@ -16,7 +16,7 @@ const htmlToTxtPostSchema = { description: "Returns the result of converting a HTML document to TXT format.", operationId: "postHtmlToTxt", - consumes: ["text/html"], + consumes: ["application/xhtml+xml", "text/html"], produces: ["application/json", "application/xml"], response: { 200: { diff --git a/test_resources/test_files/xhtml_valid.xhtml b/test_resources/test_files/xhtml_valid.xhtml new file mode 100644 index 000000000..d5984ace2 --- /dev/null +++ b/test_resources/test_files/xhtml_valid.xhtml @@ -0,0 +1,575 @@ + + + + + + + + + +
+

I am a header

+
+

+ Lorem ipsum +

+



+

+ Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nunc ac + faucibus odio. +

+



+

+ Vestibulum neque massa, scelerisque sit amet ligula eu, congue + molestie mi. Praesent ut varius sem. Nullam at porttitor arcu, + nec lacinia nisi. Ut ac dolor vitae odio interdum + condimentum. + Vivamus dapibus sodales ex, vitae malesuada ipsum cursus + convallis. Maecenas sed egestas nulla, ac condimentum + orci. + Mauris diam felis, vulputate ac suscipit et, iaculis non est. + Curabitur semper arcu ac ligula semper, nec luctus nisl blandit. + Integer lacinia ante ac libero lobortis imperdiet. + Nullam mollis convallis ipsum, ac accumsan nunc vehicula + vitae. + Nulla eget justo in felis tristique fringilla. Morbi sit amet + tortor quis risus auctor condimentum. Morbi in ullamcorper elit. + Nulla iaculis tellus sit amet mauris tempus fringilla. +

+

+ Maecenas mauris lectus, lobortis et purus mattis, blandit + dictum tellus. +

+ +

+ +

+



+



+



+

+ In non mauris justo. Duis vehicula mi vel mi pretium, a viverra + erat efficitur. Cras aliquam est ac eros varius, id iaculis dui + auctor. Duis pretium neque ligula, et pulvinar mi placerat et. + Nulla nec nunc sit amet nunc posuere vestibulum. Ut id neque + eget tortor mattis tristique. Donec ante est, blandit sit amet + tristique vel, lacinia pulvinar arcu. Pellentesque scelerisque + fermentum erat, id posuere justo pulvinar ut. Cras id eros sed + enim aliquam lobortis. Sed lobortis nisl ut eros efficitur + tincidunt. Cras justo mi, porttitor quis mattis vel, ultricies + ut purus. Ut facilisis et lacus eu cursus. +

+
  • +

    + Cras fringilla ipsum magna, in fringilla dui commodo a. +

    +
  • +



    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +

    +
    +

    +
    +

    + Lorem ipsum +

    +
    +

    + Lorem ipsum +

    +
    +

    + Lorem ipsum +

    +
    +

    + 1 +

    +
    +

    + In eleifend velit vitae libero sollicitudin + euismod. +

    +
    +

    + Lorem +

    +
    +

    +
    +

    +
    +

    + 2 +

    +
    +

    + Cras fringilla ipsum magna, in fringilla dui + commodo a. +

    +
    +

    + Ipsum +

    +
    +

    +
    +

    +
    +

    + 3 +

    +
    +

    + Aliquam erat volutpat. +

    +
    +

    + Lorem +

    +
    +

    +
    +

    +
    +

    + 4 +

    +
    +

    + Fusce vitae vestibulum velit. +

    +
    +

    + Lorem +

    +
    +

    +
    +

    +
    +

    + 5 +

    +
    +

    + Etiam vehicula luctus fermentum. +

    +
    +

    + Ipsum +

    +
    +

    +
    +

    +
    +



    +

    + Etiam vehicula luctus fermentum. In vel metus congue, pulvinar + lectus vel, fermentum dui. Maecenas ante orci, egestas ut + aliquet sit amet, sagittis a magna. Aliquam ante quam, + pellentesque ut dignissim quis, laoreet eget est. Aliquam erat + volutpat. Class aptent taciti sociosqu ad litora torquent per + conubia nostra, per inceptos himenaeos. Ut ullamcorper justo + sapien, in cursus libero viverra eget. Vivamus auctor imperdiet + urna, at pulvinar leo posuere laoreet. Suspendisse neque nisl, + fringilla at iaculis scelerisque, ornare vel dolor. Ut et + pulvinar nunc. Pellentesque fringilla mollis efficitur. Nullam + venenatis commodo imperdiet. Morbi velit neque, semper quis + lorem quis, efficitur dignissim ipsum. Ut ac lorem sed turpis + imperdiet eleifend sit amet id sapien +

    +

    +

    +

    +
    +

    I am a footer

    +
    + +