Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(routes): add doc-to-txt route #1445

Merged
merged 3 commits into from
May 8, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@

Docsmith is a RESTful API, built using Node.js and the [Fastify](https://fastify.io/) web framework, that can convert files from:

- DOC to TXT
- DOCX to HTML
- DOCX to TXT
- PDF to HTML
Expand Down
87 changes: 85 additions & 2 deletions package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 3 additions & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,7 @@
"@fastify/static": "^6.10.1",
"@fastify/swagger": "^8.3.1",
"@fastify/under-pressure": "^8.2.0",
"cfb": "^1.2.2",
"clean-css": "^5.3.2",
"cssesc": "^3.0.0",
"cssom": "^0.5.0",
Expand Down Expand Up @@ -128,6 +129,7 @@
"redoc": "^2.0.0",
"secure-json-parse": "^2.7.0",
"tesseract.js": "^4.0.5",
"upath": "^2.0.1"
"upath": "^2.0.1",
"word-extractor": "^1.0.4"
}
}
5 changes: 5 additions & 0 deletions src/config/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -263,6 +263,11 @@ async function getConfig() {
: undefined,
},
tags: [
{
name: "DOC",
description:
"Endpoints used for the conversion of DOC documents",
},
{
name: "DOCX",
description:
Expand Down
44 changes: 44 additions & 0 deletions src/plugins/doc-to-txt/index.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
const fp = require("fastify-plugin");
const WordExtractor = require("word-extractor");

/**
* @author Frazer Smith
* @description Pre-handler plugin that uses Word-Extractor to convert Buffer containing
* DOC file in `req.body` to TXT.
* `req` object is decorated with `conversionResults.body` holding the converted document.
* @param {object} server - Fastify instance.
*/
async function plugin(server) {
const wordExtractor = new WordExtractor();

server.addHook("onRequest", async (req) => {
req.conversionResults = { body: undefined };
});

server.addHook("preHandler", async (req, res) => {
try {
const results = await wordExtractor.extract(req.body);

const value = `${results.getHeaders({
includeFooters: false,
})}\n${results.getTextboxes({
includeHeadersAndFooters: false,
})}\n${results.getBody()}\n${results.getEndnotes()}\n${results.getFootnotes()}\n${results.getFooters()}`.trim();

req.conversionResults.body = value;
res.type("text/plain; charset=utf-8");
} catch {
/**
* Word-Extractor will throw if the .doc file provided
* by client is malformed, thus client error code
*/
throw server.httpErrors.badRequest();
}
});
}

module.exports = fp(plugin, {
fastify: "4.x",
name: "doc-to-txt",
dependencies: ["@fastify/sensible"],
});
91 changes: 91 additions & 0 deletions src/plugins/doc-to-txt/plugin.test.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
const fs = require("fs/promises");
const Fastify = require("fastify");
const isHtml = require("is-html");
const sensible = require("@fastify/sensible");
const plugin = require(".");

describe("DOC-to-TXT conversion plugin", () => {
let server;

beforeAll(async () => {
server = Fastify();

server.addContentTypeParser(
"application/msword",
{ parseAs: "buffer" },
async (_req, payload) => payload
);

await server.register(sensible).register(plugin);

server.post("/", (req, res) => {
res.header("content-type", "application/json").send(
req.conversionResults
);
});

await server.ready();
});

afterAll(async () => {
await server.close();
});

it("Converts DOC file to TXT", async () => {
const response = await server.inject({
method: "POST",
url: "/",
body: await fs.readFile(
"./test_resources/test_files/valid_doc.doc"
),
headers: {
"content-type": "application/msword",
},
});

const { body } = JSON.parse(response.payload);

// String found in first heading of the test document
expect(body).toMatch(
"Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nunc ac faucibus odio."
);
// String found at end of the test document
expect(body).toMatch(
/Nullam venenatis commodo imperdiet. Morbi velit neque, semper quis lorem quis, efficitur dignissim ipsum. Ut ac lorem sed turpis imperdiet eleifend sit amet id sapien$/m
);
expect(isHtml(body)).toBe(false);
expect(response.statusCode).toBe(200);
});

// TODO: use `it.concurrent.each()` once it is no longer experimental
it.each([
{ testName: "is missing" },
{
testName: "is not a valid DOC file",
readFile: true,
},
])(
"Returns HTTP status code 400 if DOC file $testName",
async ({ readFile }) => {
const response = await server.inject({
method: "POST",
url: "/",
headers: {
"content-type": "application/msword",
},
body: readFile
? await fs.readFile(
"./test_resources/test_files/invalid_doc.doc"
)
: undefined,
});

expect(JSON.parse(response.payload)).toEqual({
error: "Bad Request",
message: "Bad Request",
statusCode: 400,
});
expect(response.statusCode).toBe(400);
}
);
});
Loading