diff --git a/example/example-buffer.js b/example/example-buffer.js new file mode 100644 index 0000000..73f9ed3 --- /dev/null +++ b/example/example-buffer.js @@ -0,0 +1,13 @@ +var fs = require('fs'); +var assert = require('assert'); +var PDFExtract = require('../lib').PDFExtract; +var pdfExtract = new PDFExtract(); +var buffer = fs.readFileSync('./example.pdf'); +pdfExtract.extractBuffer(buffer, {}, function(err, data) { + if (err) { + return console.log(err); + } + var expected = require('./example-output.json'); + assert.deepEqual(data.meta, expected.meta); + assert.deepEqual(data.pages, expected.pages); +}); diff --git a/example/example-output.json b/example/example-output.json index 1aa26d2..59ba3dc 100644 --- a/example/example-output.json +++ b/example/example-output.json @@ -1,5 +1,4 @@ { - "filename": "./example.pdf", "meta": { "info": { "PDFFormatVersion": "1.3", @@ -335,5 +334,6 @@ "numPages": 1, "fingerprint": "4dc91a1875a6d707aec203bb021c93a0", "encrypted": false - } + }, + "filename": "./example.pdf" } \ No newline at end of file diff --git a/lib/index.js b/lib/index.js index f51444a..bfbe4ae 100644 --- a/lib/index.js +++ b/lib/index.js @@ -16,76 +16,87 @@ PDFExtract.prototype.extract = function (filename, options, cb) { * * Any copyright is dedicated to the Public Domain. * http://creativecommons.org/publicdomain/zero/1.0/ */ + var _this = this; + fs.readFile(filename, function(err, buffer) { + if (err) { + return cb(err); + } + return _this.extractBuffer(buffer, options, function(err, pdf) { + if (err) { + cb(err); + } + pdf.filename = filename; + cb(null, pdf); + }); + }); +}; - fs.readFile(filename, function (err, buffer) { - if (err) return cb(err); - // Loading file from file system into typed array - var data = new Uint8Array(buffer); - var pdf = { - filename: filename, - meta: {}, - pages: [] - }; - // Will be using promises to load document, pages and misc data instead of callback. - pdfjsLib.getDocument({data: data}).then(function (doc) { - var numPages = doc.numPages; - pdf.pdfInfo = doc.pdfInfo; - var lastPromise; // will be used to chain promises - lastPromise = doc.getMetadata().then(function (data) { - pdf.meta = data; - }); - var loadPage = function (pageNum) { - return doc.getPage(pageNum).then(function (page) { - var viewport = page.getViewport(1.0 /* scale */); - var pag = { - pageInfo: { - num: pageNum, - scale: viewport.scale, - rotation: viewport.rotation, - offsetX: viewport.offsetX, - offsetY: viewport.offsetY, - width: viewport.width, - height: viewport.height, - fontScale: viewport.fontScale - } - }; - pdf.pages.push(pag); - return page.getTextContent().then(function (content) { - // Content contains lots of information about the text layout and styles, but we need only strings at the moment - pag.content = content.items.map(function (item) { - var x = item.transform[4]; - var y = pag.pageInfo.height - item.transform[5]; - if (viewport.rotation == 90) { - x = item.transform[5] - y = item.transform[4]; - } - return { - x: x, - y: y, - str: item.str, - dir: item.dir, - width: item.width, - height: item.height, - fontName: item.fontName - }; - }); - }).then(function () { - // console.log('done page parsing'); - }); - }) - }; - // Loading of the first page will wait on metadata and subsequent loadings - // will wait on the previous pages. - for (var i = 1; i <= numPages; i++) { - lastPromise = lastPromise.then(loadPage.bind(null, i)); - } - return lastPromise; - }).then(function () { - cb(null, pdf); - }, function (err) { - cb(err); - }); - }); +PDFExtract.prototype.extractBuffer = function(buffer, options, cb) { + // Loading file from file system into typed array + var data = new Uint8Array(buffer); + var pdf = { + meta: {}, + pages: [] + }; + // Will be using promises to load document, pages and misc data instead of callback. + pdfjsLib.getDocument({ data }).then(function(doc) { + var numPages = doc.numPages; + pdf.pdfInfo = doc.pdfInfo; + var lastPromise; // will be used to chain promises + lastPromise = doc.getMetadata().then(function(data) { + pdf.meta = data; + }); + var loadPage = function(pageNum) { + return doc.getPage(pageNum).then(function(page) { + var viewport = page.getViewport(1.0 /* scale */); + var pag = { + pageInfo: { + num: pageNum, + scale: viewport.scale, + rotation: viewport.rotation, + offsetX: viewport.offsetX, + offsetY: viewport.offsetY, + width: viewport.width, + height: viewport.height, + fontScale: viewport.fontScale + } + }; + pdf.pages.push(pag); + return page.getTextContent().then(function(content) { + // Content contains lots of information about the text layout and styles, but we need only strings at the moment + pag.content = content.items.map(function(item) { + var x = item.transform[4]; + var y = pag.pageInfo.height - item.transform[5]; + if (viewport.rotation == 90) { + x = item.transform[5]; + y = item.transform[4]; + } + return { + x, + y, + str: item.str, + dir: item.dir, + width: item.width, + height: item.height, + fontName: item.fontName + }; + }); + }).then(function() { + // console.log('done page parsing'); + }); + }); + }; + // Loading of the first page will wait on metadata and subsequent loadings + // will wait on the previous pages. + for (var i = 1; i <= numPages; i++) { + lastPromise = lastPromise.then(loadPage.bind(null, i)); + } + return lastPromise; + }).then(function() { + cb(null, pdf); + }, function(err) { + cb(err); + }); }; PDFExtract.utils = utils;