Skip to content

Commit

Permalink
refactor buffer-processing logic into its own method
Browse files Browse the repository at this point in the history
so you don't have to write a temp file if you've got the PDF in memory
  • Loading branch information
atbah committed Nov 17, 2017
1 parent d949fbb commit 59f988c
Show file tree
Hide file tree
Showing 3 changed files with 95 additions and 71 deletions.
13 changes: 13 additions & 0 deletions example/example-buffer.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
var fs = require('fs');
var assert = require('assert');
var PDFExtract = require('../lib').PDFExtract;
var pdfExtract = new PDFExtract();
var buffer = fs.readFileSync('./example.pdf');
pdfExtract.extractBuffer(buffer, {}, function(err, data) {
if (err) {
return console.log(err);
}
var expected = require('./example-output.json');
assert.deepEqual(data.meta, expected.meta);
assert.deepEqual(data.pages, expected.pages);
});
4 changes: 2 additions & 2 deletions example/example-output.json
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
{
"filename": "./example.pdf",
"meta": {
"info": {
"PDFFormatVersion": "1.3",
Expand Down Expand Up @@ -335,5 +334,6 @@
"numPages": 1,
"fingerprint": "4dc91a1875a6d707aec203bb021c93a0",
"encrypted": false
}
},
"filename": "./example.pdf"
}
149 changes: 80 additions & 69 deletions lib/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -16,76 +16,87 @@ PDFExtract.prototype.extract = function (filename, options, cb) {
*
* Any copyright is dedicated to the Public Domain.
* http://creativecommons.org/publicdomain/zero/1.0/ */
var _this = this;
fs.readFile(filename, function(err, buffer) {
if (err) {
return cb(err);
}
return _this.extractBuffer(buffer, options, function(err, pdf) {
if (err) {
cb(err);
}
pdf.filename = filename;
cb(null, pdf);
});
});
};

fs.readFile(filename, function (err, buffer) {
if (err) return cb(err);
// Loading file from file system into typed array
var data = new Uint8Array(buffer);
var pdf = {
filename: filename,
meta: {},
pages: []
};
// Will be using promises to load document, pages and misc data instead of callback.
pdfjsLib.getDocument({data: data}).then(function (doc) {
var numPages = doc.numPages;
pdf.pdfInfo = doc.pdfInfo;
var lastPromise; // will be used to chain promises
lastPromise = doc.getMetadata().then(function (data) {
pdf.meta = data;
});
var loadPage = function (pageNum) {
return doc.getPage(pageNum).then(function (page) {
var viewport = page.getViewport(1.0 /* scale */);
var pag = {
pageInfo: {
num: pageNum,
scale: viewport.scale,
rotation: viewport.rotation,
offsetX: viewport.offsetX,
offsetY: viewport.offsetY,
width: viewport.width,
height: viewport.height,
fontScale: viewport.fontScale
}
};
pdf.pages.push(pag);
return page.getTextContent().then(function (content) {
// Content contains lots of information about the text layout and styles, but we need only strings at the moment
pag.content = content.items.map(function (item) {
var x = item.transform[4];
var y = pag.pageInfo.height - item.transform[5];
if (viewport.rotation == 90) {
x = item.transform[5]
y = item.transform[4];
}
return {
x: x,
y: y,
str: item.str,
dir: item.dir,
width: item.width,
height: item.height,
fontName: item.fontName
};
});
}).then(function () {
// console.log('done page parsing');
});
})
};
// Loading of the first page will wait on metadata and subsequent loadings
// will wait on the previous pages.
for (var i = 1; i <= numPages; i++) {
lastPromise = lastPromise.then(loadPage.bind(null, i));
}
return lastPromise;
}).then(function () {
cb(null, pdf);
}, function (err) {
cb(err);
});
});
PDFExtract.prototype.extractBuffer = function(buffer, options, cb) {
// Loading file from file system into typed array
var data = new Uint8Array(buffer);
var pdf = {
meta: {},
pages: []
};
// Will be using promises to load document, pages and misc data instead of callback.
pdfjsLib.getDocument({ data }).then(function(doc) {
var numPages = doc.numPages;
pdf.pdfInfo = doc.pdfInfo;
var lastPromise; // will be used to chain promises
lastPromise = doc.getMetadata().then(function(data) {
pdf.meta = data;
});
var loadPage = function(pageNum) {
return doc.getPage(pageNum).then(function(page) {
var viewport = page.getViewport(1.0 /* scale */);
var pag = {
pageInfo: {
num: pageNum,
scale: viewport.scale,
rotation: viewport.rotation,
offsetX: viewport.offsetX,
offsetY: viewport.offsetY,
width: viewport.width,
height: viewport.height,
fontScale: viewport.fontScale
}
};
pdf.pages.push(pag);
return page.getTextContent().then(function(content) {
// Content contains lots of information about the text layout and styles, but we need only strings at the moment
pag.content = content.items.map(function(item) {
var x = item.transform[4];
var y = pag.pageInfo.height - item.transform[5];
if (viewport.rotation == 90) {
x = item.transform[5];
y = item.transform[4];
}
return {
x,
y,
str: item.str,
dir: item.dir,
width: item.width,
height: item.height,
fontName: item.fontName
};
});
}).then(function() {
// console.log('done page parsing');
});
});
};
// Loading of the first page will wait on metadata and subsequent loadings
// will wait on the previous pages.
for (var i = 1; i <= numPages; i++) {
lastPromise = lastPromise.then(loadPage.bind(null, i));
}
return lastPromise;
}).then(function() {
cb(null, pdf);
}, function(err) {
cb(err);
});
};

PDFExtract.utils = utils;
Expand Down

0 comments on commit 59f988c

Please sign in to comment.