-
-
Notifications
You must be signed in to change notification settings - Fork 9
/
Copy pathindex.js
110 lines (93 loc) · 3 KB
/
index.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
const fs = require('fs');
const path = require('path');
const _ = require('highland');
const fastXmlParser = require('fast-xml-parser');
const yargs = require('yargs');
//const resultsPath = './CBP_FOIA_Response_OCR.json';
const pdf = require('./pdf');
const gcv = require('./gcv');
const utils = require('./utils');
function test(yargs) {
console.log(yargs)
}
function ocrPDF(yargs) {
const inputFile = yargs.inputPDF;
const outputDir = yargs.outputDir || path.basename(inputFile) + '_ocr';
const outputPath = path.join(path.dirname(inputFile), outputDir);
let metadata = {};
let pdfStream = _(pdf.convertPDF(inputFile, outputDir))
.doto(output => {
metadata = output;
fs.writeFileSync(path.join(metadata.outputDir, "metadata.json"), JSON.stringify(metadata));
})
.pluck('pdfImages')
.flatten()
let gcvStream = pdfStream.fork();
let imageStream = pdfStream.observe()
.map(fileName => fileName.replace(path.extname(fileName), '.json'));
gcvStream
.through(gcv.prepareGCV)
.batch(5)
.flatMap(d => _(gcv.performGCV(d)))
.flatten()
.pluck('responses')
.flatten()
.map(d => JSON.stringify(d))
.zip(imageStream)
.flatMap(result => _(writeFile(result)))
.each(fileName => {
console.log(`${fileName}`);
})
}
function recreatePDF() {
let pdfPath = path.parse(inputFile);
let pdfXML = path.join(pdfPath.dir, outputDir, pdfPath.base.replace(pdfPath.ext, '.xml'));
let inputPDF = pdf.initializePDF(inputFile);
_(utils.readFileAsync(pdfXML, 'utf-8'))
.map(xml => fastXmlParser.parse(xml, {ignoreAttributes: false, attributeNamePrefix: '', parseAttributeValue: true, textNodeName: '@text'}))
.pluck('pdf2xml')
.pluck('page')
.map(utils.hasToBeArray)
.flatten()
.filter(page => page.hasOwnProperty('image'))
.take(1)
.map(page => {
page.image = utils.hasToBeArray(page.image)
.map(image => {
let imagePath = path.parse(image.src);
image.ocr = path.join(imagePath.dir, imagePath.name + ".json");
return image
})
return page;
})
.flatMap(page => _(pdf.parseGCV(inputPDF, page)))
//.reduce(inputPDF, pdf.parseGCV)
.reduce(inputPDF, pdf.enrichPage) // convert the pdf xml into a page.
.done(f => inputPDF.endPDF())
//.each(console.log)
}
function addGCVText() {
// Read the GCV Input and add in the text accordingly.
}
async function writeFile(result) {
let ocrOutput = result[0];
let outputFile = result[1];
await utils.writeFileAsync(outputFile, ocrOutput);
return outputFile;
}
yargs
.usage('Usage: $0 <command> <inputPDF> [options]')
.command(
'ocr <inputPDF>',
'run pdf to gocr to output json file',
() => {},
ocrPDF
)
.option('outputDir', {
alias: 'o',
describe: 'Path to write images & JSON files to.',
type: 'string'
})
.describe('ocr', 'OCR a PDF File')
.alias('h', 'help').argv;
//ocrPDF()