From 3678dbceae3f50929b7b3748bbe6b6bd9f767bf3 Mon Sep 17 00:00:00 2001 From: Balearica Date: Wed, 14 Sep 2022 18:07:56 -0700 Subject: [PATCH 01/21] Added image preprocessing functions (rotate + save images) --- examples/browser/image-processing.html | 60 ++++++++ src/constants/imageType.js | 5 + src/createWorker.js | 9 ++ src/index.d.ts | 16 +++ src/worker-script/index.js | 136 +++++++++++++++++- .../utils/arrayBufferToBase64.js | 56 ++++++++ src/worker-script/utils/setImage.js | 6 +- 7 files changed, 282 insertions(+), 6 deletions(-) create mode 100644 examples/browser/image-processing.html create mode 100644 src/constants/imageType.js create mode 100644 src/worker-script/utils/arrayBufferToBase64.js diff --git a/examples/browser/image-processing.html b/examples/browser/image-processing.html new file mode 100644 index 000000000..cafb6d218 --- /dev/null +++ b/examples/browser/image-processing.html @@ -0,0 +1,60 @@ + + + + + + + + + + +
+
+

Input Image

+ +
+
+

Rotated, Original Color

+ +
+
+

Rotated, Grey

+ +
+
+

Rotated, Binary

+ +
+
+ + + + + \ No newline at end of file diff --git a/src/constants/imageType.js b/src/constants/imageType.js new file mode 100644 index 000000000..e21ccfe35 --- /dev/null +++ b/src/constants/imageType.js @@ -0,0 +1,5 @@ +module.exports = { + ORIGINAL: 0, + GREY: 1, + BINARY: 2, +}; diff --git a/src/createWorker.js b/src/createWorker.js index d6a5875b9..3c3b6263f 100644 --- a/src/createWorker.js +++ b/src/createWorker.js @@ -123,6 +123,14 @@ module.exports = (_options = {}) => { })) ); + const threshold = async (image, opts = {}, jobId) => ( + startJob(createJob({ + id: jobId, + action: 'threshold', + payload: { image: await loadImage(image), options: opts }, + })) + ); + const getPDF = (title = 'Tesseract OCR Result', textonly = false, jobId) => ( startJob(createJob({ id: jobId, @@ -191,6 +199,7 @@ module.exports = (_options = {}) => { initialize, setParameters, recognize, + threshold, getPDF, detect, terminate, diff --git a/src/index.d.ts b/src/index.d.ts index 2a7f265db..bcc5d33e3 100644 --- a/src/index.d.ts +++ b/src/index.d.ts @@ -22,7 +22,9 @@ declare namespace Tesseract { loadLanguage(langs?: string, jobId?: string): Promise initialize(langs?: string, oem?: OEM, jobId?: string): Promise setParameters(params: Partial, jobId?: string): Promise + getImage(type: imageType): string recognize(image: ImageLike, options?: Partial, jobId?: string): Promise + threshold(image: ImageLike, options?: Partial, jobId?: string): Promise detect(image: ImageLike, jobId?: string): Promise terminate(jobId?: string): Promise } @@ -53,6 +55,11 @@ declare namespace Tesseract { } interface RecognizeOptions { rectangle: Rectangle + saveImageOriginal: boolean + saveImageGrey: boolean + saveImageBinary: boolean + rotateAuto: boolean + rotateRadians: float } interface ConfigResult { jobId: string @@ -100,6 +107,11 @@ declare namespace Tesseract { SPARSE_TEXT = '11', SPARSE_TEXT_OSD = '12', } + const enum imageType { + ORIGINAL = 0, + GREY = 1, + BINARY = 2 + } type ImageLike = string | HTMLImageElement | HTMLCanvasElement | HTMLVideoElement | CanvasRenderingContext2D | File | Blob | ImageData | Buffer; interface Block { @@ -213,6 +225,10 @@ declare namespace Tesseract { box: string | null; unlv: string | null; sd: string | null; + imageOriginal: string; + imageGrey: string; + imageBinary: string; + rotateRadians: number; } } diff --git a/src/worker-script/index.js b/src/worker-script/index.js index 1e2cfb673..3634f1c9b 100644 --- a/src/worker-script/index.js +++ b/src/worker-script/index.js @@ -15,6 +15,9 @@ const isWebWorker = require('../utils/getEnvironment')('type') === 'webworker'; const setImage = require('./utils/setImage'); const defaultParams = require('./constants/defaultParams'); const { log, setLogging } = require('../utils/log'); +const arrayBufferToBase64 = require('./utils/arrayBufferToBase64'); +const imageType = require('../constants/imageType'); +const PSM = require('../constants/PSM'); /* * Tesseract Module returned by TesseractCore. @@ -197,14 +200,140 @@ const initialize = ({ } }; -const recognize = ({ payload: { image, options: { rectangle: rec } } }, res) => { +const getImage = (type) => { + api.WriteImage(type, '/image.png'); + const pngBuffer = TessModule.FS.readFile('/image.png'); + const pngStr = `data:image/png;base64,${arrayBufferToBase64(pngBuffer.buffer)}`; + TessModule.FS.unlink('/image.png'); + return pngStr; +}; + +const recognize = ({ + payload: { + image, options: { + rectangle: rec, saveImageOriginal, saveImageGrey, saveImageBinary, rotateAuto, rotateRadians, + }, + }, +}, res) => { try { - const ptr = setImage(TessModule, api, image); + // When the auto-rotate option is True, setImage is called with no angle, + // then the angle is calculated by Tesseract and then setImage is re-called. + // Otherwise, setImage is called once using the user-provided rotateRadiansFinal value. + let ptr; + let rotateRadiansFinal; + if (rotateAuto) { + // The angle is only detected if auto page segmentation is used + // Therefore, if this is not the mode specified by the user, it is enabled temporarily here + const psmInit = api.GetPageSegMode(); + let psmEdit = false; + if (![PSM.AUTO, PSM.AUTO_ONLY, PSM.OSD].includes(psmInit)) { + psmEdit = true; + api.SetVariable('tessedit_pageseg_mode', String(PSM.AUTO)); + } + + ptr = setImage(TessModule, api, image); + api.FindLines(); + const rotateRadiansCalc = api.GetAngle(); + + // Restore user-provided PSM setting + if (psmEdit) { + api.SetVariable('tessedit_pageseg_mode', String(psmInit)); + } + + // Small angles (<0.005 radians/~0.3 degrees) are ignored to save on runtime + if (Math.abs(rotateRadiansCalc) >= 0.005) { + rotateRadiansFinal = rotateRadiansCalc; + ptr = setImage(TessModule, api, image, rotateRadiansFinal); + } else { + // Image needs to be reset if run with different PSM setting earlier + if (psmEdit) { + ptr = setImage(TessModule, api, image); + } + rotateRadiansFinal = 0; + } + } else { + rotateRadiansFinal = rotateRadians || 0; + ptr = setImage(TessModule, api, image, rotateRadiansFinal); + } + if (typeof rec === 'object') { api.SetRectangle(rec.left, rec.top, rec.width, rec.height); } api.Recognize(null); - res.resolve(dump(TessModule, api, params)); + const result = dump(TessModule, api, params); + if (saveImageOriginal) { + result.imageOriginal = getImage(imageType.ORIGINAL); + } + if (saveImageGrey) { + result.imageGrey = getImage(imageType.GREY); + } + if (saveImageBinary) { + result.imageBinary = getImage(imageType.BINARY); + } + result.rotateRadians = rotateRadiansFinal; + res.resolve(result); + TessModule._free(ptr); + } catch (err) { + res.reject(err.toString()); + } +}; + +// `threshold` is similar to `recognize` except it skips the recognition step +// Useful for getting rotated/binarized images without running recognition +const threshold = ({ + payload: { + image, options: { + rectangle: rec, saveImageOriginal, saveImageGrey, saveImageBinary, rotateAuto, rotateRadians, + }, + }, +}, res) => { + try { + let ptr; + let rotateRadiansFinal; + if (rotateAuto) { + const psmInit = api.GetPageSegMode(); + let psmEdit = false; + if (![PSM.AUTO, PSM.AUTO_ONLY, PSM.OSD].includes(psmInit)) { + psmEdit = true; + api.SetVariable('tessedit_pageseg_mode', String(PSM.AUTO)); + } + + ptr = setImage(TessModule, api, image); + api.FindLines(); + const rotateRadiansCalc = api.GetAngle(); + + // Restore user-provided PSM setting + if (psmEdit) { + api.SetVariable('tessedit_pageseg_mode', String(psmInit)); + } + + // Small angles (<0.005 radians/~0.3 degrees) are ignored to save on runtime + if (Math.abs(rotateRadiansCalc) >= 0.005) { + rotateRadiansFinal = rotateRadiansCalc; + ptr = setImage(TessModule, api, image, rotateRadiansFinal); + } else { + rotateRadiansFinal = 0; + } + } else { + rotateRadiansFinal = rotateRadians || 0; + ptr = setImage(TessModule, api, image, rotateRadiansFinal); + } + + if (typeof rec === 'object') { + api.SetRectangle(rec.left, rec.top, rec.width, rec.height); + } + const result = {}; + if (saveImageOriginal) { + result.imageOriginal = getImage(imageType.ORIGINAL); + } + if (saveImageGrey) { + result.imageGrey = getImage(imageType.GREY); + } + if (saveImageBinary) { + result.imageBinary = getImage(imageType.BINARY); + } + result.rotateRadians = rotateRadiansFinal; + res.resolve(result); TessModule._free(ptr); } catch (err) { res.reject(err.toString()); @@ -295,6 +424,7 @@ exports.dispatchHandlers = (packet, send) => { initialize, setParameters, recognize, + threshold, getPDF, detect, terminate, diff --git a/src/worker-script/utils/arrayBufferToBase64.js b/src/worker-script/utils/arrayBufferToBase64.js new file mode 100644 index 000000000..727f6f535 --- /dev/null +++ b/src/worker-script/utils/arrayBufferToBase64.js @@ -0,0 +1,56 @@ +// Copied from https://gist.github.com/jonleighton/958841 +// Copyright 2011 Jon Leighton, MIT LICENSE + +/* eslint no-bitwise: 0 */ +module.exports = (arrayBuffer) => { + let base64 = ''; + const encodings = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/'; + + const bytes = new Uint8Array(arrayBuffer); + const { byteLength } = bytes; + const byteRemainder = byteLength % 3; + const mainLength = byteLength - byteRemainder; + + let a; let b; let c; let + d; + let chunk; + + // Main loop deals with bytes in chunks of 3 + for (let i = 0; i < mainLength; i += 3) { + // Combine the three bytes into a single integer + chunk = (bytes[i] << 16) | (bytes[i + 1] << 8) | bytes[i + 2]; + + // Use bitmasks to extract 6-bit segments from the triplet + a = (chunk & 16515072) >> 18; // 16515072 = (2^6 - 1) << 18 + b = (chunk & 258048) >> 12; // 258048 = (2^6 - 1) << 12 + c = (chunk & 4032) >> 6; // 4032 = (2^6 - 1) << 6 + d = chunk & 63; // 63 = 2^6 - 1 + + // Convert the raw binary segments to the appropriate ASCII encoding + base64 += encodings[a] + encodings[b] + encodings[c] + encodings[d]; + } + + // Deal with the remaining bytes and padding + if (byteRemainder === 1) { + chunk = bytes[mainLength]; + + a = (chunk & 252) >> 2; // 252 = (2^6 - 1) << 2 + + // Set the 4 least significant bits to zero + b = (chunk & 3) << 4; // 3 = 2^2 - 1 + + base64 += `${encodings[a] + encodings[b]}==`; + } else if (byteRemainder === 2) { + chunk = (bytes[mainLength] << 8) | bytes[mainLength + 1]; + + a = (chunk & 64512) >> 10; // 64512 = (2^6 - 1) << 10 + b = (chunk & 1008) >> 4; // 1008 = (2^6 - 1) << 4 + + // Set the 2 least significant bits to zero + c = (chunk & 15) << 2; // 15 = 2^4 - 1 + + base64 += `${encodings[a] + encodings[b] + encodings[c]}=`; + } + + return base64; +}; diff --git a/src/worker-script/utils/setImage.js b/src/worker-script/utils/setImage.js index 3e0904503..7ebc194c9 100644 --- a/src/worker-script/utils/setImage.js +++ b/src/worker-script/utils/setImage.js @@ -8,7 +8,7 @@ const fileType = require('file-type'); * @function set image in tesseract for recognition * @access public */ -module.exports = (TessModule, api, image) => { +module.exports = (TessModule, api, image, angle = 0) => { const buf = Buffer.from(Array.from({ ...image, length: Object.keys(image).length })); const type = fileType(buf); let bytesPerPixel = 0; @@ -55,9 +55,9 @@ module.exports = (TessModule, api, image) => { * */ if (data === null) { - api.SetImage(pix, undefined, undefined, undefined, undefined, exif); + api.SetImage(pix, undefined, undefined, undefined, undefined, exif, angle); } else { - api.SetImage(data, w, h, bytesPerPixel, w * bytesPerPixel, exif); + api.SetImage(data, w, h, bytesPerPixel, w * bytesPerPixel, exif, angle); } return data === null ? pix : data; }; From 0277db26b105e9b33d0dab23edcffbb10c086afd Mon Sep 17 00:00:00 2001 From: Balearica Date: Wed, 14 Sep 2022 22:38:18 -0700 Subject: [PATCH 02/21] Updated createWorker to be async --- src/createWorker.js | 25 +++++++++++++++++++++++-- src/worker-script/index.js | 32 ++++++++++++++++++-------------- tests/FS.test.js | 5 +++-- tests/detect.test.js | 5 +++-- tests/error.test.html | 18 ++++++++++++++++++ tests/error.test.js | 29 +++++++++++++++++++++++++++++ tests/recognize.test.js | 3 ++- tests/scheduler.test.js | 2 +- 8 files changed, 97 insertions(+), 22 deletions(-) create mode 100644 tests/error.test.html create mode 100644 tests/error.test.js diff --git a/src/createWorker.js b/src/createWorker.js index 3c3b6263f..83227ec8f 100644 --- a/src/createWorker.js +++ b/src/createWorker.js @@ -15,7 +15,7 @@ const { let workerCounter = 0; -module.exports = (_options = {}) => { +module.exports = async (_options = {}) => { const id = getId('Worker', workerCounter); const { logger, @@ -27,7 +27,18 @@ module.exports = (_options = {}) => { }); const resolves = {}; const rejects = {}; + + let resReject; + let resResolve; + const res = new Promise((resolve, reject) => { + resResolve = resolve; + resReject = reject; + }); + let workerError = (event) => {resReject(event.message)}; + let worker = spawnWorker(options); + // worker.addEventListener("error", workerError); + worker.onerror = workerError; workerCounter += 1; @@ -185,7 +196,7 @@ module.exports = (_options = {}) => { } }); - return { + const resolveObj = { id, worker, setResolve, @@ -204,4 +215,14 @@ module.exports = (_options = {}) => { detect, terminate, }; + + startJob(createJob({ + id: undefined, action: 'checkWorker', + })).then(() => { + console.log("Created worker"); + // worker.removeEventListener("error", workerError); + resResolve(resolveObj)}); + + return res; + }; diff --git a/src/worker-script/index.js b/src/worker-script/index.js index 3634f1c9b..1b07899bf 100644 --- a/src/worker-script/index.js +++ b/src/worker-script/index.js @@ -57,7 +57,7 @@ const load = async ({ workerId, jobId, payload: { options: { corePath, logging } } }; -const FS = ({ workerId, payload: { method, args } }, res) => { +const FS = async ({ workerId, payload: { method, args } }, res) => { log(`[${workerId}]: FS.${method} with args ${args}`); res.resolve(TessModule.FS[method](...args)); }; @@ -159,7 +159,7 @@ res) => { } }; -const setParameters = ({ payload: { params: _params } }, res) => { +const setParameters = async ({ payload: { params: _params } }, res) => { Object.keys(_params) .filter((k) => !k.startsWith('tessjs_')) .forEach((key) => { @@ -172,7 +172,7 @@ const setParameters = ({ payload: { params: _params } }, res) => { } }; -const initialize = ({ +const initialize = async ({ workerId, payload: { langs: _langs, oem }, }, res) => { @@ -208,7 +208,7 @@ const getImage = (type) => { return pngStr; }; -const recognize = ({ +const recognize = async ({ payload: { image, options: { rectangle: rec, saveImageOriginal, saveImageGrey, saveImageBinary, rotateAuto, rotateRadians, @@ -280,7 +280,7 @@ const recognize = ({ // `threshold` is similar to `recognize` except it skips the recognition step // Useful for getting rotated/binarized images without running recognition -const threshold = ({ +const threshold = async ({ payload: { image, options: { rectangle: rec, saveImageOriginal, saveImageGrey, saveImageBinary, rotateAuto, rotateRadians, @@ -340,7 +340,7 @@ const threshold = ({ } }; -const getPDF = ({ payload: { title, textonly } }, res) => { +const getPDF = async ({ payload: { title, textonly } }, res) => { const pdfRenderer = new TessModule.TessPDFRenderer('tesseract-ocr', '/', textonly); pdfRenderer.BeginDocument(title); pdfRenderer.AddImage(api); @@ -350,7 +350,7 @@ const getPDF = ({ payload: { title, textonly } }, res) => { res.resolve(TessModule.FS.readFile('/tesseract-ocr.pdf')); }; -const detect = ({ payload: { image } }, res) => { +const detect = async ({ payload: { image } }, res) => { try { const ptr = setImage(TessModule, api, image); const results = new TessModule.OSResults(); @@ -379,7 +379,7 @@ const detect = ({ payload: { image } }, res) => { } }; -const terminate = (_, res) => { +const terminate = async (_, res) => { try { if (api !== null) { api.End(); @@ -390,6 +390,13 @@ const terminate = (_, res) => { } }; +// Function that always resolves +// Used to confirm that worker was successfully created +const checkWorker = async (_, res) => { + res.resolve(); +}; + + /** * dispatchHandlers * @@ -416,7 +423,6 @@ exports.dispatchHandlers = (packet, send) => { latestJob = res; - try { ({ load, FS, @@ -428,11 +434,9 @@ exports.dispatchHandlers = (packet, send) => { getPDF, detect, terminate, - })[packet.action](packet, res); - } catch (err) { - /** Prepare exception to travel through postMessage */ - res.reject(err.toString()); - } + checkWorker + })[packet.action](packet, res) + .catch((err) => res.reject(err.toString())); }; /** diff --git a/tests/FS.test.js b/tests/FS.test.js index 393926f67..5fea777c6 100644 --- a/tests/FS.test.js +++ b/tests/FS.test.js @@ -1,8 +1,9 @@ const { createWorker } = Tesseract; const FS_WAIT = 500; -const worker = createWorker(OPTIONS); -before(function cb() { +let worker; +before(async function cb() { this.timeout(0); + worker = await createWorker(OPTIONS); return worker.load(); }); diff --git a/tests/detect.test.js b/tests/detect.test.js index cadef05e6..0903e91ab 100644 --- a/tests/detect.test.js +++ b/tests/detect.test.js @@ -1,7 +1,8 @@ const { createWorker } = Tesseract; -const worker = createWorker(OPTIONS); -before(function cb() { +let worker; +before(async function cb() { this.timeout(0); + worker = await createWorker(OPTIONS); return worker.load(); }); diff --git a/tests/error.test.html b/tests/error.test.html new file mode 100644 index 000000000..a25e16263 --- /dev/null +++ b/tests/error.test.html @@ -0,0 +1,18 @@ + + + + + + +
+ + + + + + + + + diff --git a/tests/error.test.js b/tests/error.test.js new file mode 100644 index 000000000..1c568636e --- /dev/null +++ b/tests/error.test.js @@ -0,0 +1,29 @@ +// const { createWorker } = Tesseract; +// const worker = createWorker(OPTIONS); +// before(function cb() { +// this.timeout(0); +// return worker.load(); +// }); + + +(IS_BROWSER ? describe : describe.skip)('Invalid paths should result in promise rejection', () => { + it('Invalid workerPath', async () => { + const OPTIONS1 = JSON.parse(JSON.stringify(OPTIONS)); + OPTIONS1.corePath = "badpath.js"; + let errorThrown; + try { + const worker = Tesseract.createWorker(OPTIONS1); + await worker.load() + errorThrown = false; + } catch (error) { + errorThrown = true; + } + + expect(errorThrown).to.equal(true); + + // expect(func).to.throwError(); + + // const ret = await (worker.load().then(() => true).catch(() => false)); + // expect(ret).to.equal(false); + }).timeout(TIMEOUT); +}); diff --git a/tests/recognize.test.js b/tests/recognize.test.js index a65f108c8..9df5127b2 100644 --- a/tests/recognize.test.js +++ b/tests/recognize.test.js @@ -1,7 +1,8 @@ const { createWorker, PSM } = Tesseract; -const worker = createWorker(OPTIONS); +let worker; before(async function cb() { this.timeout(0); + worker = await createWorker(OPTIONS); await worker.load(); await worker.loadLanguage('eng+chi_tra+osd'); }); diff --git a/tests/scheduler.test.js b/tests/scheduler.test.js index b47cc83e6..a59d516dc 100644 --- a/tests/scheduler.test.js +++ b/tests/scheduler.test.js @@ -7,7 +7,7 @@ before(async function cb() { const NUM_WORKERS = 5; console.log(`Initializing ${NUM_WORKERS} workers`); workers = await Promise.all(Array(NUM_WORKERS).fill(0).map(async () => { - const w = createWorker(OPTIONS); + const w = await createWorker(OPTIONS); await w.load(); await w.loadLanguage('eng'); await w.initialize('eng'); From b87afe9d93c2f96fcb1cac717710c631c954c5fb Mon Sep 17 00:00:00 2001 From: Balearica Date: Sat, 17 Sep 2022 13:50:08 -0700 Subject: [PATCH 03/21] Reworked createWorker to be async and throw errors per #654 --- examples/node/benchmark.js | 4 +--- examples/node/download-pdf.js | 3 +-- examples/node/recognize.js | 7 +++---- 3 files changed, 5 insertions(+), 9 deletions(-) diff --git a/examples/node/benchmark.js b/examples/node/benchmark.js index 728d1d528..28d3fe725 100644 --- a/examples/node/benchmark.js +++ b/examples/node/benchmark.js @@ -2,10 +2,8 @@ const path = require('path'); const { createWorker } = require('../../'); -const worker = createWorker(); - (async () => { - await worker.load(); + const worker = await createWorker(); await worker.loadLanguage('eng'); await worker.initialize('eng'); const fileArr = ["../data/meditations.jpg", "../data/tyger.jpg", "../data/testocr.png"]; diff --git a/examples/node/download-pdf.js b/examples/node/download-pdf.js index f61e55cbe..2f7ed2490 100755 --- a/examples/node/download-pdf.js +++ b/examples/node/download-pdf.js @@ -9,8 +9,7 @@ const image = path.resolve(__dirname, (imagePath || '../../tests/assets/images/c console.log(`Recognizing ${image}`); (async () => { - const worker = createWorker(); - await worker.load(); + const worker = await createWorker(); await worker.loadLanguage('eng'); await worker.initialize('eng'); const { data: { text } } = await worker.recognize(image); diff --git a/examples/node/recognize.js b/examples/node/recognize.js index c77955e5e..082ffa0ce 100755 --- a/examples/node/recognize.js +++ b/examples/node/recognize.js @@ -6,12 +6,11 @@ const [,, imagePath] = process.argv; const image = path.resolve(__dirname, (imagePath || '../../tests/assets/images/cosmic.png')); console.log(`Recognizing ${image}`); -const worker = createWorker({ - logger: m => console.log(m), -}); (async () => { - await worker.load(); + const worker = await createWorker({ + logger: m => console.log(m), + }); await worker.loadLanguage('eng'); await worker.initialize('eng'); const { data: { text } } = await worker.recognize(image); From ca99c35d141999e9df1aded1c9df6b861691da95 Mon Sep 17 00:00:00 2001 From: Balearica Date: Sat, 17 Sep 2022 13:50:24 -0700 Subject: [PATCH 04/21] Reworked createWorker to be async and throw errors per #654 --- README.md | 3 +- docs/api.md | 32 +- docs/examples.md | 33 +- docs/faq.md | 3 +- docs/local-installation.md | 2 +- examples/browser/basic-edge.html | 18 +- examples/browser/benchmark.html | 3 +- examples/browser/download-pdf.html | 3 +- examples/browser/image-processing.html | 5 +- package-lock.json | 2887 +++++++++++------------- package.json | 4 +- src/Tesseract.js | 6 +- src/createWorker.js | 13 +- src/index.d.ts | 2 +- src/worker-script/index.js | 8 - tests/FS.test.js | 1 - tests/detect.test.js | 13 +- tests/error.test.js | 45 +- tests/recognize.test.js | 14 +- tests/scheduler.test.js | 1 - 20 files changed, 1374 insertions(+), 1722 deletions(-) diff --git a/README.md b/README.md index f945ad994..241972abc 100644 --- a/README.md +++ b/README.md @@ -46,12 +46,11 @@ Or more imperative ```javascript import { createWorker } from 'tesseract.js'; -const worker = createWorker({ +const worker = await createWorker({ logger: m => console.log(m) }); (async () => { - await worker.load(); await worker.loadLanguage('eng'); await worker.initialize('eng'); const { data: { text } } = await worker.recognize('https://tesseract.projectnaptha.com/img/eng_bw.png'); diff --git a/docs/api.md b/docs/api.md index f66e35982..8e795b010 100644 --- a/docs/api.md +++ b/docs/api.md @@ -1,7 +1,6 @@ # API - [createWorker()](#create-worker) - - [Worker.load](#worker-load) - [Worker.writeText](#worker-writeText) - [Worker.readText](#worker-readText) - [Worker.removeFile](#worker-removeFile) @@ -53,7 +52,7 @@ createWorker is a factory function that creates a tesseract worker, a worker is ```javascript const { createWorker } = Tesseract; -const worker = createWorker({ +const worker = await createWorker({ langPath: '...', logger: m => console.log(m), }); @@ -63,7 +62,6 @@ const worker = createWorker({ A Worker helps you to do the OCR related tasks, it takes few steps to setup Worker before it is fully functional. The full flow is: -- load - FS functions // optional - loadLanguauge - initialize @@ -82,23 +80,6 @@ Each function is async, so using async/await or Promise is required. When it is jobId is generated by Tesseract.js, but you can put your own when calling any of the function above. - -### Worker.load(jobId): Promise - -Worker.load() loads tesseract.js-core scripts (download from remote if not presented), it makes Web Worker/Child Process ready for next action. - -**Arguments:** - -- `jobId` Please see details above - -**Examples:** - -```javascript -(async () => { - await worker.load(); -})(); -``` - ### Worker.writeText(path, text, jobId): Promise @@ -273,8 +254,7 @@ Figures out what words are in `image`, where the words are in `image`, etc. ```javascript const { createWorker } = Tesseract; (async () => { - const worker = createWorker(); - await worker.load(); + const worker = await createWorker(); await worker.loadLanguage('eng'); await worker.initialize('eng'); const { data: { text } } = await worker.recognize(image); @@ -287,8 +267,7 @@ With rectangle ```javascript const { createWorker } = Tesseract; (async () => { - const worker = createWorker(); - await worker.load(); + const worker = await createWorker(); await worker.loadLanguage('eng'); await worker.initialize('eng'); const { data: { text } } = await worker.recognize(image, { @@ -313,8 +292,7 @@ Worker.detect() does OSD (Orientation and Script Detection) to the image instead ```javascript const { createWorker } = Tesseract; (async () => { - const worker = createWorker(); - await worker.load(); + const worker = await createWorker(); await worker.loadLanguage('eng'); await worker.initialize('eng'); const { data } = await worker.detect(image); @@ -361,7 +339,7 @@ Scheduler.addWorker() adds a worker into the worker pool inside scheduler, it is ```javascript const { createWorker, createScheduler } = Tesseract; const scheduler = createScheduler(); -const worker = createWorker(); +const worker = await createWorker(); scheduler.addWorker(worker); ``` diff --git a/docs/examples.md b/docs/examples.md index 21c93c1eb..188fa13a9 100644 --- a/docs/examples.md +++ b/docs/examples.md @@ -7,10 +7,9 @@ You can also check [examples](../examples) folder. ```javascript const { createWorker } = require('tesseract.js'); -const worker = createWorker(); +const worker = await createWorker(); (async () => { - await worker.load(); await worker.loadLanguage('eng'); await worker.initialize('eng'); const { data: { text } } = await worker.recognize('https://tesseract.projectnaptha.com/img/eng_bw.png'); @@ -24,12 +23,11 @@ const worker = createWorker(); ```javascript const { createWorker } = require('tesseract.js'); -const worker = createWorker({ +const worker = await createWorker({ logger: m => console.log(m), // Add logger here }); (async () => { - await worker.load(); await worker.loadLanguage('eng'); await worker.initialize('eng'); const { data: { text } } = await worker.recognize('https://tesseract.projectnaptha.com/img/eng_bw.png'); @@ -43,10 +41,9 @@ const worker = createWorker({ ```javascript const { createWorker } = require('tesseract.js'); -const worker = createWorker(); +const worker = await createWorker(); (async () => { - await worker.load(); await worker.loadLanguage('eng+chi_tra'); await worker.initialize('eng+chi_tra'); const { data: { text } } = await worker.recognize('https://tesseract.projectnaptha.com/img/eng_bw.png'); @@ -59,10 +56,9 @@ const worker = createWorker(); ```javascript const { createWorker } = require('tesseract.js'); -const worker = createWorker(); +const worker = await createWorker(); (async () => { - await worker.load(); await worker.loadLanguage('eng'); await worker.initialize('eng'); await worker.setParameters({ @@ -81,10 +77,9 @@ Check here for more details of pageseg mode: https://github.com/tesseract-ocr/te ```javascript const { createWorker, PSM } = require('tesseract.js'); -const worker = createWorker(); +const worker = await createWorker(); (async () => { - await worker.load(); await worker.loadLanguage('eng'); await worker.initialize('eng'); await worker.setParameters({ @@ -110,11 +105,10 @@ Node: [download-pdf.js](../examples/node/download-pdf.js) ```javascript const { createWorker } = require('tesseract.js'); -const worker = createWorker(); +const worker = await createWorker(); const rectangle = { left: 0, top: 0, width: 500, height: 250 }; (async () => { - await worker.load(); await worker.loadLanguage('eng'); await worker.initialize('eng'); const { data: { text } } = await worker.recognize('https://tesseract.projectnaptha.com/img/eng_bw.png', { rectangle }); @@ -128,7 +122,7 @@ const rectangle = { left: 0, top: 0, width: 500, height: 250 }; ```javascript const { createWorker } = require('tesseract.js'); -const worker = createWorker(); +const worker = await createWorker(); const rectangles = [ { left: 0, @@ -145,7 +139,6 @@ const rectangles = [ ]; (async () => { - await worker.load(); await worker.loadLanguage('eng'); await worker.initialize('eng'); const values = []; @@ -164,8 +157,8 @@ const rectangles = [ const { createWorker, createScheduler } = require('tesseract.js'); const scheduler = createScheduler(); -const worker1 = createWorker(); -const worker2 = createWorker(); +const worker1 = await createWorker(); +const worker2 = await createWorker(); const rectangles = [ { left: 0, @@ -182,8 +175,6 @@ const rectangles = [ ]; (async () => { - await worker1.load(); - await worker2.load(); await worker1.loadLanguage('eng'); await worker2.loadLanguage('eng'); await worker1.initialize('eng'); @@ -204,12 +195,10 @@ const rectangles = [ const { createWorker, createScheduler } = require('tesseract.js'); const scheduler = createScheduler(); -const worker1 = createWorker(); -const worker2 = createWorker(); +const worker1 = await createWorker(); +const worker2 = await createWorker(); (async () => { - await worker1.load(); - await worker2.load(); await worker1.loadLanguage('eng'); await worker2.loadLanguage('eng'); await worker1.initialize('eng'); diff --git a/docs/faq.md b/docs/faq.md index 8f0738e45..900ea7a18 100644 --- a/docs/faq.md +++ b/docs/faq.md @@ -19,12 +19,11 @@ Starting from 2.0.0-beta.1, you can get all these information in the final resul ```javascript import { createWorker } from 'tesseract.js'; -const worker = createWorker({ +const worker = await createWorker({ logger: m => console.log(m) }); (async () => { - await worker.load(); await worker.loadLanguage('eng'); await worker.initialize('eng'); await worker.setParameters({ diff --git a/docs/local-installation.md b/docs/local-installation.md index 6832bba0d..f3fd35b5d 100644 --- a/docs/local-installation.md +++ b/docs/local-installation.md @@ -19,7 +19,7 @@ Tesseract.recognize(image, langs, { Or ```javascript -const worker = createWorker({ +const worker = await createWorker({ workerPath: 'https://unpkg.com/tesseract.js@v2.0.0/dist/worker.min.js', langPath: 'https://tessdata.projectnaptha.com/4.0.0', corePath: 'https://unpkg.com/tesseract.js-core@v2.0.0/tesseract-core.wasm.js', diff --git a/examples/browser/basic-edge.html b/examples/browser/basic-edge.html index 11125a79a..a2bee278e 100644 --- a/examples/browser/basic-edge.html +++ b/examples/browser/basic-edge.html @@ -6,9 +6,9 @@ - - - - - - - - diff --git a/tests/error.test.js b/tests/error.test.js deleted file mode 100644 index 0b6220aa1..000000000 --- a/tests/error.test.js +++ /dev/null @@ -1,50 +0,0 @@ -// const { createWorker } = Tesseract; -// const worker = await createWorker(OPTIONS); -// before(function cb() { -// this.timeout(0); -// }); - - -(IS_BROWSER ? describe : describe.skip)('Invalid paths should result in promise rejection', () => { - it('Invalid workerPath', async () => { - const OPTIONS1 = JSON.parse(JSON.stringify(OPTIONS)); - OPTIONS1.workerPath = "badpath.js"; - let errorThrown; - // try { - // const worker = await Tesseract.createWorker(OPTIONS1); - // errorThrown = false; - // } catch (error) { - // errorThrown = true; - // } - - // Tesseract.createWorker(OPTIONS1).catch(() => errorThrown = true); - // await Tesseract.createWorker(OPTIONS1).catch(() => { - // errorThrown = true; - // }) - // const func = async () => { - - // await Tesseract.createWorker(OPTIONS1).catch(() => { - // errorThrown = true; - // }) - // return; - // }; - - // await func(); - - await (async () => { - await Tesseract.createWorker(OPTIONS1).catch((x) => { console.log("stuff") }) - // .then((x) => { throw new Error('was not supposed to succeed'); }) - // .catch((x) => { console.log("stuff") }) - return; - })(); - - - // await func().catch(() => console.log("caught")); - - // expect(errorThrown).to.equal(true); - - // expect(func).to.throwError(); - - // expect(ret).to.equal(false); - }).timeout(TIMEOUT); -}); From 622c841f33775744647d6f8a472b02d992dbea91 Mon Sep 17 00:00:00 2001 From: Balearica Date: Sat, 17 Sep 2022 20:30:41 -0700 Subject: [PATCH 08/21] Added savePDF option to recognize per #488; cleaned up code for linter --- examples/browser/download-pdf.html | 11 +++-- src/createWorker.js | 34 +++++++-------- src/index.d.ts | 12 +++-- src/worker-script/index.js | 70 ++++++++++++++++++++---------- 4 files changed, 78 insertions(+), 49 deletions(-) diff --git a/examples/browser/download-pdf.html b/examples/browser/download-pdf.html index a24030282..e562795eb 100644 --- a/examples/browser/download-pdf.html +++ b/examples/browser/download-pdf.html @@ -8,26 +8,29 @@ -