diff --git a/src/createWorker.js b/src/createWorker.js index d3568086c..f27e3fadf 100644 --- a/src/createWorker.js +++ b/src/createWorker.js @@ -127,7 +127,7 @@ module.exports = async (langs = 'eng', oem = OEM.LSTM_ONLY, _options = {}, confi cachePath: options.cachePath, cacheMethod: options.cacheMethod, gzip: options.gzip, - lstmOnly: [OEM.TESSERACT_ONLY, OEM.TESSERACT_LSTM_COMBINED].includes(currentOem) + lstmOnly: [OEM.LSTM_ONLY, OEM.TESSERACT_LSTM_COMBINED].includes(currentOem) && !options.legacyLang, }, }, @@ -159,13 +159,13 @@ module.exports = async (langs = 'eng', oem = OEM.LSTM_ONLY, _options = {}, confi // This logic fails if the user downloaded the LSTM-only English data for a language // and then uses `worker.reinitialize` to switch to the Legacy engine. // However, the correct data will still be downloaded after initialization fails - // and this can be avoided entirely + // and this can be avoided entirely if the user loads the correct data ahead of time. const langsArr = typeof langs === 'string' ? langs.split('+') : langs; - const _langs = langsArr.filter((x) => currentLangs.includes(x)); + const _langs = langsArr.filter((x) => !currentLangs.includes(x)); currentLangs.push(_langs); return loadLanguageInternal(_langs, jobId) - .then(() => initializeInternal(_langs, _oem, _config, jobId)); + .then(() => initializeInternal(langs, _oem, _config, jobId)); }; const setParameters = (params = {}, jobId) => ( diff --git a/tests/assets/traineddata/chi_tra.traineddata b/tests/assets/traineddata/chi_tra.traineddata index 5f1fe2712..daed6f590 100644 Binary files a/tests/assets/traineddata/chi_tra.traineddata and b/tests/assets/traineddata/chi_tra.traineddata differ diff --git a/tests/assets/traineddata/chi_tra.traineddata.gz b/tests/assets/traineddata/chi_tra.traineddata.gz deleted file mode 100644 index 591ece7a2..000000000 Binary files a/tests/assets/traineddata/chi_tra.traineddata.gz and /dev/null differ diff --git a/tests/assets/traineddata/eng.traineddata.gz b/tests/assets/traineddata/eng.traineddata.gz deleted file mode 100644 index d9a2dfa67..000000000 Binary files a/tests/assets/traineddata/eng.traineddata.gz and /dev/null differ diff --git a/tests/assets/traineddata/osd.traineddata.gz b/tests/assets/traineddata/osd.traineddata.gz deleted file mode 100644 index 4df91fba1..000000000 Binary files a/tests/assets/traineddata/osd.traineddata.gz and /dev/null differ diff --git a/tests/constants.js b/tests/constants.js index e3c72a845..06ebd06cf 100644 --- a/tests/constants.js +++ b/tests/constants.js @@ -2,17 +2,16 @@ const TIMEOUT = 30000; const IMAGE_PATH = 'http://localhost:3000/tests/assets/images'; const IS_BROWSER = typeof window !== 'undefined' && typeof window.document !== 'undefined'; const OPTIONS = { - cacheMethod: 'readOnly', - langPath: 'http://localhost:3000/tests/assets/traineddata', cachePath: './tests/assets/traineddata', - corePath: '../node_modules/tesseract.js-core/tesseract-core.wasm.js', + corePath: '../node_modules/tesseract.js-core', ...(IS_BROWSER ? { workerPath: '../dist/worker.min.js' } : {}), }; const SIMPLE_TEXT = 'Tesseract.js\n'; +const SIMPLE_TEXT_LEGACY = 'Tesseractjs\n'; const SIMPLE_TEXT_HALF = 'Tesse\n'; const COMSIC_TEXT = 'HellO World\nfrom beyond\nthe Cosmic Void\n'; const TESTOCR_TEXT = 'This is a lot of 12 point text to test the\nocr code and see if it works on all types\nof file format.\n\nThe quick brown dog jumped over the\nlazy fox. The quick brown dog jumped\nover the lazy fox. The quick brown dog\njumped over the lazy fox. The quick\nbrown dog jumped over the lazy fox.\n'; -const CHINESE_TEXT = '繁 體 中 文 測 試\n'; +const CHINESE_TEXT = '繁體 中 文 測試\n'; const BILL_SPACED_TEXT = 'FIRST CHEQUING\n\nLine of Credit 100,000.00 Rate 4.2000\n\nDate Description Number Debits Credits Balance\n31Jul2018 Balance Forward 99,878.08 -\n01Aug2018 Clearing Cheque 4987 36.07 99,914.15 -\n01Aug2018 Clearing Cheque 4986 60.93 99,975.08 -\n01Aug2018 Clearing Cheque 4982 800.04 100,775.12 EX\n01Aug2018 Clearing Cheque 4981 823.34 101,598.46 EX\n01Aug2018 Incoming Interac e-Transfer 1454 101,583.92 EX\n01Aug2018 Incoming Interac e-Transfer 400.00 101,183.92 EX\n01Aug2018 Assisted Deposit 3241450 68,769.42 -\n01Aug2018 Transfer out to loan 7 1,500.00 70,269.42 -\n02Aug2018 Clearing Cheque 4984 48.08 70,317.50 -\n02Aug2018 Clearing Cheque 4985 7051 70,388.01 -\n02Aug2018 Clearing Cheque 4992 500.00 70,888.01 -\n'; const SIMPLE_WHITELIST_TEXT = 'Tesses\n'; const FORMATS = ['png', 'jpg', 'bmp', 'pbm', 'webp', 'gif']; @@ -28,6 +27,7 @@ if (typeof module !== 'undefined') { SIMPLE_JPG_BASE64, CHINESE_TEXT, SIMPLE_TEXT, + SIMPLE_TEXT_LEGACY, SIMPLE_WHITELIST_TEXT, SIMPLE_TEXT_HALF, COMSIC_TEXT, diff --git a/tests/recognize.test.js b/tests/recognize.test.js index 79831b318..573783665 100644 --- a/tests/recognize.test.js +++ b/tests/recognize.test.js @@ -2,7 +2,8 @@ const { createWorker, PSM } = Tesseract; let worker; before(async function cb() { this.timeout(0); - worker = await createWorker("eng+chi_tra+osd", 1, OPTIONS); + worker = await createWorker("eng", 1, OPTIONS); + workerLegacy = await createWorker("eng", 0, OPTIONS); }); describe('recognize()', () => { @@ -29,6 +30,19 @@ describe('recognize()', () => { )); }); + describe('should recognize with Legacy OEM', () => { + [ + { format: 'png', image: SIMPLE_PNG_BASE64, ans: SIMPLE_TEXT_LEGACY }, + { format: 'jpg', image: SIMPLE_JPG_BASE64, ans: SIMPLE_TEXT_LEGACY }, + ].forEach(({ format, image, ans }) => ( + it(`recongize ${format} in base64`, async () => { + const { data: { text } } = await workerLegacy.recognize(image); + console.log(text); + expect(text).to.be(ans); + }).timeout(TIMEOUT) + )); + }); + describe('should support orientation metadata', () => { [ { name: 'simple-90.jpg', desc: 'simple', ans: SIMPLE_TEXT }, @@ -125,13 +139,30 @@ describe('recognize()', () => { }).timeout(TIMEOUT); }); - describe('should support all page seg modes', () => { + describe('should support all page seg modes (Legacy)', () => { + Object + .keys(PSM) + .map(name => ({ name, mode: PSM[name] })) + .forEach(({ name, mode }) => ( + it(`support PSM.${name} mode`, async () => { + await workerLegacy.reinitialize('eng+osd'); + await workerLegacy.setParameters({ + tessedit_pageseg_mode: mode, + }); + const { data } = await workerLegacy.recognize(`${IMAGE_PATH}/simple.png`); + expect(Object.keys(data).length).not.to.be(0); + }).timeout(TIMEOUT) + )); + }); + + describe('should support all page seg modes except for PSM.OSD_ONLY (LSTM)', () => { Object .keys(PSM) + .filter((x) => x !== 'OSD_ONLY') .map(name => ({ name, mode: PSM[name] })) .forEach(({ name, mode }) => ( it(`support PSM.${name} mode`, async () => { - await worker.reinitialize('eng'); + await worker.reinitialize('eng+osd'); await worker.setParameters({ tessedit_pageseg_mode: mode, });