Skip to content

Commit

Permalink
Fixed bugs with wrong lang data being loaded per #834 and #835
Browse files Browse the repository at this point in the history
  • Loading branch information
Balearica committed Oct 3, 2023
1 parent a1031fd commit 0cad08a
Show file tree
Hide file tree
Showing 7 changed files with 42 additions and 11 deletions.
8 changes: 4 additions & 4 deletions src/createWorker.js
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ module.exports = async (langs = 'eng', oem = OEM.LSTM_ONLY, _options = {}, confi
cachePath: options.cachePath,
cacheMethod: options.cacheMethod,
gzip: options.gzip,
lstmOnly: [OEM.TESSERACT_ONLY, OEM.TESSERACT_LSTM_COMBINED].includes(currentOem)
lstmOnly: [OEM.LSTM_ONLY, OEM.TESSERACT_LSTM_COMBINED].includes(currentOem)
&& !options.legacyLang,
},
},
Expand Down Expand Up @@ -159,13 +159,13 @@ module.exports = async (langs = 'eng', oem = OEM.LSTM_ONLY, _options = {}, confi
// This logic fails if the user downloaded the LSTM-only English data for a language
// and then uses `worker.reinitialize` to switch to the Legacy engine.
// However, the correct data will still be downloaded after initialization fails
// and this can be avoided entirely
// and this can be avoided entirely if the user loads the correct data ahead of time.
const langsArr = typeof langs === 'string' ? langs.split('+') : langs;
const _langs = langsArr.filter((x) => currentLangs.includes(x));
const _langs = langsArr.filter((x) => !currentLangs.includes(x));
currentLangs.push(_langs);

return loadLanguageInternal(_langs, jobId)
.then(() => initializeInternal(_langs, _oem, _config, jobId));
.then(() => initializeInternal(langs, _oem, _config, jobId));
};

const setParameters = (params = {}, jobId) => (
Expand Down
Binary file modified tests/assets/traineddata/chi_tra.traineddata
Binary file not shown.
Binary file removed tests/assets/traineddata/chi_tra.traineddata.gz
Binary file not shown.
Binary file removed tests/assets/traineddata/eng.traineddata.gz
Binary file not shown.
Binary file removed tests/assets/traineddata/osd.traineddata.gz
Binary file not shown.
8 changes: 4 additions & 4 deletions tests/constants.js

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

37 changes: 34 additions & 3 deletions tests/recognize.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@ const { createWorker, PSM } = Tesseract;
let worker;
before(async function cb() {
this.timeout(0);
worker = await createWorker("eng+chi_tra+osd", 1, OPTIONS);
worker = await createWorker("eng", 1, OPTIONS);
workerLegacy = await createWorker("eng", 0, OPTIONS);
});

describe('recognize()', () => {
Expand All @@ -29,6 +30,19 @@ describe('recognize()', () => {
));
});

describe('should recognize with Legacy OEM', () => {
[
{ format: 'png', image: SIMPLE_PNG_BASE64, ans: SIMPLE_TEXT_LEGACY },
{ format: 'jpg', image: SIMPLE_JPG_BASE64, ans: SIMPLE_TEXT_LEGACY },
].forEach(({ format, image, ans }) => (
it(`recongize ${format} in base64`, async () => {
const { data: { text } } = await workerLegacy.recognize(image);
console.log(text);
expect(text).to.be(ans);
}).timeout(TIMEOUT)
));
});

describe('should support orientation metadata', () => {
[
{ name: 'simple-90.jpg', desc: 'simple', ans: SIMPLE_TEXT },
Expand Down Expand Up @@ -125,13 +139,30 @@ describe('recognize()', () => {
}).timeout(TIMEOUT);
});

describe('should support all page seg modes', () => {
describe('should support all page seg modes (Legacy)', () => {
Object
.keys(PSM)
.map(name => ({ name, mode: PSM[name] }))
.forEach(({ name, mode }) => (
it(`support PSM.${name} mode`, async () => {
await workerLegacy.reinitialize('eng+osd');
await workerLegacy.setParameters({
tessedit_pageseg_mode: mode,
});
const { data } = await workerLegacy.recognize(`${IMAGE_PATH}/simple.png`);
expect(Object.keys(data).length).not.to.be(0);
}).timeout(TIMEOUT)
));
});

describe('should support all page seg modes except for PSM.OSD_ONLY (LSTM)', () => {
Object
.keys(PSM)
.filter((x) => x !== 'OSD_ONLY')
.map(name => ({ name, mode: PSM[name] }))
.forEach(({ name, mode }) => (
it(`support PSM.${name} mode`, async () => {
await worker.reinitialize('eng');
await worker.reinitialize('eng+osd');
await worker.setParameters({
tessedit_pageseg_mode: mode,
});
Expand Down

0 comments on commit 0cad08a

Please sign in to comment.