Skip to content

Commit

Permalink
detect language mapping (#7513)
Browse files Browse the repository at this point in the history
* detect language mapping

* use pdf file with more text to be able to detect language properly

---------

Co-authored-by: Joan Gallego Girona <daneryl@gmail.com>
  • Loading branch information
Joao-vi and daneryl authored Dec 2, 2024
1 parent fc9d21f commit a9e57a4
Show file tree
Hide file tree
Showing 6 changed files with 11 additions and 7 deletions.
2 changes: 1 addition & 1 deletion app/api/files/PDF.ts
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ class PDF extends EventEmitter {
...conversion,
...this.file,
language:
detectLanguage(Object.values(conversion.fullTextWithoutPages).join(''), 'franc') ||
detectLanguage(Object.values(conversion.fullTextWithoutPages).join(''), 'ISO639_3') ||
undefined,
processed: true,
toc: [],
Expand Down
8 changes: 5 additions & 3 deletions app/api/files/specs/uploadRoutes.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -96,9 +96,11 @@ describe('upload routes', () => {
entity: 'sharedId1',
type: 'document',
status: 'ready',
fullText: { 1: 'Test[[1]] file[[1]]\n\n' },
fullText: {
1: 'This[[1]] is[[1]] a[[1]] dumb[[1]] text[[1]] file[[1]] used[[1]] to[[1]] text[[1]] language[[1]] detecting,[[1]] it[[1]] should[[1]] be[[1]] detected[[1]] as[[1]] english[[1]]\n\n',
},
totalPages: 1,
language: 'other',
language: 'eng',
filename: expect.stringMatching(/.*\.pdf/),
originalname: 'f2082bf51b6ef839690485d7153e847a.pdf',
creationDate: 1000,
Expand All @@ -114,7 +116,7 @@ describe('upload routes', () => {
type: 'thumbnail',
});

expect(language).toBe('other');
expect(language).toBe('eng');
expect(mimetype).toEqual('image/jpeg');
expect(await fs.readFile(uploadsPath(filename))).toBeDefined();
});
Expand Down
Binary file modified app/api/files/specs/uploads/f2082bf51b6ef839690485d7153e847a.pdf
Binary file not shown.
4 changes: 2 additions & 2 deletions app/shared/detectLanguage.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import franc from 'franc';
import { language } from 'shared/languagesList';
import { language, LanguageCode } from 'shared/languagesList';

const detectLanguage = (text: string, purpose: 'elastic' | 'franc' | 'ISO639_1' = 'elastic') =>
const detectLanguage = (text: string, purpose: LanguageCode = 'elastic') =>
language(franc(text), purpose);
export { detectLanguage };
2 changes: 1 addition & 1 deletion app/shared/languagesList.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
/* eslint-disable max-lines */
import { LanguageSchema } from 'shared/types/commonTypes';

type LanguageCode = 'elastic' | 'ISO639_3' | 'ISO639_1' | 'franc';
type LanguageCode = 'elastic' | 'ISO639_3' | 'ISO639_1';

type LegacyElasticObject = Record<
string,
Expand Down
2 changes: 2 additions & 0 deletions app/shared/specs/languages.spec.js
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,8 @@ describe('languages', () => {
expect(detectLanguage('what is the colour of the white horse of santiago', 'franc')).toBe(
'eng'
);

expect(detectLanguage('Це перевірка', 'ISO639_3')).toBe('ukr');
});

it('should return other when the language is not supported', () => {
Expand Down

0 comments on commit a9e57a4

Please sign in to comment.