Skip to content

Commit

Permalink
7147 missing segmentations feedback (#7531)
Browse files Browse the repository at this point in the history
* Feedback on no segmentation case

count segmentations and labeled files and provide proper feedback

* fix type errors
  • Loading branch information
daneryl authored Dec 10, 2024
1 parent e585652 commit 38c0fd8
Show file tree
Hide file tree
Showing 4 changed files with 109 additions and 19 deletions.
39 changes: 30 additions & 9 deletions app/api/services/informationextraction/InformationExtraction.ts
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@ import {
getFilesForSuggestions,
propertyTypeIsWithoutExtractedMetadata,
propertyTypeIsSelectOrMultiSelect,
NoSegmentedFiles,
NoLabeledFiles,
} from 'api/services/informationextraction/getFiles';
import { Suggestions } from 'api/suggestions/suggestions';
import { IXExtractorType } from 'shared/types/extractorType';
Expand Down Expand Up @@ -403,13 +405,13 @@ class InformationExtraction {

const [extractor] = await Extractors.get({ _id: extractorId });
const serviceUrl = await this.serviceUrl();
const materialsSent = await this.materialsForModel(extractor, serviceUrl);
const [materialsSent, status] = await this.materialsForModel(extractor, serviceUrl);
if (!materialsSent) {
if (model) {
model.findingSuggestions = false;
await IXModelsModel.save(model);
}
return { status: 'error', message: 'No labeled data' };
return status || { status: 'error', message: 'No labeled data' };
}

const template = await templatesModel.getById(extractor.templates[0]);
Expand Down Expand Up @@ -508,14 +510,33 @@ class InformationExtraction {
return { status: 'error', message: 'No model found' };
};

materialsForModel = async (extractor: IXExtractorType, serviceUrl: string) => {
const files = await getFilesForTraining(extractor.templates, extractor.property);
if (!files.length) {
return false;
async materialsForModel(
extractor: IXExtractorType,
serviceUrl: string
): Promise<[boolean, { status: string; message: string }?]> {
try {
const files = await getFilesForTraining(extractor.templates, extractor.property);
if (!files.length) {
return [false];
}
await this.sendMaterials(files, extractor, serviceUrl);
return [true];
} catch (e) {
if (e instanceof NoSegmentedFiles) {
return [
false,
{
status: 'error',
message: 'There are no documents segmented yet, please try again later',
},
];
}
if (e instanceof NoLabeledFiles) {
return [false, { status: 'error', message: 'No labeled data' }];
}
throw e;
}
await this.sendMaterials(files, extractor, serviceUrl);
return true;
};
}

saveModelProcess = async (
extractorId: ObjectIdSchema,
Expand Down
55 changes: 45 additions & 10 deletions app/api/services/informationextraction/getFiles.ts
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,9 @@ const MAX_TRAINING_FILES_NUMBER = 2000;

type PropertyValue = string | Array<{ value: string; label: string }>;

class NoSegmentedFiles extends Error {}
class NoLabeledFiles extends Error {}

interface FileWithAggregation {
_id: ObjectIdSchema;
segmentation: SegmentationType;
Expand Down Expand Up @@ -87,7 +90,8 @@ async function getFilesWithAggregations(files: (FileType & FileEnforcedNotUndefi

async function getSegmentedFilesIds() {
const segmentations = await SegmentationModel.get({ status: 'ready' }, 'fileID');
return segmentations.filter(x => x.fileID).map(x => x.fileID) as ObjectIdSchema[];
const result = segmentations.filter(x => x.fileID).map(x => x.fileID) as ObjectIdSchema[];
return result;
}

async function getPropertyType(templates: ObjectIdSchema[], property: string) {
Expand All @@ -106,27 +110,48 @@ async function getPropertyType(templates: ObjectIdSchema[], property: string) {
return type;
}

async function anyFilesLabeled(
property: string,
propertyType: string,
entitiesFromTrainingTemplatesIds: string[]
) {
const needsExtractedMetadata = !propertyTypeIsWithoutExtractedMetadata(propertyType);
const count = await filesModel.count({
type: 'document',
filename: { $exists: true },
language: { $exists: true },
entity: { $in: entitiesFromTrainingTemplatesIds },
...(needsExtractedMetadata ? { 'extractedMetadata.name': property } : {}),
});
return !!count;
}

async function anyFilesSegmented(property: string, propertyType: string) {
const needsExtractedMetadata = !propertyTypeIsWithoutExtractedMetadata(propertyType);
const segmentedFilesCount = await filesModel.count({
type: 'document',
filename: { $exists: true },
language: { $exists: true },
_id: { $in: await getSegmentedFilesIds() },
...(needsExtractedMetadata ? { 'extractedMetadata.name': property } : {}),
});
return !!segmentedFilesCount;
}

async function fileQuery(
property: string,
propertyType: string,
entitiesFromTrainingTemplatesIds: string[]
) {
const needsExtractedMetadata = !propertyTypeIsWithoutExtractedMetadata(propertyType);
const query: {
type: string;
filename: { $exists: Boolean };
language: { $exists: Boolean };
_id: { $in: ObjectIdSchema[] };
'extractedMetadata.name'?: string;
entity: { $in: string[] };
} = {
const query = {
type: 'document',
filename: { $exists: true },
language: { $exists: true },
_id: { $in: await getSegmentedFilesIds() },
entity: { $in: entitiesFromTrainingTemplatesIds },
...(needsExtractedMetadata ? { 'extractedMetadata.name': property } : {}),
};
if (needsExtractedMetadata) query['extractedMetadata.name'] = property;
return query;
}

Expand Down Expand Up @@ -154,6 +179,14 @@ async function getFilesForTraining(templates: ObjectIdSchema[], property: string
.filter(x => x.sharedId)
.map(x => x.sharedId) as string[];

if (!(await anyFilesLabeled(property, propertyType, entitiesFromTrainingTemplatesIds))) {
throw new NoLabeledFiles();
}

if (!(await anyFilesSegmented(property, propertyType))) {
throw new NoSegmentedFiles();
}

const files = (await filesModel.get(
await fileQuery(property, propertyType, entitiesFromTrainingTemplatesIds),
'extractedMetadata entity language filename',
Expand Down Expand Up @@ -238,5 +271,7 @@ export {
propertyTypeIsSelectOrMultiSelect,
propertyTypeIsWithoutExtractedMetadata,
propertyTypeIsMultiValued,
NoLabeledFiles,
NoSegmentedFiles,
};
export type { FileWithAggregation };
Original file line number Diff line number Diff line change
Expand Up @@ -444,6 +444,10 @@ describe('InformationExtraction', () => {
id: `A${i + 1}`,
label: `A${i + 1}`,
})),
{
id: 'entityWithoutSegmentation',
label: 'entityWithoutSegmentation',
},
],
metadata: {
extractor_name: 'extractorWithRelationshipToAny',
Expand Down Expand Up @@ -482,6 +486,18 @@ describe('InformationExtraction', () => {
});
expect(relationshipModel.findingSuggestions).toBe(false);
});

it('should return error status (No segmented files) and stop finding suggestions, when there are no segmented files', async () => {
const expectedError = {
status: 'error',
message: 'There are no documents segmented yet, please try again later',
};

const result = await informationExtraction.trainModel(
factory.id('extractorWithoutSegmentations')
);
expect(result).toMatchObject(expectedError);
});
});

describe('when model is trained', () => {
Expand Down
18 changes: 18 additions & 0 deletions app/api/services/informationextraction/specs/fixtures.ts
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ const ficturesPdfNameJ = 'documentJ.pdf';
const fixturesPdfNameK = 'documentK.pdf';
const fixturesPdfNameL = 'documentL.pdf';
const fixturesPdfNameM = 'documentM.pdf';
const pdfWithouTSegmentations = 'documentWithoutSegmentations.pdf';

const fixtures: DBFixture = {
settings: [
Expand Down Expand Up @@ -54,6 +55,7 @@ const fixtures: DBFixture = {
factory.ixExtractor('extractorWithRelationshipToAny', 'property_relationship_to_any', [
'templateToSegmentF',
]),
factory.ixExtractor('extractorWithoutSegmentations', 'title', ['templateWithoutSegmentations']),
],
entities: [
factory.entity('P1', 'relationshipPartnerTemplate', {}, { sharedId: 'P1sharedId' }),
Expand Down Expand Up @@ -137,6 +139,7 @@ const fixtures: DBFixture = {
property_empty_relationship: [],
property_relationship_to_any: [],
}),
factory.entity('entityWithoutSegmentation', 'templateWithoutSegmentations', {}),
],
files: [
factory.fileDeprecated('F1', 'A1', 'document', fixturesPdfNameA, 'other', '', [
Expand Down Expand Up @@ -193,6 +196,20 @@ const fixtures: DBFixture = {
factory.fileDeprecated('F21', 'A21', 'document', fixturesPdfNameK, 'eng'),
factory.fileDeprecated('F22', 'A22', 'document', fixturesPdfNameL, 'eng'),
factory.fileDeprecated('F23', 'A23', 'document', fixturesPdfNameM, 'eng'),
factory.document('FileWithoutSegmentations', {
language: 'eng',
filename: pdfWithouTSegmentations,
entity: 'entityWithoutSegmentation',
extractedMetadata: [
{
name: 'title',
selection: {
text: 'something',
selectionRectangles: [{ top: 0, left: 0, width: 0, height: 0, page: '1' }],
},
},
],
}),
],
segmentations: [
{
Expand Down Expand Up @@ -764,6 +781,7 @@ const fixtures: DBFixture = {
relationType: factory.idString('relatedToAny'),
}),
]),
factory.template('templateWithoutSegmentations'),
],
dictionaries: [factory.nestedThesauri('thesauri1', ['A', 'B', 'C', { 1: ['1A', '1B'] }])],
};
Expand Down

0 comments on commit 38c0fd8

Please sign in to comment.