Skip to content

Commit

Permalink
feat: deduplicate france services lieux by compatible name
Browse files Browse the repository at this point in the history
  • Loading branch information
marc-gavanier committed Oct 3, 2024
1 parent af67c8f commit 8c74f16
Show file tree
Hide file tree
Showing 2 changed files with 100 additions and 45 deletions.
60 changes: 59 additions & 1 deletion src/dedupliquer/steps/find-duplicates/find-duplicates.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,7 @@ describe('find duplicates', (): void => {
]);
});

it('should not need to deduplicate when only lieu 1 has RFS typologie', (): void => {
it('should not deduplicate when only lieu 1 has RFS typologie', (): void => {
const lieux: SchemaLieuMediationNumerique[] = [
{
id: '574-mediation-numerique-hinaura',
Expand Down Expand Up @@ -177,6 +177,64 @@ describe('find duplicates', (): void => {
expect(duplicates).toStrictEqual([]);
});

it('should allow deduplicate when both lieux contain "France Service" in the name', (): void => {
const lieux: SchemaLieuMediationNumerique[] = [
{
id: '574-mediation-numerique-hinaura',
nom: "France services d'Etrechy",
adresse: '26 rue Jean Moulin',
code_postal: '38000',
commune: 'Grenoble',
latitude: 45.186115,
longitude: 5.716962,
source: 'conseiller-numerique'
} as SchemaLieuMediationNumerique,
{
id: '2848-mediation-numerique-france-services',
nom: "France services d'Etrechy",
adresse: '26 rue Jean Moulin',
code_postal: '38000',
commune: 'Grenoble',
latitude: 45.186115,
longitude: 5.716962,
typologie: Typologie.RFS,
source: 'france-services'
} as SchemaLieuMediationNumerique
];

const duplicates: CommuneDuplications[] = findDuplicates(lieux, false);

expect(duplicates).toStrictEqual([
{
codePostal: '38000',
lieux: [
{
id: '574-mediation-numerique-hinaura',
duplicates: [
{
id: '2848-mediation-numerique-france-services',
distanceScore: 100,
nomFuzzyScore: 100,
voieFuzzyScore: 100
}
]
},
{
id: '2848-mediation-numerique-france-services',
duplicates: [
{
id: '574-mediation-numerique-hinaura',
distanceScore: 100,
nomFuzzyScore: 100,
voieFuzzyScore: 100
}
]
}
]
}
]);
});

it('should not need to deduplicate when only lieu 1 has RFS typologie in lieux to deduplicate', (): void => {
const lieuxToDeduplicate: SchemaLieuMediationNumerique[] = [
{
Expand Down
85 changes: 41 additions & 44 deletions src/dedupliquer/steps/find-duplicates/find-duplicates.ts
Original file line number Diff line number Diff line change
@@ -1,22 +1,11 @@
import { SchemaLieuMediationNumerique, Typologie } from '@gouvfr-anct/lieux-de-mediation-numerique';
import { ratio } from 'fuzzball';

export type Duplicate = {
id: string;
distanceScore: number;
nomFuzzyScore: number;
voieFuzzyScore: number;
};

export type LieuDuplications = {
id: string;
duplicates: Duplicate[];
};

export type CommuneDuplications = {
codePostal: string;
lieux: LieuDuplications[];
};
export type Duplicate = { id: string; distanceScore: number; nomFuzzyScore: number; voieFuzzyScore: number };

export type LieuDuplications = { id: string; duplicates: Duplicate[] };

export type CommuneDuplications = { codePostal: string; lieux: LieuDuplications[] };

const sameSource = (lieu: SchemaLieuMediationNumerique, lieuToDeduplicate: SchemaLieuMediationNumerique): boolean =>
lieu.source === lieuToDeduplicate.source;
Expand All @@ -30,8 +19,18 @@ const sameCodePostal = (lieu: SchemaLieuMediationNumerique, lieuToDeduplicate: S
const hasRFSCompatibleTypology = (lieu: SchemaLieuMediationNumerique): boolean =>
[`${Typologie.RFS}`, `${Typologie.PIMMS}`].includes(lieu.typologie ?? 'NO_TYPOLOGY');

const compatibleTypologies = (lieu: SchemaLieuMediationNumerique, lieuToDeduplicate: SchemaLieuMediationNumerique): boolean =>
hasRFSCompatibleTypology(lieu) && hasRFSCompatibleTypology(lieuToDeduplicate)
const isCompatibleForFranceServices = (
lieu: SchemaLieuMediationNumerique,
lieuToDeduplicate: SchemaLieuMediationNumerique
): boolean =>
(hasRFSCompatibleTypology(lieu) && hasRFSCompatibleTypology(lieuToDeduplicate)) ||
(/france services?/giu.test(lieu.nom.toLowerCase()) && /france services?/giu.test(lieuToDeduplicate.nom.toLowerCase()));

const compatibilitySpecialCases = (
lieu: SchemaLieuMediationNumerique,
lieuToDeduplicate: SchemaLieuMediationNumerique
): boolean =>
isCompatibleForFranceServices(lieu, lieuToDeduplicate)
? true
: lieuToDeduplicate.typologie !== Typologie.RFS && lieu.typologie !== Typologie.RFS;

Expand All @@ -41,7 +40,7 @@ const onlyPotentialDuplicates =
sameCodePostal(lieu, lieuToDeduplicate) &&
!sameId(lieu, lieuToDeduplicate) &&
(allowInternalMerge || !sameSource(lieu, lieuToDeduplicate)) &&
compatibleTypologies(lieu, lieuToDeduplicate);
compatibilitySpecialCases(lieu, lieuToDeduplicate);

const MINIMAL_CARTESIAN_DISTANCE: 0.0004 = 0.0004 as const;

Expand All @@ -59,9 +58,9 @@ const hasDefinedCoordinates = (
/* eslint-disable-next-line no-mixed-operators */
const pythagore = (x1: number, x2: number, y1: number, y2: number): number => Math.sqrt((x1 - x2) ** 2 + (y1 - y2) ** 2);

const cartesianDistanceBetween = (lieu: SchemaLieuMediationNumerique, curentLieu: SchemaLieuMediationNumerique): number =>
hasDefinedCoordinates(lieu) && hasDefinedCoordinates(curentLieu)
? pythagore(lieu.latitude, curentLieu.latitude, lieu.longitude, curentLieu.longitude)
const cartesianDistanceBetween = (lieu: SchemaLieuMediationNumerique, cible: SchemaLieuMediationNumerique): number =>
hasDefinedCoordinates(lieu) && hasDefinedCoordinates(cible)
? pythagore(lieu.latitude, cible.latitude, lieu.longitude, cible.longitude)
: NaN;

const duplicatesWithScores =
Expand All @@ -76,6 +75,12 @@ const duplicatesWithScores =
})
);

const toLieuDuplications = (
lieu: SchemaLieuMediationNumerique,
lieux: SchemaLieuMediationNumerique[],
allowInternalMerge: boolean
): LieuDuplications => ({ id: lieu.id, duplicates: duplicatesWithScores(lieux)(lieu, allowInternalMerge) });

const appendCommuneDuplications =
(lieux: SchemaLieuMediationNumerique[]) =>
(
Expand All @@ -84,10 +89,7 @@ const appendCommuneDuplications =
allowInternalMerge: boolean
): CommuneDuplications[] => [
...duplications,
{
codePostal: lieuToDeduplicate.code_postal,
lieux: [{ id: lieuToDeduplicate.id, duplicates: duplicatesWithScores(lieux)(lieuToDeduplicate, allowInternalMerge) }]
}
{ codePostal: lieuToDeduplicate.code_postal, lieux: [toLieuDuplications(lieuToDeduplicate, lieux, allowInternalMerge)] }
];

const toUpdatedCommuneDuplications =
Expand All @@ -97,17 +99,14 @@ const toUpdatedCommuneDuplications =
communeDuplications.codePostal === lieu.code_postal
? {
codePostal: lieu.code_postal,
lieux: [
...duplicationsWithSameCodePostal.lieux,
{ id: lieu.id, duplicates: duplicatesWithScores(lieux)(lieu, allowInternalMerge) }
]
lieux: [...duplicationsWithSameCodePostal.lieux, toLieuDuplications(lieu, lieux, allowInternalMerge)]
}
: communeDuplications;

const withSameCodePostal =
(lieu: SchemaLieuMediationNumerique) =>
(communeDuplications: CommuneDuplications): boolean =>
communeDuplications.codePostal === lieu.code_postal;
({ codePostal }: CommuneDuplications): boolean =>
codePostal === lieu.code_postal;

const toCommunesDuplications =
(lieux: SchemaLieuMediationNumerique[], allowInternalMerge: boolean) =>
Expand All @@ -119,25 +118,23 @@ const toCommunesDuplications =
toUpdatedCommuneDuplications(lieux)(lieuToDeduplicate, duplicationsWithSameCodePostal, allowInternalMerge)
))(duplications.find(withSameCodePostal(lieuToDeduplicate)));

const onlyWithDuplicates = (lieu: LieuDuplications): boolean => lieu.duplicates.length > 0;
const onlyWithDuplicates = ({ duplicates }: LieuDuplications): boolean => duplicates.length > 0;

const onlyWithoutDuplicates = (lieu: LieuDuplications): boolean => lieu.duplicates.length === 0;
const onlyWithoutDuplicates = ({ duplicates }: LieuDuplications): boolean => duplicates.length === 0;

const toDuplicatesWithout =
(noDuplicatesIds: string[]) =>
(lieu: LieuDuplications): LieuDuplications => ({
id: lieu.id,
duplicates: lieu.duplicates.filter((duplicate: Duplicate): boolean => !noDuplicatesIds.includes(duplicate.id))
({ id, duplicates }: LieuDuplications): LieuDuplications => ({
id,
duplicates: duplicates.filter((duplicate: Duplicate): boolean => !noDuplicatesIds.includes(duplicate.id))
});

const toId = (lieu: LieuDuplications): string => lieu.id;

const invalidDuplicatesIds = (communeDuplications: CommuneDuplications): string[] =>
communeDuplications.lieux.filter(onlyWithoutDuplicates).map(toId);
const invalidDuplicatesIds = ({ lieux }: CommuneDuplications): string[] =>
lieux.filter(onlyWithoutDuplicates).map((lieu: LieuDuplications): string => lieu.id);

const removeLieuxFrom = (communeDuplications: CommuneDuplications, ids: string[]): CommuneDuplications => ({
codePostal: communeDuplications.codePostal,
lieux: communeDuplications.lieux.map(toDuplicatesWithout(ids)).filter(onlyWithDuplicates)
const removeLieuxFrom = ({ codePostal, lieux }: CommuneDuplications, ids: string[]): CommuneDuplications => ({
codePostal,
lieux: lieux.map(toDuplicatesWithout(ids)).filter(onlyWithDuplicates)
});

const toValidDuplicates = (communeDuplications: CommuneDuplications): CommuneDuplications =>
Expand Down

0 comments on commit 8c74f16

Please sign in to comment.