-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathsimilarity.ts
79 lines (74 loc) · 3.32 KB
/
similarity.ts
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
import { getIntersection, getUnion } from './operations'
/**
* Aalso known as the Jaccard Similarity Coefficient, it is a
* mathematical formula devised by Paul Jaccard in 1912, and
* individually discovered again by T. Tanimoto in 1958.
*
* - **Formal definition**: `|(A ∩ B)|/|(A ∪ B)|`, or the size
* of the intersection of A and B, divided by the size of
* the union of A and B.
* @param a - Set A of type `<TElement>` elements
* @param b - Set B of type `<TElement>` elements
* @returns a normalized percentage from `[0, 1]` which measures
* the similarity between two sets
*/
export function getJaccardSimilarityCoefficient<TElement>(a: Set<TElement>, b: Set<TElement>): number {
if (a.size === 0 || b.size === 0) {
return 1
}
return getIntersection(a, b).size / getUnion(a, b).size
}
/**
* Implementation of the logDice algorithm, which was originally
* formulated by Pavel Rychlý. [1]
*
* - This method runs in logarithmic bilinear time, or `O(log(n*m))`,
* where `n = |A|`, and `m = |B|`.
* - [1] “A Lexicographer-Friendly Association Score - Sketchengine.eu”
* (Faculty of Informatics, Masaryk University, November 24, 2008)
* @see https://www.sketchengine.eu/wp-content/uploads/2015/03/Lexicographer-Friendly_2008.pdf
*
* @param a - Set A of type `<TElement>` elements
* @param b - Set B of type `<B>` elements
* @returns A coefficient score that can be interpreted into
* the number of collocation pairs between members of set and set B
* (as defined by §4 "logDice):
* - `14`: The theoretical maximum score, where all occurrences of A
* co-occur with B and vice-versa
* - `0`: There is less than 1 co-occurrence of AB for every 16,000 A
* or every 16,000 B
* - `< 0`: No statistical significance of AB collocation
* - The score is independent of the given corpus's size
*/
export function getLogDice<TElement>(a: Set<TElement>, b: Set<TElement>): number {
return 14 + Math.log2(getSorensenDiceCoefficient(a, b))
}
/**
* Also known as the Szymkiewicz–Simpson coefficient, it is a
* mathematical formula that computes the overlap between two sets
* by dividing the size of the intersection by the size of the
* the smallest set.
* @param a - Set A of type `<TElement>` elements
* @param b - Set B of type `<TElement>` elements
* @returns The overlap score between 2 sets
*/
export function getOverlapCoefficient<TElement>(a: Set<TElement>, b: Set<TElement>): number {
return getIntersection(a, b).size / Math.min(a.size, b.size)
}
/**
* An implementation of the Sørensen–Dice coefficient, originally
* formulated by Thorvald Sørensen (1948) and Lee Raymond Dice (1945)
* respectively.
* - **Formal definition**: `2|A ∩ B|/(|A| + |B|)`, which is twice
* the size of the intersection between A and B, divided by the sum
* of the cardinality of both sets.
* - This method runs in *bilinear time*, or `O(n * m)`, where
* `n = |B|` (size of set B) and `m = |A|` (size of set A).
* @param a - Set A of type `<TElement>` elements
* @param b - Set B of type `<TElement>` elements
* @returns A unique coefficient that scores the similarity between two sets,
* where the coefficient is part of the unit interval
*/
export function getSorensenDiceCoefficient<TElement>(a: Set<TElement>, b: Set<TElement>): number {
return (2 * getIntersection(a, b).size) / (a.size + b.size)
}