-
Notifications
You must be signed in to change notification settings - Fork 0
/
blocking.py
81 lines (68 loc) · 3.01 KB
/
blocking.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
import abc
class Blocking(abc.ABC):
# Any shared data strcutures or methods should be defined as part of the parent class.
# A list of shared arguments should be defined for each of the following methods and replace (or precede) *args.
# The output of each of the following methods should be defined clearly and shared between all methods implemented by members of the group.
def __init__(self, blocking_module,**kargs):
'''
Initialise object
Args:
blocking_module: String that mentions the blocking module being used.
**kargs: Dictionary containing parameters needed in blocking pipeline.
'''
self.blocking_module = blocking_module
@classmethod
@abc.abstractmethod
def read_dataset(filepath_list, *args, **kwargs):
'''
Accepts list of URI's and returns a List of Pandas DataFrames.
Args:
filepath_list: List of string path to the dataset.
*args: Additional arguments can be mentioned.
**kargs: Parameters for reading datasets.
Returns: List of Pandas DataFrames
'''
pass
@classmethod
@abc.abstractmethod
def train(dataframe, *args, **kwargs):
'''
Accepts Pandas Dataframe to train the model on and returns the trained model
Args:
dataframe: Pandas dataframe with training vectors under 'vectors' column. Row id's
will be used as node id's in the tree built by the related pairs algorithms.
*args: Empty
**kargs: Dictionary containing requisite parameters for training/building the
algorithm specified in their original papers/githubs.
Returns: Trained model
'''
pass
@classmethod
@abc.abstractmethod
def predict(model, dataframe_list, *args, **kwargs):
'''
Given a trained model and Pandas Dataframe of datasets, predict() returns a list of pairs of
related ids as a list of tuples.
Args:
model: tensorflow model used to predict.
dataframe_list: List of Pandas Dataframes.
*args: Additional arguments.
**kargs: Parameters requisite for prediciting related ids as defined by the original papers/githubs.
Returns: List of pairs of elements related in the dataset. Lists of Lists of tuples [[(id_0,0.99),(id_3,0.95)],...,[(id_5,0.97),(id_1,0.83)]].
'''
pass
@classmethod
@abc.abstractmethod
def evaluate(model, groundtruth, dataframe_list, *args, **kwargs):
'''
Given the ground truth and list of dataframes to predict the related ids for, evaluate() returns the Precision,
Recall and Reduction Ratio metrics.
Args:
model: tensorflow model used to predict.
groundtruth: String path to or Dataframe of the ground truth data.
dataframe_list: List of Pandas Dataframes with the data to predict the related id's for.
*args: If more than 1 dataset, string dataset path can be mentioned as additional arguments.
**kargs: Parameters requisite for prediciting related ids as defined by the original papers/githubs.
Returns: Precision, Recall, Reduction_Ratio
'''
pass