From 7177f0af8234b1a9d66198c7ea20a662167c64b3 Mon Sep 17 00:00:00 2001 From: Mukesh A Date: Thu, 23 Apr 2020 20:43:46 +0100 Subject: [PATCH 1/6] added data model for MetaFeatures for Data --- app.py | 13 ++++++++++--- constants.py | 2 ++ data_models/MetaFeatures.py | 27 +++++++++++++++++++++++++++ data_models/Metrics.py | 11 +++++++---- data_models/Params.py | 4 ++-- 5 files changed, 48 insertions(+), 9 deletions(-) create mode 100644 data_models/MetaFeatures.py diff --git a/app.py b/app.py index 300a85d..ac1696a 100644 --- a/app.py +++ b/app.py @@ -23,6 +23,7 @@ from data_models.Metrics import * from data_models.Params import * +from data_models.MetaFeatures import * # Create a Metric @app.route(METRIC, methods=[POST]) @@ -31,7 +32,9 @@ def add_metric(): dataset_hash = request.json['dataset_hash'].replace("\x00", "") metric_name = request.json['metric_name'] metric_value = request.json['metric_value'] - + + target_type = request.json['target_type'] + new_metric = Metric(algorithm_name, dataset_hash, metric_name, metric_value) db.session.add(new_metric) @@ -44,8 +47,12 @@ def add_metric(): db.session.add(new_params) db.session.commit() - #new_metric = Metric.query.get(new_metric.id) - print(metric_schema.dump(new_metric)) + data_meta_features = request.json['data_meta_features'] + if(data_meta_features != ""): + for feat in data_meta_features: + new_feat = MetaFeature(new_metric.id, feat['feat_name'], feat['feat_value']) + db.session.add(new_feat) + db.session.commit() return metric_schema.jsonify(new_metric) diff --git a/constants.py b/constants.py index 5cdf0c7..73ec782 100644 --- a/constants.py +++ b/constants.py @@ -1,10 +1,12 @@ # Table Names TABLE_METRIC = 'metric' TABLE_PARAM = 'params' +TABLE_META_FEATURES = 'meta_features' # Table Class Names CLASS_METRIC = 'Metric' CLASS_PARAM = 'Params' +CLASS_META_FEATURE = 'MetaFeature' # Rest Methods GET = 'GET' diff --git a/data_models/MetaFeatures.py b/data_models/MetaFeatures.py new file mode 100644 index 0000000..d1c9d6f --- /dev/null +++ b/data_models/MetaFeatures.py @@ -0,0 +1,27 @@ +from app import db, ma +from constants import TABLE_META_FEATURES + + +# Meta Feature Class/Model +class MetaFeature(db.Model): + __tablename__ = TABLE_META_FEATURES + id = db.Column(db.Integer, primary_key=True) + metric_id = db.Column(db.Integer, db.ForeignKey('metric.id'), nullable=False) + feat_name = db.Column(db.String(200)) + feat_value = db.Column(db.String(200)) + + def __init__(self, metric_id, feat_name, feat_value): + self.metric_id = metric_id + self.feat_name = feat_name + self.feat_value = feat_value + + +# Meta Feature Schema +class MetaFeatureSchema(ma.Schema): + class Meta: + fields = ('id', 'metric_id', 'feat_name', 'feat_value') + + +# Init schema +meta_feature_schema = MetaFeatureSchema() +meta_features_schema = MetaFeatureSchema(many=True) diff --git a/data_models/Metrics.py b/data_models/Metrics.py index fde9cb9..ce62814 100644 --- a/data_models/Metrics.py +++ b/data_models/Metrics.py @@ -1,8 +1,9 @@ from app import db, ma from data_models.Params import ParamSchema -from constants import TABLE_METRIC, CLASS_PARAM +from data_models.MetaFeatures import MetaFeatureSchema +from constants import TABLE_METRIC, CLASS_PARAM, CLASS_META_FEATURE -# Product Class/Model +# Metric Class/Model class Metric(db.Model): __tablename__ = TABLE_METRIC id = db.Column(db.Integer, primary_key=True) @@ -11,6 +12,7 @@ class Metric(db.Model): metric_name = db.Column(db.String(200)) metric_value = db.Column(db.Float) params = db.relationship(CLASS_PARAM, cascade = "all, delete", backref=TABLE_METRIC, lazy=True) + meta_features = db.relationship(CLASS_META_FEATURE, cascade = "all, delete", backref=TABLE_METRIC, lazy=True) def __init__(self, algorithm_name, dataset_hash, metric_name, metric_value): self.algorithm_name = algorithm_name @@ -19,11 +21,12 @@ def __init__(self, algorithm_name, dataset_hash, metric_name, metric_value): self.metric_value = metric_value -# Product Schema +# Metric Schema class MetricSchema(ma.Schema): params = ma.Nested(ParamSchema, many=True) + meta_features = ma.Nested(MetaFeatureSchema, many=True) class Meta: - fields = ('id', 'algorithm_name', 'dataset_hash', 'metric_name', 'metric_value', 'params') + fields = ('id', 'algorithm_name', 'dataset_hash', 'metric_name', 'metric_value', 'params', 'meta_features') include_fk = True diff --git a/data_models/Params.py b/data_models/Params.py index 03e8773..2da131d 100644 --- a/data_models/Params.py +++ b/data_models/Params.py @@ -2,7 +2,7 @@ from constants import TABLE_PARAM -# Product Class/Model +# Param Class/Model class Params(db.Model): __tablename__ = TABLE_PARAM id = db.Column(db.Integer, primary_key=True) @@ -16,7 +16,7 @@ def __init__(self, metric_id, param_name, param_value): self.param_value = param_value -# Product Schema +# Param Schema class ParamSchema(ma.Schema): class Meta: fields = ('id', 'metric_id', 'param_name', 'param_value') From 4cb67ad6c9495e32a7e53e6345f12c2f1346587e Mon Sep 17 00:00:00 2001 From: Mukesh A Date: Thu, 23 Apr 2020 21:04:57 +0100 Subject: [PATCH 2/6] added target_type to the metric data model --- app.py | 2 +- data_models/Metrics.py | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/app.py b/app.py index ac1696a..22026e3 100644 --- a/app.py +++ b/app.py @@ -35,7 +35,7 @@ def add_metric(): target_type = request.json['target_type'] - new_metric = Metric(algorithm_name, dataset_hash, metric_name, metric_value) + new_metric = Metric(algorithm_name, dataset_hash, metric_name, metric_value, target_type) db.session.add(new_metric) db.session.commit() diff --git a/data_models/Metrics.py b/data_models/Metrics.py index ce62814..95dfffb 100644 --- a/data_models/Metrics.py +++ b/data_models/Metrics.py @@ -11,14 +11,16 @@ class Metric(db.Model): dataset_hash = db.Column(db.Text) metric_name = db.Column(db.String(200)) metric_value = db.Column(db.Float) + target_type = db.Column(db.String(200)) params = db.relationship(CLASS_PARAM, cascade = "all, delete", backref=TABLE_METRIC, lazy=True) meta_features = db.relationship(CLASS_META_FEATURE, cascade = "all, delete", backref=TABLE_METRIC, lazy=True) - def __init__(self, algorithm_name, dataset_hash, metric_name, metric_value): + def __init__(self, algorithm_name, dataset_hash, metric_name, metric_value, target_type): self.algorithm_name = algorithm_name self.dataset_hash = dataset_hash self.metric_name = metric_name self.metric_value = metric_value + self.target_type = target_type # Metric Schema @@ -26,7 +28,7 @@ class MetricSchema(ma.Schema): params = ma.Nested(ParamSchema, many=True) meta_features = ma.Nested(MetaFeatureSchema, many=True) class Meta: - fields = ('id', 'algorithm_name', 'dataset_hash', 'metric_name', 'metric_value', 'params', 'meta_features') + fields = ('id', 'algorithm_name', 'dataset_hash', 'metric_name', 'metric_value', 'target_type', 'params', 'meta_features') include_fk = True From 19097c59a16ca53231b842e83496fe0da3982747 Mon Sep 17 00:00:00 2001 From: Mukesh A Date: Sat, 16 May 2020 14:01:18 +0100 Subject: [PATCH 3/6] added util code to fetch data from db and convert to df --- requirements.txt | 9 +++++++++ utils.py | 43 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 52 insertions(+) create mode 100644 utils.py diff --git a/requirements.txt b/requirements.txt index daf4367..a96429a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,16 +9,25 @@ gunicorn==20.0.4 isort==4.3.21 itsdangerous==1.1.0 Jinja2==2.10.3 +joblib==0.15.0 lazy-object-proxy==1.4.3 MarkupSafe==1.1.1 marshmallow==3.3.0 marshmallow-sqlalchemy==0.21.0 mccabe==0.6.1 +numpy==1.18.4 +pandas==1.0.3 psycopg2-binary==2.8.4 pycparser==2.19 pylint==2.4.4 +python-dateutil==2.8.1 +pytz==2020.1 +scikit-learn==0.23.0 +scipy==1.4.1 six==1.13.0 +sklearn==0.0 SQLAlchemy==1.3.12 +threadpoolctl==2.0.0 typed-ast==1.4.1 Werkzeug==0.16.0 wrapt==1.11.2 diff --git a/utils.py b/utils.py new file mode 100644 index 0000000..f1a3a30 --- /dev/null +++ b/utils.py @@ -0,0 +1,43 @@ +import pandas as pd + +from data_models.Metrics import Metric + +TARGET_TYPE = 'Target Type' +ALGORITHM_NAME = 'Algorithm Name' +METRIC_NAME = 'Metric Name' +METRIC_VALUE = 'Metric Value' + +def get_df_from_db(): + all_metrics = Metric.query.all() + + data = {} + for metric in all_metrics: + if TARGET_TYPE not in data: + data[TARGET_TYPE] = [] + data[TARGET_TYPE].append(metric.target_type) + + for mf in metric.meta_features: + if mf.feat_name not in data: + data[mf.feat_name] = [] + data[mf.feat_name].append(mf.feat_value) + + if ALGORITHM_NAME not in data: + data[ALGORITHM_NAME] = [] + data[ALGORITHM_NAME].append(metric.algorithm_name) + + if METRIC_NAME not in data: + data[METRIC_NAME] = [] + data[METRIC_NAME].append(metric.metric_name) + + if METRIC_VALUE not in data: + data[METRIC_VALUE] = [] + data[METRIC_VALUE].append(metric.metric_value) + + df = pd.DataFrame.from_dict(data) + return df + + +def get_Xy(df): + X = df[df.columns.difference([ALGORITHM_NAME, METRIC_NAME, METRIC_VALUE])] + y = df[[ALGORITHM_NAME, METRIC_NAME, METRIC_VALUE]] + return X, y From f27b9166241af60f5e9bcbb83117219031c7ade2 Mon Sep 17 00:00:00 2001 From: Mukesh A Date: Sat, 16 May 2020 15:03:21 +0100 Subject: [PATCH 4/6] added inital kmc model and updated utils for preprocessing --- fmlearn.py | 29 +++++++++++++++++++++++++++++ utils.py | 48 +++++++++++++++++++++++++++++++++++++++--------- 2 files changed, 68 insertions(+), 9 deletions(-) create mode 100644 fmlearn.py diff --git a/fmlearn.py b/fmlearn.py new file mode 100644 index 0000000..be0f7b3 --- /dev/null +++ b/fmlearn.py @@ -0,0 +1,29 @@ +import pandas as pd +import utils + +from sklearn.model_selection import train_test_split +from sklearn.neighbors import KNeighborsRegressor + +def kmc(): + df = utils.get_df_from_db() + + df, _ = utils.ohe_feature(df, utils.TARGET_TYPE) + df, _ = utils.label_encode_feature(df, utils.ALGORITHM_NAME) + df, _ = utils.label_encode_feature(df, utils.METRIC_NAME) + + X, y = utils.get_Xy(df) + + X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123) + + model = KNeighborsRegressor(n_neighbors=2) + + model.fit(X_train, y_train) + + y_pred = model.predict(X_test) + print(y_pred) + print(y) + print(y_test) + + +if __name__ == "__main__": + kmc() \ No newline at end of file diff --git a/utils.py b/utils.py index f1a3a30..53be77d 100644 --- a/utils.py +++ b/utils.py @@ -2,6 +2,8 @@ from data_models.Metrics import Metric +from sklearn import preprocessing as pp + TARGET_TYPE = 'Target Type' ALGORITHM_NAME = 'Algorithm Name' METRIC_NAME = 'Metric Name' @@ -14,25 +16,25 @@ def get_df_from_db(): for metric in all_metrics: if TARGET_TYPE not in data: data[TARGET_TYPE] = [] - data[TARGET_TYPE].append(metric.target_type) + data[TARGET_TYPE].append(str(metric.target_type)) for mf in metric.meta_features: if mf.feat_name not in data: data[mf.feat_name] = [] - data[mf.feat_name].append(mf.feat_value) - + data[mf.feat_name].append(float(mf.feat_value)) + if ALGORITHM_NAME not in data: data[ALGORITHM_NAME] = [] - data[ALGORITHM_NAME].append(metric.algorithm_name) - + data[ALGORITHM_NAME].append(str(metric.algorithm_name)) + if METRIC_NAME not in data: data[METRIC_NAME] = [] - data[METRIC_NAME].append(metric.metric_name) - + data[METRIC_NAME].append(str(metric.metric_name)) + if METRIC_VALUE not in data: data[METRIC_VALUE] = [] - data[METRIC_VALUE].append(metric.metric_value) - + data[METRIC_VALUE].append(float(metric.metric_value)) + df = pd.DataFrame.from_dict(data) return df @@ -41,3 +43,31 @@ def get_Xy(df): X = df[df.columns.difference([ALGORITHM_NAME, METRIC_NAME, METRIC_VALUE])] y = df[[ALGORITHM_NAME, METRIC_NAME, METRIC_VALUE]] return X, y + +# One Hot Encoding +def ohe_feature(df, feature): + encoder = pp.OneHotEncoder(categories='auto', sparse=False) + data = encoder.fit_transform(df[feature].values.reshape(len(df[feature]), 1)) + # creating the encoded df + ohedf = pd.DataFrame(data, columns=[feature + ': ' + str(i.strip('x0123_')) for i in encoder.get_feature_names()]) + # to drop the extra column of redundant data + ohedf.drop(ohedf.columns[len(ohedf.columns) - 1], axis=1, inplace=True) + # concat the ohe df with the original df + df = pd.concat([df, ohedf], axis=1) + # to drop the original column in the df + del df[feature] + + return df, encoder + +# Label Encoding +def label_encode_feature(df, feature): + encoder = pp.LabelEncoder() + data = encoder.fit_transform(df[feature].values.reshape(len(df[feature]), 1)) + # to drop the original column in the df + del df[feature] + # creating the encoded df + ledf = pd.DataFrame(data, columns=[feature]) + # concat the ohe df with the original df + df = pd.concat([df, ledf], axis=1) + + return df, encoder From 5fa3b63d50673b241aed5121803bf3cb6dac30b7 Mon Sep 17 00:00:00 2001 From: Mukesh A Date: Sat, 16 May 2020 15:07:08 +0100 Subject: [PATCH 5/6] updated model to ohe output features --- fmlearn.py | 6 +++--- utils.py | 7 ++++--- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/fmlearn.py b/fmlearn.py index be0f7b3..40f4660 100644 --- a/fmlearn.py +++ b/fmlearn.py @@ -8,11 +8,11 @@ def kmc(): df = utils.get_df_from_db() df, _ = utils.ohe_feature(df, utils.TARGET_TYPE) - df, _ = utils.label_encode_feature(df, utils.ALGORITHM_NAME) - df, _ = utils.label_encode_feature(df, utils.METRIC_NAME) - X, y = utils.get_Xy(df) + y, _ = utils.ohe_feature(y, utils.ALGORITHM_NAME, False) + y, _ = utils.ohe_feature(y, utils.METRIC_NAME, False) + X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123) model = KNeighborsRegressor(n_neighbors=2) diff --git a/utils.py b/utils.py index 53be77d..2628977 100644 --- a/utils.py +++ b/utils.py @@ -34,7 +34,7 @@ def get_df_from_db(): if METRIC_VALUE not in data: data[METRIC_VALUE] = [] data[METRIC_VALUE].append(float(metric.metric_value)) - + df = pd.DataFrame.from_dict(data) return df @@ -45,13 +45,14 @@ def get_Xy(df): return X, y # One Hot Encoding -def ohe_feature(df, feature): +def ohe_feature(df, feature, drop_additional_feature=True): encoder = pp.OneHotEncoder(categories='auto', sparse=False) data = encoder.fit_transform(df[feature].values.reshape(len(df[feature]), 1)) # creating the encoded df ohedf = pd.DataFrame(data, columns=[feature + ': ' + str(i.strip('x0123_')) for i in encoder.get_feature_names()]) # to drop the extra column of redundant data - ohedf.drop(ohedf.columns[len(ohedf.columns) - 1], axis=1, inplace=True) + if drop_additional_feature: + ohedf.drop(ohedf.columns[len(ohedf.columns) - 1], axis=1, inplace=True) # concat the ohe df with the original df df = pd.concat([df, ohedf], axis=1) # to drop the original column in the df From 800dcac57d7bc91cc651118691b68a27ffe22c90 Mon Sep 17 00:00:00 2001 From: Mukesh A Date: Wed, 20 May 2020 17:05:49 +0100 Subject: [PATCH 6/6] updated the data fetch util to create proper df for na records --- fmlearn.py | 16 ++++++++++------ utils.py | 9 +++++---- 2 files changed, 15 insertions(+), 10 deletions(-) diff --git a/fmlearn.py b/fmlearn.py index 40f4660..1971ac8 100644 --- a/fmlearn.py +++ b/fmlearn.py @@ -6,13 +6,17 @@ def kmc(): df = utils.get_df_from_db() + df.fillna(0, inplace=True) - df, _ = utils.ohe_feature(df, utils.TARGET_TYPE) X, y = utils.get_Xy(df) - y, _ = utils.ohe_feature(y, utils.ALGORITHM_NAME, False) - y, _ = utils.ohe_feature(y, utils.METRIC_NAME, False) + # pre processing of data + X, _ = utils.ohe_feature(X, utils.TARGET_TYPE) + y, _ = utils.label_encode_feature(y, utils.ALGORITHM_NAME) + y, _ = utils.label_encode_feature(y, utils.METRIC_NAME) + + # train test split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123) model = KNeighborsRegressor(n_neighbors=2) @@ -20,10 +24,10 @@ def kmc(): model.fit(X_train, y_train) y_pred = model.predict(X_test) - print(y_pred) - print(y) - print(y_test) + print(y_test.to_string(header=False)) + y_pred = pd.DataFrame(y_pred) + print(y_pred.to_string(header=False)) if __name__ == "__main__": kmc() \ No newline at end of file diff --git a/utils.py b/utils.py index 2628977..4f9b6eb 100644 --- a/utils.py +++ b/utils.py @@ -11,9 +11,9 @@ def get_df_from_db(): all_metrics = Metric.query.all() - - data = {} + df = pd.DataFrame() for metric in all_metrics: + data = {} if TARGET_TYPE not in data: data[TARGET_TYPE] = [] data[TARGET_TYPE].append(str(metric.target_type)) @@ -34,9 +34,10 @@ def get_df_from_db(): if METRIC_VALUE not in data: data[METRIC_VALUE] = [] data[METRIC_VALUE].append(float(metric.metric_value)) + + df = df.append(pd.DataFrame.from_dict(data)) - df = pd.DataFrame.from_dict(data) - return df + return df.reset_index() def get_Xy(df):