From 7177f0af8234b1a9d66198c7ea20a662167c64b3 Mon Sep 17 00:00:00 2001
From: Mukesh A <amukesh.mk@gmail.com>
Date: Thu, 23 Apr 2020 20:43:46 +0100
Subject: [PATCH 1/6] added data model for MetaFeatures for Data

---
 app.py                      | 13 ++++++++++---
 constants.py                |  2 ++
 data_models/MetaFeatures.py | 27 +++++++++++++++++++++++++++
 data_models/Metrics.py      | 11 +++++++----
 data_models/Params.py       |  4 ++--
 5 files changed, 48 insertions(+), 9 deletions(-)
 create mode 100644 data_models/MetaFeatures.py

diff --git a/app.py b/app.py
index 300a85d..ac1696a 100644
--- a/app.py
+++ b/app.py
@@ -23,6 +23,7 @@
 
 from data_models.Metrics import *
 from data_models.Params import *
+from data_models.MetaFeatures import *
 
 # Create a Metric
 @app.route(METRIC, methods=[POST])
@@ -31,7 +32,9 @@ def add_metric():
     dataset_hash = request.json['dataset_hash'].replace("\x00", "")
     metric_name = request.json['metric_name']
     metric_value = request.json['metric_value']
-    
+
+    target_type = request.json['target_type']
+
     new_metric = Metric(algorithm_name, dataset_hash, metric_name, metric_value)
 
     db.session.add(new_metric)
@@ -44,8 +47,12 @@ def add_metric():
             db.session.add(new_params)
     db.session.commit()
 
-    #new_metric = Metric.query.get(new_metric.id)
-    print(metric_schema.dump(new_metric))
+    data_meta_features = request.json['data_meta_features']
+    if(data_meta_features != ""):
+        for feat in data_meta_features:
+            new_feat = MetaFeature(new_metric.id, feat['feat_name'], feat['feat_value'])
+            db.session.add(new_feat)
+    db.session.commit()
 
     return metric_schema.jsonify(new_metric)
 
diff --git a/constants.py b/constants.py
index 5cdf0c7..73ec782 100644
--- a/constants.py
+++ b/constants.py
@@ -1,10 +1,12 @@
 # Table Names
 TABLE_METRIC = 'metric'
 TABLE_PARAM = 'params'
+TABLE_META_FEATURES = 'meta_features'
 
 # Table Class Names
 CLASS_METRIC = 'Metric'
 CLASS_PARAM = 'Params'
+CLASS_META_FEATURE = 'MetaFeature'
 
 # Rest Methods
 GET = 'GET'
diff --git a/data_models/MetaFeatures.py b/data_models/MetaFeatures.py
new file mode 100644
index 0000000..d1c9d6f
--- /dev/null
+++ b/data_models/MetaFeatures.py
@@ -0,0 +1,27 @@
+from app import db, ma
+from constants import TABLE_META_FEATURES
+
+
+# Meta Feature Class/Model
+class MetaFeature(db.Model):
+    __tablename__ = TABLE_META_FEATURES
+    id = db.Column(db.Integer, primary_key=True)
+    metric_id = db.Column(db.Integer, db.ForeignKey('metric.id'), nullable=False)
+    feat_name = db.Column(db.String(200))
+    feat_value = db.Column(db.String(200))
+
+    def __init__(self, metric_id, feat_name, feat_value):
+        self.metric_id = metric_id
+        self.feat_name = feat_name
+        self.feat_value = feat_value
+
+
+# Meta Feature Schema
+class MetaFeatureSchema(ma.Schema):
+    class Meta:
+        fields = ('id', 'metric_id', 'feat_name', 'feat_value')
+
+
+# Init schema
+meta_feature_schema = MetaFeatureSchema()
+meta_features_schema = MetaFeatureSchema(many=True)
diff --git a/data_models/Metrics.py b/data_models/Metrics.py
index fde9cb9..ce62814 100644
--- a/data_models/Metrics.py
+++ b/data_models/Metrics.py
@@ -1,8 +1,9 @@
 from app import db, ma
 from data_models.Params import ParamSchema
-from constants import TABLE_METRIC, CLASS_PARAM
+from data_models.MetaFeatures import MetaFeatureSchema
+from constants import TABLE_METRIC, CLASS_PARAM, CLASS_META_FEATURE
 
-# Product Class/Model
+# Metric Class/Model
 class Metric(db.Model):
     __tablename__ = TABLE_METRIC
     id = db.Column(db.Integer, primary_key=True)
@@ -11,6 +12,7 @@ class Metric(db.Model):
     metric_name = db.Column(db.String(200))
     metric_value = db.Column(db.Float)
     params = db.relationship(CLASS_PARAM, cascade = "all, delete", backref=TABLE_METRIC, lazy=True)
+    meta_features = db.relationship(CLASS_META_FEATURE, cascade = "all, delete", backref=TABLE_METRIC, lazy=True)
 
     def __init__(self, algorithm_name, dataset_hash, metric_name, metric_value):
         self.algorithm_name = algorithm_name
@@ -19,11 +21,12 @@ def __init__(self, algorithm_name, dataset_hash, metric_name, metric_value):
         self.metric_value = metric_value
 
 
-# Product Schema
+# Metric Schema
 class MetricSchema(ma.Schema):
     params = ma.Nested(ParamSchema, many=True)
+    meta_features = ma.Nested(MetaFeatureSchema, many=True)
     class Meta:
-        fields = ('id', 'algorithm_name', 'dataset_hash', 'metric_name', 'metric_value', 'params')
+        fields = ('id', 'algorithm_name', 'dataset_hash', 'metric_name', 'metric_value', 'params', 'meta_features')
         include_fk = True
 
 
diff --git a/data_models/Params.py b/data_models/Params.py
index 03e8773..2da131d 100644
--- a/data_models/Params.py
+++ b/data_models/Params.py
@@ -2,7 +2,7 @@
 from constants import TABLE_PARAM
 
 
-# Product Class/Model
+# Param Class/Model
 class Params(db.Model):
     __tablename__ = TABLE_PARAM
     id = db.Column(db.Integer, primary_key=True)
@@ -16,7 +16,7 @@ def __init__(self, metric_id, param_name, param_value):
         self.param_value = param_value
 
 
-# Product Schema
+# Param Schema
 class ParamSchema(ma.Schema):
     class Meta:
         fields = ('id', 'metric_id', 'param_name', 'param_value')

From 4cb67ad6c9495e32a7e53e6345f12c2f1346587e Mon Sep 17 00:00:00 2001
From: Mukesh A <amukesh.mk@gmail.com>
Date: Thu, 23 Apr 2020 21:04:57 +0100
Subject: [PATCH 2/6] added target_type to the metric data model

---
 app.py                 | 2 +-
 data_models/Metrics.py | 6 ++++--
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/app.py b/app.py
index ac1696a..22026e3 100644
--- a/app.py
+++ b/app.py
@@ -35,7 +35,7 @@ def add_metric():
 
     target_type = request.json['target_type']
 
-    new_metric = Metric(algorithm_name, dataset_hash, metric_name, metric_value)
+    new_metric = Metric(algorithm_name, dataset_hash, metric_name, metric_value, target_type)
 
     db.session.add(new_metric)
     db.session.commit()
diff --git a/data_models/Metrics.py b/data_models/Metrics.py
index ce62814..95dfffb 100644
--- a/data_models/Metrics.py
+++ b/data_models/Metrics.py
@@ -11,14 +11,16 @@ class Metric(db.Model):
     dataset_hash = db.Column(db.Text)
     metric_name = db.Column(db.String(200))
     metric_value = db.Column(db.Float)
+    target_type = db.Column(db.String(200))
     params = db.relationship(CLASS_PARAM, cascade = "all, delete", backref=TABLE_METRIC, lazy=True)
     meta_features = db.relationship(CLASS_META_FEATURE, cascade = "all, delete", backref=TABLE_METRIC, lazy=True)
 
-    def __init__(self, algorithm_name, dataset_hash, metric_name, metric_value):
+    def __init__(self, algorithm_name, dataset_hash, metric_name, metric_value, target_type):
         self.algorithm_name = algorithm_name
         self.dataset_hash = dataset_hash
         self.metric_name = metric_name
         self.metric_value = metric_value
+        self.target_type = target_type
 
 
 # Metric Schema
@@ -26,7 +28,7 @@ class MetricSchema(ma.Schema):
     params = ma.Nested(ParamSchema, many=True)
     meta_features = ma.Nested(MetaFeatureSchema, many=True)
     class Meta:
-        fields = ('id', 'algorithm_name', 'dataset_hash', 'metric_name', 'metric_value', 'params', 'meta_features')
+        fields = ('id', 'algorithm_name', 'dataset_hash', 'metric_name', 'metric_value', 'target_type', 'params', 'meta_features')
         include_fk = True
 
 

From 19097c59a16ca53231b842e83496fe0da3982747 Mon Sep 17 00:00:00 2001
From: Mukesh A <amukesh.mk@gmail.com>
Date: Sat, 16 May 2020 14:01:18 +0100
Subject: [PATCH 3/6] added util code to fetch data from db and convert to df

---
 requirements.txt |  9 +++++++++
 utils.py         | 43 +++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 52 insertions(+)
 create mode 100644 utils.py

diff --git a/requirements.txt b/requirements.txt
index daf4367..a96429a 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -9,16 +9,25 @@ gunicorn==20.0.4
 isort==4.3.21
 itsdangerous==1.1.0
 Jinja2==2.10.3
+joblib==0.15.0
 lazy-object-proxy==1.4.3
 MarkupSafe==1.1.1
 marshmallow==3.3.0
 marshmallow-sqlalchemy==0.21.0
 mccabe==0.6.1
+numpy==1.18.4
+pandas==1.0.3
 psycopg2-binary==2.8.4
 pycparser==2.19
 pylint==2.4.4
+python-dateutil==2.8.1
+pytz==2020.1
+scikit-learn==0.23.0
+scipy==1.4.1
 six==1.13.0
+sklearn==0.0
 SQLAlchemy==1.3.12
+threadpoolctl==2.0.0
 typed-ast==1.4.1
 Werkzeug==0.16.0
 wrapt==1.11.2
diff --git a/utils.py b/utils.py
new file mode 100644
index 0000000..f1a3a30
--- /dev/null
+++ b/utils.py
@@ -0,0 +1,43 @@
+import pandas as pd
+
+from data_models.Metrics import Metric
+
+TARGET_TYPE = 'Target Type'
+ALGORITHM_NAME = 'Algorithm Name'
+METRIC_NAME = 'Metric Name'
+METRIC_VALUE = 'Metric Value'
+
+def get_df_from_db():
+    all_metrics = Metric.query.all()
+
+    data = {}
+    for metric in all_metrics:
+        if TARGET_TYPE not in data:
+            data[TARGET_TYPE] = []
+        data[TARGET_TYPE].append(metric.target_type)
+
+        for mf in metric.meta_features:
+            if mf.feat_name not in data:
+                data[mf.feat_name] = []
+            data[mf.feat_name].append(mf.feat_value)
+        
+        if ALGORITHM_NAME not in data:
+            data[ALGORITHM_NAME] = []
+        data[ALGORITHM_NAME].append(metric.algorithm_name)
+        
+        if METRIC_NAME not in data:
+            data[METRIC_NAME] = []
+        data[METRIC_NAME].append(metric.metric_name)
+        
+        if METRIC_VALUE not in data:
+            data[METRIC_VALUE] = []
+        data[METRIC_VALUE].append(metric.metric_value)
+
+    df = pd.DataFrame.from_dict(data)
+    return df
+
+
+def get_Xy(df):
+    X = df[df.columns.difference([ALGORITHM_NAME, METRIC_NAME, METRIC_VALUE])]
+    y = df[[ALGORITHM_NAME, METRIC_NAME, METRIC_VALUE]]
+    return X, y

From f27b9166241af60f5e9bcbb83117219031c7ade2 Mon Sep 17 00:00:00 2001
From: Mukesh A <amukesh.mk@gmail.com>
Date: Sat, 16 May 2020 15:03:21 +0100
Subject: [PATCH 4/6] added inital kmc model and updated utils for
 preprocessing

---
 fmlearn.py | 29 +++++++++++++++++++++++++++++
 utils.py   | 48 +++++++++++++++++++++++++++++++++++++++---------
 2 files changed, 68 insertions(+), 9 deletions(-)
 create mode 100644 fmlearn.py

diff --git a/fmlearn.py b/fmlearn.py
new file mode 100644
index 0000000..be0f7b3
--- /dev/null
+++ b/fmlearn.py
@@ -0,0 +1,29 @@
+import pandas as pd
+import utils
+
+from sklearn.model_selection import train_test_split
+from sklearn.neighbors import KNeighborsRegressor
+
+def kmc():
+    df = utils.get_df_from_db()
+
+    df, _ = utils.ohe_feature(df, utils.TARGET_TYPE)
+    df, _ = utils.label_encode_feature(df, utils.ALGORITHM_NAME)
+    df, _ = utils.label_encode_feature(df, utils.METRIC_NAME)
+    
+    X, y = utils.get_Xy(df)
+
+    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)
+
+    model = KNeighborsRegressor(n_neighbors=2)
+
+    model.fit(X_train, y_train)
+
+    y_pred = model.predict(X_test)
+    print(y_pred)
+    print(y)
+    print(y_test)
+
+
+if __name__ == "__main__":
+    kmc()
\ No newline at end of file
diff --git a/utils.py b/utils.py
index f1a3a30..53be77d 100644
--- a/utils.py
+++ b/utils.py
@@ -2,6 +2,8 @@
 
 from data_models.Metrics import Metric
 
+from sklearn import preprocessing as pp
+
 TARGET_TYPE = 'Target Type'
 ALGORITHM_NAME = 'Algorithm Name'
 METRIC_NAME = 'Metric Name'
@@ -14,25 +16,25 @@ def get_df_from_db():
     for metric in all_metrics:
         if TARGET_TYPE not in data:
             data[TARGET_TYPE] = []
-        data[TARGET_TYPE].append(metric.target_type)
+        data[TARGET_TYPE].append(str(metric.target_type))
 
         for mf in metric.meta_features:
             if mf.feat_name not in data:
                 data[mf.feat_name] = []
-            data[mf.feat_name].append(mf.feat_value)
-        
+            data[mf.feat_name].append(float(mf.feat_value))
+
         if ALGORITHM_NAME not in data:
             data[ALGORITHM_NAME] = []
-        data[ALGORITHM_NAME].append(metric.algorithm_name)
-        
+        data[ALGORITHM_NAME].append(str(metric.algorithm_name))
+
         if METRIC_NAME not in data:
             data[METRIC_NAME] = []
-        data[METRIC_NAME].append(metric.metric_name)
-        
+        data[METRIC_NAME].append(str(metric.metric_name))
+
         if METRIC_VALUE not in data:
             data[METRIC_VALUE] = []
-        data[METRIC_VALUE].append(metric.metric_value)
-
+        data[METRIC_VALUE].append(float(metric.metric_value))
+    
     df = pd.DataFrame.from_dict(data)
     return df
 
@@ -41,3 +43,31 @@ def get_Xy(df):
     X = df[df.columns.difference([ALGORITHM_NAME, METRIC_NAME, METRIC_VALUE])]
     y = df[[ALGORITHM_NAME, METRIC_NAME, METRIC_VALUE]]
     return X, y
+
+# One Hot Encoding
+def ohe_feature(df, feature):
+    encoder = pp.OneHotEncoder(categories='auto', sparse=False)
+    data = encoder.fit_transform(df[feature].values.reshape(len(df[feature]), 1))
+    # creating the encoded df
+    ohedf = pd.DataFrame(data, columns=[feature + ': ' + str(i.strip('x0123_')) for i in encoder.get_feature_names()])
+    # to drop the extra column of redundant data
+    ohedf.drop(ohedf.columns[len(ohedf.columns) - 1], axis=1, inplace=True)
+    # concat the ohe df with the original df
+    df = pd.concat([df, ohedf], axis=1)
+    # to drop the original column in the df
+    del df[feature]
+
+    return df, encoder
+
+# Label Encoding
+def label_encode_feature(df, feature):
+    encoder = pp.LabelEncoder()
+    data = encoder.fit_transform(df[feature].values.reshape(len(df[feature]), 1))
+    # to drop the original column in the df
+    del df[feature]
+    # creating the encoded df
+    ledf = pd.DataFrame(data, columns=[feature])
+    # concat the ohe df with the original df
+    df = pd.concat([df, ledf], axis=1)
+
+    return df, encoder

From 5fa3b63d50673b241aed5121803bf3cb6dac30b7 Mon Sep 17 00:00:00 2001
From: Mukesh A <amukesh.mk@gmail.com>
Date: Sat, 16 May 2020 15:07:08 +0100
Subject: [PATCH 5/6] updated model to ohe output features

---
 fmlearn.py | 6 +++---
 utils.py   | 7 ++++---
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/fmlearn.py b/fmlearn.py
index be0f7b3..40f4660 100644
--- a/fmlearn.py
+++ b/fmlearn.py
@@ -8,11 +8,11 @@ def kmc():
     df = utils.get_df_from_db()
 
     df, _ = utils.ohe_feature(df, utils.TARGET_TYPE)
-    df, _ = utils.label_encode_feature(df, utils.ALGORITHM_NAME)
-    df, _ = utils.label_encode_feature(df, utils.METRIC_NAME)
-    
     X, y = utils.get_Xy(df)
 
+    y, _ = utils.ohe_feature(y, utils.ALGORITHM_NAME, False)
+    y, _ = utils.ohe_feature(y, utils.METRIC_NAME, False)
+
     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)
 
     model = KNeighborsRegressor(n_neighbors=2)
diff --git a/utils.py b/utils.py
index 53be77d..2628977 100644
--- a/utils.py
+++ b/utils.py
@@ -34,7 +34,7 @@ def get_df_from_db():
         if METRIC_VALUE not in data:
             data[METRIC_VALUE] = []
         data[METRIC_VALUE].append(float(metric.metric_value))
-    
+
     df = pd.DataFrame.from_dict(data)
     return df
 
@@ -45,13 +45,14 @@ def get_Xy(df):
     return X, y
 
 # One Hot Encoding
-def ohe_feature(df, feature):
+def ohe_feature(df, feature, drop_additional_feature=True):
     encoder = pp.OneHotEncoder(categories='auto', sparse=False)
     data = encoder.fit_transform(df[feature].values.reshape(len(df[feature]), 1))
     # creating the encoded df
     ohedf = pd.DataFrame(data, columns=[feature + ': ' + str(i.strip('x0123_')) for i in encoder.get_feature_names()])
     # to drop the extra column of redundant data
-    ohedf.drop(ohedf.columns[len(ohedf.columns) - 1], axis=1, inplace=True)
+    if drop_additional_feature:
+        ohedf.drop(ohedf.columns[len(ohedf.columns) - 1], axis=1, inplace=True)
     # concat the ohe df with the original df
     df = pd.concat([df, ohedf], axis=1)
     # to drop the original column in the df

From 800dcac57d7bc91cc651118691b68a27ffe22c90 Mon Sep 17 00:00:00 2001
From: Mukesh A <amukesh.mk@gmail.com>
Date: Wed, 20 May 2020 17:05:49 +0100
Subject: [PATCH 6/6] updated the data fetch util to create proper df for na
 records

---
 fmlearn.py | 16 ++++++++++------
 utils.py   |  9 +++++----
 2 files changed, 15 insertions(+), 10 deletions(-)

diff --git a/fmlearn.py b/fmlearn.py
index 40f4660..1971ac8 100644
--- a/fmlearn.py
+++ b/fmlearn.py
@@ -6,13 +6,17 @@
 
 def kmc():
     df = utils.get_df_from_db()
+    df.fillna(0, inplace=True)
 
-    df, _ = utils.ohe_feature(df, utils.TARGET_TYPE)
     X, y = utils.get_Xy(df)
 
-    y, _ = utils.ohe_feature(y, utils.ALGORITHM_NAME, False)
-    y, _ = utils.ohe_feature(y, utils.METRIC_NAME, False)
+    # pre processing of data
+    X, _ = utils.ohe_feature(X, utils.TARGET_TYPE)
 
+    y, _ = utils.label_encode_feature(y, utils.ALGORITHM_NAME)
+    y, _ = utils.label_encode_feature(y, utils.METRIC_NAME)
+
+    # train test split
     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)
 
     model = KNeighborsRegressor(n_neighbors=2)
@@ -20,10 +24,10 @@ def kmc():
     model.fit(X_train, y_train)
 
     y_pred = model.predict(X_test)
-    print(y_pred)
-    print(y)
-    print(y_test)
 
+    print(y_test.to_string(header=False))
+    y_pred = pd.DataFrame(y_pred)
+    print(y_pred.to_string(header=False))
 
 if __name__ == "__main__":
     kmc()
\ No newline at end of file
diff --git a/utils.py b/utils.py
index 2628977..4f9b6eb 100644
--- a/utils.py
+++ b/utils.py
@@ -11,9 +11,9 @@
 
 def get_df_from_db():
     all_metrics = Metric.query.all()
-
-    data = {}
+    df = pd.DataFrame()
     for metric in all_metrics:
+        data = {}
         if TARGET_TYPE not in data:
             data[TARGET_TYPE] = []
         data[TARGET_TYPE].append(str(metric.target_type))
@@ -34,9 +34,10 @@ def get_df_from_db():
         if METRIC_VALUE not in data:
             data[METRIC_VALUE] = []
         data[METRIC_VALUE].append(float(metric.metric_value))
+        
+        df = df.append(pd.DataFrame.from_dict(data))
 
-    df = pd.DataFrame.from_dict(data)
-    return df
+    return df.reset_index()
 
 
 def get_Xy(df):