merge pull request #10 from mukeshmk/autosk-meta-feat

added data model for meta-features and kNN search algorithm
mukeshmk · May 20, 2020 · 2e4a606 · 2e4a606
2 parents f5bc66e + 800dcac
commit 2e4a606
Show file tree

Hide file tree

Showing 8 changed files with 169 additions and 11 deletions.
diff --git a/app.py b/app.py
@@ -23,6 +23,7 @@
 
 from data_models.Metrics import *
 from data_models.Params import *
+from data_models.MetaFeatures import *
 
 # Create a Metric
 @app.route(METRIC, methods=[POST])
@@ -31,8 +32,10 @@ def add_metric():
     dataset_hash = request.json['dataset_hash'].replace("\x00", "")
     metric_name = request.json['metric_name']
     metric_value = request.json['metric_value']
-
-    new_metric = Metric(algorithm_name, dataset_hash, metric_name, metric_value)
+
+    target_type = request.json['target_type']
+
+    new_metric = Metric(algorithm_name, dataset_hash, metric_name, metric_value, target_type)
 
     db.session.add(new_metric)
     db.session.commit()
@@ -44,8 +47,12 @@ def add_metric():
             db.session.add(new_params)
     db.session.commit()
 
-    #new_metric = Metric.query.get(new_metric.id)
-    print(metric_schema.dump(new_metric))
+    data_meta_features = request.json['data_meta_features']
+    if(data_meta_features != ""):
+        for feat in data_meta_features:
+            new_feat = MetaFeature(new_metric.id, feat['feat_name'], feat['feat_value'])
+            db.session.add(new_feat)
+    db.session.commit()
 
     return metric_schema.jsonify(new_metric)
 

diff --git a/constants.py b/constants.py
@@ -1,10 +1,12 @@
 # Table Names
 TABLE_METRIC = 'metric'
 TABLE_PARAM = 'params'
+TABLE_META_FEATURES = 'meta_features'
 
 # Table Class Names
 CLASS_METRIC = 'Metric'
 CLASS_PARAM = 'Params'
+CLASS_META_FEATURE = 'MetaFeature'
 
 # Rest Methods
 GET = 'GET'

diff --git a/data_models/MetaFeatures.py b/data_models/MetaFeatures.py
@@ -0,0 +1,27 @@
+from app import db, ma
+from constants import TABLE_META_FEATURES
+
+
+# Meta Feature Class/Model
+class MetaFeature(db.Model):
+    __tablename__ = TABLE_META_FEATURES
+    id = db.Column(db.Integer, primary_key=True)
+    metric_id = db.Column(db.Integer, db.ForeignKey('metric.id'), nullable=False)
+    feat_name = db.Column(db.String(200))
+    feat_value = db.Column(db.String(200))
+
+    def __init__(self, metric_id, feat_name, feat_value):
+        self.metric_id = metric_id
+        self.feat_name = feat_name
+        self.feat_value = feat_value
+
+
+# Meta Feature Schema
+class MetaFeatureSchema(ma.Schema):
+    class Meta:
+        fields = ('id', 'metric_id', 'feat_name', 'feat_value')
+
+
+# Init schema
+meta_feature_schema = MetaFeatureSchema()
+meta_features_schema = MetaFeatureSchema(many=True)
diff --git a/data_models/Metrics.py b/data_models/Metrics.py
@@ -1,29 +1,34 @@
 from app import db, ma
 from data_models.Params import ParamSchema
-from constants import TABLE_METRIC, CLASS_PARAM
+from data_models.MetaFeatures import MetaFeatureSchema
+from constants import TABLE_METRIC, CLASS_PARAM, CLASS_META_FEATURE
 
-# Product Class/Model
+# Metric Class/Model
 class Metric(db.Model):
     __tablename__ = TABLE_METRIC
     id = db.Column(db.Integer, primary_key=True)
     algorithm_name = db.Column(db.String(200))
     dataset_hash = db.Column(db.Text)
     metric_name = db.Column(db.String(200))
     metric_value = db.Column(db.Float)
+    target_type = db.Column(db.String(200))
     params = db.relationship(CLASS_PARAM, cascade = "all, delete", backref=TABLE_METRIC, lazy=True)
+    meta_features = db.relationship(CLASS_META_FEATURE, cascade = "all, delete", backref=TABLE_METRIC, lazy=True)
 
-    def __init__(self, algorithm_name, dataset_hash, metric_name, metric_value):
+    def __init__(self, algorithm_name, dataset_hash, metric_name, metric_value, target_type):
         self.algorithm_name = algorithm_name
         self.dataset_hash = dataset_hash
         self.metric_name = metric_name
         self.metric_value = metric_value
+        self.target_type = target_type
 
 
-# Product Schema
+# Metric Schema
 class MetricSchema(ma.Schema):
     params = ma.Nested(ParamSchema, many=True)
+    meta_features = ma.Nested(MetaFeatureSchema, many=True)
     class Meta:
-        fields = ('id', 'algorithm_name', 'dataset_hash', 'metric_name', 'metric_value', 'params')
+        fields = ('id', 'algorithm_name', 'dataset_hash', 'metric_name', 'metric_value', 'target_type', 'params', 'meta_features')
         include_fk = True
 
 

diff --git a/data_models/Params.py b/data_models/Params.py
@@ -2,7 +2,7 @@
 from constants import TABLE_PARAM
 
 
-# Product Class/Model
+# Param Class/Model
 class Params(db.Model):
     __tablename__ = TABLE_PARAM
     id = db.Column(db.Integer, primary_key=True)
@@ -16,7 +16,7 @@ def __init__(self, metric_id, param_name, param_value):
         self.param_value = param_value
 
 
-# Product Schema
+# Param Schema
 class ParamSchema(ma.Schema):
     class Meta:
         fields = ('id', 'metric_id', 'param_name', 'param_value')

diff --git a/fmlearn.py b/fmlearn.py
@@ -0,0 +1,33 @@
+import pandas as pd
+import utils
+
+from sklearn.model_selection import train_test_split
+from sklearn.neighbors import KNeighborsRegressor
+
+def kmc():
+    df = utils.get_df_from_db()
+    df.fillna(0, inplace=True)
+
+    X, y = utils.get_Xy(df)
+
+    # pre processing of data
+    X, _ = utils.ohe_feature(X, utils.TARGET_TYPE)
+
+    y, _ = utils.label_encode_feature(y, utils.ALGORITHM_NAME)
+    y, _ = utils.label_encode_feature(y, utils.METRIC_NAME)
+
+    # train test split
+    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)
+
+    model = KNeighborsRegressor(n_neighbors=2)
+
+    model.fit(X_train, y_train)
+
+    y_pred = model.predict(X_test)
+
+    print(y_test.to_string(header=False))
+    y_pred = pd.DataFrame(y_pred)
+    print(y_pred.to_string(header=False))
+
+if __name__ == "__main__":
+    kmc()
diff --git a/requirements.txt b/requirements.txt
@@ -9,16 +9,25 @@ gunicorn==20.0.4
 isort==4.3.21
 itsdangerous==1.1.0
 Jinja2==2.10.3
+joblib==0.15.0
 lazy-object-proxy==1.4.3
 MarkupSafe==1.1.1
 marshmallow==3.3.0
 marshmallow-sqlalchemy==0.21.0
 mccabe==0.6.1
+numpy==1.18.4
+pandas==1.0.3
 psycopg2-binary==2.8.4
 pycparser==2.19
 pylint==2.4.4
+python-dateutil==2.8.1
+pytz==2020.1
+scikit-learn==0.23.0
+scipy==1.4.1
 six==1.13.0
+sklearn==0.0
 SQLAlchemy==1.3.12
+threadpoolctl==2.0.0
 typed-ast==1.4.1
 Werkzeug==0.16.0
 wrapt==1.11.2
diff --git a/utils.py b/utils.py
@@ -0,0 +1,75 @@
+import pandas as pd
+
+from data_models.Metrics import Metric
+
+from sklearn import preprocessing as pp
+
+TARGET_TYPE = 'Target Type'
+ALGORITHM_NAME = 'Algorithm Name'
+METRIC_NAME = 'Metric Name'
+METRIC_VALUE = 'Metric Value'
+
+def get_df_from_db():
+    all_metrics = Metric.query.all()
+    df = pd.DataFrame()
+    for metric in all_metrics:
+        data = {}
+        if TARGET_TYPE not in data:
+            data[TARGET_TYPE] = []
+        data[TARGET_TYPE].append(str(metric.target_type))
+
+        for mf in metric.meta_features:
+            if mf.feat_name not in data:
+                data[mf.feat_name] = []
+            data[mf.feat_name].append(float(mf.feat_value))
+
+        if ALGORITHM_NAME not in data:
+            data[ALGORITHM_NAME] = []
+        data[ALGORITHM_NAME].append(str(metric.algorithm_name))
+
+        if METRIC_NAME not in data:
+            data[METRIC_NAME] = []
+        data[METRIC_NAME].append(str(metric.metric_name))
+
+        if METRIC_VALUE not in data:
+            data[METRIC_VALUE] = []
+        data[METRIC_VALUE].append(float(metric.metric_value))
+
+        df = df.append(pd.DataFrame.from_dict(data))
+
+    return df.reset_index()
+
+
+def get_Xy(df):
+    X = df[df.columns.difference([ALGORITHM_NAME, METRIC_NAME, METRIC_VALUE])]
+    y = df[[ALGORITHM_NAME, METRIC_NAME, METRIC_VALUE]]
+    return X, y
+
+# One Hot Encoding
+def ohe_feature(df, feature, drop_additional_feature=True):
+    encoder = pp.OneHotEncoder(categories='auto', sparse=False)
+    data = encoder.fit_transform(df[feature].values.reshape(len(df[feature]), 1))
+    # creating the encoded df
+    ohedf = pd.DataFrame(data, columns=[feature + ': ' + str(i.strip('x0123_')) for i in encoder.get_feature_names()])
+    # to drop the extra column of redundant data
+    if drop_additional_feature:
+        ohedf.drop(ohedf.columns[len(ohedf.columns) - 1], axis=1, inplace=True)
+    # concat the ohe df with the original df
+    df = pd.concat([df, ohedf], axis=1)
+    # to drop the original column in the df
+    del df[feature]
+
+    return df, encoder
+
+# Label Encoding
+def label_encode_feature(df, feature):
+    encoder = pp.LabelEncoder()
+    data = encoder.fit_transform(df[feature].values.reshape(len(df[feature]), 1))
+    # to drop the original column in the df
+    del df[feature]
+    # creating the encoded df
+    ledf = pd.DataFrame(data, columns=[feature])
+    # concat the ohe df with the original df
+    df = pd.concat([df, ledf], axis=1)
+
+    return df, encoder