Skip to content

Commit

Permalink
merge pull request #10 from mukeshmk/autosk-meta-feat
Browse files Browse the repository at this point in the history
added data model for meta-features and kNN search algorithm
  • Loading branch information
mukeshmk committed May 20, 2020
2 parents f5bc66e + 800dcac commit 2e4a606
Show file tree
Hide file tree
Showing 8 changed files with 169 additions and 11 deletions.
15 changes: 11 additions & 4 deletions app.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@

from data_models.Metrics import *
from data_models.Params import *
from data_models.MetaFeatures import *

# Create a Metric
@app.route(METRIC, methods=[POST])
Expand All @@ -31,8 +32,10 @@ def add_metric():
dataset_hash = request.json['dataset_hash'].replace("\x00", "")
metric_name = request.json['metric_name']
metric_value = request.json['metric_value']

new_metric = Metric(algorithm_name, dataset_hash, metric_name, metric_value)

target_type = request.json['target_type']

new_metric = Metric(algorithm_name, dataset_hash, metric_name, metric_value, target_type)

db.session.add(new_metric)
db.session.commit()
Expand All @@ -44,8 +47,12 @@ def add_metric():
db.session.add(new_params)
db.session.commit()

#new_metric = Metric.query.get(new_metric.id)
print(metric_schema.dump(new_metric))
data_meta_features = request.json['data_meta_features']
if(data_meta_features != ""):
for feat in data_meta_features:
new_feat = MetaFeature(new_metric.id, feat['feat_name'], feat['feat_value'])
db.session.add(new_feat)
db.session.commit()

return metric_schema.jsonify(new_metric)

Expand Down
2 changes: 2 additions & 0 deletions constants.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
# Table Names
TABLE_METRIC = 'metric'
TABLE_PARAM = 'params'
TABLE_META_FEATURES = 'meta_features'

# Table Class Names
CLASS_METRIC = 'Metric'
CLASS_PARAM = 'Params'
CLASS_META_FEATURE = 'MetaFeature'

# Rest Methods
GET = 'GET'
Expand Down
27 changes: 27 additions & 0 deletions data_models/MetaFeatures.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
from app import db, ma
from constants import TABLE_META_FEATURES


# Meta Feature Class/Model
class MetaFeature(db.Model):
__tablename__ = TABLE_META_FEATURES
id = db.Column(db.Integer, primary_key=True)
metric_id = db.Column(db.Integer, db.ForeignKey('metric.id'), nullable=False)
feat_name = db.Column(db.String(200))
feat_value = db.Column(db.String(200))

def __init__(self, metric_id, feat_name, feat_value):
self.metric_id = metric_id
self.feat_name = feat_name
self.feat_value = feat_value


# Meta Feature Schema
class MetaFeatureSchema(ma.Schema):
class Meta:
fields = ('id', 'metric_id', 'feat_name', 'feat_value')


# Init schema
meta_feature_schema = MetaFeatureSchema()
meta_features_schema = MetaFeatureSchema(many=True)
15 changes: 10 additions & 5 deletions data_models/Metrics.py
Original file line number Diff line number Diff line change
@@ -1,29 +1,34 @@
from app import db, ma
from data_models.Params import ParamSchema
from constants import TABLE_METRIC, CLASS_PARAM
from data_models.MetaFeatures import MetaFeatureSchema
from constants import TABLE_METRIC, CLASS_PARAM, CLASS_META_FEATURE

# Product Class/Model
# Metric Class/Model
class Metric(db.Model):
__tablename__ = TABLE_METRIC
id = db.Column(db.Integer, primary_key=True)
algorithm_name = db.Column(db.String(200))
dataset_hash = db.Column(db.Text)
metric_name = db.Column(db.String(200))
metric_value = db.Column(db.Float)
target_type = db.Column(db.String(200))
params = db.relationship(CLASS_PARAM, cascade = "all, delete", backref=TABLE_METRIC, lazy=True)
meta_features = db.relationship(CLASS_META_FEATURE, cascade = "all, delete", backref=TABLE_METRIC, lazy=True)

def __init__(self, algorithm_name, dataset_hash, metric_name, metric_value):
def __init__(self, algorithm_name, dataset_hash, metric_name, metric_value, target_type):
self.algorithm_name = algorithm_name
self.dataset_hash = dataset_hash
self.metric_name = metric_name
self.metric_value = metric_value
self.target_type = target_type


# Product Schema
# Metric Schema
class MetricSchema(ma.Schema):
params = ma.Nested(ParamSchema, many=True)
meta_features = ma.Nested(MetaFeatureSchema, many=True)
class Meta:
fields = ('id', 'algorithm_name', 'dataset_hash', 'metric_name', 'metric_value', 'params')
fields = ('id', 'algorithm_name', 'dataset_hash', 'metric_name', 'metric_value', 'target_type', 'params', 'meta_features')
include_fk = True


Expand Down
4 changes: 2 additions & 2 deletions data_models/Params.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from constants import TABLE_PARAM


# Product Class/Model
# Param Class/Model
class Params(db.Model):
__tablename__ = TABLE_PARAM
id = db.Column(db.Integer, primary_key=True)
Expand All @@ -16,7 +16,7 @@ def __init__(self, metric_id, param_name, param_value):
self.param_value = param_value


# Product Schema
# Param Schema
class ParamSchema(ma.Schema):
class Meta:
fields = ('id', 'metric_id', 'param_name', 'param_value')
Expand Down
33 changes: 33 additions & 0 deletions fmlearn.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
import pandas as pd
import utils

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor

def kmc():
df = utils.get_df_from_db()
df.fillna(0, inplace=True)

X, y = utils.get_Xy(df)

# pre processing of data
X, _ = utils.ohe_feature(X, utils.TARGET_TYPE)

y, _ = utils.label_encode_feature(y, utils.ALGORITHM_NAME)
y, _ = utils.label_encode_feature(y, utils.METRIC_NAME)

# train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

model = KNeighborsRegressor(n_neighbors=2)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print(y_test.to_string(header=False))
y_pred = pd.DataFrame(y_pred)
print(y_pred.to_string(header=False))

if __name__ == "__main__":
kmc()
9 changes: 9 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,16 +9,25 @@ gunicorn==20.0.4
isort==4.3.21
itsdangerous==1.1.0
Jinja2==2.10.3
joblib==0.15.0
lazy-object-proxy==1.4.3
MarkupSafe==1.1.1
marshmallow==3.3.0
marshmallow-sqlalchemy==0.21.0
mccabe==0.6.1
numpy==1.18.4
pandas==1.0.3
psycopg2-binary==2.8.4
pycparser==2.19
pylint==2.4.4
python-dateutil==2.8.1
pytz==2020.1
scikit-learn==0.23.0
scipy==1.4.1
six==1.13.0
sklearn==0.0
SQLAlchemy==1.3.12
threadpoolctl==2.0.0
typed-ast==1.4.1
Werkzeug==0.16.0
wrapt==1.11.2
75 changes: 75 additions & 0 deletions utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
import pandas as pd

from data_models.Metrics import Metric

from sklearn import preprocessing as pp

TARGET_TYPE = 'Target Type'
ALGORITHM_NAME = 'Algorithm Name'
METRIC_NAME = 'Metric Name'
METRIC_VALUE = 'Metric Value'

def get_df_from_db():
all_metrics = Metric.query.all()
df = pd.DataFrame()
for metric in all_metrics:
data = {}
if TARGET_TYPE not in data:
data[TARGET_TYPE] = []
data[TARGET_TYPE].append(str(metric.target_type))

for mf in metric.meta_features:
if mf.feat_name not in data:
data[mf.feat_name] = []
data[mf.feat_name].append(float(mf.feat_value))

if ALGORITHM_NAME not in data:
data[ALGORITHM_NAME] = []
data[ALGORITHM_NAME].append(str(metric.algorithm_name))

if METRIC_NAME not in data:
data[METRIC_NAME] = []
data[METRIC_NAME].append(str(metric.metric_name))

if METRIC_VALUE not in data:
data[METRIC_VALUE] = []
data[METRIC_VALUE].append(float(metric.metric_value))

df = df.append(pd.DataFrame.from_dict(data))

return df.reset_index()


def get_Xy(df):
X = df[df.columns.difference([ALGORITHM_NAME, METRIC_NAME, METRIC_VALUE])]
y = df[[ALGORITHM_NAME, METRIC_NAME, METRIC_VALUE]]
return X, y

# One Hot Encoding
def ohe_feature(df, feature, drop_additional_feature=True):
encoder = pp.OneHotEncoder(categories='auto', sparse=False)
data = encoder.fit_transform(df[feature].values.reshape(len(df[feature]), 1))
# creating the encoded df
ohedf = pd.DataFrame(data, columns=[feature + ': ' + str(i.strip('x0123_')) for i in encoder.get_feature_names()])
# to drop the extra column of redundant data
if drop_additional_feature:
ohedf.drop(ohedf.columns[len(ohedf.columns) - 1], axis=1, inplace=True)
# concat the ohe df with the original df
df = pd.concat([df, ohedf], axis=1)
# to drop the original column in the df
del df[feature]

return df, encoder

# Label Encoding
def label_encode_feature(df, feature):
encoder = pp.LabelEncoder()
data = encoder.fit_transform(df[feature].values.reshape(len(df[feature]), 1))
# to drop the original column in the df
del df[feature]
# creating the encoded df
ledf = pd.DataFrame(data, columns=[feature])
# concat the ohe df with the original df
df = pd.concat([df, ledf], axis=1)

return df, encoder

0 comments on commit 2e4a606

Please sign in to comment.