Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

added data model for meta-features and kNN search algorithm #10

Merged
merged 6 commits into from
May 20, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 11 additions & 4 deletions app.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@

from data_models.Metrics import *
from data_models.Params import *
from data_models.MetaFeatures import *

# Create a Metric
@app.route(METRIC, methods=[POST])
Expand All @@ -31,8 +32,10 @@ def add_metric():
dataset_hash = request.json['dataset_hash'].replace("\x00", "")
metric_name = request.json['metric_name']
metric_value = request.json['metric_value']

new_metric = Metric(algorithm_name, dataset_hash, metric_name, metric_value)

target_type = request.json['target_type']

new_metric = Metric(algorithm_name, dataset_hash, metric_name, metric_value, target_type)

db.session.add(new_metric)
db.session.commit()
Expand All @@ -44,8 +47,12 @@ def add_metric():
db.session.add(new_params)
db.session.commit()

#new_metric = Metric.query.get(new_metric.id)
print(metric_schema.dump(new_metric))
data_meta_features = request.json['data_meta_features']
if(data_meta_features != ""):
for feat in data_meta_features:
new_feat = MetaFeature(new_metric.id, feat['feat_name'], feat['feat_value'])
db.session.add(new_feat)
db.session.commit()

return metric_schema.jsonify(new_metric)

Expand Down
2 changes: 2 additions & 0 deletions constants.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
# Table Names
TABLE_METRIC = 'metric'
TABLE_PARAM = 'params'
TABLE_META_FEATURES = 'meta_features'

# Table Class Names
CLASS_METRIC = 'Metric'
CLASS_PARAM = 'Params'
CLASS_META_FEATURE = 'MetaFeature'

# Rest Methods
GET = 'GET'
Expand Down
27 changes: 27 additions & 0 deletions data_models/MetaFeatures.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
from app import db, ma
from constants import TABLE_META_FEATURES


# Meta Feature Class/Model
class MetaFeature(db.Model):
__tablename__ = TABLE_META_FEATURES
id = db.Column(db.Integer, primary_key=True)
metric_id = db.Column(db.Integer, db.ForeignKey('metric.id'), nullable=False)
feat_name = db.Column(db.String(200))
feat_value = db.Column(db.String(200))

def __init__(self, metric_id, feat_name, feat_value):
self.metric_id = metric_id
self.feat_name = feat_name
self.feat_value = feat_value


# Meta Feature Schema
class MetaFeatureSchema(ma.Schema):
class Meta:
fields = ('id', 'metric_id', 'feat_name', 'feat_value')


# Init schema
meta_feature_schema = MetaFeatureSchema()
meta_features_schema = MetaFeatureSchema(many=True)
15 changes: 10 additions & 5 deletions data_models/Metrics.py
Original file line number Diff line number Diff line change
@@ -1,29 +1,34 @@
from app import db, ma
from data_models.Params import ParamSchema
from constants import TABLE_METRIC, CLASS_PARAM
from data_models.MetaFeatures import MetaFeatureSchema
from constants import TABLE_METRIC, CLASS_PARAM, CLASS_META_FEATURE

# Product Class/Model
# Metric Class/Model
class Metric(db.Model):
__tablename__ = TABLE_METRIC
id = db.Column(db.Integer, primary_key=True)
algorithm_name = db.Column(db.String(200))
dataset_hash = db.Column(db.Text)
metric_name = db.Column(db.String(200))
metric_value = db.Column(db.Float)
target_type = db.Column(db.String(200))
params = db.relationship(CLASS_PARAM, cascade = "all, delete", backref=TABLE_METRIC, lazy=True)
meta_features = db.relationship(CLASS_META_FEATURE, cascade = "all, delete", backref=TABLE_METRIC, lazy=True)

def __init__(self, algorithm_name, dataset_hash, metric_name, metric_value):
def __init__(self, algorithm_name, dataset_hash, metric_name, metric_value, target_type):
self.algorithm_name = algorithm_name
self.dataset_hash = dataset_hash
self.metric_name = metric_name
self.metric_value = metric_value
self.target_type = target_type


# Product Schema
# Metric Schema
class MetricSchema(ma.Schema):
params = ma.Nested(ParamSchema, many=True)
meta_features = ma.Nested(MetaFeatureSchema, many=True)
class Meta:
fields = ('id', 'algorithm_name', 'dataset_hash', 'metric_name', 'metric_value', 'params')
fields = ('id', 'algorithm_name', 'dataset_hash', 'metric_name', 'metric_value', 'target_type', 'params', 'meta_features')
include_fk = True


Expand Down
4 changes: 2 additions & 2 deletions data_models/Params.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from constants import TABLE_PARAM


# Product Class/Model
# Param Class/Model
class Params(db.Model):
__tablename__ = TABLE_PARAM
id = db.Column(db.Integer, primary_key=True)
Expand All @@ -16,7 +16,7 @@ def __init__(self, metric_id, param_name, param_value):
self.param_value = param_value


# Product Schema
# Param Schema
class ParamSchema(ma.Schema):
class Meta:
fields = ('id', 'metric_id', 'param_name', 'param_value')
Expand Down
33 changes: 33 additions & 0 deletions fmlearn.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
import pandas as pd
import utils

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor

def kmc():
df = utils.get_df_from_db()
df.fillna(0, inplace=True)

X, y = utils.get_Xy(df)

# pre processing of data
X, _ = utils.ohe_feature(X, utils.TARGET_TYPE)

y, _ = utils.label_encode_feature(y, utils.ALGORITHM_NAME)
y, _ = utils.label_encode_feature(y, utils.METRIC_NAME)

# train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

model = KNeighborsRegressor(n_neighbors=2)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print(y_test.to_string(header=False))
y_pred = pd.DataFrame(y_pred)
print(y_pred.to_string(header=False))

if __name__ == "__main__":
kmc()
9 changes: 9 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,16 +9,25 @@ gunicorn==20.0.4
isort==4.3.21
itsdangerous==1.1.0
Jinja2==2.10.3
joblib==0.15.0
lazy-object-proxy==1.4.3
MarkupSafe==1.1.1
marshmallow==3.3.0
marshmallow-sqlalchemy==0.21.0
mccabe==0.6.1
numpy==1.18.4
pandas==1.0.3
psycopg2-binary==2.8.4
pycparser==2.19
pylint==2.4.4
python-dateutil==2.8.1
pytz==2020.1
scikit-learn==0.23.0
scipy==1.4.1
six==1.13.0
sklearn==0.0
SQLAlchemy==1.3.12
threadpoolctl==2.0.0
typed-ast==1.4.1
Werkzeug==0.16.0
wrapt==1.11.2
75 changes: 75 additions & 0 deletions utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
import pandas as pd

from data_models.Metrics import Metric

from sklearn import preprocessing as pp

TARGET_TYPE = 'Target Type'
ALGORITHM_NAME = 'Algorithm Name'
METRIC_NAME = 'Metric Name'
METRIC_VALUE = 'Metric Value'

def get_df_from_db():
all_metrics = Metric.query.all()
df = pd.DataFrame()
for metric in all_metrics:
data = {}
if TARGET_TYPE not in data:
data[TARGET_TYPE] = []
data[TARGET_TYPE].append(str(metric.target_type))

for mf in metric.meta_features:
if mf.feat_name not in data:
data[mf.feat_name] = []
data[mf.feat_name].append(float(mf.feat_value))

if ALGORITHM_NAME not in data:
data[ALGORITHM_NAME] = []
data[ALGORITHM_NAME].append(str(metric.algorithm_name))

if METRIC_NAME not in data:
data[METRIC_NAME] = []
data[METRIC_NAME].append(str(metric.metric_name))

if METRIC_VALUE not in data:
data[METRIC_VALUE] = []
data[METRIC_VALUE].append(float(metric.metric_value))

df = df.append(pd.DataFrame.from_dict(data))

return df.reset_index()


def get_Xy(df):
X = df[df.columns.difference([ALGORITHM_NAME, METRIC_NAME, METRIC_VALUE])]
y = df[[ALGORITHM_NAME, METRIC_NAME, METRIC_VALUE]]
return X, y

# One Hot Encoding
def ohe_feature(df, feature, drop_additional_feature=True):
encoder = pp.OneHotEncoder(categories='auto', sparse=False)
data = encoder.fit_transform(df[feature].values.reshape(len(df[feature]), 1))
# creating the encoded df
ohedf = pd.DataFrame(data, columns=[feature + ': ' + str(i.strip('x0123_')) for i in encoder.get_feature_names()])
# to drop the extra column of redundant data
if drop_additional_feature:
ohedf.drop(ohedf.columns[len(ohedf.columns) - 1], axis=1, inplace=True)
# concat the ohe df with the original df
df = pd.concat([df, ohedf], axis=1)
# to drop the original column in the df
del df[feature]

return df, encoder

# Label Encoding
def label_encode_feature(df, feature):
encoder = pp.LabelEncoder()
data = encoder.fit_transform(df[feature].values.reshape(len(df[feature]), 1))
# to drop the original column in the df
del df[feature]
# creating the encoded df
ledf = pd.DataFrame(data, columns=[feature])
# concat the ohe df with the original df
df = pd.concat([df, ledf], axis=1)

return df, encoder