Regression with Scikit-learn

Import pandas as pd
Import matplotlib as plt
Import numpy as np
From sklearn import linear_model

data=pd.read_csv(‘boston.csv’)
#view data
print(data.head())
#target variable is Medv, return numpy arrays
y=data[‘MEDV’].values
X=data.drop(‘MEDV’,axis=1).values


#regressing on number of rooms (index position column 5)
NumberRooms=X[:,5]
#figure out what type it is 
type(NumberRooms), type(y)
#both are numpy arrays
#make them the right shape
NumberRooms=NumberRooms.reshape(-1,1)
y=y.reshape(-1,1)
#lets plot
plt.scatter(NumberRooms,y)
plt.ylabel(“Value of house per /1000 $”)
plt.xlabel(“Number rooms”)
plt.show();
#fit model
model=linear_model.LinearRegression()
model.fit(NumberRooms,y)
#make a cool plot for prediction cause yolo
XAxis=np.linspace(min(NumberRooms),max(NumberRooms)).reshape(-1,1)
plt.scatter(NumberRooms,y,color=’blue’)
plt.plot(XAxis,model.predict(XAxis),color=’black’,linewidth=3)
plt.show()

#---------------------------------------------------------
#More practice with fertility rates and life expectancy
# Import numpy and pandas
import numpy as np
import pandas as pd

# Read the CSV file into a DataFrame: df
df = pd.read_csv("gapminder.csv")

# Create arrays for features and target variable
y = df['life'].values
X = df['fertility'].values

# Print the dimensions of X and y before reshaping
print("Dimensions of y before reshaping: {}".format(y.shape))
print("Dimensions of X before reshaping: {}".format(X.shape))

# Reshape X and y
y = y.reshape(-1,1)
X = X.reshape(-1,1)

# Print the dimensions of X and y after reshaping
print("Dimensions of y after reshaping: {}".format(y.shape))
print("Dimensions of X after reshaping: {}".format(X.shape))

#make a correlation heatmap
#the heatmap was generated using Seaborn's heatmap function and the following line of code, where df.corr() computes the pairwise correlation between columns:

sns.heatmap(df.corr(), square=True, cmap='RdYlGn')

#explore the DataFrame using pandas methods such as .info(), .describe(), .head().
#---------------------------------------------------------
From sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=42)
model=linear_model.LinearRegression()
model.fit(X_train,y_train)
Xhat=model.predict(X_test)
#find out how good model is with R^2
model.score(X_test,y_test)

#---------------------------------------------------------
# more regression
# Import LinearRegression
from sklearn.linear_model import LinearRegression

# Create the regressor: reg
reg = LinearRegression()

# Create the prediction space
prediction_space = np.linspace(min(X_fertility), max(X_fertility)).reshape(-1,1)

# Fit the model to the data
reg.fit(X_fertility,y)

# Compute predictions over the prediction space: y_pred
y_pred = reg.predict(prediction_space)

# Print R^2 
print(reg.score(X_fertility,y))

# Plot regression line
plt.plot(prediction_space, y_pred, color='black', linewidth=3)
plt.show()
#---------------------------------------------------------
# Import necessary modules
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

# Create training and test sets so that 70% train, 30% test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=42)

# Create the regressor: reg_all
reg_all = LinearRegression()

# Fit the regressor to the training data
reg_all.fit(X_train,y_train)

# Predict on the test data: y_pred
y_pred = reg_all.predict(X_test)

# Compute and print R^2 and RMSE
print("R^2: {}".format(reg_all.score(X_test, y_test)))
rmse = np.sqrt(mean_squared_error(y_test,y_pred))
print("Root Mean Squared Error: {}".format(rmse))

#---------------------------------------------------------
#Cross validation
#Motivation: Model performance is dependant on the way the data is split, may not be representative of the models ability to generalize to unseen data
#Begin by splitting the data into 5 parts,or folds
#use the first fold as the test sample, use other 4 as training
#do this for each fold
#this gives you 5 metrics of comparison for goodness of fit
#this is called 5-fold cross validation,could be 10-fold etc k-fold but more folds is more computationally expensive

From sklearn.model_selection import cross_val_score
model=linear_model.LinearRegression()
CV_RSquare=cross_val_score(model,X,y,cv=5)
print(CV_RSquare)
np.mean(CV_RSquare)

#---------------------------------------------------------
# Import the necessary modules
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score

# Create a linear regression object: reg
reg = LinearRegression()

# Compute 5-fold cross-validation scores: cv_scores
cv_scores = cross_val_score(reg,X,y,cv=5)

# Print the 5-fold cross-validation scores
print(cv_scores)

print("Average 5-Fold CV Score: {}".format(np.mean(cv_scores)))

#---------------------------------------------------------
#In the IPython Shell, you can use %timeit to see how long each 3-fold CV takes compared to 10-fold CV by executing the following cv=3 and cv=10:
# Import necessary modules
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score


# Create a linear regression object: reg
reg = LinearRegression()

# Perform 3-fold CV
cvscores_3 = cross_val_score(reg,X,y,cv=3)
#in the shell, do %timeit cross_val_score(reg,X,y,cv=3)
print(np.mean(cvscores_3))

# Perform 10-fold CV
cvscores_10 = cross_val_score(reg,X,y,cv=10)
print(np.mean(cvscores_10))


#---------------------------------------------------------
#Regulariation
#large coefficients can lead to overfitting, thus penalize for large coefficients

#Sum squared residuals + some constant alpha times the sum of the squared coefficients
#thus, large coefficients = large penalty
#we choose alpha, much like choosing k in knn in classification. This is called hyperparameter tuning
#alpha(sometimes called lambda controls model complexity
#when alpha=0, we get back to standard OLS (which can lead to overfitting)
#when alpha approaches infinity, this can lead to underfitting because large coefficients are significantly penalized and leads to too simple of a model 

From sklearn.linear_model import Ridge
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=42)
ridge=Ridge(alpha=0.1,normalize=True)
ridge.fit(X_train,y_train)
Xhat=ridge.predict(X_test)
ridge.score(X_test,y_test)


#It can be used to select very important features of a dataset
#tends to shrink the value of less important features
from sklearn.model_selection import Lasso to 0
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=42)
lasso=Lasso(alpha=0.1,normalize=True)
lasso.fit(X_train,y_train)
lasso.predict(X_test)
lasso.score(X_test,y_test)
#Lasso for feature selection i.e independent variable selection 
from sklearn.model_selection import Lasso
columns=data.drop(“Medv”,axis=1).columns
lasso=Lasso(alpha=0.1)
lasso_coeff=lasso.fit(X,y).coef_
#plotting coefficients as a function of feature name
_=plt.plot(range(len(columns),lasso_coeff)
_=plt.xticks(range(len(columns),columns,rotation=60)
_=plt.ylabel(‘Coefficients’)
plt.show()
#Most important predictor is #rooms


# Import Lasso
from sklearn.linear_model import Lasso

# Instantiate a lasso regressor: lasso
lasso = Lasso(alpha=0.4, normalize=True)

# Fit the regressor to the data
lasso.fit(X, y)

# Compute and print the coefficients
lasso_coef = lasso.coef_
print(lasso_coef)

# Plot the coefficients
plt.plot(range(len(df_columns)), lasso_coef)
plt.xticks(range(len(df_columns)), df_columns.values, rotation=60)
plt.margins(0.02)
plt.show()

#-------------------------------------
#Lasso is great for feature selection, but when building regression models, Ridge regression should be your first choice.
#Recall that lasso performs regularization by adding to the loss function a penalty term of the absolute value of each coefficient multiplied by some alpha. This is also known as 
L1
 regularization because the regularization term is the 
L1
 norm of the coefficients. This is not the only way to regularize, however. 
If instead you took the sum of the squared values of the coefficients multiplied by some alpha - like in Ridge regression - you would be computing the 
L2
 norm. In this exercise, you will practice fitting ridge regression models over a range of different alphas, and plot cross-validated 
R
2
 scores for each, using this function that we have defined for you, which plots the 
R
2
 score as well as standard error for each alpha:

def display_plot(cv_scores, cv_scores_std):
    fig = plt.figure()
    ax = fig.add_subplot(1,1,1)
    ax.plot(alpha_space, cv_scores)

    std_error = cv_scores_std / np.sqrt(10)

    ax.fill_between(alpha_space, cv_scores + std_error, cv_scores - std_error, alpha=0.2)
    ax.set_ylabel('CV Score +/- Std Error')
    ax.set_xlabel('Alpha')
    ax.axhline(np.max(cv_scores), linestyle='--', color='.5')
    ax.set_xlim([alpha_space[0], alpha_space[-1]])
    ax.set_xscale('log')
    plt.show()
# Import necessary modules
from sklearn.linear_model import Ridge
from sklearn.model_selection import cross_val_score

# Setup the array of alphas and lists to store scores
alpha_space = np.logspace(-4, 0, 50)
ridge_scores = []
ridge_scores_std = []

# Create a ridge regressor: ridge
#this is an object
ridge = Ridge(normalize=True)

# Compute scores over range of alphas
for alpha in alpha_space:

    # Specify the alpha value to use: ridge.alpha
    ridge.alpha = ridge.alpha
    
    # Perform 10-fold CV: ridge_cv_scores
    ridge_cv_scores = cross_val_score(ridge,X,y,cv=10)
    
    # Append the mean of ridge_cv_scores to ridge_scores
    ridge_scores.append(np.mean(ridge_cv_scores))
    
    # Append the std of ridge_cv_scores to ridge_scores_std
    ridge_scores_std.append(np.std(ridge_cv_scores))

# Display the plot
display_plot(ridge_scores, ridge_scores_std)