rnd4u-org · DROBUCshK · Jan 29, 2021 · Jan 29, 2021 · Feb 2, 2021 · Feb 2, 2021
@@ -0,0 +1,232 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "name": "Titanic_simple.ipynb",
+      "provenance": [],
+      "collapsed_sections": []
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "XtClpWT-RGU4"
+      },
+      "source": [
+        "**Importing all libraries**"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "QLMcjiNeO-uz"
+      },
+      "source": [
+        "import pandas as pd\n",
+        "from sklearn.ensemble import RandomForestClassifier\n",
+        "from sklearn.linear_model import LogisticRegression\n",
+        "from lightgbm import LGBMClassifier\n",
+        "from sklearn.preprocessing import StandardScaler\n",
+        "from sklearn.ensemble import VotingClassifier"
+      ],
+      "execution_count": 23,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "jL3ggg9IRFnT"
+      },
+      "source": [
+        "**Loading the data**"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "FhTUIFLbQ-SS"
+      },
+      "source": [
+        "train_data = pd.read_csv(\"/content/sample_data/train.csv\", index_col=\"PassengerId\")\n",
+        "test_data = pd.read_csv(\"/content/sample_data/test.csv\", index_col=\"PassengerId\")"
+      ],
+      "execution_count": 24,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "fW5Kzh3JRVbb"
+      },
+      "source": [
+        "**Feature selection**"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "tt5LFPViRaiR"
+      },
+      "source": [
+        "features = [\"Pclass\", \"Sex\", \"SibSp\", \"Parch\"]\n",
+        "X = pd.get_dummies(train_data[features])\n",
+        "X_test = pd.get_dummies(test_data[features])\n",
+        "y = train_data[\"Survived\"]"
+      ],
+      "execution_count": 25,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "6Bb17U40SG2y"
+      },
+      "source": [
+        "**Data Normalization**"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "K1ZvNjfRR_IB",
+        "outputId": "e7397408-010b-4822-8df8-e96f7a142a9b"
+      },
+      "source": [
+        "ss = StandardScaler()\n",
+        "X_scaled = ss.fit_transform(X)\n",
+        "X_test_scaled = ss.transform(X_test)\n",
+        "print(X_scaled)\n",
+        "print(X_test_scaled)"
+      ],
+      "execution_count": 26,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "text": [
+            "[[ 0.82737724  0.43279337 -0.47367361 -0.73769513  0.73769513]\n",
+            " [-1.56610693  0.43279337 -0.47367361  1.35557354 -1.35557354]\n",
+            " [ 0.82737724 -0.4745452  -0.47367361  1.35557354 -1.35557354]\n",
+            " ...\n",
+            " [ 0.82737724  0.43279337  2.00893337  1.35557354 -1.35557354]\n",
+            " [-1.56610693 -0.4745452  -0.47367361 -0.73769513  0.73769513]\n",
+            " [ 0.82737724 -0.4745452  -0.47367361 -0.73769513  0.73769513]]\n",
+            "[[ 0.82737724 -0.4745452  -0.47367361 -0.73769513  0.73769513]\n",
+            " [ 0.82737724  0.43279337 -0.47367361  1.35557354 -1.35557354]\n",
+            " [-0.36936484 -0.4745452  -0.47367361 -0.73769513  0.73769513]\n",
+            " ...\n",
+            " [ 0.82737724 -0.4745452  -0.47367361 -0.73769513  0.73769513]\n",
+            " [ 0.82737724 -0.4745452  -0.47367361 -0.73769513  0.73769513]\n",
+            " [ 0.82737724  0.43279337  0.76762988 -0.73769513  0.73769513]]\n"
+          ],
+          "name": "stdout"
+        }
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "j4Sdo0r_SipM"
+      },
+      "source": [
+        "**Modeling**"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "LG70wCttSfdL",
+        "outputId": "e65c3bd8-7452-451e-ae77-5c8b9bf5095c"
+      },
+      "source": [
+        "rfc = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1)\n",
+        "rfc.fit(X_scaled, y)\n",
+        "lg = LogisticRegression(random_state=10, max_iter=1000, C=20, solver='lbfgs')\n",
+        "lg.fit(X_scaled, y)\n",
+        "lgb = LGBMClassifier()\n",
+        "lgb.fit(X_scaled, y)"
+      ],
+      "execution_count": 27,
+      "outputs": [
+        {
+          "output_type": "execute_result",
+          "data": {
+            "text/plain": [
+              "LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,\n",
+              "               importance_type='split', learning_rate=0.1, max_depth=-1,\n",
+              "               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,\n",
+              "               n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,\n",
+              "               random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,\n",
+              "               subsample=1.0, subsample_for_bin=200000, subsample_freq=0)"
+            ]
+          },
+          "metadata": {
+            "tags": []
+          },
+          "execution_count": 27
+        }
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "ZcKY8aNXTpjv"
+      },
+      "source": [
+        "**Ensembling**"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "N_w1IJ-STus2"
+      },
+      "source": [
+        "ensemble_model = VotingClassifier(estimators=[\n",
+        "    (\"logit\", lg),\n",
+        "    (\"rf\", rfc),\n",
+        "    (\"lgb\", lgb),\n",
+        "], voting=\"hard\")\n",
+        "\n",
+        "ensemble_model.fit(X_scaled, y)\n",
+        "preds = ensemble_model.predict(X_test_scaled)"
+      ],
+      "execution_count": 28,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "Ti0tSHUDUAUa"
+      },
+      "source": [
+        "**Output**"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "bjeO0_OGUCPb"
+      },
+      "source": [
+        "output = pd.DataFrame({'PassengerId': test_data.index,\n",
+        "                       'Survived': preds})\n",
+        "\n",
+        "output.to_csv('submission.csv', index=False)"
+      ],
+      "execution_count": 29,
+      "outputs": []
+    }
+  ]
+}
@@ -0,0 +1,64 @@
+# -*- coding: utf-8 -*-
+"""Titanic_simple.ipynb
+
+Automatically generated by Colaboratory.
+
+Original file is located at
+    https://colab.research.google.com/drive/1xML6OS-AeM6VQ1qkooNVCbykEKUXWAqV
+
+**Importing all libraries**
+"""
+
+import pandas as pd
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.linear_model import LogisticRegression
+from lightgbm import LGBMClassifier
+from sklearn.preprocessing import StandardScaler
+from sklearn.ensemble import VotingClassifier
+
+"""**Loading the data**"""
+
+train_data = pd.read_csv("/content/sample_data/train.csv", index_col="PassengerId")
+test_data = pd.read_csv("/content/sample_data/test.csv", index_col="PassengerId")
+
+"""**Feature selection**"""
+
+features = ["Pclass", "Sex", "SibSp", "Parch"]
+X = pd.get_dummies(train_data[features])
+X_test = pd.get_dummies(test_data[features])
+y = train_data["Survived"]
+
+"""**Data Normalization**"""
+
+ss = StandardScaler()
+X_scaled = ss.fit_transform(X)
+X_test_scaled = ss.transform(X_test)
+print(X_scaled)
+print(X_test_scaled)
+
+"""**Modeling**"""
+
+rfc = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1)
+rfc.fit(X_scaled, y)
+lg = LogisticRegression(random_state=10, max_iter=1000, C=20, solver='lbfgs')
+lg.fit(X_scaled, y)
+lgb = LGBMClassifier()
+lgb.fit(X_scaled, y)
+
+"""**Ensembling**"""
+
+ensemble_model = VotingClassifier(estimators=[
+    ("logit", lg),
+    ("rf", rfc),
+    ("lgb", lgb),
+], voting="hard")
+
+ensemble_model.fit(X_scaled, y)
+preds = ensemble_model.predict(X_test_scaled)
+
+"""**Output**"""
+
+output = pd.DataFrame({'PassengerId': test_data.index,
+                       'Survived': preds})
+
+output.to_csv('submission.csv', index=False)