diff --git a/dataAnalysis/DataCleaningForETC.ipynb b/dataAnalysis/DataCleaningForETC.ipynb new file mode 100644 index 000000000..da83d5446 --- /dev/null +++ b/dataAnalysis/DataCleaningForETC.ipynb @@ -0,0 +1,432 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Code developed by Varun Bopardikar" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# This is the notebook you can use to get the csv file containing all service requests between 2017 and 2019, along with their ElapsedTime and ElapsedDays." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np \n", + "from datetime import datetime\n", + "from datetime import date" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "#Adds ElaspedTime and ElapsedDays column. \n", + "\n", + "\n", + "def convertTime(string):\n", + " \"\"\"Converts a string to a datetime object.\n", + " \"\"\"\n", + " time = datetime.strptime(string,'%m/%d/%Y %I:%M:%S %p') \n", + " return time\n", + "\n", + "def convertDays(string):\n", + " \"\"\"\n", + " Converts string to date object. \n", + " \"\"\"\n", + " time = datetime.strptime(string[:10],'%m/%d/%Y') #Accepts time string, converts it to datetime object.\n", + " return time\n", + "\n", + "def convertFromSeconds(s): # total seconds\n", + " \"\"\" convertFromSeconds(s): Converts an integer # of seconds into a list of [days, hours, minutes, seconds]\n", + " input s: an int\n", + " \"\"\"\n", + " s = s*60\n", + " days = s // (24*60*60) # total days\n", + " s = s % (24*60*60) # remainder s\n", + " hours = s // (60*60) # total hours\n", + " s = s % (60*60) # remainder s\n", + " minutes = s // 60 # total minutes\n", + " s = s % 60 # remainder s\n", + " statement = (days, ' days') + (hours, ' hrs') +(minutes, ' mins') + (s, 'sec')\n", + " return statement\n", + " \n", + "def elapsedTime(csv2017, csv2018, csv2019):\n", + " \"\"\"\n", + " Accepts CSV files containing 2017-2019 service requests and creates new dataframe of all timestamps for both service request submission and fulfillment times.\n", + " Input CSV file in the format: r'filepath'\n", + " Data source: https://data.lacity.org/A-Well-Run-City/MyLA311-Service-Request-Data-2019/pvft-t768\n", + " \"\"\"\n", + " df = pd.concat(map(pd.read_csv, [csv2017, csv2018, csv2019]), ignore_index = True)\n", + " \n", + " hdf = df.dropna(axis=0, subset=['CreatedDate', 'ClosedDate'])\n", + "\n", + " #ElapsedTime \n", + " df1 = hdf['ClosedDate'].apply(convertTime, 0)\n", + " df2 = hdf['CreatedDate'].apply(convertTime, 0)\n", + " \n", + " hdf['ElapsedTime'] = df1 - df2 \n", + " hdf['ElapsedTime'] = hdf['ElapsedTime']/np.timedelta64(1,'m') \n", + " hdf['ElapsedTime'] = hdf['ElapsedTime'].apply(convertFromSeconds, 0) \n", + " \n", + " #ElapsedDays\n", + " df3 = hdf['CreatedDate'].apply(convertDays, 0)\n", + " df4 = hdf['ClosedDate'].apply(convertDays, 0) \n", + " hdf['ElapsedDays'] = (df4 - df3).dt.days\n", + " \n", + " #Column for Closed Dates\n", + " hdf['Just Date'] = df3\n", + " \n", + " return hdf.reset_index(drop = True)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/varunbopardikar/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:1: DtypeWarning: Columns (20) have mixed types. Specify dtype option on import or set low_memory=False.\n", + " \"\"\"Entry point for launching an IPython kernel.\n", + "/Users/varunbopardikar/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:34: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version\n", + "of pandas will change to not sort by default.\n", + "\n", + "To accept the future behavior, pass 'sort=False'.\n", + "\n", + "To retain the current behavior and silence the warning, pass 'sort=True'.\n", + "\n", + "/Users/varunbopardikar/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:42: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", + "/Users/varunbopardikar/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:43: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", + "/Users/varunbopardikar/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:44: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", + "/Users/varunbopardikar/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:49: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", + "/Users/varunbopardikar/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:52: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n" + ] + } + ], + "source": [ + "edf = elapsedTime('service2017.csv', 'service2018.csv', 'service2019.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "edf = edf[edf['ElapsedDays'] < 1100] #Some of the service requests had dates which were mislabelled, resulting in impossible ElapsedDay values. This command gets rid of those requests. " + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [], + "source": [ + "edf.to_csv('fservice.csv', index = False)" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/varunbopardikar/anaconda3/lib/python3.7/site-packages/IPython/core/interactiveshell.py:3057: DtypeWarning: Columns (10,33) have mixed types. Specify dtype option on import or set low_memory=False.\n", + " interactivity=interactivity, compiler=compiler, result=result)\n" + ] + } + ], + "source": [ + "df = pd.read_csv('fservice.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
APCActionTakenAddressAddressVerifiedAnonymousApproximateAddressAssignToCDCDMemberClosedDate...StreetNameSuffixTBMColumnTBMPageTBMRowUpdatedDateZipCodeElapsedTimeElapsedDaysJust Date
0West Los Angeles APCSR Created222 N ASHDALE PL, 90049YYNNaN5.0Paul Koretz01/01/2017 09:26:00 AM...ASHDALEPLH631.01.001/01/2017 09:26:00 AM90049(0.0, ' days', 9.0, ' hrs', 25.0, ' mins', 0.0...02017-01-01
1East Los Angeles APCSR Created1624 N ALLESANDRO ST, 90026YYNCCAC13.0Mitch O'Farrell01/12/2017 05:10:00 PM...ALLESANDROSTE594.06.001/12/2017 05:10:00 PM90026(11.0, ' days', 16.0, ' hrs', 25.0, ' mins', 0...112017-01-01
2West Los Angeles APCSR Created250 N ASHDALE AVE, 90049YYNNaN5.0Paul Koretz01/01/2017 09:27:00 AM...ASHDALEAVEH631.01.001/01/2017 09:27:00 AM90049(0.0, ' days', 8.0, ' hrs', 41.0, ' mins', 0.0...02017-01-01
3North Valley APCSR Created21824 W DEVONSHIRE ST, 91311YNNWVA12.0Mitchell Englander01/04/2017 05:33:00 PM...DEVONSHIRESTA500.04.001/04/2017 05:33:00 PM91311(3.0, ' days', 16.0, ' hrs', 32.0, ' mins', 0....32017-01-01
4North Valley APCSR Created21230 W DEVONSHIRE ST, 91311YNNWVA12.0Mitchell Englander01/04/2017 05:34:00 PM...DEVONSHIRESTB500.04.001/04/2017 05:34:00 PM91311(3.0, ' days', 16.0, ' hrs', 25.0, ' mins', 0....32017-01-01
\n", + "

5 rows × 37 columns

\n", + "
" + ], + "text/plain": [ + " APC ActionTaken Address \\\n", + "0 West Los Angeles APC SR Created 222 N ASHDALE PL, 90049 \n", + "1 East Los Angeles APC SR Created 1624 N ALLESANDRO ST, 90026 \n", + "2 West Los Angeles APC SR Created 250 N ASHDALE AVE, 90049 \n", + "3 North Valley APC SR Created 21824 W DEVONSHIRE ST, 91311 \n", + "4 North Valley APC SR Created 21230 W DEVONSHIRE ST, 91311 \n", + "\n", + " AddressVerified Anonymous ApproximateAddress AssignTo CD \\\n", + "0 Y Y N NaN 5.0 \n", + "1 Y Y N CCAC 13.0 \n", + "2 Y Y N NaN 5.0 \n", + "3 Y N N WVA 12.0 \n", + "4 Y N N WVA 12.0 \n", + "\n", + " CDMember ClosedDate ... StreetName Suffix \\\n", + "0 Paul Koretz 01/01/2017 09:26:00 AM ... ASHDALE PL \n", + "1 Mitch O'Farrell 01/12/2017 05:10:00 PM ... ALLESANDRO ST \n", + "2 Paul Koretz 01/01/2017 09:27:00 AM ... ASHDALE AVE \n", + "3 Mitchell Englander 01/04/2017 05:33:00 PM ... DEVONSHIRE ST \n", + "4 Mitchell Englander 01/04/2017 05:34:00 PM ... DEVONSHIRE ST \n", + "\n", + " TBMColumn TBMPage TBMRow UpdatedDate ZipCode \\\n", + "0 H 631.0 1.0 01/01/2017 09:26:00 AM 90049 \n", + "1 E 594.0 6.0 01/12/2017 05:10:00 PM 90026 \n", + "2 H 631.0 1.0 01/01/2017 09:27:00 AM 90049 \n", + "3 A 500.0 4.0 01/04/2017 05:33:00 PM 91311 \n", + "4 B 500.0 4.0 01/04/2017 05:34:00 PM 91311 \n", + "\n", + " ElapsedTime ElapsedDays Just Date \n", + "0 (0.0, ' days', 9.0, ' hrs', 25.0, ' mins', 0.0... 0 2017-01-01 \n", + "1 (11.0, ' days', 16.0, ' hrs', 25.0, ' mins', 0... 11 2017-01-01 \n", + "2 (0.0, ' days', 8.0, ' hrs', 41.0, ' mins', 0.0... 0 2017-01-01 \n", + "3 (3.0, ' days', 16.0, ' hrs', 32.0, ' mins', 0.... 3 2017-01-01 \n", + "4 (3.0, ' days', 16.0, ' hrs', 25.0, ' mins', 0.... 3 2017-01-01 \n", + "\n", + "[5 rows x 37 columns]" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.5" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/dataAnalysis/ETCClassifier.ipynb b/dataAnalysis/ETCClassifier.ipynb new file mode 100644 index 000000000..e7f4a19d4 --- /dev/null +++ b/dataAnalysis/ETCClassifier.ipynb @@ -0,0 +1,243 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Project led by Nikolas Papastavrou\n", + "## Code developed by Varun Bopardikar\n", + "## Data Analysis conducted by Selina Ho, Hana Ahmed" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np \n", + "from sklearn.preprocessing import LabelEncoder\n", + "from sklearn.preprocessing import OneHotEncoder\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn import metrics\n", + "from datetime import datetime\n", + "from sklearn.naive_bayes import GaussianNB\n", + "from sklearn import tree\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "from sklearn.linear_model import LogisticRegression" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Load Data" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/varunbopardikar/anaconda3/lib/python3.7/site-packages/IPython/core/interactiveshell.py:3057: DtypeWarning: Columns (10,33) have mixed types. Specify dtype option on import or set low_memory=False.\n", + " interactivity=interactivity, compiler=compiler, result=result)\n" + ] + } + ], + "source": [ + "def gsev(val): \n", + " \"\"\"\n", + " Records whether or not a number is greater than 7. \n", + " \"\"\"\n", + " if val <= 7: \n", + " return 0\n", + " else: \n", + " return 1\n", + "\n", + "df = pd.read_csv('../../fservice.csv')\n", + "df['Just Date'] = df['Just Date'].apply(lambda x: datetime.strptime(x,'%Y-%m-%d'))\n", + "df['Seven'] = df['ElapsedDays'].apply(gsev, 0)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Parameters " + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "c = ['Anonymous','AssignTo', 'RequestType', 'RequestSource','CD','Direction', 'ActionTaken', 'APC' ,'AddressVerified']\n", + "d = ['Latitude', 'Longitude']" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Feature Cleaning " + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/varunbopardikar/anaconda3/lib/python3.7/site-packages/sklearn/preprocessing/_encoders.py:415: FutureWarning: The handling of integer data will change in version 0.22. Currently, the categories are determined based on the range [0, max(values)], while in the future they will be determined based on the unique values.\n", + "If you want the future behaviour and silence this warning, you can specify \"categories='auto'\".\n", + "In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.\n", + " warnings.warn(msg, FutureWarning)\n" + ] + } + ], + "source": [ + "#Put desired columns into dataframe, drop nulls. \n", + "dfn = df.filter(items = c + d + ['ElapsedDays'] + ['Seven'])\n", + "dfn = dfn.dropna()\n", + " \n", + "#Separate data into explanatory and response variables\n", + "XCAT = dfn.filter(items = c).values\n", + "XNUM = dfn.filter(items = d).values\n", + " \n", + "y = dfn['ElapsedDays'] <= 7\n", + " \n", + "#Encode cateogrical data and merge with numerical data\n", + "labelencoder_X = LabelEncoder()\n", + "for num in range(len(c)): \n", + " XCAT[:, num] = labelencoder_X.fit_transform(XCAT[:, num])\n", + " \n", + "onehotencoder = OneHotEncoder()\n", + "XCAT = onehotencoder.fit_transform(XCAT).toarray()\n", + " \n", + "X = np.concatenate((XCAT, XNUM), axis=1)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Algorithms and Hyperparameters" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "##Used Random Forest in Final Model \n", + "\n", + "gnb = GaussianNB()\n", + "dc = tree.DecisionTreeClassifier(criterion = 'entropy', max_depth = 20)\n", + "rf = RandomForestClassifier(n_estimators = 50, max_depth = 20)\n", + "lr = LogisticRegression()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Validation Set" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Accuracy: 0.9385983549336814\n", + "Precision, Recall, F1Score: (0.946896616482519, 0.9893259382317161, 0.9676463908853341, None)\n" + ] + } + ], + "source": [ + "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)\n", + "X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.2, random_state = 0)\n", + "\n", + "#Train Model\n", + "classifier = rf\n", + "\n", + "classifier.fit(X_train, y_train)\n", + "\n", + "#Test model\n", + "y_vpred = classifier.predict(X_val)\n", + "\n", + "#Print Accuracy Function results\n", + "print(\"Accuracy:\",metrics.accuracy_score(y_val, y_vpred))\n", + "print(\"Precision, Recall, F1Score:\",metrics.precision_recall_fscore_support(y_val, y_vpred, average = 'binary'))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Test Set" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Accuracy: 0.9387186223709323\n", + "Precision, Recall, F1Score: (0.9468199376863904, 0.9895874917412928, 0.9677314319565967, None)\n" + ] + } + ], + "source": [ + "#Train Model\n", + "\n", + "#Test model\n", + "y_tpred = classifier.predict(X_test)\n", + "\n", + "#Print Accuracy Function results\n", + "\n", + "print(\"Accuracy:\",metrics.accuracy_score(y_test, y_tpred))\n", + "print(\"Precision, Recall, F1Score:\",metrics.precision_recall_fscore_support(y_test, y_tpred, average = 'binary'))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.5" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}