diff --git a/dataAnalysis/DataCleaningForETC.ipynb b/dataAnalysis/DataCleaningForETC.ipynb new file mode 100644 index 000000000..da83d5446 --- /dev/null +++ b/dataAnalysis/DataCleaningForETC.ipynb @@ -0,0 +1,432 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Code developed by Varun Bopardikar" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# This is the notebook you can use to get the csv file containing all service requests between 2017 and 2019, along with their ElapsedTime and ElapsedDays." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np \n", + "from datetime import datetime\n", + "from datetime import date" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "#Adds ElaspedTime and ElapsedDays column. \n", + "\n", + "\n", + "def convertTime(string):\n", + " \"\"\"Converts a string to a datetime object.\n", + " \"\"\"\n", + " time = datetime.strptime(string,'%m/%d/%Y %I:%M:%S %p') \n", + " return time\n", + "\n", + "def convertDays(string):\n", + " \"\"\"\n", + " Converts string to date object. \n", + " \"\"\"\n", + " time = datetime.strptime(string[:10],'%m/%d/%Y') #Accepts time string, converts it to datetime object.\n", + " return time\n", + "\n", + "def convertFromSeconds(s): # total seconds\n", + " \"\"\" convertFromSeconds(s): Converts an integer # of seconds into a list of [days, hours, minutes, seconds]\n", + " input s: an int\n", + " \"\"\"\n", + " s = s*60\n", + " days = s // (24*60*60) # total days\n", + " s = s % (24*60*60) # remainder s\n", + " hours = s // (60*60) # total hours\n", + " s = s % (60*60) # remainder s\n", + " minutes = s // 60 # total minutes\n", + " s = s % 60 # remainder s\n", + " statement = (days, ' days') + (hours, ' hrs') +(minutes, ' mins') + (s, 'sec')\n", + " return statement\n", + " \n", + "def elapsedTime(csv2017, csv2018, csv2019):\n", + " \"\"\"\n", + " Accepts CSV files containing 2017-2019 service requests and creates new dataframe of all timestamps for both service request submission and fulfillment times.\n", + " Input CSV file in the format: r'filepath'\n", + " Data source: https://data.lacity.org/A-Well-Run-City/MyLA311-Service-Request-Data-2019/pvft-t768\n", + " \"\"\"\n", + " df = pd.concat(map(pd.read_csv, [csv2017, csv2018, csv2019]), ignore_index = True)\n", + " \n", + " hdf = df.dropna(axis=0, subset=['CreatedDate', 'ClosedDate'])\n", + "\n", + " #ElapsedTime \n", + " df1 = hdf['ClosedDate'].apply(convertTime, 0)\n", + " df2 = hdf['CreatedDate'].apply(convertTime, 0)\n", + " \n", + " hdf['ElapsedTime'] = df1 - df2 \n", + " hdf['ElapsedTime'] = hdf['ElapsedTime']/np.timedelta64(1,'m') \n", + " hdf['ElapsedTime'] = hdf['ElapsedTime'].apply(convertFromSeconds, 0) \n", + " \n", + " #ElapsedDays\n", + " df3 = hdf['CreatedDate'].apply(convertDays, 0)\n", + " df4 = hdf['ClosedDate'].apply(convertDays, 0) \n", + " hdf['ElapsedDays'] = (df4 - df3).dt.days\n", + " \n", + " #Column for Closed Dates\n", + " hdf['Just Date'] = df3\n", + " \n", + " return hdf.reset_index(drop = True)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/varunbopardikar/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:1: DtypeWarning: Columns (20) have mixed types. Specify dtype option on import or set low_memory=False.\n", + " \"\"\"Entry point for launching an IPython kernel.\n", + "/Users/varunbopardikar/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:34: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version\n", + "of pandas will change to not sort by default.\n", + "\n", + "To accept the future behavior, pass 'sort=False'.\n", + "\n", + "To retain the current behavior and silence the warning, pass 'sort=True'.\n", + "\n", + "/Users/varunbopardikar/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:42: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", + "/Users/varunbopardikar/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:43: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", + "/Users/varunbopardikar/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:44: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", + "/Users/varunbopardikar/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:49: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", + "/Users/varunbopardikar/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:52: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n" + ] + } + ], + "source": [ + "edf = elapsedTime('service2017.csv', 'service2018.csv', 'service2019.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "edf = edf[edf['ElapsedDays'] < 1100] #Some of the service requests had dates which were mislabelled, resulting in impossible ElapsedDay values. This command gets rid of those requests. " + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [], + "source": [ + "edf.to_csv('fservice.csv', index = False)" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/varunbopardikar/anaconda3/lib/python3.7/site-packages/IPython/core/interactiveshell.py:3057: DtypeWarning: Columns (10,33) have mixed types. Specify dtype option on import or set low_memory=False.\n", + " interactivity=interactivity, compiler=compiler, result=result)\n" + ] + } + ], + "source": [ + "df = pd.read_csv('fservice.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + " | APC | \n", + "ActionTaken | \n", + "Address | \n", + "AddressVerified | \n", + "Anonymous | \n", + "ApproximateAddress | \n", + "AssignTo | \n", + "CD | \n", + "CDMember | \n", + "ClosedDate | \n", + "... | \n", + "StreetName | \n", + "Suffix | \n", + "TBMColumn | \n", + "TBMPage | \n", + "TBMRow | \n", + "UpdatedDate | \n", + "ZipCode | \n", + "ElapsedTime | \n", + "ElapsedDays | \n", + "Just Date | \n", + "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", + "West Los Angeles APC | \n", + "SR Created | \n", + "222 N ASHDALE PL, 90049 | \n", + "Y | \n", + "Y | \n", + "N | \n", + "NaN | \n", + "5.0 | \n", + "Paul Koretz | \n", + "01/01/2017 09:26:00 AM | \n", + "... | \n", + "ASHDALE | \n", + "PL | \n", + "H | \n", + "631.0 | \n", + "1.0 | \n", + "01/01/2017 09:26:00 AM | \n", + "90049 | \n", + "(0.0, ' days', 9.0, ' hrs', 25.0, ' mins', 0.0... | \n", + "0 | \n", + "2017-01-01 | \n", + "
1 | \n", + "East Los Angeles APC | \n", + "SR Created | \n", + "1624 N ALLESANDRO ST, 90026 | \n", + "Y | \n", + "Y | \n", + "N | \n", + "CCAC | \n", + "13.0 | \n", + "Mitch O'Farrell | \n", + "01/12/2017 05:10:00 PM | \n", + "... | \n", + "ALLESANDRO | \n", + "ST | \n", + "E | \n", + "594.0 | \n", + "6.0 | \n", + "01/12/2017 05:10:00 PM | \n", + "90026 | \n", + "(11.0, ' days', 16.0, ' hrs', 25.0, ' mins', 0... | \n", + "11 | \n", + "2017-01-01 | \n", + "
2 | \n", + "West Los Angeles APC | \n", + "SR Created | \n", + "250 N ASHDALE AVE, 90049 | \n", + "Y | \n", + "Y | \n", + "N | \n", + "NaN | \n", + "5.0 | \n", + "Paul Koretz | \n", + "01/01/2017 09:27:00 AM | \n", + "... | \n", + "ASHDALE | \n", + "AVE | \n", + "H | \n", + "631.0 | \n", + "1.0 | \n", + "01/01/2017 09:27:00 AM | \n", + "90049 | \n", + "(0.0, ' days', 8.0, ' hrs', 41.0, ' mins', 0.0... | \n", + "0 | \n", + "2017-01-01 | \n", + "
3 | \n", + "North Valley APC | \n", + "SR Created | \n", + "21824 W DEVONSHIRE ST, 91311 | \n", + "Y | \n", + "N | \n", + "N | \n", + "WVA | \n", + "12.0 | \n", + "Mitchell Englander | \n", + "01/04/2017 05:33:00 PM | \n", + "... | \n", + "DEVONSHIRE | \n", + "ST | \n", + "A | \n", + "500.0 | \n", + "4.0 | \n", + "01/04/2017 05:33:00 PM | \n", + "91311 | \n", + "(3.0, ' days', 16.0, ' hrs', 32.0, ' mins', 0.... | \n", + "3 | \n", + "2017-01-01 | \n", + "
4 | \n", + "North Valley APC | \n", + "SR Created | \n", + "21230 W DEVONSHIRE ST, 91311 | \n", + "Y | \n", + "N | \n", + "N | \n", + "WVA | \n", + "12.0 | \n", + "Mitchell Englander | \n", + "01/04/2017 05:34:00 PM | \n", + "... | \n", + "DEVONSHIRE | \n", + "ST | \n", + "B | \n", + "500.0 | \n", + "4.0 | \n", + "01/04/2017 05:34:00 PM | \n", + "91311 | \n", + "(3.0, ' days', 16.0, ' hrs', 25.0, ' mins', 0.... | \n", + "3 | \n", + "2017-01-01 | \n", + "
5 rows × 37 columns
\n", + "