From 85f569aa803b60350578efbf87ab91f4bdef4581 Mon Sep 17 00:00:00 2001 From: Mark Roth Date: Thu, 28 Mar 2024 09:27:02 -0400 Subject: [PATCH] Add citibike example --- example/Citibike.ipynb | 925 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 925 insertions(+) create mode 100644 example/Citibike.ipynb diff --git a/example/Citibike.ipynb b/example/Citibike.ipynb new file mode 100644 index 0000000..243c443 --- /dev/null +++ b/example/Citibike.ipynb @@ -0,0 +1,925 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "ef4ba8c0-2b1e-4dca-96a9-22ff2173b09f", + "metadata": {}, + "source": [ + "# Demo: Gross Adjusted Income vs. Citibike Usage Analysis using Memento\n", + "\n", + "Note: This is not a scientific analysis and is simply a demonstration of a typical research scenario and how Memento might be used to organize a notebook conducting similar research.\n", + "\n", + "
Disclaimer:\n", + "

This document is being distributed for informational and educational purposes only and is not an offer to sell or the solicitation of an offer to buy any securities or other instruments. The information contained herein is not intended to provide, and should not be relied upon for, investment advice. The views expressed herein are not necessarily the views of Two Sigma Investments, LP or any of its affiliates (collectively, “Two Sigma”). Such views reflect the assumptions of the author(s) of the document and are subject to change without notice. The document may employ data derived from third-party sources. No representation is made by Two Sigma as to the accuracy of such information and the use of such information in no way implies an endorsement of the source of such information or its validity.

\n", + "\n", + "

The copyrights and/or trademarks in some of the images, logos or other material used herein may be owned by entities other than Two Sigma. If so, such copyrights and/or trademarks are most likely owned by the entity that created the material and are used purely for identification and comment as fair use under international copyright and/or trademark laws. Use of such image, copyright or trademark does not imply any association with such organization (or endorsement of such organization) by Two Sigma, nor vice versa.

\n", + "
" + ] + }, + { + "cell_type": "markdown", + "id": "6dfa4249-f92e-4bb5-9494-ff7818ea7ef6", + "metadata": { + "jp-MarkdownHeadingCollapsed": true + }, + "source": [ + "## Init and Imports" + ] + }, + { + "cell_type": "code", + "execution_count": 192, + "id": "d72cca6a-6276-4672-9508-c7537d709fa0", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: boto3 in /home/mroth/data/projects/twosigma/memento/venv/memento/lib/python3.10/site-packages (1.34.71)\n", + "Requirement already satisfied: matplotlib in /home/mroth/data/projects/twosigma/memento/venv/memento/lib/python3.10/site-packages (3.8.3)\n", + "Requirement already satisfied: geopy in /home/mroth/data/projects/twosigma/memento/venv/memento/lib/python3.10/site-packages (2.4.1)\n", + "Collecting scipy\n", + " Downloading scipy-1.12.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m60.4/60.4 kB\u001b[0m \u001b[31m2.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: botocore<1.35.0,>=1.34.71 in /home/mroth/data/projects/twosigma/memento/venv/memento/lib/python3.10/site-packages (from boto3) (1.34.71)\n", + "Requirement already satisfied: jmespath<2.0.0,>=0.7.1 in /home/mroth/data/projects/twosigma/memento/venv/memento/lib/python3.10/site-packages (from boto3) (1.0.1)\n", + "Requirement already satisfied: s3transfer<0.11.0,>=0.10.0 in /home/mroth/data/projects/twosigma/memento/venv/memento/lib/python3.10/site-packages (from boto3) (0.10.1)\n", + "Requirement already satisfied: contourpy>=1.0.1 in /home/mroth/data/projects/twosigma/memento/venv/memento/lib/python3.10/site-packages (from matplotlib) (1.2.0)\n", + "Requirement already satisfied: cycler>=0.10 in /home/mroth/data/projects/twosigma/memento/venv/memento/lib/python3.10/site-packages (from matplotlib) (0.12.1)\n", + "Requirement already satisfied: fonttools>=4.22.0 in /home/mroth/data/projects/twosigma/memento/venv/memento/lib/python3.10/site-packages (from matplotlib) (4.50.0)\n", + "Requirement already satisfied: kiwisolver>=1.3.1 in /home/mroth/data/projects/twosigma/memento/venv/memento/lib/python3.10/site-packages (from matplotlib) (1.4.5)\n", + "Requirement already satisfied: numpy<2,>=1.21 in /home/mroth/data/projects/twosigma/memento/venv/memento/lib/python3.10/site-packages (from matplotlib) (1.26.4)\n", + "Requirement already satisfied: packaging>=20.0 in /home/mroth/data/projects/twosigma/memento/venv/memento/lib/python3.10/site-packages (from matplotlib) (24.0)\n", + "Requirement already satisfied: pillow>=8 in /home/mroth/data/projects/twosigma/memento/venv/memento/lib/python3.10/site-packages (from matplotlib) (10.2.0)\n", + "Requirement already satisfied: pyparsing>=2.3.1 in /home/mroth/data/projects/twosigma/memento/venv/memento/lib/python3.10/site-packages (from matplotlib) (3.1.2)\n", + "Requirement already satisfied: python-dateutil>=2.7 in /home/mroth/data/projects/twosigma/memento/venv/memento/lib/python3.10/site-packages (from matplotlib) (2.9.0.post0)\n", + "Requirement already satisfied: geographiclib<3,>=1.52 in /home/mroth/data/projects/twosigma/memento/venv/memento/lib/python3.10/site-packages (from geopy) (2.0)\n", + "Requirement already satisfied: urllib3!=2.2.0,<3,>=1.25.4 in /home/mroth/data/projects/twosigma/memento/venv/memento/lib/python3.10/site-packages (from botocore<1.35.0,>=1.34.71->boto3) (2.2.1)\n", + "Requirement already satisfied: six>=1.5 in /home/mroth/data/projects/twosigma/memento/venv/memento/lib/python3.10/site-packages (from python-dateutil>=2.7->matplotlib) (1.16.0)\n", + "Downloading scipy-1.12.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (38.4 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m38.4/38.4 MB\u001b[0m \u001b[31m33.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n", + "\u001b[?25hInstalling collected packages: scipy\n", + "Successfully installed scipy-1.12.0\n" + ] + } + ], + "source": [ + "# sudo apt-get install graphviz\n", + "!pip install boto3 matplotlib geopy scipy" + ] + }, + { + "cell_type": "code", + "execution_count": 193, + "id": "3abfdc86-14e3-4b40-a460-b9607b7e030a", + "metadata": {}, + "outputs": [], + "source": [ + "from twosigma.memento import memento_function\n", + "import boto3\n", + "from io import BytesIO, StringIO\n", + "import zipfile\n", + "import pandas as pd\n", + "import matplotlib\n", + "import requests\n", + "from geopy.geocoders import Nominatim\n", + "import matplotlib.pyplot as plt\n", + "from scipy import stats" + ] + }, + { + "cell_type": "markdown", + "id": "a9af39a1-229a-4538-89a5-640d6d41317b", + "metadata": { + "jp-MarkdownHeadingCollapsed": true + }, + "source": [ + "## Citibike Trip Data" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "3ba02267-2f7d-4fed-9868-d3b3a073f0b9", + "metadata": {}, + "outputs": [], + "source": [ + "@memento_function\n", + "def ingest_citibike(year: int):\n", + " bucket = \"tripdata\"\n", + " s3_client = boto3.client(\"s3\")\n", + " zip_filename = f\"{year}-citibike-tripdata.zip\"\n", + " response = s3_client.get_object(Bucket=bucket, Key=zip_filename)\n", + " object_data = response['Body'].read()\n", + " return object_data" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "dabae2f4-2662-4879-be95-67a40f850d3a", + "metadata": {}, + "outputs": [], + "source": [ + "def list_citibike_csvs(zip: zipfile.ZipFile) -> list[str]:\n", + " return sorted([file for file in zip.namelist() if file.endswith(\"_1.csv\")], key=lambda f: f[f.rindex(\"/\")+1:])" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "100b9534-359b-41c2-a5f7-4fc18f9d1c21", + "metadata": {}, + "outputs": [], + "source": [ + "@memento_function\n", + "def citibike_tripdata_csv(year: int, month: int) -> str:\n", + " yyyymm = f\"{year}{month:02}\"\n", + " data = ingest_citibike(year)\n", + " zip = zipfile.ZipFile(BytesIO(data))\n", + " name_list = list_citibike_csvs(zip)\n", + " names_for_month = [name for name in name_list if yyyymm in name]\n", + " if len(names_for_month) >= 1:\n", + " name = names_for_month[0]\n", + " with zip.open(name) as f:\n", + " return f.read().decode(\"utf-8\")\n", + " else:\n", + " return None\n" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "d4d88f7c-45f1-4837-b83a-627842ef9d12", + "metadata": {}, + "outputs": [], + "source": [ + "@memento_function\n", + "def normalize_citibike_tripdata(year: int, month: int = None) -> pd.DataFrame:\n", + " if month is None:\n", + " all_dfs = [normalize_citibike_tripdata(year, m) for m in range(1, 13)]\n", + " all_dfs = [df for df in all_dfs if df is not None]\n", + " return pd.concat(all_dfs)\n", + " else:\n", + " return pd.read_csv(StringIO(citibike_tripdata_csv(year, month)))" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "b7fba306-4b7a-4dba-ad17-8e0d7101dc96", + "metadata": {}, + "outputs": [], + "source": [ + "@memento_function\n", + "def zip_by_lat_long(lat: float, long: float) -> int:\n", + " geolocator = Nominatim(user_agent=\"twosigma_memento_example\")\n", + " location = geolocator.reverse((lat, long))\n", + " if location.raw.get('address', {}).get('postcode'):\n", + " return location.raw['address']['postcode']\n", + " else:\n", + " return None" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "id": "6ddbac5a-2375-428f-8b6d-78db6180f5c5", + "metadata": {}, + "outputs": [], + "source": [ + "@memento_function\n", + "def start_usage(year: int) -> pd.DataFrame:\n", + " df = normalize_citibike_tripdata(2020)\n", + " df = df \\\n", + " .groupby(\"start station id\") \\\n", + " .agg(\n", + " count=(\"start station id\", \"size\"),\n", + " lat=(\"start station latitude\", \"first\"),\n", + " long=(\"start station longitude\", \"first\")\n", + " ) \\\n", + " .reset_index() \\\n", + " .drop(\"start station id\", axis=1)\n", + " \n", + " # df is now (count, lat, long). Look up zip code\n", + " df[\"zipcode\"] = df.apply(lambda row: zip_by_lat_long(row[\"lat\"], row[\"long\"]), axis=1)\n", + " return df\n", + "\n", + "\n", + "@memento_function\n", + "def end_usage(year: int) -> pd.DataFrame:\n", + " df = normalize_citibike_tripdata(2020)\n", + " df = df \\\n", + " .groupby(\"end station id\") \\\n", + " .agg(\n", + " count=(\"end station id\", \"size\"),\n", + " lat=(\"end station latitude\", \"first\"),\n", + " long=(\"end station longitude\", \"first\")\n", + " ) \\\n", + " .reset_index() \\\n", + " .drop(\"end station id\", axis=1)\n", + " \n", + " # df is now (count, lat, long). Look up zip code\n", + " df[\"zipcode\"] = df.apply(lambda row: zip_by_lat_long(row[\"lat\"], row[\"long\"]), axis=1)\n", + " return df\n" + ] + }, + { + "cell_type": "code", + "execution_count": 114, + "id": "624eb259-621f-4819-b925-13d009f913ed", + "metadata": {}, + "outputs": [], + "source": [ + "@memento_function\n", + "def citibike_activity_by_zip(year: int) -> pd.DataFrame:\n", + " \"\"\"Rides that started or ended in each zip code, over the course of the year\"\"\"\n", + " df_start = start_usage(year)\n", + " df_end = end_usage(year)\n", + " d1 = df_start.drop([\"lat\", \"long\"], axis=1).groupby(\"zipcode\").sum().reset_index()\n", + " d2 = df_end.drop([\"lat\", \"long\"], axis=1).groupby(\"zipcode\").sum().reset_index()\n", + " d3 = d1.set_index(\"zipcode\").add(d2.set_index(\"zipcode\"), fill_value=0).reset_index()\n", + " d3[\"zipcode\"] = d3[\"zipcode\"].astype(int)\n", + " return d3" + ] + }, + { + "cell_type": "code", + "execution_count": 115, + "id": "65a60690-226d-4dcf-9a03-94fc991e55e5", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "%3\n", + "\n", + "\n", + "\n", + "e1cd35ee\n", + "\n", + "citibike_activity_by_zip\n", + "\n", + "\n", + "\n", + "7f68612d\n", + "\n", + "end_usage\n", + "\n", + "\n", + "\n", + "e1cd35ee->7f68612d\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "ccc846ef\n", + "\n", + "start_usage\n", + "\n", + "\n", + "\n", + "e1cd35ee->ccc846ef\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "2c30b248\n", + "\n", + "normalize_citibike_tripdata\n", + "\n", + "\n", + "\n", + "7f68612d->2c30b248\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "93592f4d\n", + "\n", + "zip_by_lat_long\n", + "\n", + "\n", + "\n", + "7f68612d->93592f4d\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "ccc846ef->2c30b248\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "ccc846ef->93592f4d\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "1ed63db4\n", + "\n", + "citibike_tripdata_csv\n", + "\n", + "\n", + "\n", + "2c30b248->1ed63db4\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "2d9f38ea\n", + "\n", + "ingest_citibike\n", + "\n", + "\n", + "\n", + "1ed63db4->2d9f38ea\n", + "\n", + "\n", + "\n", + "\n", + "\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 115, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "citibike_activity_by_zip.dependencies()" + ] + }, + { + "cell_type": "markdown", + "id": "2867e3ad-794b-4eac-9054-6647eeadec6f", + "metadata": { + "jp-MarkdownHeadingCollapsed": true + }, + "source": [ + "## IRS Data by Zip Code" + ] + }, + { + "cell_type": "code", + "execution_count": 135, + "id": "a413e29e-15a6-4f04-b67c-727f81611b72", + "metadata": {}, + "outputs": [], + "source": [ + "@memento_function\n", + "def ingest_irs_gov_zip(year: int) -> bytes:\n", + " url = f\"https://www.irs.gov/pub/irs-soi/{year-2000}zpallnoagi.csv\"\n", + " response = requests.get(url)\n", + " return response.content" + ] + }, + { + "cell_type": "code", + "execution_count": 143, + "id": "c5d813cd-9782-48ea-88b6-a21ad954a4a9", + "metadata": {}, + "outputs": [], + "source": [ + "@memento_function\n", + "def normalize_irs_gov_zip(year: int) -> pd.DataFrame:\n", + " data = ingest_irs_gov_zip(year)\n", + " csv = data.decode(\"utf-8\")\n", + " df = pd.read_csv(StringIO(csv))\n", + " df.columns = [col.lower() for col in df.columns]\n", + " return df" + ] + }, + { + "cell_type": "code", + "execution_count": 178, + "id": "313bf04e-6656-4c83-9661-18524c9be083", + "metadata": {}, + "outputs": [], + "source": [ + "@memento_function\n", + "def irs_agi_per_person_by_zip(year: int) -> pd.DataFrame:\n", + " \"\"\"Compute mean AGI per person in each ZIP\"\"\"\n", + " df = normalize_irs_gov_zip(year)\n", + " # (a00100: AGI, n1: count)\n", + " df[\"agi\"] = df[\"a00100\"] * 1000.0 / df[\"n1\"]\n", + " return df[[\"zipcode\", \"agi\"]]" + ] + }, + { + "cell_type": "code", + "execution_count": 172, + "id": "10904a66-d5a2-4b0d-b7bf-915f3af94cf5", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "%3\n", + "\n", + "\n", + "\n", + "2cf646b3\n", + "\n", + "irs_agi_per_person_by_zip\n", + "\n", + "\n", + "\n", + "01594205\n", + "\n", + "normalize_irs_gov_zip\n", + "\n", + "\n", + "\n", + "2cf646b3->01594205\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "fc7ceed1\n", + "\n", + "ingest_irs_gov_zip\n", + "\n", + "\n", + "\n", + "01594205->fc7ceed1\n", + "\n", + "\n", + "\n", + "\n", + "\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 172, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "irs_agi_per_person_by_zip.dependencies()" + ] + }, + { + "cell_type": "markdown", + "id": "464c8390-88e0-48c2-b021-3679d6f5403e", + "metadata": { + "jp-MarkdownHeadingCollapsed": true + }, + "source": [ + "## Join" + ] + }, + { + "cell_type": "code", + "execution_count": 185, + "id": "edad20e6-3382-4483-850f-3872cc997685", + "metadata": {}, + "outputs": [], + "source": [ + "@memento_function\n", + "def citibike_usage_and_agi_by_zip(year: int) -> pd.DataFrame:\n", + " df_citibike = citibike_activity_by_zip(2020) \n", + " df_irs = irs_agi_per_person_by_zip(2020)\n", + " df = df_citibike.rename(columns={\"count\": \"citibike_usage\"}).merge(df_irs, on=\"zipcode\", how=\"inner\")\n", + " return df" + ] + }, + { + "cell_type": "code", + "execution_count": 186, + "id": "da6b1631-f5aa-4d3e-9c79-c162f570771e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "%3\n", + "\n", + "\n", + "\n", + "2ea77d09\n", + "\n", + "citibike_usage_and_agi_by_zip\n", + "\n", + "\n", + "\n", + "e1cd35ee\n", + "\n", + "citibike_activity_by_zip\n", + "\n", + "\n", + "\n", + "2ea77d09->e1cd35ee\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "2cf646b3\n", + "\n", + "irs_agi_per_person_by_zip\n", + "\n", + "\n", + "\n", + "2ea77d09->2cf646b3\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "7f68612d\n", + "\n", + "end_usage\n", + "\n", + "\n", + "\n", + "e1cd35ee->7f68612d\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "ccc846ef\n", + "\n", + "start_usage\n", + "\n", + "\n", + "\n", + "e1cd35ee->ccc846ef\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "01594205\n", + "\n", + "normalize_irs_gov_zip\n", + "\n", + "\n", + "\n", + "2cf646b3->01594205\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "2c30b248\n", + "\n", + "normalize_citibike_tripdata\n", + "\n", + "\n", + "\n", + "7f68612d->2c30b248\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "93592f4d\n", + "\n", + "zip_by_lat_long\n", + "\n", + "\n", + "\n", + "7f68612d->93592f4d\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "ccc846ef->2c30b248\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "ccc846ef->93592f4d\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "1ed63db4\n", + "\n", + "citibike_tripdata_csv\n", + "\n", + "\n", + "\n", + "2c30b248->1ed63db4\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "2d9f38ea\n", + "\n", + "ingest_citibike\n", + "\n", + "\n", + "\n", + "1ed63db4->2d9f38ea\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "fc7ceed1\n", + "\n", + "ingest_irs_gov_zip\n", + "\n", + "\n", + "\n", + "01594205->fc7ceed1\n", + "\n", + "\n", + "\n", + "\n", + "\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 186, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "citibike_usage_and_agi_by_zip.dependencies()" + ] + }, + { + "cell_type": "markdown", + "id": "ced52459-bb32-4bf6-a8f2-20285a473110", + "metadata": { + "jp-MarkdownHeadingCollapsed": true + }, + "source": [ + "## Plot" + ] + }, + { + "cell_type": "code", + "execution_count": 187, + "id": "ddcd59f2-6a3d-49a8-9315-42f5e756c6b4", + "metadata": {}, + "outputs": [], + "source": [ + "df = citibike_usage_and_agi_by_zip(year=2020)" + ] + }, + { + "cell_type": "code", + "execution_count": 188, + "id": "650f954a-5498-4ecc-bafc-d74163526611", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
zipcodecitibike_usageagi
070873.042344.191546
1730245.0165762.133241
273047.050591.104553
3730611.059873.759398
4730717.063170.202952
............
8811238249581.0114466.614810
891136936586.038412.689585
90113737126.036881.548056
911138558067.050903.892801
92114151511.070225.906736
\n", + "

93 rows × 3 columns

\n", + "
" + ], + "text/plain": [ + " zipcode citibike_usage agi\n", + "0 7087 3.0 42344.191546\n", + "1 7302 45.0 165762.133241\n", + "2 7304 7.0 50591.104553\n", + "3 7306 11.0 59873.759398\n", + "4 7307 17.0 63170.202952\n", + ".. ... ... ...\n", + "88 11238 249581.0 114466.614810\n", + "89 11369 36586.0 38412.689585\n", + "90 11373 7126.0 36881.548056\n", + "91 11385 58067.0 50903.892801\n", + "92 11415 1511.0 70225.906736\n", + "\n", + "[93 rows x 3 columns]" + ] + }, + "execution_count": 188, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df" + ] + }, + { + "cell_type": "code", + "execution_count": 202, + "id": "38eef90d-c762-4561-9273-3f0fb766376f", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "plt.figure(figsize=(8, 6))\n", + "x = df[\"agi\"]\n", + "y = df[\"citibike_usage\"]\n", + "slope, intercept, r_value, p_value, std_err = stats.linregress(x, y)\n", + "trend_y = slope * x + intercept\n", + "plt.scatter(x, y)\n", + "plt.plot(x, trend_y, color=\"red\")\n", + "plt.xlabel(\"Mean Adjusted Gross Income (USD) for ZIP Code\")\n", + "plt.ylabel(\"Citibike Trip Starts + Ends\")\n", + "plt.title(\"ZIP Code Mean Income vs. Citibike Trips with Trendline\")\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 203, + "id": "6e440644-ca0d-4ae6-8acc-9d88dc7c466c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.556482518196366 138727.27634896868 0.3509066738711846 0.0005634972170450429 0.15567018690300002\n" + ] + } + ], + "source": [ + "print(slope, intercept, r_value, p_value, std_err)" + ] + }, + { + "cell_type": "markdown", + "id": "ae8c02c5-0ffa-4eba-b246-2cc6e3f287a4", + "metadata": {}, + "source": [ + "There is a weak but statistically significant positive correlation between zip code mean AGI and citibike trips started or ended in that zip code.\n", + "\n", + "Citibike usage is statistically higher in neighborhoods with higher incomes, which could indicate that wealthier neighborhoods benefit from Citibike more." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "memento", + "language": "python", + "name": "memento" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}