diff --git a/rt_segment_speeds/12_speeds.ipynb b/rt_segment_speeds/12_speeds.ipynb index 677f7b36f..b3ea92af5 100644 --- a/rt_segment_speeds/12_speeds.ipynb +++ b/rt_segment_speeds/12_speeds.ipynb @@ -2,26 +2,33 @@ "cells": [ { "cell_type": "code", - "execution_count": null, + "execution_count": 36, "id": "2c7feec3-aa18-42ab-94b9-cab4be608152", "metadata": {}, "outputs": [], "source": [ - "import _speed_utils as speed_utils\n", "import datetime\n", + "import _speed_utils as speed_utils\n", "import _threshold_utils as threshold_utils\n", "import altair as alt\n", "import dask.dataframe as dd\n", "import geopandas as gpd\n", "import pandas as pd\n", "from segment_speed_utils import gtfs_schedule_wrangling, helpers, segment_calcs\n", - "from segment_speed_utils.project_vars import analysis_date\n", + "from segment_speed_utils.project_vars import (\n", + " COMPILED_CACHED_VIEWS,\n", + " PROJECT_CRS,\n", + " SEGMENT_GCS,\n", + " analysis_date,\n", + " CONFIG_PATH\n", + ")\n", + "from scripts import A1_sjoin_vp_segments\n", "from shared_utils import calitp_color_palette as cp" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 37, "id": "0108ae4a-4518-4487-85f7-a5faa3e9cbf6", "metadata": {}, "outputs": [], @@ -34,15 +41,12 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "1e9f79a4-5921-4e8c-82c5-3b414f677cf8", - "metadata": { - "tags": [] - }, + "execution_count": 38, + "id": "80014c5e-695d-4280-89cd-4e7e2bb3d302", + "metadata": {}, "outputs": [], "source": [ - "# Flag\n", - "# routes_many_stops_df, routes_many_stops_list = speed_utils.find_shapes_with_many_stops(analysis_date)" + "# alt.data_transformers.disable_max_rows()" ] }, { @@ -55,2569 +59,1954 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "c3ab6f3f-2982-4466-aa76-06c7a235c62e", + "execution_count": 39, + "id": "2f0c5f4f-f419-42a8-8527-7060ed412092", "metadata": {}, "outputs": [], "source": [ - "avg_speeds = pd.read_parquet(\n", - " f\"{speed_utils.GCS_PATH}avg_speeds_stop_segments_{analysis_date}.parquet\"\n", - ").drop(columns=[\"geometry\", \"geometry_arrowized\", \"district\", \"district_name\"])" + "def merge_all_speeds(analysis_date:str) -> pd.DataFrame:\n", + " \"\"\"\n", + " Merge avg_speeds_stop_segments and\n", + " speed_stops parquets.\n", + " \n", + " Args:\n", + " date: analysis date\n", + " \"\"\"\n", + " # Open up avg speeds\n", + " avg_speeds = pd.read_parquet(f\"{SEGMENT_GCS}avg_speeds_stop_segments_{analysis_date}.parquet\")\n", + " avg_speeds = avg_speeds.drop(columns=[\"geometry\", \"district\", \"district_name\"])\n", + " # Filter for all day flags\n", + " avg_speeds = avg_speeds[avg_speeds.time_of_day == 'all_day'].reset_index(drop = True)\n", + " \n", + " # Open up speeds\n", + " speeds = pd.read_parquet(f\"{SEGMENT_GCS}speeds_stop_segments_{analysis_date}\")\n", + " \n", + " merge_cols = ['gtfs_dataset_key','shape_array_key', 'stop_sequence']\n", + " m1 = pd.merge(avg_speeds, speeds, on = merge_cols, how = 'inner')\n", + " \n", + " m1 = m1.drop_duplicates().reset_index(drop = True)\n", + " \n", + " return m1" ] }, { "cell_type": "code", - "execution_count": null, - "id": "8185a464-5ca2-43bb-89f6-062ee01b5e2d", + "execution_count": 40, + "id": "84ac97bf-ee4f-4d85-b523-8a36823f9d9a", "metadata": {}, "outputs": [], "source": [ - "speeds = pd.read_parquet(f\"{speed_utils.GCS_PATH}speeds_stop_segments_{analysis_date}\")" + "m1 = merge_all_speeds(analysis_date)" ] }, { "cell_type": "code", - "execution_count": null, - "id": "01e1b2ad-edb7-4021-825f-612f070db139", + "execution_count": 41, + "id": "68950ae7-4061-47d6-ac48-5eac0b1f29c0", "metadata": {}, "outputs": [], "source": [ - "avg_speeds.sample()" + "# m1.shape" ] }, { "cell_type": "code", - "execution_count": null, - "id": "4623eaa7-4594-4155-859d-af997094c3de", + "execution_count": 42, + "id": "b04dfb8b-7476-49df-873a-cea75dc61763", "metadata": {}, "outputs": [], "source": [ - "speeds.sample()" + "\n", + "# Picked 4 random routes\n", + "sample_0_keys = [\n", + " \"0fb4f3627996269dc7075276d3b69e36\",\n", + " \"07c9a47264a43d8d0d16ef7109e8fd68\",\n", + " \"106d979b9a9e6338827a8e1c145e69fd\",\n", + " \"000624bd8453dbe4f2eb2765b04bcb98\",\n", + "]" ] }, { - "cell_type": "code", - "execution_count": null, - "id": "bbc7804a-550a-40fa-a25f-7694b057c9b7", + "cell_type": "markdown", + "id": "898e3546-5298-4c4f-87d0-ee1d1a10f07d", "metadata": {}, - "outputs": [], "source": [ - "merge_cols = [\"gtfs_dataset_key\", \"shape_array_key\", \"stop_sequence\"]\n", - "merge1 = pd.merge(avg_speeds, speeds, on=merge_cols, how=\"inner\")" + "### Categorize" ] }, { "cell_type": "code", - "execution_count": null, - "id": "34c64e59-0379-4edf-a87f-ec621c0b668b", + "execution_count": 43, + "id": "e81e59fd-cc2f-408e-9148-1a1055425fc4", "metadata": {}, "outputs": [], "source": [ - "merge1.sample()" + "def categorize_by_percentile_pandas(\n", + " df: pd.DataFrame, column_percentile: str, column_str: str\n", + ") -> pd.DataFrame:\n", + "\n", + " # Find percentiles\n", + " p5 = df[column_percentile].quantile(0.05).astype(float)\n", + " p95 = df[column_percentile].quantile(0.95).astype(float)\n", + " \n", + " def rate(row):\n", + " if ((row[column_percentile] >= 0) and (row[column_percentile] <= p5)):\n", + " return f\"{column_str} is low\"\n", + " elif (row[column_percentile] >= p95):\n", + " return f\"{column_str} is high\"\n", + " else:\n", + " return f\"{column_str} is avg\"\n", + " \n", + " # Apply flags\n", + " df[f\"{column_str}cat\"] = df.apply(lambda x: rate(x), axis=1)\n", + " \n", + " # Clean\n", + " df[f\"{column_str}cat\"] = df[f\"{column_str}cat\"].str.replace(\"_\", \"\")\n", + "\n", + " print(f\"Done with {column_str}\")\n", + " \n", + " return df " ] }, { "cell_type": "code", - "execution_count": null, - "id": "7fa72f28-fd46-4fd9-9f4a-120812a482da", + "execution_count": 44, + "id": "0dfb836d-f919-4f2b-a0d1-9e4a4713ba8a", "metadata": {}, "outputs": [], "source": [ - "segments_file = \"stop_segments\"" + "# df1 = categorize_by_percentile_pandas(subset, \"meters_elapsed\", \"meters_\")" ] }, { "cell_type": "code", - "execution_count": null, - "id": "77f1e38e-6dc4-4e79-9eed-569375a133fe", + "execution_count": 45, + "id": "9f84205d-93db-49f3-be99-6b5014f7faeb", "metadata": {}, "outputs": [], "source": [ - "stop_segments = pd.read_parquet(\n", - " f\"{speed_utils.GCS_PATH}{segments_file}_{analysis_date}.parquet\"\n", - ").drop(columns=[\"geometry\", \"geometry_arrowized\"])" + "# df1.head()" ] }, { "cell_type": "code", - "execution_count": null, - "id": "cdaf396a-49d0-4da6-a3f8-3080f1f838b0", + "execution_count": 46, + "id": "b0d2184f-8a44-4489-a1b4-2be8317142f1", "metadata": {}, "outputs": [], "source": [ - "stop_segments.sample()" + "# df2 = categorize_by_percentile_pandas(df1, \"sec_elapsed\", \"sec_\")" ] }, { "cell_type": "code", - "execution_count": null, - "id": "b994dc1c-2a11-4376-9d43-f61db486e6eb", + "execution_count": 47, + "id": "940fb010-0dff-465e-bf8d-87dd3f4ba101", "metadata": {}, "outputs": [], "source": [ - "# pd.merge(stop_segments, merge1, on = ['gtfs_dataset_key','shape_array_key','stop_sequence','loop_or_inlining'], how = \"inner\", indicator = True)[['_merge']].value_counts()" + "# df2.head()" ] }, { "cell_type": "code", - "execution_count": null, - "id": "e510e28b-a179-41d7-b738-e50edb26d878", + "execution_count": 48, + "id": "9d38d541-5c9c-4d31-8986-9c3928eb2f59", "metadata": {}, "outputs": [], "source": [ - "merge1.shape" + "def categorize_meters_speeds_pandas()-> pd.DataFrame:\n", + " start = datetime.datetime.now()\n", + " print(start)\n", + " \n", + " df = merge_all_speeds(analysis_date)\n", + " \n", + " # Categorize\n", + " df1 = categorize_by_percentile_pandas(df, \"meters_elapsed\", \"meters_\")\n", + " df2 = categorize_by_percentile_pandas(df1, \"sec_elapsed\", \"sec_\")\n", + " \n", + " # Find size of categories\n", + " print(df2.groupby(['sec_cat','meters_cat']).size())\n", + "\n", + " # Filter out for only meters that are low or seconds that are high\n", + " df2 = df2[(df2.meters_cat == 'meters is low') | (df2.sec_cat == 'sec is high')].reset_index(drop = True)\n", + " print(f\"{len(df2)} rows left after filtering for rows with either high seconds OR low meters\") \n", + " \n", + " def flag_round(row):\n", + " if (row[\"meters_elapsed\"] == 0) & (row[\"sec_elapsed\"] == 0):\n", + " return \"division by 0\"\n", + " elif row[\"meters_cat\"] == \"meters is low\":\n", + " return \"meters too low\"\n", + " elif row[\"sec_cat\"] == \"sec is high\":\n", + " return \"seconds too high\"\n", + " else:\n", + " return \"ok\"\n", + " \n", + " df2[\"flag\"] = df2.apply(lambda x: flag_round(x), axis=1)\n", + " print(df2.flag.value_counts())\n", + " \n", + " # Filter out for only division by 0 \n", + " df3 = df2[(df2.flag == 'division by 0')].reset_index(drop = True)\n", + " \n", + " end = datetime.datetime.now()\n", + " print(f\"Took {end-start}\")\n", + " return df3" ] }, { "cell_type": "code", - "execution_count": null, - "id": "6483a59e-1d44-4fa9-b057-8ce5230126c2", + "execution_count": 49, + "id": "7201d5e3-f765-4e5d-9bbd-aa6a336bcc0a", "metadata": {}, "outputs": [], "source": [ - "# m1 = speed_utils.merge_all_speeds(analysis_date)" + "# subset = m1[m1.shape_array_key.isin(sample_0_keys)].reset_index()" ] }, { "cell_type": "code", - "execution_count": null, - "id": "e9d651da-95f8-425b-9a7c-28781b70a595", + "execution_count": 50, + "id": "2c5107cb-c574-449b-95b6-fb205f38502e", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2023-06-30 10:15:15.239284\n", + "Done with meters_\n", + "Done with sec_\n", + "sec_cat meters_cat \n", + "sec is avg meters is avg 2415102\n", + " meters is high 70745\n", + " meters is low 139528\n", + "sec is high meters is avg 57245\n", + " meters is high 83074\n", + " meters is low 13695\n", + "sec is low meters is low 296973\n", + "dtype: int64\n", + "590515 rows left after filtering for rows with either high seconds OR low meters\n", + "division by 0 296973\n", + "meters too low 153223\n", + "seconds too high 140319\n", + "Name: flag, dtype: int64\n", + "Took 0:02:29.450038\n" + ] + } + ], "source": [ - "# m1.sample()" + "m2 = categorize_meters_speeds_pandas()" ] }, { "cell_type": "code", - "execution_count": null, - "id": "20c1eeac-34e6-417c-9354-31cfc3ea9096", + "execution_count": 51, + "id": "2d1bf90c-d9ed-4861-a1be-23f356165a4c", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "division by 0 296973\n", + "Name: flag, dtype: int64" + ] + }, + "execution_count": 51, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# m1.groupby(['shape_array_key']).agg({'trip_id':'nunique'}).sort_values(['trip_id'], ascending = False).head(30)" + "m2.flag.value_counts()" ] }, { "cell_type": "code", - "execution_count": null, - "id": "48b197ec-add3-46d3-a2bf-ff91c1ff15be", - "metadata": {}, - "outputs": [], - "source": [ - "merge1.shape_array_key.unique()" - ] - }, - { - "cell_type": "markdown", - "id": "801c89ce-0e7f-4758-a38a-201cc843ef28", + "execution_count": 52, + "id": "ce0fbb35-f81e-4343-92d2-4382d2173dbd", "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "2779389" + ] + }, + "execution_count": 52, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "#### A few routes" + "len(m1)-len(m2)" ] }, { "cell_type": "code", - "execution_count": null, - "id": "8ca57e6f-fb38-4381-ac83-cb9dc9fabdf4", + "execution_count": 53, + "id": "508f1411-4328-4b80-a029-0ae516107ed0", "metadata": {}, - "outputs": [], - "source": [ - "test_shapes = [\n", - " \"00093e1c28352239174c92c4f07a483b\",\n", - " \"001254fc8105d01a8064046249c0ceba\",\n", - " \"00b40413c13a48046de6e2338aee0410\",\n", - " \"e3c5ed2c6fa6cd5c5cd57d46aeb3cd8e\",\n", - " \"efa0f969b4499620b80c9b82170e2e60\",\n", - " \"00093e1c28352239174c92c4f07a483b\",\n", - " \"001254fc8105d01a8064046249c0ceba\",\n", - " \"6388c0be232f0c745df85d66689a6db0\",\n", - " \"d8b0826e923620f7b7cd74be090de936\",\n", - " \"e7012e8847c179f713daee0f158233e4\",\n", - " \"11d91cab41cde51a6d4f623b9cba867c\"\n", - "]" + "outputs": [ + { + "data": { + "text/plain": [ + "296973" + ] + }, + "execution_count": 53, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(m2)" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "id": "9fe08d2a-b874-4439-aa5b-a52de58cad09", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(45357, 72067)" + ] + }, + "execution_count": 54, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "m2.trip_id.nunique(), m1.trip_id.nunique()" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "id": "8e6d31ab-46a7-4e20-bb2f-9cac1a2d672d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'2155 routes flagged'" + ] + }, + "execution_count": 55, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "f\"{m1.shape_array_key.nunique() - m2.shape_array_key.nunique()} routes flagged\"" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "id": "647fad46-7f9b-4ce2-a26a-1ea69d02daee", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(63, 76)" + ] + }, + "execution_count": 56, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "m2._gtfs_dataset_name.nunique(), m1._gtfs_dataset_name.nunique()" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "id": "0f4e4b50-081d-4516-81cf-d5bdfb5d469f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
shape_array_key
loop_or_inlining
03970
1867
\n", + "
" + ], + "text/plain": [ + " shape_array_key\n", + "loop_or_inlining \n", + "0 3970\n", + "1 867" + ] + }, + "execution_count": 57, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "m1.groupby([\"loop_or_inlining\"]).agg({\"shape_array_key\": \"nunique\"})" + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "id": "83036ccc-7339-42c2-b1f7-183734253c21", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
shape_array_key
loop_or_inlining
02682
\n", + "
" + ], + "text/plain": [ + " shape_array_key\n", + "loop_or_inlining \n", + "0 2682" + ] + }, + "execution_count": 58, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "m2.groupby([\"loop_or_inlining\"]).agg({\"shape_array_key\": \"nunique\"})" ] }, { - "cell_type": "code", - "execution_count": null, - "id": "6858f9a8-2136-4aab-a099-25907b6ef7ef", + "cell_type": "markdown", + "id": "4486cd7c-31d7-4420-ac67-f9783676ede8", "metadata": {}, - "outputs": [], "source": [ - " few_routes = merge1.loc[merge1.shape_array_key.isin(test_shapes)].reset_index(drop=True)" + "#### See how many trips for a shape ID have problematic rows\n" ] }, { "cell_type": "code", - "execution_count": null, - "id": "913ac9e5-41ed-43c3-86ad-2b13b141d17c", - "metadata": {}, + "execution_count": 59, + "id": "468be3c9-7a24-4f01-84fd-31c137bc45e8", + "metadata": { + "scrolled": true, + "tags": [] + }, "outputs": [], "source": [ - "# few_routes = merge1.copy()" + "# Number of trips that have at least one row that was divided by 0 \n", + "# for this shape array key\n", + "df1 = m2.groupby(['shape_array_key']).agg({'trip_id':'nunique'}).rename(columns = {'trip_id':'trips_with_zero'}).reset_index()" ] }, { "cell_type": "code", - "execution_count": null, - "id": "602983ba-8f2e-41cb-9eb8-96e8ede9f58a", - "metadata": {}, + "execution_count": 60, + "id": "4350f540-8f6b-4fb0-8b16-836245c0e44c", + "metadata": { + "scrolled": true, + "tags": [] + }, "outputs": [], "source": [ - "few_routes.shape" + "# Original number of trips\n", + "df2 = m1.groupby(['shape_array_key']).agg({'trip_id':'nunique'}).rename(columns = {'trip_id':'all_trips'}).reset_index()" ] }, { "cell_type": "code", - "execution_count": null, - "id": "f294ff32-b025-4037-9ebc-cefe6dca00b9", + "execution_count": 61, + "id": "ac68bdf7-26a0-4679-9a35-26f8a670018a", "metadata": {}, "outputs": [], "source": [ - "few_routes.trip_id.nunique()" + "df3 = pd.merge(df1, df2, how = \"inner\", on = 'shape_array_key')" ] }, { "cell_type": "code", - "execution_count": null, - "id": "b4f56307-6624-488d-8688-7a9f1e47ff65", + "execution_count": 62, + "id": "81d443cc-122f-46f1-87ec-dbdc74e0ca6a", "metadata": {}, "outputs": [], "source": [ - "def categorize_by_percentile(df, column_percentile: str, column_str: str):\n", - "\n", - " agg1 = (\n", - " df.groupby([\"shape_array_key\", \"stop_sequence\"])[column_percentile]\n", - " .describe(percentiles=[0.15, 0.5, 0.95])\n", - " .reset_index()\n", - " .add_prefix(column_str)\n", - " )\n", - "\n", - " merge1 = pd.merge(\n", - " df,\n", - " agg1,\n", - " how=\"inner\",\n", - " left_on=[\"shape_array_key\", \"stop_sequence\"],\n", - " right_on=[\n", - " f\"{column_str}shape_array_key\",\n", - " f\"{column_str}stop_sequence\",\n", - " ],\n", - " )\n", - "\n", - " def percentile(row):\n", - "\n", - " if row[column_percentile] == row[f\"{column_str}mean\"]:\n", - " return f\"{column_str} elapsed avg\"\n", - " elif row[column_percentile] == row[f\"{column_str}50%\"]:\n", - " return f\"{column_str} elapsed avg\"\n", - " elif row[column_percentile] <= row[f\"{column_str}15%\"]:\n", - " return f\"{column_str} elapsed low\"\n", - " elif row[column_percentile] == 0:\n", - " return f\"{column_str} elapsed is 0\"\n", - " elif (\n", - " row[f\"{column_str}15%\"] < row[column_percentile] <= row[f\"{column_str}95%\"]\n", - " ):\n", - " return f\"{column_str} elapsed avg\"\n", - "\n", - " elif row[column_percentile] > row[f\"{column_str}95%\"]:\n", - " return f\"{column_str} elapsed high\"\n", - "\n", - " else:\n", - " return \"other\"\n", - "\n", - " merge1[f\"{column_str}cat\"] = merge1.apply(lambda x: percentile(x), axis=1)\n", - " print(f\"Done with {column_str}\")\n", - " return merge1" + "df3['percent_of_trips_with_problematic_rows'] = df3.trips_with_zero/df3.all_trips * 100" ] }, { "cell_type": "code", - "execution_count": null, - "id": "d80535fb-8648-4216-918f-76e0484ba3ea", + "execution_count": 63, + "id": "314d9baf-de0e-460a-8c29-4504ba94cfa6", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "count 2682.00\n", + "mean 82.86\n", + "std 26.65\n", + "min 1.52\n", + "25% 75.00\n", + "50% 100.00\n", + "75% 100.00\n", + "max 100.00\n", + "Name: percent_of_trips_with_problematic_rows, dtype: float64" + ] + }, + "execution_count": 63, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "def categorize_meters_speeds(df):\n", - " start = datetime.datetime.now()\n", - " print(f\"Begin: {start}\")\n", - " df.speed_mph = df.speed_mph.fillna(0)\n", - " df = categorize_by_percentile(df, \"meters_elapsed\", \"meters_\")\n", - " df = categorize_by_percentile(df, \"sec_elapsed\", \"seconds_\")\n", - " df = categorize_by_percentile(df, \"speed_mph\", \"speed_\")\n", - " df = df.rename(columns={\"speed_cat\": \"speed_flags\"})\n", - " end = datetime.datetime.now()\n", - " print(f\"Finish: {end}\")\n", - " return df" + "df3['percent_of_trips_with_problematic_rows'].describe()" ] }, { "cell_type": "code", - "execution_count": null, - "id": "4676866e-674a-4561-bc4f-55dc2dcc4769", + "execution_count": 64, + "id": "5de3efe6-2233-4251-93a8-1f8dd6fb2dae", "metadata": { "tags": [] }, "outputs": [], "source": [ - "few_routes_cat = categorize_meters_speeds(few_routes)" + "# df3.sample(5)" ] }, { - "cell_type": "code", - "execution_count": null, - "id": "1f8fa537-8807-4544-8568-7d999ca9ecac", + "cell_type": "markdown", + "id": "a399d982-e400-43fa-b13f-fecafaa27262", "metadata": {}, - "outputs": [], "source": [ - "# few_routes_cat.columns" + "### Investigate \n", + "#### Stage3: \"vp_pared_stops\"/A3_loop_inlining\n", + "* Rewrite this part to filter read_parquet with the shape array and whatnot" ] }, { "cell_type": "code", - "execution_count": null, - "id": "1e6db506-6e09-49c7-84a1-02160178573d", + "execution_count": 65, + "id": "a2a705af-b588-463b-b6ce-f999b2050208", "metadata": {}, "outputs": [], "source": [ - "subset = [\n", - " \"stop_sequence\",\n", - " \"speed_flags\",\n", - " \"speed_mph\",\n", - " \"speed_15%\",\n", - " \"speed_50%\",\n", - " \"speed_95%\",\n", - " \"meters_cat\",\n", - " \"meters_elapsed\",\n", - " \"meters_mean\",\n", - " \"meters_15%\",\n", - " \"meters_50%\",\n", - " \"meters_95%\",\n", - " \"seconds_cat\",\n", - " \"sec_elapsed\",\n", - " \"seconds_mean\",\n", - " \"seconds_15%\",\n", - " \"seconds_50%\",\n", - " \"seconds_95%\",\n", - " \"gtfs_dataset_key\",\n", - "]" + "def load_vp_stage3(flagged_df:pd.DataFrame, date:str) -> pd.DataFrame:\n", + " \n", + " # Subset the dataframe and use it to filter out for only the values of interest\n", + " shape_array_keys = flagged_df.shape_array_key.unique().tolist()\n", + " stop_seq = flagged_df.stop_sequence.unique().tolist() \n", + " trip_id = flagged_df.trip_id.unique().tolist() \n", + " gtfs_dataset_key = flagged_df.gtfs_dataset_key.unique().tolist() \n", + " \n", + " #flagged_df = flagged_df[['gtfs_dataset_key', 'trip_id','stop_sequence','shape_array_key']]\n", + " vp = pd.read_parquet(f\"{SEGMENT_GCS}vp_pared_stops_{date}\",\n", + " filters = [[('shape_array_key', \"in\", shape_array_keys),\n", + " ('stop_sequence', 'in', stop_seq), \n", + " ('trip_id', 'in', trip_id), \n", + " ('gtfs_dataset_key', 'in', gtfs_dataset_key)]],)\n", + " \n", + " # Merge to filter\n", + " vp2 = pd.merge(flagged_df, vp, how = \"inner\", on = ['gtfs_dataset_key', 'trip_id','stop_sequence','shape_array_key'])\n", + " \n", + " return vp2" ] }, { "cell_type": "code", - "execution_count": null, - "id": "f5ae87ad-a3a4-4630-ac8c-b7ce711c2fb7", + "execution_count": 66, + "id": "1e36c5fc-ab3f-4129-97f9-ad9472b7d32a", "metadata": {}, "outputs": [], "source": [ - "few_routes_cat.speed_flags.value_counts()" + "vp2 = load_vp_stage3(subset, analysis_date)" ] }, { "cell_type": "code", - "execution_count": null, - "id": "83ede9e9-a76d-4e57-9fac-4cc2ce1af0c5", + "execution_count": 67, + "id": "1d6fe654-40ca-4758-bc2c-316e33d1a9d1", "metadata": {}, "outputs": [], "source": [ - "few_routes_cat.speed_flags.value_counts() / len(few_routes) * 100" + "# vp = pd.read_parquet(f\"{SEGMENT_GCS}vp_pared_stops_{analysis_date}\")" ] }, { "cell_type": "code", - "execution_count": null, - "id": "e016eb5a-0039-4063-ade4-c871e01c8a16", - "metadata": {}, + "execution_count": 68, + "id": "148e75f1-08dd-44c8-8179-319164d8e020", + "metadata": { + "tags": [] + }, "outputs": [], "source": [ - "few_routes_cat.groupby([\"speed_flags\", \"meters_cat\", \"seconds_cat\",]).agg(\n", - " {\"trip_id\": \"count\"}\n", - ").reset_index().sort_values([\"trip_id\"], ascending=False)" + "# Check out stop sequences for the trip below that have division by 0\n", + "# subset[subset.trip_id == \"1088383\"].stop_sequence.unique()" ] }, { "cell_type": "code", - "execution_count": null, - "id": "87860e54-fd6e-42d1-a38f-613fea4a77e9", - "metadata": {}, + "execution_count": 69, + "id": "b4350206-c237-44a3-abce-f8f38cde8117", + "metadata": { + "scrolled": true, + "tags": [] + }, "outputs": [], "source": [ - "# 65d9589130415c685b89f4f7c2d8bd7e 65" + "# Stop sequences that were flagged as division by 0\n", + "# vp2[vp2.trip_id == \"1088383\"].sort_values(['trip_id', 'stop_sequence','location_timestamp_local'])" ] }, { "cell_type": "code", - "execution_count": null, - "id": "bc6df377-9c23-4b01-80de-8c977b797c47", - "metadata": {}, + "execution_count": 70, + "id": "aa1e56d1-ec07-436c-8763-7bcf3dcbf7d4", + "metadata": { + "scrolled": true, + "tags": [] + }, "outputs": [], "source": [ - "# few_routes_cat[few_routes_cat.speed_flags == \"average\"][subset].sample(3)" + "# All the stop sequences for this trip, even those that are ok\n", + "# vp_pared[vp_pared.trip_id == \"1088383\"].sort_values(['trip_id', 'stop_sequence','location_timestamp_local'])" ] }, { "cell_type": "code", - "execution_count": null, - "id": "49c58012-004e-4c47-becf-20e7f18895d3", - "metadata": {}, + "execution_count": 71, + "id": "22e42aae-9281-4040-ab8c-6a10b93f6cf4", + "metadata": { + "scrolled": true, + "tags": [] + }, "outputs": [], "source": [ - "# few_routes_cat[(few_routes_cat.meters_cat == \"meters_ elapsed avg\") & (few_routes_cat.seconds_cat == \"seconds_ elapsed avg\") & (few_routes_cat.speed_flags == \"speed low\")][subset]" + "# All the stop sequences for this trip, even those that are ok\n", + "# vp_pared[vp_pared.trip_id == \"1088383\"].sort_values(['location_timestamp_local','stop_sequence',])" ] }, { "cell_type": "code", - "execution_count": null, - "id": "54f38944-af7a-47bf-a799-882899964c6a", + "execution_count": 72, + "id": "0f21f08f-d4eb-4bbd-94d3-f4b031e97cf4", "metadata": {}, "outputs": [], "source": [ - "# few_routes_cat[(few_routes_cat.meters_cat == \"meters_ elapsed low\") & (few_routes_cat.seconds_cat == \"seconds_ elapsed avg\") & (few_routes_cat.speed_flags == \"speed low\")][subset].sample(3)" + "def stage3_repeated_timestamps(stage3_df:pd.DataFrame)-> pd.DataFrame:\n", + " \"\"\"\n", + " Look at how many times a time stamp is repeated a route-trip-location.\n", + " Each of these 3 combos should have a different time for each \n", + " stop sequence or else the vehicle is not changing locations.\n", + " \"\"\"\n", + " agg = (stage3_df\n", + " .groupby(['shape_array_key','trip_id', 'location_timestamp_local'])\n", + " .agg({'stop_sequence':'nunique'})\n", + " .reset_index()\n", + " .rename(columns = {'stop_sequence':'number_of_repeated_timestamps'})\n", + " )\n", + " \n", + " # Only keep timestamps that are repeated more than once\n", + " agg = (agg[agg.number_of_repeated_timestamps > 1]).reset_index(drop = True)\n", + "\n", + " return agg" ] }, { "cell_type": "code", - "execution_count": null, - "id": "8b28c22b-9b7c-41ab-b3cc-36661c8439e5", + "execution_count": 73, + "id": "5ce07566-c1f0-4fa7-9550-2fa07b98dba8", "metadata": {}, "outputs": [], "source": [ - "# few_routes_cat[(few_routes_cat.meters_cat == \"meters_ elapsed high\") & (few_routes_cat.seconds_cat == \"seconds_ elapsed low\") & (few_routes_cat.speed_flags == \"speed high\")][subset].sample(3)" + "def stage3_repeated_locations(stage3_df:pd.DataFrame):\n", + " \"\"\"\n", + " Look at how many times a time stamp is repeated for a stop-trip-route combo.\n", + " Each of these 3 combos should have a different location for each \n", + " stop sequence or else the vehicle is not changing locations.\n", + " \"\"\"\n", + " # Concat x and y into a string\n", + " stage3_df['pair'] = stage3_df.x.astype(str) + '/' + vp2.y.astype(str)\n", + " \n", + " # Count number of different stops that reference the same location\n", + " agg = (stage3_df\n", + " .groupby(['shape_array_key','trip_id','pair'])\n", + " .agg({'stop_sequence':'nunique'})\n", + " .reset_index()\n", + " .sort_values('stop_sequence', ascending = False)\n", + " .rename(columns = {'stop_sequence':'number_of_repeated_locs'}) \n", + " )\n", + "\n", + " # Only keep locations that are repeated more than once\n", + " agg = agg[agg.number_of_repeated_locs != 1].reset_index(drop = True)\n", + " \n", + " return agg" ] }, { "cell_type": "code", - "execution_count": null, - "id": "0c00506f-00ed-4660-9c88-aa11bc925fd2", + "execution_count": 74, + "id": "66e83169-2b4a-4912-bc0e-1a0b3e8deea6", "metadata": {}, "outputs": [], "source": [ - "# few_routes_cat[(few_routes_cat.meters_cat == \"meters_ elapsed high\") & (few_routes_cat.seconds_cat == \"seconds_ elapsed high\") & (few_routes_cat.speed_flags == \"speed low\")][subset].sample(3)" + "def flag_stage3(flagged_df:pd.DataFrame, date:str) -> pd.DataFrame:\n", + " \"\"\"\n", + " Flag the errors in stage3\n", + " \"\"\"\n", + " start = datetime.datetime.now()\n", + " print(start)\n", + " \n", + " # Relevant rows from Vehicle Positions\n", + " vp = load_vp_stage3(flagged_df, date)\n", + " \n", + " # Find repeated timestamps.\n", + " multi_timestamps = stage3_repeated_timestamps(vp)\n", + " \n", + " # Find repeated locations\n", + " multi_locs = stage3_repeated_locations(vp)\n", + " \n", + " # Merge\n", + " timestamps_merge_cols = ['shape_array_key','trip_id','location_timestamp_local']\n", + " loc_merge_cols = ['shape_array_key','trip_id','pair']\n", + " \n", + " # Want everything found in vehicle positions, so do left merges\n", + " m1 = (vp\n", + " .merge(multi_timestamps, how=\"left\", on= timestamps_merge_cols)\n", + " .merge(multi_locs, how=\"left\", on=loc_merge_cols)\n", + " )\n", + " \n", + " drop_cols = ['vp_idx','x','y','hour','activity_date',]\n", + " m1 = m1.drop(columns = drop_cols)\n", + " \n", + " # Flag\n", + " def flag(row):\n", + " if (row[\"number_of_repeated_timestamps\"] > 1) & (row[\"number_of_repeated_locs\"] > 1):\n", + " return \"repeated timestamps & locations\"\n", + " elif (row[\"number_of_repeated_timestamps\"] > 1):\n", + " return \"repeated timestamps\"\n", + " elif (row[\"number_of_repeated_locs\"] > 1):\n", + " return \"repeated locations\"\n", + " else:\n", + " return \"check in stage 2\"\n", + " \n", + " m1[\"stage3_flag\"] = m1.apply(lambda x: flag(x), axis=1)\n", + " \n", + " print(m1.stage3_flag.value_counts())\n", + " \n", + " check_in_stage2 = m1[m1.stage3_flag == \"check in stage 2\"]\n", + " print(f\"Have to check {len(check_in_stage2)/len(m1) * 100} % of rows in stage 2\")\n", + " \n", + " end = datetime.datetime.now()\n", + " print(f\"Took {end-start}\")\n", + " return m1" ] }, { "cell_type": "code", - "execution_count": null, - "id": "17bceddf-6c73-466d-8a8c-115370ab3301", + "execution_count": 75, + "id": "cab32ef3-cc66-40ce-aa19-59631734f539", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2023-06-30 10:17:51.135694\n", + "check in stage 2 538914\n", + "repeated timestamps 54883\n", + "repeated timestamps & locations 107\n", + "repeated locations 42\n", + "Name: stage3_flag, dtype: int64\n", + "Have to check 90.73451121819154 % of rows in stage 2\n", + "Took 0:00:27.583738\n" + ] + } + ], "source": [ - "# few_routes_cat[(few_routes_cat.meters_cat == \"meters_ elapsed avg\") & (few_routes_cat.seconds_cat == \"seconds_ elapsed high\") & (few_routes_cat.speed_flags == \"speed low\")][subset].sample(3)" + "m3 = flag_stage3(m2, analysis_date)" + ] + }, + { + "cell_type": "code", + "execution_count": 76, + "id": "1cca329c-14bc-4ad5-9465-1a63ca53df49", + "metadata": {}, + "outputs": [], + "source": [ + "m3 = m3[m3.stage3_flag == \"check in stage 2\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 77, + "id": "93e87778-edef-4d62-98aa-a4241f177892", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(538914, 29)" + ] + }, + "execution_count": 77, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "m3.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 78, + "id": "21799f42-873e-41bd-b764-42cc297686a6", + "metadata": {}, + "outputs": [], + "source": [ + "sort_cols = ['trip_id', 'shape_array_key', 'stop_sequence']" + ] + }, + { + "cell_type": "code", + "execution_count": 149, + "id": "17ac977c-d220-414e-be9f-540eec051e06", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
shape_array_keygtfs_dataset_keytrip_idn_trips
296389809fd4704a18ae0ad64f8170e0167b565222fe2cf728fd3f16b2ff51e133fe8c183-oeiebzuc1162
295464805fef558a9bf81d57143cab635b27b1c0e3039da063db95ebabd3fe4ee611a411083276_M11159
396202ac5104538290bb7c7d14b926884e6efac0e3039da063db95ebabd3fe4ee611a411060883_M11157
527457e5ec67542d6f30fa38fdcf2f63c90109c0e3039da063db95ebabd3fe4ee611a411083144_M11156
1166043928b30e00772c10a38c11ea12ad78695222fe2cf728fd3f16b2ff51e133fe8c183-0rjkhjagy150
555304edc5ab1a2be1d269306161ce38e0b2adc0e3039da063db95ebabd3fe4ee611a411042148_M11138
\n", + "
" + ], + "text/plain": [ + " shape_array_key gtfs_dataset_key \\\n", + "296389 809fd4704a18ae0ad64f8170e0167b56 5222fe2cf728fd3f16b2ff51e133fe8c \n", + "295464 805fef558a9bf81d57143cab635b27b1 c0e3039da063db95ebabd3fe4ee611a4 \n", + "396202 ac5104538290bb7c7d14b926884e6efa c0e3039da063db95ebabd3fe4ee611a4 \n", + "527457 e5ec67542d6f30fa38fdcf2f63c90109 c0e3039da063db95ebabd3fe4ee611a4 \n", + "116604 3928b30e00772c10a38c11ea12ad7869 5222fe2cf728fd3f16b2ff51e133fe8c \n", + "555304 edc5ab1a2be1d269306161ce38e0b2ad c0e3039da063db95ebabd3fe4ee611a4 \n", + "\n", + " trip_id n_trips \n", + "296389 183-oeiebzuc1 162 \n", + "295464 11083276_M11 159 \n", + "396202 11060883_M11 157 \n", + "527457 11083144_M11 156 \n", + "116604 183-0rjkhjagy 150 \n", + "555304 11042148_M11 138 " + ] + }, + "execution_count": 149, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Find routes with the most trips\n", + "(m3\n", + " .sort_values(['n_trips'], ascending = False)\n", + " .drop_duplicates(['shape_array_key'])\n", + " [['shape_array_key','gtfs_dataset_key', 'trip_id', 'n_trips']]\n", + " .head(6)\n", + ")" ] }, { "cell_type": "code", - "execution_count": null, - "id": "b8248dd3-c7de-4655-a8a2-9f7b486d01c1", - "metadata": { - "scrolled": true, - "tags": [] - }, - "outputs": [], - "source": [ - "# few_routes_cat[(few_routes_cat.shape_array_key == \"d8b0826e923620f7b7cd74be090de936\") & (few_routes_cat.stop_sequence == 1)][subset]" + "execution_count": 90, + "id": "3869ed7a-a951-4ed0-bfa9-bbdba7177790", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "n_trips\n", + "140 40\n", + "146 28\n", + "150 28\n", + "147 24\n", + "148 18\n", + "152 18\n", + "149 16\n", + "151 16\n", + "158 12\n", + "157 8\n", + "154 6\n", + "156 6\n", + "159 4\n", + "160 2\n", + "161 2\n", + "162 2\n", + "dtype: int64" + ] + }, + "execution_count": 90, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "m3[m3.shape_array_key == \"809fd4704a18ae0ad64f8170e0167b56\"][['n_trips']].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 91, + "id": "a6c3db80-c4bf-4264-873c-c9912cdc9dc5", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "n_trips\n", + "140 50\n", + "54 22\n", + "125 20\n", + "132 14\n", + "126 12\n", + "145 10\n", + "136 8\n", + "141 6\n", + "142 6\n", + "155 4\n", + "158 4\n", + "144 4\n", + "151 4\n", + "143 4\n", + "159 4\n", + "153 2\n", + "156 2\n", + "157 2\n", + "dtype: int64" + ] + }, + "execution_count": 91, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "m3[m3.shape_array_key == \"805fef558a9bf81d57143cab635b27b1\"][['n_trips']].value_counts()" ] }, { - "cell_type": "code", - "execution_count": null, - "id": "56d7c123-5002-4ba5-916c-ed5d9097126c", - "metadata": { - "scrolled": true, - "tags": [] - }, - "outputs": [], + "cell_type": "markdown", + "id": "4b1876cf-9e8b-4c30-8723-2226133b8e01", + "metadata": {}, "source": [ - "# few_routes_cat[(few_routes_cat.stop_sequence == 65) & (few_routes_cat.gtfs_dataset_key == \"65d9589130415c685b89f4f7c2d8bd7e\")][subset].sort_values(by = ['speed_mph'])" + "#### Stage2: \"vp_stop_segment\"/A1_sjoin_vp_segments\n" ] }, { "cell_type": "code", - "execution_count": null, - "id": "963353ae-52ab-4ca4-bef5-b3649b6b74c3", + "execution_count": 92, + "id": "0a469849-f903-44e4-9d2a-4f3775270a52", "metadata": {}, "outputs": [], "source": [ - "def flag(row):\n", - "\n", - " # Ok rows\n", - " # If distance and time are average, flag as average\n", - " if (row[\"meters_cat\"] == \"meters_ elapsed avg\") & (row[\"seconds_cat\"] == \"seconds_ elapsed avg\"):\n", - " return \"ok\"\n", - " # If MPH is average, flag as average\n", - " elif row[\"speed_flags\"] == \"speed_ elapsed avg\":\n", - " return \"ok\"\n", - "\n", - " # Zero rows\n", - " elif ((row[\"speed_mph\"] == 0) | (row[\"sec_elapsed\"] == 0) | (row[\"meters_elapsed\"] == 0)):\n", - " return \"low\"\n", - "\n", - " # Tag as high\n", - " elif row[\"speed_flags\"] == \"speed_ elapsed high\":\n", - " return \"high\"\n", - "\n", - " # Tag as low\n", - " elif row[\"speed_flags\"] == \"speed_ elapsed low\":\n", - " return \"low\"\n", - "\n", - " else:\n", - " return \"other\"" + "# Select one route to look at\n", + "test_route = \"3928b30e00772c10a38c11ea12ad7869\"" ] }, { "cell_type": "code", - "execution_count": null, - "id": "e9994532-e658-480e-b009-8ad7ef6392b5", + "execution_count": 93, + "id": "6e946c68-3476-459d-a869-77ac37b5fb07", "metadata": {}, "outputs": [], "source": [ - "few_routes_cat[\"unusual_flag\"] = few_routes_cat.apply(lambda x: flag(x), axis=1)" + "test_gtfs_key = \"5222fe2cf728fd3f16b2ff51e133fe8c\"" ] }, { "cell_type": "code", - "execution_count": null, - "id": "f652b637-2682-4b00-895d-d1809bab7d12", + "execution_count": 94, + "id": "b4fa40bf-387c-4301-ba13-2bd16b15cd24", "metadata": {}, "outputs": [], "source": [ - "len(few_routes_cat) == len(merge1)" + "test_trip = '183-0rjkhjagy'" ] }, { - "cell_type": "code", - "execution_count": null, - "id": "d7340f9e-62ae-4684-9415-a8d9189fb3f9", + "cell_type": "markdown", + "id": "ef18eb20-a43e-4d32-80eb-7f902116a944", "metadata": {}, - "outputs": [], "source": [ - "few_routes_cat.unusual_flag.value_counts() / len(few_routes_cat) * 100" + "#### Look at export file" ] }, { "cell_type": "code", - "execution_count": null, - "id": "40c0937e-de60-4508-a2aa-056209649c4b", + "execution_count": 95, + "id": "6397dc45-c271-4057-a0d8-1962846d4f94", "metadata": {}, "outputs": [], "source": [ - "few_routes_cat.unusual_flag.value_counts()" + "def import_stage_2(date:str, route:str, stop_sequence:str):\n", + " df = pd.read_parquet(\n", + " f\"{SEGMENT_GCS}vp_sjoin/vp_stop_segment_{date}\",\n", + " filters = [[('shape_array_key', \"==\", route),\n", + " ('stop_sequence', \"==\", stop_sequence)]],\n", + " )\n", + " return df" ] }, { "cell_type": "code", - "execution_count": null, - "id": "347a8e6b-162a-4079-b258-22017bad83e9", + "execution_count": 96, + "id": "fe8f800a-f180-4495-a387-0367528823ba", "metadata": {}, "outputs": [], "source": [ - "subset2 = [\"unusual_flag\", \"_gtfs_dataset_name\"] + subset" + "# stg2 = import_stage_2(analysis_date, test_route, test_sequence)" ] }, { - "cell_type": "code", - "execution_count": null, - "id": "9978f816-a208-4dac-9eb8-154ba9e58d6b", + "cell_type": "markdown", + "id": "4b5dec8d-c4f5-49dd-9a11-1b10ff30fb55", "metadata": { - "scrolled": true, "tags": [] }, - "outputs": [], - "source": [ - "high_df = few_routes_cat[few_routes_cat.unusual_flag == \"high\"]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3335a25d-8447-49b1-b885-ede8234ec16a", - "metadata": {}, - "outputs": [], "source": [ - "low_df = few_routes_cat[few_routes_cat.unusual_flag == \"low\"]" + "#### Look at vp trips -> import unique trips" ] }, { "cell_type": "code", - "execution_count": null, - "id": "21432512-a578-4541-bd5c-132e3985e062", + "execution_count": 97, + "id": "ade9e07f-0b55-4561-96d1-6fd6adec0f1a", "metadata": {}, "outputs": [], "source": [ - "stc_3 = few_routes_cat[\n", - " (few_routes_cat.stop_sequence == 3)\n", - " & (\n", - " few_routes_cat._gtfs_dataset_name\n", - " == \"Bay Area 511 Santa Clara Transit VehiclePositions\"\n", + "def import_unique_trips(gtfs_key:str, trip: str, route:str):\n", + " vp_trips = A1_sjoin_vp_segments.add_grouping_col_to_vp(\n", + " f\"vp_usable_{analysis_date}\",\n", + " analysis_date,\n", + " [\"shape_array_key\"]\n", " )\n", - "]" + " \n", + " # Filter to just one trip/route/operator\n", + " df = vp_trips[(vp_trips.gtfs_dataset_key == gtfs_key)\n", + " & (vp_trips.shape_array_key == route)\n", + " & (vp_trips.trip_id == trip)].reset_index(drop = True)\n", + " return df\n" ] }, { "cell_type": "code", - "execution_count": null, - "id": "44a2debc-f188-4411-95a7-c9f822ea7f3c", + "execution_count": 98, + "id": "c8003044-b7e4-477e-9395-fa881a2fa2b3", "metadata": {}, "outputs": [], "source": [ - "test1 = few_routes_cat[\n", - " (few_routes_cat.speed_flags == \"speed_ elapsed avg\")\n", - " & (few_routes_cat.meters_cat == \"meters_ elapsed low\")\n", - " & (few_routes_cat.seconds_cat == \"seconds_ elapsed low\")\n", - "]" + "# unique_trips = import_unique_trips(test_gtfs_key, test_trip, test_route)" ] }, { - "cell_type": "code", - "execution_count": null, - "id": "5f817b1c-f215-486c-ac3c-e75bc8fb9393", + "cell_type": "markdown", + "id": "52ce333b-9f75-4c9b-a1af-130c93786f94", "metadata": {}, - "outputs": [], "source": [ - "# test1[subset2].sample(100)" + "#### Look at vehicle positions" ] }, { "cell_type": "code", - "execution_count": null, - "id": "f7b6bb03-dc37-478b-a1d5-ab5f2e59a556", - "metadata": { - "scrolled": true, - "tags": [] - }, + "execution_count": 99, + "id": "ac6e78c3-694d-4297-8db1-f0f4d6faadbf", + "metadata": {}, "outputs": [], "source": [ - "# high_df[subset2].sample(10)" + "def import_vehicle_positions(unique_trips:pd.DataFrame, gtfs_key:str, trip_id:str)-> gpd.GeoDataFrame:\n", + " vp = helpers.import_vehicle_positions(\n", + " SEGMENT_GCS,\n", + " f\"vp_usable_{analysis_date}/\",\n", + " filters = [[(\"gtfs_dataset_key\", \"==\", gtfs_key),\n", + " ('trip_id', '==', trip_id)]],\n", + " columns = [\"gtfs_dataset_key\", \"trip_id\", \n", + " \"vp_idx\", \"x\", \"y\"],\n", + " partitioned = True\n", + " )\n", + " \n", + " vp = vp.compute()\n", + " vp = vp.merge(unique_trips, on = [\"gtfs_dataset_key\", \"trip_id\"],\n", + " how = \"inner\"\n", + " )\n", + " \n", + " vp_gdf = gpd.GeoDataFrame(\n", + " vp, \n", + " geometry = gpd.points_from_xy(vp.x, vp.y, crs = \"EPSG:4326\")\n", + " ).to_crs(PROJECT_CRS).drop(columns = [\"x\", \"y\"])\n", + " \n", + " return vp_gdf" ] }, { "cell_type": "code", - "execution_count": null, - "id": "58b91d5d-9f9a-47e9-a5fe-b7f7e0e64587", + "execution_count": 100, + "id": "b47ea0cb-6031-4d98-b963-efbef949d169", "metadata": {}, "outputs": [], "source": [ - "metro_62 = few_routes_cat[\n", - " (few_routes_cat.stop_sequence == 62)\n", - " & (few_routes_cat._gtfs_dataset_name == \"LA Metro Bus Vehicle Positions\")\n", - "]" + "#vehicle_positions = import_vehicle_positions(unique_trips, test_gtfs_key, test_trip)" ] }, { "cell_type": "code", - "execution_count": null, - "id": "43ff6e38-5084-463f-a444-68e3bffdd7cc", - "metadata": { - "scrolled": true, - "tags": [] - }, + "execution_count": 101, + "id": "f0f96480-8328-43ed-9add-0a74b533fc8d", + "metadata": {}, "outputs": [], "source": [ - "stop_16 = few_routes_cat[\n", - " (few_routes_cat.stop_sequence == 16)\n", - " & (few_routes_cat._gtfs_dataset_name == \"Bay Area 511 Muni VehiclePositions\")\n", - "]" + "#len(vehicle_positions)" ] }, { "cell_type": "markdown", - "id": "e01d88e4-a3e2-43f1-9591-afe83e8a92cf", + "id": "896e00be-27f2-43b7-9cb4-69d61a061af0", "metadata": {}, "source": [ - "#### Should filter even further." + "#### Look at segments" ] }, { "cell_type": "code", - "execution_count": null, - "id": "bdd2b274-87a7-4306-8494-f65416ac88fb", + "execution_count": 102, + "id": "17bb5083-ace2-4e2d-8400-3bf948625909", "metadata": {}, "outputs": [], "source": [ - "high_low_zero = few_routes_cat[\n", - " few_routes_cat.unusual_flag.isin([\"high\", \"low\"])\n", - "].reset_index()" + "def import_segments(flagged_df: pd.DataFrame, route:str, gtfs_key:str) -> gpd.GeoDataFrame:\n", + " \n", + " # Load in ALL segments, flag them.\n", + " gdf = gpd.read_parquet(f\"{SEGMENT_GCS}stop_segments_{analysis_date}.parquet\",\n", + " filters = [[(\"shape_array_key\", \"==\", route),\n", + " (\"gtfs_dataset_key\", \"==\", gtfs_key),\n", + " ]]).to_crs(PROJECT_CRS)\n", + " \n", + " gdf[\"geometry_buffered\"] = gdf.geometry.buffer(35)\n", + " gdf = gdf.set_geometry('geometry_buffered')\n", + " \n", + " # Distinguish between \"correct\" and \"incorrect\" seq\n", + " # A sequence can be incorrect even if just one row is \"divided by 0\"\n", + " incorrect_segments = flagged_df[(flagged_df.shape_array_key == route) & (flagged_df.gtfs_dataset_key == gtfs_key)]\n", + " incorrect_segments_list = incorrect_segments.stop_sequence.unique().tolist()\n", + " incorrect_segments_filtered = gdf[gdf.stop_sequence.isin(incorrect_segments_list)].reset_index(drop = True)\n", + " incorrect_segments_filtered['flag'] = 'contains 0m/0sec'\n", + " \n", + " # Filter for correct segments\n", + " correct_segments = flagged_df[~flagged_df.stop_sequence.isin(incorrect_segments_list)]\n", + " correct_segments_list = correct_segments.stop_sequence.unique().tolist()\n", + " correct_segments_filtered = gdf[gdf.stop_sequence.isin(correct_segments_list)].reset_index(drop = True)\n", + " correct_segments_filtered['flag'] = 'does not contain 0m/0sec'\n", + " \n", + " final = pd.concat([correct_segments_filtered, incorrect_segments_filtered])\n", + " \n", + " return final" ] }, { "cell_type": "code", - "execution_count": null, - "id": "ebea8b6f-a011-4996-be52-9e10ec1f8342", + "execution_count": 103, + "id": "9f3a302a-f604-49fe-ae9b-ee8db85466de", "metadata": {}, "outputs": [], "source": [ - "few_routes_cat.shape" + "flagged_segments = import_segments(m3, test_route, test_gtfs_key)" ] }, { "cell_type": "code", - "execution_count": null, - "id": "987d3500-7594-4d12-a9d0-45d2c19999d9", + "execution_count": 104, + "id": "e8a14cfd-38b8-4326-9eac-a711f1a189e8", "metadata": {}, "outputs": [], "source": [ - "high_low_zero.shape" + "#segments = A1_sjoin_vp_segments.import_segments_and_buffer(\n", + " # f\"stop_segments_{analysis_date}\",\n", + "# 35,\n", + " # [\"shape_array_key\", \"stop_sequence\"]+ [\"seg_idx\", \"geometry\"]\n", + "#)" ] }, { "cell_type": "code", - "execution_count": null, - "id": "630e745f-dd71-4cc2-bad6-e17c666900a8", + "execution_count": 105, + "id": "3c08c38e-7419-4ff6-b74f-5f15615e52c4", "metadata": {}, "outputs": [], "source": [ - "len(high_low_zero.drop_duplicates())" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "daa82644-dc46-409a-83ab-a86ea996c356", - "metadata": {}, - "outputs": [], - "source": [ - "len(few_routes_cat)-len(high_low_zero)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8b912560-11ed-4aad-bcc3-ffb9e7966c24", - "metadata": { - "scrolled": true, - "tags": [] - }, - "outputs": [], - "source": [ - "# To plot\n", - "# all_trips = one_route3.melt(id_vars=[ '_gtfs_dataset_name','shape_array_key','trip_id', 'stop_sequence','gtfs_dataset_key','loop_or_inlining',\n", - "#'n_trips'], value_vars=['avg_speed_mph','speed_mph','p20_speed_mph', 'p80_speed_mph'])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "bc09e6d6-9811-4c50-9f61-e00af00f0b83", - "metadata": {}, - "outputs": [], - "source": [ - "# all_trips = all_trips.drop_duplicates(subset = [ '_gtfs_dataset_name','shape_array_key','stop_sequence','gtfs_dataset_key','variable','value']).reset_index(drop = True)" + "# segments = segments.compute()" ] }, { "cell_type": "markdown", - "id": "cde431f9-10ad-484f-b954-dd3c13a6e683", - "metadata": {}, - "source": [ - "#### Other ideas\n", - "* Show which stops are excluded from flags\n", - "* Show how many stops are dropped\n", - "* Show % of stops that were flagged compared to total stops." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "aae6b6c0-ce47-4ca1-b104-68b39ebcf2ca", - "metadata": {}, - "outputs": [], - "source": [ - "high_low_zero2 = high_low_zero.melt(\n", - " id_vars=[\n", - " \"_gtfs_dataset_name\",\n", - " \"shape_array_key\",\n", - " \"trip_id\",\n", - " \"stop_sequence\",\n", - " \"gtfs_dataset_key\",\n", - " \"loop_or_inlining\",\n", - " \"n_trips\",\n", - " \"meters_cat\",\n", - " \"seconds_cat\",\n", - " \"unusual_flag\",\n", - " ],\n", - " value_vars=[\"avg_speed_mph\", \"speed_mph\", \"p20_speed_mph\", \"p80_speed_mph\"],\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "767abd20-d030-42d3-b85f-6d3023d69b8a", + "id": "5a688361-e7a9-40f0-877d-3693be99a960", "metadata": {}, - "outputs": [], "source": [ - "high_low_zero2 = high_low_zero2.drop_duplicates(\n", - " subset=[\n", - " \"loop_or_inlining\",\n", - " \"_gtfs_dataset_name\",\n", - " \"shape_array_key\",\n", - " \"stop_sequence\",\n", - " \"gtfs_dataset_key\",\n", - " \"variable\",\n", - " \"value\",\n", - " ]\n", - ").reset_index(drop=True)" + "#### Stops kept: last and first" ] }, { "cell_type": "code", - "execution_count": null, - "id": "52e7ee5a-a40e-423e-ba4b-dea14de17982", + "execution_count": 106, + "id": "9ea744bf-8019-4b7c-988f-f95196b56435", "metadata": {}, "outputs": [], "source": [ - "high_low_zero2.shape" + "def find_first_last_points(route:str, trip:str, gtfs_key:str):\n", + " df = pd.read_parquet(f\"{SEGMENT_GCS}vp_pared_stops_{analysis_date}\",\n", + " filters = [[('shape_array_key', \"==\", route),\n", + " \n", + " ('trip_id', \"==\", trip), \n", + " ('gtfs_dataset_key', '==', gtfs_key)]],)\n", + " \n", + " gdf = gpd.GeoDataFrame(\n", + " df, \n", + " geometry = gpd.points_from_xy(df.x, df.y, crs = \"EPSG:4326\")\n", + " ).to_crs(PROJECT_CRS).drop(columns = [\"x\", \"y\"])\n", + " \n", + " gdf = gdf[['geometry','stop_sequence']]\n", + " \n", + " return gdf" ] }, { "cell_type": "code", - "execution_count": null, - "id": "229c6d49-58e1-45bd-839f-b03f7e8cdd4a", + "execution_count": 107, + "id": "7c7aa90f-3e80-472e-b61d-94afd6c0ec01", "metadata": {}, "outputs": [], "source": [ - "def stops_info(original: pd.DataFrame, filtered_unusual_stops: pd.DataFrame):\n", - "\n", - " subset = [\"_gtfs_dataset_name\", \"gtfs_dataset_key\", \"shape_array_key\"]\n", - "\n", - " def aggregate(df, total_trip_column_name: str):\n", - " agg = (\n", - " df.groupby(subset)\n", - " .agg({\"stop_sequence\": \"count\"})\n", - " .reset_index()\n", - " .rename(columns={\"stop_sequence\": total_trip_column_name})\n", - " )\n", - "\n", - " return agg\n", - "\n", - " total_unusual_stops = aggregate(filtered_unusual_stops, \"total_unusual_stops\")\n", - " total_stops = aggregate(original, \"total_stops\")\n", - "\n", - " # Merge them\n", - " merge1 = pd.merge(total_unusual_stops, total_stops, on=subset, how=\"inner\")\n", - "\n", - " # Add some columns\n", - " merge1[\"percent_of_unusual_stops\"] = ((merge1.total_unusual_stops / merge1.total_stops) * 100).astype(int)\n", - " \n", - " merge1[\"Percentage of Unusual Stops\"] = \"% of Unusual Stops: \" + merge1.percent_of_unusual_stops.astype(str)\n", - "\n", - " # Add dropdown menu\n", - " merge1[\"Dropdown Menu\"] = merge1._gtfs_dataset_name + \" \" + merge1.shape_array_key\n", - "\n", - " # Clean\n", - " merge1 = threshold_utils.pre_clean(merge1)\n", - "\n", - " return merge1" + "# first_last = find_first_last_points(test_route, test_trip, test_gtfs_key)" ] }, { "cell_type": "code", - "execution_count": null, - "id": "4a22aba2-00ee-4f11-bcf5-c3001cc34b9f", + "execution_count": 108, + "id": "5634169a-f26b-4174-b46a-3aa872bc1bdb", "metadata": {}, "outputs": [], "source": [ - "# Do not use melted version of the dataframe for second argument\n", - "stop_info = stops_info(merge1, high_low_zero)" + "# len(first_last)" ] }, { - "cell_type": "code", - "execution_count": null, - "id": "23e7f746-b1b5-402f-92fd-dbc74840e013", + "cell_type": "markdown", + "id": "9aba7f4e-2b1a-4f1b-aaf2-1bc7bb3cf221", "metadata": {}, - "outputs": [], "source": [ - "merge1.shape_array_key.nunique(), high_low_zero.shape_array_key.nunique()" + "#### Sjoin " ] }, { "cell_type": "code", - "execution_count": null, - "id": "7049c7e9-85bc-43c3-8f0c-7111fabbe649", + "execution_count": 109, + "id": "d535d059-efd9-49dd-9759-8663679ad5e1", "metadata": {}, "outputs": [], "source": [ - "stop_info.shape" + "def sjoin_vp_segments(segments: gpd.GeoDataFrame, vp_gdf: gpd.GeoDataFrame):\n", + " vp_in_seg = gpd.sjoin(\n", + " vp_gdf,\n", + " segments,\n", + " how = \"inner\",\n", + " predicate = \"within\"\n", + " )\n", + " \n", + " \n", + " return vp_in_seg" ] }, { - "cell_type": "code", - "execution_count": null, - "id": "c4fc643f-a156-419e-ad35-8ebc6da5d075", + "cell_type": "markdown", + "id": "1aeb604b-249b-41e7-be71-fe8d3205e54a", "metadata": { "tags": [] }, - "outputs": [], - "source": [ - "stop_info.sort_values(['Percent Of Unusual Stops'], ascending = False).head(10)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8a25f067-956e-46a7-aa7a-5abf57e662f6", - "metadata": {}, - "outputs": [], "source": [ - "# Clean\n", - "high_low_zero2 = threshold_utils.pre_clean(high_low_zero2)" + "#### Mapping" ] }, { "cell_type": "code", - "execution_count": null, - "id": "b5eea02c-05ec-4707-a06d-9de1864e8fbe", + "execution_count": 124, + "id": "1a785180-38bb-4d33-a96e-3385c84ed2f1", "metadata": {}, "outputs": [], "source": [ - "# Add dropdown menu\n", - "high_low_zero2[\"Dropdown Menu\"] = (\n", - " high_low_zero2[\"Gtfs Dataset Name\"] + \" \" + high_low_zero2[\"Shape Array Key\"]\n", - ")" + "def display_maps(all_points: gpd.GeoDataFrame, \n", + " first_last_points: gpd.GeoDataFrame,\n", + " segments: gpd.GeoDataFrame,\n", + " sjoin_results: gpd.GeoDataFrame):\n", + " \n", + " base1 = segments.explore('flag', cmap= 'tab10', height = 400, width = 600, name = 'segments')\n", + " all_points_map = all_points.explore(m = base1, color = 'red',style_kwds = {'weight':6}, name= 'points')\n", + " \n", + " print('ALL POINTS')\n", + " display(all_points_map) \n", + " \n", + " \n", + " # Right left geo\n", + " sjoin_points = sjoin_results.set_geometry('geometry_left')\n", + " sjoin_segments = sjoin_results.set_geometry('geometry_right')\n", + " sjoin_segments.geometry_right = sjoin_segments.geometry_right.buffer(35)\n", + " base3 = sjoin_segments.explore('flag', cmap= 'tab10', height = 400, width = 600, name = 'segments')\n", + " sjoin_map = sjoin_points.explore(m = base3, color = 'orange',style_kwds = {'weight':6}, name= 'points')\n", + " \n", + " print('SJOIN')\n", + " display(sjoin_map)\n", + " \n", + " base2 = segments.explore('flag', cmap= 'tab10', height = 400, width = 600, name = 'segments')\n", + " first_last_map = first_last_points.explore(m = base2, color = 'pink',style_kwds = {'weight':6},height = 400, width = 600,)\n", + " \n", + " print('FIRST AND LAST')\n", + " display(first_last_map)\n", + " " ] }, { "cell_type": "code", - "execution_count": null, - "id": "571b471f-4a66-474f-8900-c3eaffde441e", + "execution_count": 125, + "id": "bab6bfc4-3d08-46fe-be33-6179ac5df34d", "metadata": {}, "outputs": [], "source": [ - "high_low_zero2[\"Route Type\"] = \"Route Type: \" + high_low_zero2[\n", - " \"Loop Or Inlining\"\n", - "].astype(str)" + "# display_maps(vehicle_positions,first_last,flagged_segments)" ] }, { - "cell_type": "code", - "execution_count": null, - "id": "072f39d9-4bf9-4efc-a393-24f096cecf7e", + "cell_type": "markdown", + "id": "f7ed9dc6-80ce-46f9-ae59-a5ed2bbcad50", "metadata": {}, - "outputs": [], "source": [ - "def alt_dropdown(df, col_for_dropdown: str, dropdown_menu_title: str):\n", - " # Create dropdown menu\n", - " # Exclude \"none\" operators which are only scheduled data\n", - " df = df.loc[df[col_for_dropdown] != \"None\"][[col_for_dropdown]]\n", - " dropdown_list = df[col_for_dropdown].unique().tolist()\n", + "#### Function\n", "\n", - " # Show only first operator by default\n", - " initialize_first_op = sorted(dropdown_list)[0]\n", - " input_dropdown = alt.binding_select(\n", - " options=sorted(dropdown_list), name=dropdown_menu_title\n", - " )\n", + "Previous tried routes\n", + "test_route = \"106d979b9a9e6338827a8e1c145e69fd\"\n", + "test_sequence = 39\n", + "test_gtfs_key = \"db56b50ab86b5f7a4ae2fc2dd9889bbe\"\n", + "test_trip = '1088405'\n", "\n", - " selection = alt.selection_single(\n", - " name=dropdown_menu_title,\n", - " fields=[col_for_dropdown],\n", - " bind=input_dropdown,\n", - " init={col_for_dropdown: initialize_first_op},\n", - " )\n", + "test_route2 = \"0fb4f3627996269dc7075276d3b69e36\"\n", + "test_gtfs_key2 = \"a4f6fd5552107e05fe9743ac7cce2c55\"\n", + "test_trip2 = \"16939095\"\n", "\n", - " return selection" + "test_route3 = \"07c9a47264a43d8d0d16ef7109e8fd68\"\n", + "test_gtfs_key3 = \"db56b50ab86b5f7a4ae2fc2dd9889bbe\"\n", + "test_trip3 = \"1089348\"" ] }, { "cell_type": "code", - "execution_count": null, - "id": "b7b429be-057c-4692-927e-92107b015ae6", - "metadata": {}, - "outputs": [], - "source": [ - "selection_test = alt_dropdown(high_low_zero2, \"Dropdown Menu\", \"Route\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2189ef37-baa1-4447-802b-f21362e73e03", + "execution_count": 126, + "id": "509bc2c0-14f3-4021-baf1-6d89a2409a79", "metadata": {}, "outputs": [], "source": [ - "alt.data_transformers.enable('default', max_rows=800000)" + "def stage2_trouble_shooting(flagged_df:pd.DataFrame,\n", + " date:str, \n", + " route:str, \n", + " trip:str, \n", + " gtfs_key:str):\n", + " unique_trips = import_unique_trips(gtfs_key, trip, route)\n", + " \n", + " # Find all recorded vps\n", + " vehicle_positions = import_vehicle_positions(unique_trips, gtfs_key, trip)\n", + " \n", + " # Flag segments, whether one row contains 1+ 0/0 division or not\n", + " flagged_segments = import_segments(flagged_df, route, gtfs_key)\n", + " \n", + " # Find first and last pt kept\n", + " first_last = find_first_last_points(route, trip, gtfs_key)\n", + " \n", + " # Sjoin \n", + " sjoin_results = sjoin_vp_segments(flagged_segments,vehicle_positions)\n", + " \n", + " # Display maps\n", + " display_maps(vehicle_positions,first_last,flagged_segments,sjoin_results)\n", + " " ] }, { - "cell_type": "code", - "execution_count": null, - "id": "80014c5e-695d-4280-89cd-4e7e2bb3d302", + "cell_type": "markdown", + "id": "056dc4ec-dde7-4f5a-bcd6-8ebd5b9d6982", "metadata": {}, - "outputs": [], "source": [ - "alt.data_transformers.disable_max_rows()" + "#### Example Trip 1" ] }, { "cell_type": "code", - "execution_count": null, - "id": "4c975495-e436-4914-ace3-3f3c361a2c66", - "metadata": {}, + "execution_count": 127, + "id": "2bd26525-b824-4b1a-a2bd-817b2207e3fe", + "metadata": { + "scrolled": true, + "tags": [] + }, "outputs": [], "source": [ - "high_low_zero2.columns" + "# subset[(subset.stop_sequence == test_sequence) & (subset.shape_array_key == test_route)]" + ] + }, + { + "cell_type": "code", + "execution_count": 128, + "id": "7a4ae095-d010-46b5-80b2-0bbe948f249f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "ALL POINTS\n" + ] + }, + { + "data": { + "text/html": [ + "
Make this Notebook Trusted to load map: File -> Trust Notebook
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "SJOIN\n" + ] + }, + { + "data": { + "text/html": [ + "
Make this Notebook Trusted to load map: File -> Trust Notebook
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "FIRST AND LAST\n" + ] + }, + { + "data": { + "text/html": [ + "
Make this Notebook Trusted to load map: File -> Trust Notebook
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "stage2_trouble_shooting(flagged_df= m3,\n", + " date = analysis_date,\n", + " route = test_route,\n", + " trip = test_trip,\n", + " gtfs_key = test_gtfs_key)" ] }, { - "cell_type": "code", - "execution_count": null, - "id": "e185065f-1d65-4720-9e72-a229c98fd1bd", + "cell_type": "markdown", + "id": "38c6d374-155d-4fcc-985a-7c892eaecb46", "metadata": {}, - "outputs": [], "source": [ - "len(high_low_zero2[['Route Type']].drop_duplicates())" + "#### Example Trip 2" ] }, { "cell_type": "code", - "execution_count": null, - "id": "aa0fad8b-b49e-48be-8070-adaf6e63d541", + "execution_count": 129, + "id": "268c5f17-df4a-4d84-b3b0-d76a2727bcf7", "metadata": {}, "outputs": [], "source": [ - "# https://github.com/altair-viz/altair/issues/1168\n", - "title = (\n", - " alt.Chart(high_low_zero2)\n", - " .mark_text(dy=-40, size=15, fontWeight=\"normal\")\n", - " .encode(\n", - " text=\"Route Type:N\",\n", - " )\n", - " .add_selection(selection_test)\n", - " .transform_filter(selection_test)\n", - ")" + "test_route2 = \"805fef558a9bf81d57143cab635b27b1\"\n", + "test_gtfs_key2 = \"c0e3039da063db95ebabd3fe4ee611a4\"\n", + "test_trip2 = \"11083276_M11\"" ] }, { "cell_type": "code", - "execution_count": null, - "id": "1042a774-165c-4f7e-bfc9-c4d4980bd29b", + "execution_count": 141, + "id": "c0914a84-b24d-442f-aaca-acf401b9209c", "metadata": {}, "outputs": [], "source": [ - "total_stops_altair = (\n", - " alt.Chart(stop_info)\n", - " .mark_text(dy=-40, size=15, fontWeight=\"normal\")\n", - " .encode(\n", - " text=\"Percentage Of Unusual Stops:N\",\n", - " )\n", - " .add_selection(selection_test)\n", - " .transform_filter(selection_test)\n", - ")" + "# m1[(m1.stop_sequence == 17) & (m1.shape_array_key == test_route2)]" ] }, { "cell_type": "code", - "execution_count": null, - "id": "bbc75adc-739d-43b7-b29a-2018f98966f4", + "execution_count": 131, + "id": "deaa6fdf-37b8-49ee-97f2-46f74d41a449", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "ALL POINTS\n" + ] + }, + { + "data": { + "text/html": [ + "
Make this Notebook Trusted to load map: File -> Trust Notebook
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "SJOIN\n" + ] + }, + { + "data": { + "text/html": [ + "
Make this Notebook Trusted to load map: File -> Trust Notebook
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "FIRST AND LAST\n" + ] + }, + { + "data": { + "text/html": [ + "
Make this Notebook Trusted to load map: File -> Trust Notebook
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ - "total_stops_altair" + "stage2_trouble_shooting(flagged_df= m3,\n", + " date = analysis_date,\n", + " route = test_route2,\n", + " trip = test_trip2,\n", + " gtfs_key = test_gtfs_key2)" ] }, { - "cell_type": "code", - "execution_count": null, - "id": "6067e93b-3519-45fc-b027-11cbcc82d80f", + "cell_type": "markdown", + "id": "27d10dab-c7b0-4ddd-b70b-b3c6b7b3e579", "metadata": {}, - "outputs": [], "source": [ - "main_chart = (\n", - " threshold_utils.chart_size(\n", - " alt.Chart(high_low_zero2)\n", - " .mark_tick(\n", - " size=15,\n", - " thickness=5,\n", - " )\n", - " .encode(\n", - " x=\"Stop Sequence:N\",\n", - " y=\"Value:Q\",\n", - " color=alt.Color(\n", - " \"Variable:N\", scale=alt.Scale(range=cp.CALITP_CATEGORY_BRIGHT_COLORS)\n", - " ),\n", - " tooltip=high_low_zero2.columns.tolist(),\n", - " )\n", - " .interactive(),\n", - " 1100,\n", - " 400,\n", - " )\n", - " .add_selection(selection_test)\n", - " .transform_filter(selection_test)\n", - ")" + "#### Example Trip 3" ] }, { "cell_type": "code", - "execution_count": null, - "id": "c71654d2-48f1-4cfb-a3a2-d22bc85d97a8", + "execution_count": 150, + "id": "f951ac97-43af-452f-9cb7-d40f71c114c9", "metadata": {}, "outputs": [], "source": [ - "main_chart" + "test_route3 = \"edc5ab1a2be1d269306161ce38e0b2ad\"\n", + "test_gtfs_key3 = \"c0e3039da063db95ebabd3fe4ee611a4\"\n", + "test_trip3 = \"11042148_M11\"" ] }, { "cell_type": "code", - "execution_count": null, - "id": "c77e2503-e211-48cc-a220-d96f82ab72df", + "execution_count": 151, + "id": "26e85057-05a9-4606-af4a-7be3e08ae2a2", "metadata": {}, "outputs": [], "source": [ - "(title & total_stops_altair | main_chart)" + "# subset[(subset.stop_sequence == 34) & (subset.shape_array_key == test_route3)]" ] }, { "cell_type": "code", - "execution_count": null, - "id": "3e9e3056-137a-451e-a566-52e085499407", - "metadata": {}, - "outputs": [], - "source": [ - "print('hi')" - ] - }, - { - "cell_type": "markdown", - "id": "5816ef70-bc10-4b87-b587-73f3789c5674", + "execution_count": 152, + "id": "91d07c20-9d78-4eea-8b9c-293df8ade5a3", "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "ALL POINTS\n" + ] + }, + { + "data": { + "text/html": [ + "
Make this Notebook Trusted to load map: File -> Trust Notebook
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "SJOIN\n" + ] + }, + { + "data": { + "text/html": [ + "
Make this Notebook Trusted to load map: File -> Trust Notebook
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "FIRST AND LAST\n" + ] + }, + { + "data": { + "text/html": [ + "
Make this Notebook Trusted to load map: File -> Trust Notebook
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ - "### Charts \n", - "Test with a few routes first\n", - "* Create new col that rounds up speed for plotting purposes only." + "stage2_trouble_shooting(flagged_df= m3,\n", + " date = analysis_date,\n", + " route = test_route3,\n", + " trip = test_trip3,\n", + " gtfs_key = test_gtfs_key3)" ] }, { "cell_type": "markdown", - "id": "c1b099f7-c8e3-4c37-a70d-e765763448d7", - "metadata": { - "tags": [] - }, - "source": [ - "#### Manipulate DF for charts" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a6401179-2b91-444c-8c80-b97659f4225e", - "metadata": {}, - "outputs": [], - "source": [ - "m1 =" - ] - }, + "id": "06df58c1-c7b7-4769-bd8d-696e337eefb3", + "metadata": {}, + "source": [ + "### Stage1: \"vp_usable\"" + ] + }, { - "cell_type": "code", - "execution_count": null, - "id": "576c28b2-8c36-48a2-b312-9fc54010f7b5", + "cell_type": "code", + "execution_count": 153, + "id": "b9eab37f-0569-4f07-9113-87200b0c7dfd", "metadata": {}, - "outputs": [], + "outputs": [], + "source": [ + "# What's the diff between stop segments normal/special/and without any notation?\n", + "usable = pd.read_parquet(f\"{SEGMENT_GCS}vp_usable_{analysis_date}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 154, + "id": "deb486c8-a800-485e-8a46-d994af1c0074", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
_gtfs_dataset_nametrip_idlocation_timestamplocation_timestamp_localactivity_datehourxyvp_idxgtfs_dataset_key
8658254San Diego Vehicle Positions168484052023-04-12 15:37:42+00:002023-04-12 08:37:422023-04-128-117.1432.798658254a4f6fd5552107e05fe9743ac7cce2c55
\n", + "
" + ], + "text/plain": [ + " _gtfs_dataset_name trip_id location_timestamp \\\n", + "8658254 San Diego Vehicle Positions 16848405 2023-04-12 15:37:42+00:00 \n", + "\n", + " location_timestamp_local activity_date hour x y vp_idx \\\n", + "8658254 2023-04-12 08:37:42 2023-04-12 8 -117.14 32.79 8658254 \n", + "\n", + " gtfs_dataset_key \n", + "8658254 a4f6fd5552107e05fe9743ac7cce2c55 " + ] + }, + "execution_count": 154, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "test1 = m1.melt(\n", - " id_vars=[\n", - " \"_gtfs_dataset_name\",\n", - " \"shape_array_key\",\n", - " \"trip_id\",\n", - " \"sorted_stop_seq\",\n", - " \"gtfs_dataset_key\",\n", - " \"loop_or_inlining\",\n", - " \"n_trips\",\n", - " ],\n", - " value_vars=[\"avg_speed_mph\", \"speed_mph\", \"p20_speed_mph\", \"p80_speed_mph\"],\n", - ")" + "usable.sample()" ] }, { "cell_type": "code", - "execution_count": null, - "id": "2b55cb75-8ee8-477c-95bc-439d2ee65962", + "execution_count": 155, + "id": "813ae4db-0fef-4f10-9408-7284fc531ed2", "metadata": {}, "outputs": [], "source": [ - "test1.shape" - ] + "m_cols2 = ['gtfs_dataset_key',\n", + " 'trip_id']" + ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "29dc19fd-0d8b-430d-a78a-a1c947263ef0", + { + "cell_type": "code", + "execution_count": 156, + "id": "d08fa8db-f3a3-43f2-a763-a39cacc9cf9c", + "metadata": {}, + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'subset_for_merge2' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[156], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43msubset_for_merge2\u001b[49m\u001b[38;5;241m.\u001b[39mhead()\n", + "\u001b[0;31mNameError\u001b[0m: name 'subset_for_merge2' is not defined" + ] + } + ], + "source": [ + "subset_for_merge2.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "68a59632-cfbd-43fd-aca4-a502e400a854", "metadata": { "scrolled": true, "tags": [] }, "outputs": [], "source": [ - "# test1[test1.shape_array_key == \"29d2bbdbeaec1d6888800f85bebf6e33\"]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e2c00548-7615-409c-a7cc-7db5f16c9a88", - "metadata": {}, - "outputs": [], - "source": [ - "# Only need average speed/p20 speed/p80 to show up once for each stop sequence-operator-shape array\n", - "test2 = test1.drop_duplicates(\n", - " subset=[\n", - " \"_gtfs_dataset_name\",\n", - " \"shape_array_key\",\n", - " \"sorted_stop_seq\",\n", - " \"gtfs_dataset_key\",\n", - " \"variable\",\n", - " \"value\",\n", - " ]\n", - ").reset_index(drop=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "436aca32-61c9-4488-8fae-26fe66688851", - "metadata": {}, - "outputs": [], - "source": [ - "# test2.to_csv(\"./speeds.csv\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "be4bac6e-2d9f-4d64-a835-22bb4f3c32f5", - "metadata": {}, - "outputs": [], - "source": [ - "test2.shape" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5723b75c-0cc2-42e7-95e0-17b444971ee2", - "metadata": {}, - "outputs": [], - "source": [ - "other = [\n", - " \"cf688717cf0cd8dac0e6d1f12f9c7333\",\n", - " \"6f39f818c9a0c5496cd1c8bd1aa11e67\",\n", - " \"3de4482ec32ba0f2edb451d3528b5a5e\",\n", - "]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7b08d810-9e33-4a4f-b600-b5c8de9bdef6", - "metadata": {}, - "outputs": [], - "source": [ - "# Take out routes that have over 85 stops\n", - "# subset = test2[~test2.shape_array_key.isin(routes_many_stops_list)].reset_index(drop = True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "600d9ed7-524e-4af6-b90b-81fd89cf8c02", - "metadata": {}, - "outputs": [], - "source": [ - "subset = test2[\n", - " test2.shape_array_key.isin(\n", - " [\n", - " \"29d2bbdbeaec1d6888800f85bebf6e33\",\n", - " \"754c5b012195800c38dc58e72e4f482e\",\n", - " \"e3c5ed2c6fa6cd5c5cd57d46aeb3cd8e\",\n", - " ]\n", - " )\n", - "]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a92360af-5dfb-43e2-b89c-87c8b8268665", - "metadata": {}, - "outputs": [], - "source": [ - "subset = threshold_utils.pre_clean(subset)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3a4c7ac2-4d6f-4702-a20b-0de69e0c86d4", - "metadata": {}, - "outputs": [], - "source": [ - "subset.columns" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "52df69f0-7083-4551-b5f5-5dd88f30a69a", - "metadata": {}, - "outputs": [], - "source": [ - "subset.sample()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4f0d0a5e-186e-4b7e-b307-6e6326c3e747", - "metadata": {}, - "outputs": [], - "source": [ - "subset[\"Route\"] = subset[\"Gtfs Dataset Name\"] + \" \" + subset[\"Shape Array Key\"]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a8d96219-49d0-4251-9b4b-a6968542424d", - "metadata": {}, - "outputs": [], - "source": [ - "subset = subset.rename(columns={\"Value\": \"Speed\"})" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b1155355-b4f6-41c2-beb1-f839d9d46027", - "metadata": {}, - "outputs": [], - "source": [ - "subset[\"Speed_Int\"] = subset.Speed.fillna(0).astype(int)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6f695b90-4157-42bd-b9cf-aaa4d71773ef", - "metadata": {}, - "outputs": [], - "source": [ - "subset[\"Route Type\"] = \"Loop or Inlining: \" + subset[\"Loop Or Inlining\"].astype(str)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0834a228-c47d-4760-ae13-aca72784e747", - "metadata": {}, - "outputs": [], - "source": [ - "# subset['Rounded Speed'].unique()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "67cb1df7-6337-478d-bd9c-3fbc0a3f698f", - "metadata": {}, - "outputs": [], - "source": [ - "def speed(row):\n", - " # If partner is none, return Unknown.\n", - " if row.Speed_Int == 0:\n", - " return 0\n", - " elif 0 < row.Speed_Int < 6:\n", - " return 5\n", - " elif 5 < row.Speed_Int < 11:\n", - " return 10\n", - " elif 10 < row.Speed_Int < 16:\n", - " return 15\n", - " elif 15 < row.Speed_Int < 21:\n", - " return 20\n", - " elif 20 < row.Speed_Int < 26:\n", - " return 25\n", - " elif 25 < row.Speed_Int < 31:\n", - " return 30\n", - " else:\n", - " return 35" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "de056258-240b-4aff-a3f9-aabe5330a9e0", - "metadata": {}, - "outputs": [], - "source": [ - "# Apply the function\n", - "subset[\"Rounded Speed\"] = subset.apply(speed, axis=1)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "afc90938-3fbd-4235-b6ab-68cfa445f0b6", - "metadata": { - "scrolled": true, - "tags": [] - }, - "outputs": [], - "source": [ - "# subset[['Rounded Speed', 'Speed', 'Speed_Int']]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9a39e842-70f6-4ff6-84b7-44b4dea48a40", - "metadata": {}, - "outputs": [], - "source": [ - "subset.Variable = subset.Variable.str.title().str.replace(\"_\", \" \")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a95e775c-4cff-44b6-a861-1a17efb5fbf2", - "metadata": {}, - "outputs": [], - "source": [ - "# One df for the actual speeds\n", - "subset_speedmph = subset[subset.Variable == \"Speed Mph\"].reset_index(drop=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "447aec9f-1a4a-408c-bc16-22c67746c262", - "metadata": {}, - "outputs": [], - "source": [ - "# One df for the percentiles\n", - "subset_other = subset[subset.Variable != \"Speed Mph\"].reset_index(drop=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c2fd6ee4-c841-44fd-8659-0ee2bc8a926d", - "metadata": {}, - "outputs": [], - "source": [ - "selection_test = alt_dropdown(subset, \"Route\", \"Operator/Shape Array\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "27a02401-dc45-493b-baad-e4cb84b6db46", - "metadata": {}, - "outputs": [], - "source": [ - "title = title.add_selection(selection_test).transform_filter(selection_test)" - ] - }, - { - "cell_type": "markdown", - "id": "c4904d76-f74e-4fd5-84e1-cd3c4476d010", - "metadata": {}, - "source": [ - "#### Scatterplot" - ] - }, - { - "cell_type": "markdown", - "id": "9c8ae725-71da-4c21-a575-969d14a0aa17", - "metadata": {}, - "source": [ - "#### Jitter" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e8c0fb00-e9a0-47c1-abc5-cea7a806a6df", - "metadata": {}, - "outputs": [], - "source": [ - "def create_jitter_plot(df):\n", - "\n", - " # title_op = df['Gtfs Dataset Name'].iloc[0].replace('VehiclePositions','').strip()\n", - " # inline = df['Loop Or Inlining'].iloc[0]\n", - " chart1 = (\n", - " alt.Chart(df, width=0.5)\n", - " .mark_circle(size=100)\n", - " .encode(\n", - " x=alt.X(\n", - " \"jitter:Q\",\n", - " title=None,\n", - " axis=alt.Axis(values=[0], ticks=False, grid=False, labels=False),\n", - " scale=alt.Scale(),\n", - " ),\n", - " y=alt.Y(\n", - " \"Rounded Speed:Q\",\n", - " scale=alt.Scale(domain=[0, 50]),\n", - " title=\"Speed (MPH)\",\n", - " axis=alt.Axis(\n", - " labelAngle=360,\n", - " grid=False,\n", - " ),\n", - " ),\n", - " color=alt.Color(\n", - " \"Variable:N\",\n", - " scale=alt.Scale(range=cp.CALITP_CATEGORY_BRIGHT_COLORS),\n", - " ),\n", - " tooltip=df.columns.tolist(),\n", - " column=alt.Column(\n", - " \"Sorted Stop Seq:N\",\n", - " header=alt.Header(\n", - " labelAngle=360,\n", - " titleOrient=\"top\",\n", - " labelOrient=\"top\",\n", - " labelAlign=\"right\",\n", - " labelPadding=2,\n", - " ),\n", - " ),\n", - " )\n", - " .transform_calculate(\n", - " # Generate Gaussian jitter with a Box-Muller transform\n", - " jitter=\"sqrt(-2*log(random()))*cos(2*PI*random())\"\n", - " )\n", - " .properties(title=\"Speeds by Operator-Shape Array\")\n", - " )\n", - "\n", - " chart1 = threshold_utils.chart_size(chart1, 75, 200)\n", - "\n", - " return chart1" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "19b97ca6-17c9-4540-acfe-9a4caf740ef4", - "metadata": {}, - "outputs": [], - "source": [ - "chart1 = (\n", - " create_jitter_plot(subset_speedmph)\n", - " .add_selection(selection_test)\n", - " .transform_filter(selection_test)\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c13568df-5e19-4f0a-a9fe-c3edeb0a0073", - "metadata": {}, - "outputs": [], - "source": [ - "chart2 = (\n", - " alt.Chart(subset_other, width=0.5)\n", - " .mark_circle(size=200)\n", - " .encode(\n", - " x=alt.X(\n", - " \"jitter:Q\",\n", - " title=None,\n", - " axis=alt.Axis(values=[0], ticks=False, grid=False, labels=False),\n", - " scale=alt.Scale(),\n", - " ),\n", - " y=alt.Y(\n", - " \"Rounded Speed:Q\",\n", - " title=\"Speed (MPH)\",\n", - " scale=alt.Scale(domain=[0, 50]),\n", - " axis=alt.Axis(grid=False),\n", - " ),\n", - " color=alt.Color(\n", - " \"Variable:N\",\n", - " scale=alt.Scale(range=cp.CALITP_CATEGORY_BRIGHT_COLORS),\n", - " ),\n", - " tooltip=subset_other.columns.tolist(),\n", - " column=alt.Column(\n", - " \"Sorted Stop Seq:N\",\n", - " header=alt.Header(\n", - " labelAngle=360,\n", - " title=None,\n", - " titleOrient=\"top\",\n", - " labelOrient=\"top\",\n", - " labelAlign=\"right\",\n", - " labelPadding=2,\n", - " ),\n", - " ),\n", - " )\n", - " .transform_calculate(\n", - " # Generate Gaussian jitter with a Box-Muller transform\n", - " jitter=\"sqrt(-2*log(random()))*cos(2*PI*random())\"\n", - " )\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2a2bda8b-d48c-44f3-a928-08aca894c565", - "metadata": {}, - "outputs": [], - "source": [ - "chart2 = threshold_utils.chart_size(chart2, 75, 200)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "dd20293d-7c43-42c4-b053-de945860b6f0", - "metadata": {}, - "outputs": [], - "source": [ - "chart2 = chart2.add_selection(selection_test).transform_filter(selection_test)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "44dc6896-e95d-42df-bbd5-f2bb2c2a2cc6", - "metadata": {}, - "outputs": [], - "source": [ - "title = threshold_utils.chart_size(title, 20, 20)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d651c8c6-179c-4157-a671-11006cb419df", - "metadata": {}, - "outputs": [], - "source": [ - "alt.data_transformers.enable(\"default\", max_rows=None)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8e753711-8506-4939-8d9d-22566a641988", - "metadata": {}, - "outputs": [], - "source": [ - "title & (chart1.interactive() & chart2.interactive())" - ] - }, - { - "cell_type": "markdown", - "id": "9745f078-1999-4e5c-9ddc-5eba135f55ab", - "metadata": {}, - "source": [ - "### Draft" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "64011717-6707-4a7a-8eec-9bdb0861bab6", - "metadata": {}, - "outputs": [], - "source": [ - "def meter_elapsed_categories(row):\n", - " lower_end = row[\"meters_mean\"] - row[\"meters_std\"]\n", - " higher_end = row[\"meters_mean\"] + row[\"meters_std\"]\n", - " if row[\"meters_elapsed\"] == row[\"meters_mean\"]:\n", - " return \"distance elapsed is average\"\n", - " elif row[\"meters_elapsed\"] <= lower_end:\n", - " return \"distance lapsed on lower end\"\n", - " elif row[\"meters_elapsed\"] >= higher_end:\n", - " return \"distance lapsed on higher end\"\n", - " elif lower_end < row[\"meters_elapsed\"] < higher_end:\n", - " return \"distance elapsed is average\"\n", - " else:\n", - " return \"other\"\n", - "\n", - "\n", - "def seconds_elapsed_categories(row):\n", - " lower_end = row[\"secs_mean\"] - row[\"secs_std\"]\n", - " higher_end = row[\"secs_mean\"] + row[\"secs_std\"]\n", - " if row[\"sec_elapsed\"] == row[\"secs_mean\"]:\n", - " return \"secs elapsed is average\"\n", - " elif row[\"sec_elapsed\"] <= lower_end:\n", - " return \"secs lapsed on lower end\"\n", - " elif row[\"sec_elapsed\"] >= higher_end:\n", - " return \"secs lapsed on higher end\"\n", - " elif lower_end < row[\"sec_elapsed\"] < higher_end:\n", - " return \"secs elapsed is average\"\n", - " else:\n", - " return \"other\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "fa26aa94-6e1c-46ef-8701-3f4a849faa7d", - "metadata": {}, - "outputs": [], - "source": [ - "\"\"\"def mph_categories(row):\n", - " if (row[\"speed_mph\"] <= row[\"p20_speed_mph\"]):\n", - " return \"speed low\"\n", - " elif (row[\"p20_speed_mph\"] < row[\"speed_mph\"] < row[\"p80_speed_mph\"]):\n", - " return \"speed average\"\n", - " elif (row[\"speed_mph\"] >= row[\"p80_speed_mph\"]):\n", - " return \"speed high\"\n", - " elif (row[\"speed_mph\"] == 0):\n", - " return \"speed is 0\"\n", - " else:\n", - " return \"other\"\n", - " \"\"\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e784cd39-b73e-478a-8b3b-458506c31a18", - "metadata": {}, - "outputs": [], - "source": [ - "def flag(row):\n", - "\n", - " # Ok rows\n", - " # If distance and time are average, flag as average\n", - " if (row[\"meters_cat\"] == \"meters_ elapsed avg\") & (\n", - " row[\"seconds_cat\"] == \"seconds_ elapsed avg\"\n", - " ):\n", - " return \"ok\"\n", - " # If MPH is average, flag as average\n", - " elif row[\"speed_flags\"] == \"speed_ elapsed avg\":\n", - " return \"ok\"\n", - "\n", - " # Zero rows\n", - " elif (\n", - " (row[\"speed_mph\"] == 0)\n", - " | (row[\"sec_elapsed\"] == 0)\n", - " | (row[\"meters_elapsed\"] == 0)\n", - " ):\n", - " return \"low\"\n", - "\n", - " # If meters and seconds are high, flag as average\n", - " # elif ((row[\"meters_cat\"] == \"meters_ elapsed high\") & (row[\"seconds_cat\"] == \"seconds_ elapsed high\")):\n", - " # return \"ok\"\n", - " # If meters and seconds are low, flag as average\n", - " # elif ((row[\"meters_cat\"] == \"meters_ elapsed low\") & (row[\"seconds_cat\"] == \"seconds_ elapsed low\")):\n", - " # return \"ok\"\n", - "\n", - " # Tag as high\n", - " # elif ((row[\"meters_cat\"] != \"meters_ elapsed avg\") & (row[\"seconds_cat\"] != \"seconds_ elapsed avg\") & (row[\"speed_flags\"] == \"speed_ elapsed high\")):\n", - " # return \"high\"\n", - " # elif ((row[\"seconds_cat\"] == \"seconds_ elapsed low\") & (row[\"speed_flags\"] == \"speed_ elapsed high\")):\n", - " # return \"high\"\n", - " # elif ((row[\"meters_cat\"] == \"meters_ elapsed high\") & (row[\"speed_flags\"] == \"speed_ elapsed high\")):\n", - " # return \"high\"\n", - "\n", - " # Tag as low\n", - " elif (\n", - " (row[\"meters_cat\"] != \"meters_ elapsed avg\")\n", - " & (row[\"seconds_cat\"] != \"seconds_ elapsed avg\")\n", - " & (row[\"speed_flags\"] == \"speed_ elapsed low\")\n", - " ):\n", - " return \"low\"\n", - " elif (row[\"seconds_cat\"] == \"seconds_ elapsed high\") & (\n", - " row[\"speed_flags\"] == \"speed_ elapsed low\"\n", - " ):\n", - " return \"low\"\n", - " elif (row[\"meters_cat\"] == \"meters_ elapsed avg\") & (\n", - " row[\"speed_flags\"] == \"speed_ elapsed low\"\n", - " ):\n", - " return \"high\"\n", - "\n", - " else:\n", - " return \"other\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "05d629ed-abfd-477e-b87b-7e321a6d0831", - "metadata": {}, - "outputs": [], - "source": [ - "def speed_categories(row):\n", - " \"\"\"\n", - " Stricter thresholds for speed categories.\n", - " Just because a speed is below the 25th or\n", - " above the 75th percentile doesn't mean it\n", - " should be flagged. Take into account how far away\n", - " it is from that.\n", - " \"\"\"\n", - " # lower_end = (row[\"speed_mean\"] - row[\"speed_std\"])\n", - " # higher_end = (row[\"speed_mean\"] + row[\"speed_std\"])\n", - " if row[\"speed_mph\"] == row[\"avg_speed_mph\"]:\n", - " return \"average\"\n", - " elif row[\"speed_mph\"] <= lower_end:\n", - " return \"speed low\"\n", - " elif row[\"speed_mph\"] >= higher_end:\n", - " return \"speed high\"\n", - " elif (row[\"speed_mph\"] == 0) | (row[\"speed_mph\"] == None):\n", - " return \"speed is 0\"\n", - " else:\n", - " return \"average\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "86a5c48c-6720-41a8-b626-94305506ab3a", - "metadata": {}, - "outputs": [], - "source": [ - "# Determine if an agency has a small, medium, or large fleet size.\n", - "def categorize_by_percentile(df, column_percentile: str):\n", - "\n", - " # Get percentiles in objects for total vehicle.\n", - " p75 = df[column_percentile].quantile(0.75).astype(float)\n", - " p25 = df[column_percentile].quantile(0.25).astype(float)\n", - " p50 = df[column_percentile].quantile(0.50).astype(float)\n", - "\n", - " def percentile(row):\n", - " if row[column_percentile] <= p25:\n", - " return f\"{column_percentile}: low\"\n", - " elif (p25 < row[column_percentile]) and (row[column_percentile] <= p75):\n", - " return f\"{column_percentile}: average\"\n", - " elif row[column_percentile] > p75:\n", - " return f\"{column_percentile}: high\"\n", - " else:\n", - " return \"other\"\n", - "\n", - " df[f\"{column_percentile}_cat\"] = df.apply(lambda x: percentile(x), axis=1)\n", - "\n", - " return df\n", - "\n", - "\n", - "def categorize_all(df):\n", - "\n", - " # Hold results\n", - " final = pd.DataFrame()\n", - "\n", - " for column in [\"meters_elapsed\", \"sec_elapsed\"]:\n", - " for shape_array_key in df.shape_array_key.tolist():\n", - " for stop in df.stop_sequence.tolist():\n", - " filtered = df[\n", - " (df.shape_array_key == shape_array_key) & (df.stop_sequence == stop)\n", - " ].reset_index()\n", - " categorized = categorize_by_percentile(filtered, column)\n", - " final = pd.concat([final, categorized], axis=0)\n", - " print(f\"done for {column}/{stop}\")\n", - "\n", - " return final" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1bb82075-6269-472e-8582-27a7640f0aa5", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4f25a17e-cea1-4bff-888d-d8f37dc31775", - "metadata": {}, - "outputs": [], - "source": [ - "\"\"\"\n", - "p25 = troubleshoot.total_stops.quantile(0.25).astype(float)\n", - "p50 = troubleshoot.total_stops.quantile(0.50).astype(float)\n", - "p75 = troubleshoot.total_stops.quantile(0.75).astype(float)\n", - "p95 = troubleshoot.total_stops.quantile(0.95).astype(float)\n", - "p99 = troubleshoot.total_stops.quantile(0.99).astype(float)\n", - "\"\"\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0e676189-df19-4630-8dc6-3c4fc8c60e16", - "metadata": {}, - "outputs": [], - "source": [ - "def stop_categories1(row):\n", - " if (row.total_stops > 0) and (row.total_stops <= p25):\n", - " return \"25th <= 17 stops\"\n", - " elif (row.total_stops > p25) and (row.total_stops <= p75):\n", - " return \"50th <= 30 stops\"\n", - " elif (row.total_stops > p75) and (row.total_stops <= p95):\n", - " return \"75th <= 50 stops\"\n", - " elif (row.total_stops > p95) and (row.total_stops <= p99):\n", - " return \"95th <= 85 stops\"\n", - " elif row.total_stops >= p95:\n", - " return \"99th >= 203 stops\"\n", - " else:\n", - " return \"other\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "af72c1c9-df07-4e7f-973f-f0c07223b7a4", - "metadata": {}, - "outputs": [], - "source": [ - "def create_jitter_plot(df):\n", - "\n", - " title_op = df[\"Gtfs Dataset Name\"].iloc[0].replace(\"VehiclePositions\", \"\").strip()\n", - " inline = df[\"Loop Or Inlining\"].iloc[0]\n", - "\n", - " chart1 = (\n", - " alt.Chart(df, width=0.5)\n", - " .mark_circle(size=100)\n", - " .encode(\n", - " x=alt.X(\n", - " \"jitter:Q\",\n", - " title=None,\n", - " axis=alt.Axis(values=[0], ticks=False, grid=False, labels=False),\n", - " scale=alt.Scale(),\n", - " ),\n", - " y=alt.Y(\"Rounded Speed:Q\", axis=alt.Axis(labelAngle=360)),\n", - " color=alt.Color(\n", - " \"Variable:N\",\n", - " scale=alt.Scale(range=cp.CALITP_CATEGORY_BRIGHT_COLORS),\n", - " ),\n", - " tooltip=df.columns.tolist(),\n", - " column=alt.Column(\n", - " \"Stop Sequence:N\",\n", - " header=alt.Header(\n", - " labelAngle=360,\n", - " titleOrient=\"top\",\n", - " labelOrient=\"bottom\",\n", - " labelAlign=\"right\",\n", - " labelPadding=2,\n", - " ),\n", - " ),\n", - " )\n", - " .transform_calculate(\n", - " # Generate Gaussian jitter with a Box-Muller transform\n", - " jitter=\"sqrt(-2*log(random()))*cos(2*PI*random())\"\n", - " )\n", - " .properties(title=f\"{title_op} - Route Type {inline}\")\n", - " )\n", - "\n", - " chart1 = threshold_utils.chart_size(chart1, 40, 250)\n", - "\n", - " return chart1" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "18ebd177-efc1-4bdc-9fac-bf947db810a4", - "metadata": {}, - "outputs": [], - "source": [ - "chart2 = (\n", - " alt.Chart(anaheim_test, width=0.5)\n", - " .mark_circle(size=100)\n", - " .encode(\n", - " x=alt.X(\n", - " \"jitter:Q\",\n", - " title=None,\n", - " axis=alt.Axis(values=[0], ticks=False, grid=False, labels=False),\n", - " scale=alt.Scale(),\n", - " ),\n", - " y=alt.Y(\"rounded_speed:Q\", axis=alt.Axis(labelAngle=-90)),\n", - " color=alt.Color(\n", - " \"variable:N\",\n", - " scale=alt.Scale(range=cp.CALITP_CATEGORY_BRIGHT_COLORS),\n", - " ),\n", - " tooltip=anaheim_test.columns.tolist(),\n", - " column=alt.Column(\n", - " \"stop_sequence:N\",\n", - " header=alt.Header(\n", - " labelAngle=360,\n", - " titleOrient=\"top\",\n", - " labelOrient=\"bottom\",\n", - " labelAlign=\"right\",\n", - " labelPadding=2,\n", - " ),\n", - " ),\n", - " )\n", - " .transform_calculate(\n", - " # Generate Gaussian jitter with a Box-Muller transform\n", - " jitter=\"sqrt(-2*log(random()))*cos(2*PI*random())\"\n", - " )\n", - " .configure_facet(spacing=0)\n", - " .configure_view(stroke=None)\n", - " .properties(title=\"Trip Duration by RT Category\")\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a7287305-20d8-434f-9756-42b18cf172a4", - "metadata": {}, - "outputs": [], - "source": [ - "chart2 = threshold_utils.chart_size(chart2, 80, 300)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3f9e9258-b954-45bf-bd7a-17822c9c607a", - "metadata": {}, - "outputs": [], - "source": [ - "chart2" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ea1d8514-ae9c-42e6-b59a-49a44fecbc98", - "metadata": {}, - "outputs": [], - "source": [ - "chart1 = (\n", - " alt.Chart(anaheim_test_speedmph, width=0.5)\n", - " .mark_circle(size=100)\n", - " .encode(\n", - " x=alt.X(\n", - " \"jitter:Q\",\n", - " title=None,\n", - " axis=alt.Axis(values=[0], ticks=False, grid=False, labels=False),\n", - " scale=alt.Scale(),\n", - " ),\n", - " y=alt.Y(\"rounded_speed:Q\", axis=alt.Axis(labelAngle=-90)),\n", - " color=alt.Color(\n", - " \"stop_sequence:N\",\n", - " scale=alt.Scale(range=cp.CALITP_CATEGORY_BRIGHT_COLORS),\n", - " ),\n", - " tooltip=anaheim_test.columns.tolist(),\n", - " column=alt.Column(\n", - " \"stop_sequence:N\",\n", - " header=alt.Header(\n", - " labelAngle=360,\n", - " titleOrient=\"top\",\n", - " labelOrient=\"bottom\",\n", - " labelAlign=\"right\",\n", - " labelPadding=2,\n", - " ),\n", - " ),\n", - " )\n", - " .transform_calculate(\n", - " # Generate Gaussian jitter with a Box-Muller transform\n", - " jitter=\"sqrt(-2*log(random()))*cos(2*PI*random())\"\n", - " )\n", - " .configure_facet(spacing=0)\n", - " .configure_view(stroke=None)\n", - " .properties(title=f\"Loop/Inling:{anaheim_test_speedmph.loop_or_inlining.iloc[0]}\")\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4e1eea88-640b-4e96-b2d5-07cd2f19afdf", - "metadata": {}, - "outputs": [], - "source": [ - "chart1 = threshold_utils.chart_size(chart1, 80, 300)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c54559c6-8fb0-4109-8e7d-84c1ecb6497e", - "metadata": {}, - "outputs": [], - "source": [ - "chart1" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "79375476-00a1-4b85-a532-26889fc465e4", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0e5c052e-cccd-4f75-be65-9658cf7616b1", - "metadata": {}, - "outputs": [], - "source": [ - "def create_dot_plot2(\n", - " df,\n", - " col_for_dots: str,\n", - " x_axis_col: str,\n", - " y_axis_col: str,\n", - " tooltip_cols: list,\n", - " chart_title: str,\n", - "):\n", - "\n", - " chart = (\n", - " alt.Chart(df)\n", - " .mark_circle(opacity=1, size=100)\n", - " .transform_window(id=\"rank()\", groupby=[col_for_dots])\n", - " .encode(\n", - " alt.X(\n", - " f\"{x_axis_col}:O\",\n", - " sort=\"descending\",\n", - " axis=alt.Axis(ticks=False, grid=True),\n", - " ),\n", - " alt.Y(f\"{y_axis_col}:N\"),\n", - " color=alt.Color(\n", - " f\"{col_for_dots}:N\",\n", - " scale=alt.Scale(range=cp.CALITP_CATEGORY_BRIGHT_COLORS),\n", - " legend=None,\n", - " ),\n", - " tooltip=tooltip_cols,\n", - " )\n", - " .properties(title=chart_title)\n", - " )\n", - "\n", - " return chart" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e38644ab-7fb8-47b1-b692-512b55f05625", - "metadata": {}, - "outputs": [], - "source": [ - "chart3 = create_dot_plot1(\n", - " anaheim_test_other,\n", - " \"variable\",\n", - " \"stop_sequence\",\n", - " \"rounded_speed\",\n", - " anaheim_test_other.columns.tolist(),\n", - " \"Percentile/Average\",\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2b6df39b-b074-494d-8f5e-65fe05356f4b", - "metadata": {}, - "outputs": [], - "source": [ - "chart3 = threshold_utils.chart_size(chart3, 650, 300)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c2c565de-3c55-4827-a5bb-b48c6d49d4c1", - "metadata": {}, - "outputs": [], - "source": [ - "chart4 = create_dot_plot2(\n", - " anaheim_test_speedmph,\n", - " \"variable\",\n", - " \"stop_sequence\",\n", - " \"rounded_speed\",\n", - " anaheim_test_speedmph.columns.tolist(),\n", - " \"Speed per Trip\",\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "29b3b7b1-8f44-4a8d-97c3-ff934250e874", - "metadata": {}, - "outputs": [], - "source": [ - "chart4 = threshold_utils.chart_size(chart4, 650, 300)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "54146c88-15df-49e2-9de6-18f2666d9f9f", - "metadata": {}, - "outputs": [], - "source": [ - "chart4" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3ccbc087-f799-4429-b6c7-a9c1f38f021d", - "metadata": {}, - "outputs": [], - "source": [ - "chart3 + chart4" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9829d749-580c-4440-9397-4562192417b7", - "metadata": {}, - "outputs": [], - "source": [ - "chart7 = (\n", - " alt.Chart(anaheim_test_other, width=0.5)\n", - " .mark_circle(size=100)\n", - " .encode(\n", - " x=alt.X(\n", - " \"jitter:Q\",\n", - " title=None,\n", - " axis=alt.Axis(values=[0], ticks=False, grid=False, labels=False),\n", - " scale=alt.Scale(),\n", - " ),\n", - " y=alt.Y(\"rounded_speed:Q\", axis=alt.Axis(labelAngle=-90)),\n", - " color=alt.Color(\n", - " \"variable:N\",\n", - " scale=alt.Scale(range=cp.CALITP_CATEGORY_BRIGHT_COLORS),\n", - " ),\n", - " tooltip=anaheim_test.columns.tolist(),\n", - " column=alt.Column(\n", - " \"stop_sequence:N\",\n", - " header=alt.Header(\n", - " labelAngle=-90,\n", - " titleOrient=\"top\",\n", - " labelOrient=\"bottom\",\n", - " labelAlign=\"right\",\n", - " labelPadding=2,\n", - " ),\n", - " ),\n", - " )\n", - " .transform_calculate(\n", - " # Generate Gaussian jitter with a Box-Muller transform\n", - " jitter=\"sqrt(-2*log(random()))*cos(2*PI*random())\"\n", - " )\n", - " .configure_facet(spacing=0)\n", - " .configure_view(stroke=None)\n", - " .properties(title=\"Trip Duration by RT Category\")\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "454980fc-e6ab-4d53-b844-120a075b8034", - "metadata": {}, - "outputs": [], - "source": [ - "chart7 = threshold_utils.chart_size(chart7, 80, 300)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "58e57fb2-1c3b-47e1-8bb6-d4941f0ee985", - "metadata": {}, - "outputs": [], - "source": [ - "chart8 = (\n", - " alt.Chart(anaheim_test_other, width=0.5)\n", - " .mark_circle(size=100)\n", - " .encode(\n", - " x=alt.X(\n", - " \"jitter:Q\",\n", - " title=None,\n", - " axis=alt.Axis(values=[0], ticks=False, grid=False, labels=False),\n", - " scale=alt.Scale(),\n", - " ),\n", - " y=alt.Y(\"rounded_speed:Q\", axis=alt.Axis(labelAngle=-90)),\n", - " color=alt.Color(\n", - " \"variable:N\",\n", - " scale=alt.Scale(range=cp.CALITP_CATEGORY_BRIGHT_COLORS),\n", - " ),\n", - " tooltip=anaheim_test.columns.tolist(),\n", - " column=alt.Column(\n", - " \"stop_sequence:N\",\n", - " header=alt.Header(\n", - " labelAngle=-90,\n", - " titleOrient=\"top\",\n", - " labelOrient=\"bottom\",\n", - " labelAlign=\"right\",\n", - " labelPadding=2,\n", - " ),\n", - " ),\n", - " )\n", - " .transform_calculate(\n", - " # Generate Gaussian jitter with a Box-Muller transform\n", - " jitter=\"sqrt(-2*log(random()))*cos(2*PI*random())\"\n", - " )\n", - " .properties(title=\"Trip Duration by RT Category\")\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c2ce0879-f45f-4c66-9b1a-870a243a1260", - "metadata": {}, - "outputs": [], - "source": [ - "chart9 = (\n", - " alt.Chart(anaheim_test_speedmph, width=0.5)\n", - " .mark_circle(size=100)\n", - " .encode(\n", - " x=alt.X(\n", - " \"jitter:Q\",\n", - " title=None,\n", - " axis=alt.Axis(values=[0], ticks=False, grid=False, labels=False),\n", - " scale=alt.Scale(),\n", - " ),\n", - " y=alt.Y(\"rounded_speed:Q\", axis=alt.Axis(labelAngle=-90)),\n", - " color=alt.Color(\n", - " \"variable:N\",\n", - " scale=alt.Scale(range=cp.CALITP_CATEGORY_BRIGHT_COLORS),\n", - " ),\n", - " tooltip=anaheim_test.columns.tolist(),\n", - " column=alt.Column(\n", - " \"stop_sequence:N\",\n", - " header=alt.Header(\n", - " labelAngle=360,\n", - " titleOrient=\"top\",\n", - " labelOrient=\"bottom\",\n", - " labelAlign=\"right\",\n", - " labelPadding=2,\n", - " ),\n", - " ),\n", - " )\n", - " .transform_calculate(\n", - " # Generate Gaussian jitter with a Box-Muller transform\n", - " jitter=\"sqrt(-2*log(random()))*cos(2*PI*random())\"\n", - " )\n", - " .properties(title=f\"Loop/Inling:{anaheim_test_speedmph.loop_or_inlining.iloc[0]}\")\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "04e3ce9a-a5ab-4aeb-a7d0-4ead6738324b", - "metadata": {}, - "outputs": [], - "source": [ - "chart8" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "04d2026e-c040-45ea-aab6-2c09c2211632", - "metadata": {}, - "outputs": [], - "source": [ - "chart9 | chart8" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "edb11b29-7038-4a5d-9e44-4d3b88e6cc88", - "metadata": {}, - "outputs": [], - "source": [ - "# pip install altair==5.0.0rc3\n", - "chart5 = (\n", - " alt.Chart(anaheim_test_speedmph, title=\"Normally distributed jitter\")\n", - " .mark_circle(size=50)\n", - " .encode(\n", - " y=\"rounded_speed:Q\",\n", - " x=\"stop_sequence:N\",\n", - " yOffset=\"jitter:Q\",\n", - " color=alt.Color(\"stop_sequence:Q\").legend(None),\n", - " )\n", - " .transform_calculate(\n", - " # Generate Gaussian jitter with a Box-Muller transform\n", - " jitter=\"sqrt(-2*log(random()))*cos(2*PI*random())\"\n", - " )\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9bbcc880-7866-495c-9a68-3b7ca6c00c4a", - "metadata": {}, - "outputs": [], - "source": [ - "chart5 = threshold_utils.chart_size(chart5, 650, 300)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a765f55e-73e2-4bae-9326-98ed1bbfadaf", - "metadata": {}, - "outputs": [], - "source": [ - "chart5" - ] - }, - { - "cell_type": "markdown", - "id": "c911a105-147a-4b8a-a741-5b67a8cf710a", - "metadata": { - "tags": [] - }, - "source": [ - "#### Look at one trip" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f02f2c0b-f5fc-4452-b39b-bb1a443bc727", - "metadata": {}, - "outputs": [], - "source": [ - "# foothill_og = speed_stops2[speed_stops2.trip_id == \"t604-b2791-sl5\"]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "173615fa-c942-4d85-9ca8-64852b706d1f", - "metadata": {}, - "outputs": [], - "source": [ - "# len(foothill_og)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "28521af6-c34e-4b20-9831-3177722b9b46", - "metadata": {}, - "outputs": [], - "source": [ - "# foothill_og.stop_sequence.nunique()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1136e4c7-2e5b-492d-8ae6-299a04164ac3", - "metadata": {}, - "outputs": [], - "source": [ - "# foothill_og.stop_sequence.describe()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9cee6f4c-8786-478a-97ff-702be25d0788", - "metadata": {}, - "outputs": [], - "source": [ - "# foothill_og.sort_values('stop_sequence').head()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "97ec3ff5-bcda-4edd-9ba6-50821923dd98", - "metadata": {}, - "outputs": [], - "source": [ - "# foothill_renumbered_stop_seq = m2[m2.trip_id == \"t604-b2791-sl5\"]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f2733f91-b890-4818-a649-d79bd6f9a16a", - "metadata": {}, - "outputs": [], - "source": [ - "# foothill_renumbered_stop_seq['Test Stop Sequence'].describe()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c9b04672-2b10-4d06-b84a-1fd92a6a78ac", - "metadata": {}, - "outputs": [], - "source": [ - "# foothill_renumbered_stop_seq.sort_values('stop_sequence').head()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9581ead1-5d91-44a4-a9d3-299445d55056", - "metadata": {}, - "outputs": [], - "source": [ - "# len(troubleshoot)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1f50eb55-9373-4004-b23b-8129db734c1b", - "metadata": {}, - "outputs": [], - "source": [ - "# Number of test stops should match stop sequence...\n", - "# troubleshoot['sequences_are_equal'] = troubleshoot['Test Stop Sequence'] - troubleshoot['stop_sequence']" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "46843106-2e31-46c9-9220-2f12a3e6a4fb", - "metadata": {}, - "outputs": [], - "source": [ - "# troubleshoot['sequences_are_equal'].value_counts()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "34d189fa-632c-40d7-a793-e576645e879e", - "metadata": {}, - "outputs": [], - "source": [ - "# Look at this trip id in the original df\n", - "# og_trip = speed_stops2[speed_stops2.trip_id == \"t640-b15FF1-sl5\"]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "34b1e2f4-fe20-4029-9ec1-e5cbaaf73178", - "metadata": {}, - "outputs": [], - "source": [ - "# Look at this trip id in the manipulated df\n", - "# new_trip = m2[m2.trip_id == \"t640-b15FF1-sl5\"]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "81190a32-907b-4a5d-818a-ab5c2740dbc3", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "# og_trip.shape, og_trip.stop_sequence.nunique()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ae8d06ad-3016-4909-b766-c370ef074aae", - "metadata": {}, - "outputs": [], - "source": [ - "# new_trip.shape, new_trip.stop_sequence.nunique()" + "# m2[m2.trip_id == '1350']" ] } ],