diff --git a/rt_segment_speeds/12_speeds.ipynb b/rt_segment_speeds/12_speeds.ipynb
index 677f7b36f..b3ea92af5 100644
--- a/rt_segment_speeds/12_speeds.ipynb
+++ b/rt_segment_speeds/12_speeds.ipynb
@@ -2,26 +2,33 @@
"cells": [
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 36,
"id": "2c7feec3-aa18-42ab-94b9-cab4be608152",
"metadata": {},
"outputs": [],
"source": [
- "import _speed_utils as speed_utils\n",
"import datetime\n",
+ "import _speed_utils as speed_utils\n",
"import _threshold_utils as threshold_utils\n",
"import altair as alt\n",
"import dask.dataframe as dd\n",
"import geopandas as gpd\n",
"import pandas as pd\n",
"from segment_speed_utils import gtfs_schedule_wrangling, helpers, segment_calcs\n",
- "from segment_speed_utils.project_vars import analysis_date\n",
+ "from segment_speed_utils.project_vars import (\n",
+ " COMPILED_CACHED_VIEWS,\n",
+ " PROJECT_CRS,\n",
+ " SEGMENT_GCS,\n",
+ " analysis_date,\n",
+ " CONFIG_PATH\n",
+ ")\n",
+ "from scripts import A1_sjoin_vp_segments\n",
"from shared_utils import calitp_color_palette as cp"
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 37,
"id": "0108ae4a-4518-4487-85f7-a5faa3e9cbf6",
"metadata": {},
"outputs": [],
@@ -34,15 +41,12 @@
},
{
"cell_type": "code",
- "execution_count": null,
- "id": "1e9f79a4-5921-4e8c-82c5-3b414f677cf8",
- "metadata": {
- "tags": []
- },
+ "execution_count": 38,
+ "id": "80014c5e-695d-4280-89cd-4e7e2bb3d302",
+ "metadata": {},
"outputs": [],
"source": [
- "# Flag\n",
- "# routes_many_stops_df, routes_many_stops_list = speed_utils.find_shapes_with_many_stops(analysis_date)"
+ "# alt.data_transformers.disable_max_rows()"
]
},
{
@@ -55,2569 +59,1954 @@
},
{
"cell_type": "code",
- "execution_count": null,
- "id": "c3ab6f3f-2982-4466-aa76-06c7a235c62e",
+ "execution_count": 39,
+ "id": "2f0c5f4f-f419-42a8-8527-7060ed412092",
"metadata": {},
"outputs": [],
"source": [
- "avg_speeds = pd.read_parquet(\n",
- " f\"{speed_utils.GCS_PATH}avg_speeds_stop_segments_{analysis_date}.parquet\"\n",
- ").drop(columns=[\"geometry\", \"geometry_arrowized\", \"district\", \"district_name\"])"
+ "def merge_all_speeds(analysis_date:str) -> pd.DataFrame:\n",
+ " \"\"\"\n",
+ " Merge avg_speeds_stop_segments and\n",
+ " speed_stops parquets.\n",
+ " \n",
+ " Args:\n",
+ " date: analysis date\n",
+ " \"\"\"\n",
+ " # Open up avg speeds\n",
+ " avg_speeds = pd.read_parquet(f\"{SEGMENT_GCS}avg_speeds_stop_segments_{analysis_date}.parquet\")\n",
+ " avg_speeds = avg_speeds.drop(columns=[\"geometry\", \"district\", \"district_name\"])\n",
+ " # Filter for all day flags\n",
+ " avg_speeds = avg_speeds[avg_speeds.time_of_day == 'all_day'].reset_index(drop = True)\n",
+ " \n",
+ " # Open up speeds\n",
+ " speeds = pd.read_parquet(f\"{SEGMENT_GCS}speeds_stop_segments_{analysis_date}\")\n",
+ " \n",
+ " merge_cols = ['gtfs_dataset_key','shape_array_key', 'stop_sequence']\n",
+ " m1 = pd.merge(avg_speeds, speeds, on = merge_cols, how = 'inner')\n",
+ " \n",
+ " m1 = m1.drop_duplicates().reset_index(drop = True)\n",
+ " \n",
+ " return m1"
]
},
{
"cell_type": "code",
- "execution_count": null,
- "id": "8185a464-5ca2-43bb-89f6-062ee01b5e2d",
+ "execution_count": 40,
+ "id": "84ac97bf-ee4f-4d85-b523-8a36823f9d9a",
"metadata": {},
"outputs": [],
"source": [
- "speeds = pd.read_parquet(f\"{speed_utils.GCS_PATH}speeds_stop_segments_{analysis_date}\")"
+ "m1 = merge_all_speeds(analysis_date)"
]
},
{
"cell_type": "code",
- "execution_count": null,
- "id": "01e1b2ad-edb7-4021-825f-612f070db139",
+ "execution_count": 41,
+ "id": "68950ae7-4061-47d6-ac48-5eac0b1f29c0",
"metadata": {},
"outputs": [],
"source": [
- "avg_speeds.sample()"
+ "# m1.shape"
]
},
{
"cell_type": "code",
- "execution_count": null,
- "id": "4623eaa7-4594-4155-859d-af997094c3de",
+ "execution_count": 42,
+ "id": "b04dfb8b-7476-49df-873a-cea75dc61763",
"metadata": {},
"outputs": [],
"source": [
- "speeds.sample()"
+ "\n",
+ "# Picked 4 random routes\n",
+ "sample_0_keys = [\n",
+ " \"0fb4f3627996269dc7075276d3b69e36\",\n",
+ " \"07c9a47264a43d8d0d16ef7109e8fd68\",\n",
+ " \"106d979b9a9e6338827a8e1c145e69fd\",\n",
+ " \"000624bd8453dbe4f2eb2765b04bcb98\",\n",
+ "]"
]
},
{
- "cell_type": "code",
- "execution_count": null,
- "id": "bbc7804a-550a-40fa-a25f-7694b057c9b7",
+ "cell_type": "markdown",
+ "id": "898e3546-5298-4c4f-87d0-ee1d1a10f07d",
"metadata": {},
- "outputs": [],
"source": [
- "merge_cols = [\"gtfs_dataset_key\", \"shape_array_key\", \"stop_sequence\"]\n",
- "merge1 = pd.merge(avg_speeds, speeds, on=merge_cols, how=\"inner\")"
+ "### Categorize"
]
},
{
"cell_type": "code",
- "execution_count": null,
- "id": "34c64e59-0379-4edf-a87f-ec621c0b668b",
+ "execution_count": 43,
+ "id": "e81e59fd-cc2f-408e-9148-1a1055425fc4",
"metadata": {},
"outputs": [],
"source": [
- "merge1.sample()"
+ "def categorize_by_percentile_pandas(\n",
+ " df: pd.DataFrame, column_percentile: str, column_str: str\n",
+ ") -> pd.DataFrame:\n",
+ "\n",
+ " # Find percentiles\n",
+ " p5 = df[column_percentile].quantile(0.05).astype(float)\n",
+ " p95 = df[column_percentile].quantile(0.95).astype(float)\n",
+ " \n",
+ " def rate(row):\n",
+ " if ((row[column_percentile] >= 0) and (row[column_percentile] <= p5)):\n",
+ " return f\"{column_str} is low\"\n",
+ " elif (row[column_percentile] >= p95):\n",
+ " return f\"{column_str} is high\"\n",
+ " else:\n",
+ " return f\"{column_str} is avg\"\n",
+ " \n",
+ " # Apply flags\n",
+ " df[f\"{column_str}cat\"] = df.apply(lambda x: rate(x), axis=1)\n",
+ " \n",
+ " # Clean\n",
+ " df[f\"{column_str}cat\"] = df[f\"{column_str}cat\"].str.replace(\"_\", \"\")\n",
+ "\n",
+ " print(f\"Done with {column_str}\")\n",
+ " \n",
+ " return df "
]
},
{
"cell_type": "code",
- "execution_count": null,
- "id": "7fa72f28-fd46-4fd9-9f4a-120812a482da",
+ "execution_count": 44,
+ "id": "0dfb836d-f919-4f2b-a0d1-9e4a4713ba8a",
"metadata": {},
"outputs": [],
"source": [
- "segments_file = \"stop_segments\""
+ "# df1 = categorize_by_percentile_pandas(subset, \"meters_elapsed\", \"meters_\")"
]
},
{
"cell_type": "code",
- "execution_count": null,
- "id": "77f1e38e-6dc4-4e79-9eed-569375a133fe",
+ "execution_count": 45,
+ "id": "9f84205d-93db-49f3-be99-6b5014f7faeb",
"metadata": {},
"outputs": [],
"source": [
- "stop_segments = pd.read_parquet(\n",
- " f\"{speed_utils.GCS_PATH}{segments_file}_{analysis_date}.parquet\"\n",
- ").drop(columns=[\"geometry\", \"geometry_arrowized\"])"
+ "# df1.head()"
]
},
{
"cell_type": "code",
- "execution_count": null,
- "id": "cdaf396a-49d0-4da6-a3f8-3080f1f838b0",
+ "execution_count": 46,
+ "id": "b0d2184f-8a44-4489-a1b4-2be8317142f1",
"metadata": {},
"outputs": [],
"source": [
- "stop_segments.sample()"
+ "# df2 = categorize_by_percentile_pandas(df1, \"sec_elapsed\", \"sec_\")"
]
},
{
"cell_type": "code",
- "execution_count": null,
- "id": "b994dc1c-2a11-4376-9d43-f61db486e6eb",
+ "execution_count": 47,
+ "id": "940fb010-0dff-465e-bf8d-87dd3f4ba101",
"metadata": {},
"outputs": [],
"source": [
- "# pd.merge(stop_segments, merge1, on = ['gtfs_dataset_key','shape_array_key','stop_sequence','loop_or_inlining'], how = \"inner\", indicator = True)[['_merge']].value_counts()"
+ "# df2.head()"
]
},
{
"cell_type": "code",
- "execution_count": null,
- "id": "e510e28b-a179-41d7-b738-e50edb26d878",
+ "execution_count": 48,
+ "id": "9d38d541-5c9c-4d31-8986-9c3928eb2f59",
"metadata": {},
"outputs": [],
"source": [
- "merge1.shape"
+ "def categorize_meters_speeds_pandas()-> pd.DataFrame:\n",
+ " start = datetime.datetime.now()\n",
+ " print(start)\n",
+ " \n",
+ " df = merge_all_speeds(analysis_date)\n",
+ " \n",
+ " # Categorize\n",
+ " df1 = categorize_by_percentile_pandas(df, \"meters_elapsed\", \"meters_\")\n",
+ " df2 = categorize_by_percentile_pandas(df1, \"sec_elapsed\", \"sec_\")\n",
+ " \n",
+ " # Find size of categories\n",
+ " print(df2.groupby(['sec_cat','meters_cat']).size())\n",
+ "\n",
+ " # Filter out for only meters that are low or seconds that are high\n",
+ " df2 = df2[(df2.meters_cat == 'meters is low') | (df2.sec_cat == 'sec is high')].reset_index(drop = True)\n",
+ " print(f\"{len(df2)} rows left after filtering for rows with either high seconds OR low meters\") \n",
+ " \n",
+ " def flag_round(row):\n",
+ " if (row[\"meters_elapsed\"] == 0) & (row[\"sec_elapsed\"] == 0):\n",
+ " return \"division by 0\"\n",
+ " elif row[\"meters_cat\"] == \"meters is low\":\n",
+ " return \"meters too low\"\n",
+ " elif row[\"sec_cat\"] == \"sec is high\":\n",
+ " return \"seconds too high\"\n",
+ " else:\n",
+ " return \"ok\"\n",
+ " \n",
+ " df2[\"flag\"] = df2.apply(lambda x: flag_round(x), axis=1)\n",
+ " print(df2.flag.value_counts())\n",
+ " \n",
+ " # Filter out for only division by 0 \n",
+ " df3 = df2[(df2.flag == 'division by 0')].reset_index(drop = True)\n",
+ " \n",
+ " end = datetime.datetime.now()\n",
+ " print(f\"Took {end-start}\")\n",
+ " return df3"
]
},
{
"cell_type": "code",
- "execution_count": null,
- "id": "6483a59e-1d44-4fa9-b057-8ce5230126c2",
+ "execution_count": 49,
+ "id": "7201d5e3-f765-4e5d-9bbd-aa6a336bcc0a",
"metadata": {},
"outputs": [],
"source": [
- "# m1 = speed_utils.merge_all_speeds(analysis_date)"
+ "# subset = m1[m1.shape_array_key.isin(sample_0_keys)].reset_index()"
]
},
{
"cell_type": "code",
- "execution_count": null,
- "id": "e9d651da-95f8-425b-9a7c-28781b70a595",
+ "execution_count": 50,
+ "id": "2c5107cb-c574-449b-95b6-fb205f38502e",
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "2023-06-30 10:15:15.239284\n",
+ "Done with meters_\n",
+ "Done with sec_\n",
+ "sec_cat meters_cat \n",
+ "sec is avg meters is avg 2415102\n",
+ " meters is high 70745\n",
+ " meters is low 139528\n",
+ "sec is high meters is avg 57245\n",
+ " meters is high 83074\n",
+ " meters is low 13695\n",
+ "sec is low meters is low 296973\n",
+ "dtype: int64\n",
+ "590515 rows left after filtering for rows with either high seconds OR low meters\n",
+ "division by 0 296973\n",
+ "meters too low 153223\n",
+ "seconds too high 140319\n",
+ "Name: flag, dtype: int64\n",
+ "Took 0:02:29.450038\n"
+ ]
+ }
+ ],
"source": [
- "# m1.sample()"
+ "m2 = categorize_meters_speeds_pandas()"
]
},
{
"cell_type": "code",
- "execution_count": null,
- "id": "20c1eeac-34e6-417c-9354-31cfc3ea9096",
+ "execution_count": 51,
+ "id": "2d1bf90c-d9ed-4861-a1be-23f356165a4c",
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "division by 0 296973\n",
+ "Name: flag, dtype: int64"
+ ]
+ },
+ "execution_count": 51,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
- "# m1.groupby(['shape_array_key']).agg({'trip_id':'nunique'}).sort_values(['trip_id'], ascending = False).head(30)"
+ "m2.flag.value_counts()"
]
},
{
"cell_type": "code",
- "execution_count": null,
- "id": "48b197ec-add3-46d3-a2bf-ff91c1ff15be",
- "metadata": {},
- "outputs": [],
- "source": [
- "merge1.shape_array_key.unique()"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "801c89ce-0e7f-4758-a38a-201cc843ef28",
+ "execution_count": 52,
+ "id": "ce0fbb35-f81e-4343-92d2-4382d2173dbd",
"metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "2779389"
+ ]
+ },
+ "execution_count": 52,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
- "#### A few routes"
+ "len(m1)-len(m2)"
]
},
{
"cell_type": "code",
- "execution_count": null,
- "id": "8ca57e6f-fb38-4381-ac83-cb9dc9fabdf4",
+ "execution_count": 53,
+ "id": "508f1411-4328-4b80-a029-0ae516107ed0",
"metadata": {},
- "outputs": [],
- "source": [
- "test_shapes = [\n",
- " \"00093e1c28352239174c92c4f07a483b\",\n",
- " \"001254fc8105d01a8064046249c0ceba\",\n",
- " \"00b40413c13a48046de6e2338aee0410\",\n",
- " \"e3c5ed2c6fa6cd5c5cd57d46aeb3cd8e\",\n",
- " \"efa0f969b4499620b80c9b82170e2e60\",\n",
- " \"00093e1c28352239174c92c4f07a483b\",\n",
- " \"001254fc8105d01a8064046249c0ceba\",\n",
- " \"6388c0be232f0c745df85d66689a6db0\",\n",
- " \"d8b0826e923620f7b7cd74be090de936\",\n",
- " \"e7012e8847c179f713daee0f158233e4\",\n",
- " \"11d91cab41cde51a6d4f623b9cba867c\"\n",
- "]"
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "296973"
+ ]
+ },
+ "execution_count": 53,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "len(m2)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 54,
+ "id": "9fe08d2a-b874-4439-aa5b-a52de58cad09",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(45357, 72067)"
+ ]
+ },
+ "execution_count": 54,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "m2.trip_id.nunique(), m1.trip_id.nunique()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 55,
+ "id": "8e6d31ab-46a7-4e20-bb2f-9cac1a2d672d",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "'2155 routes flagged'"
+ ]
+ },
+ "execution_count": 55,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "f\"{m1.shape_array_key.nunique() - m2.shape_array_key.nunique()} routes flagged\""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 56,
+ "id": "647fad46-7f9b-4ce2-a26a-1ea69d02daee",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(63, 76)"
+ ]
+ },
+ "execution_count": 56,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "m2._gtfs_dataset_name.nunique(), m1._gtfs_dataset_name.nunique()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 57,
+ "id": "0f4e4b50-081d-4516-81cf-d5bdfb5d469f",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " shape_array_key | \n",
+ "
\n",
+ " \n",
+ " loop_or_inlining | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 3970 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 867 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " shape_array_key\n",
+ "loop_or_inlining \n",
+ "0 3970\n",
+ "1 867"
+ ]
+ },
+ "execution_count": 57,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "m1.groupby([\"loop_or_inlining\"]).agg({\"shape_array_key\": \"nunique\"})"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 58,
+ "id": "83036ccc-7339-42c2-b1f7-183734253c21",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " shape_array_key | \n",
+ "
\n",
+ " \n",
+ " loop_or_inlining | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 2682 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " shape_array_key\n",
+ "loop_or_inlining \n",
+ "0 2682"
+ ]
+ },
+ "execution_count": 58,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "m2.groupby([\"loop_or_inlining\"]).agg({\"shape_array_key\": \"nunique\"})"
]
},
{
- "cell_type": "code",
- "execution_count": null,
- "id": "6858f9a8-2136-4aab-a099-25907b6ef7ef",
+ "cell_type": "markdown",
+ "id": "4486cd7c-31d7-4420-ac67-f9783676ede8",
"metadata": {},
- "outputs": [],
"source": [
- " few_routes = merge1.loc[merge1.shape_array_key.isin(test_shapes)].reset_index(drop=True)"
+ "#### See how many trips for a shape ID have problematic rows\n"
]
},
{
"cell_type": "code",
- "execution_count": null,
- "id": "913ac9e5-41ed-43c3-86ad-2b13b141d17c",
- "metadata": {},
+ "execution_count": 59,
+ "id": "468be3c9-7a24-4f01-84fd-31c137bc45e8",
+ "metadata": {
+ "scrolled": true,
+ "tags": []
+ },
"outputs": [],
"source": [
- "# few_routes = merge1.copy()"
+ "# Number of trips that have at least one row that was divided by 0 \n",
+ "# for this shape array key\n",
+ "df1 = m2.groupby(['shape_array_key']).agg({'trip_id':'nunique'}).rename(columns = {'trip_id':'trips_with_zero'}).reset_index()"
]
},
{
"cell_type": "code",
- "execution_count": null,
- "id": "602983ba-8f2e-41cb-9eb8-96e8ede9f58a",
- "metadata": {},
+ "execution_count": 60,
+ "id": "4350f540-8f6b-4fb0-8b16-836245c0e44c",
+ "metadata": {
+ "scrolled": true,
+ "tags": []
+ },
"outputs": [],
"source": [
- "few_routes.shape"
+ "# Original number of trips\n",
+ "df2 = m1.groupby(['shape_array_key']).agg({'trip_id':'nunique'}).rename(columns = {'trip_id':'all_trips'}).reset_index()"
]
},
{
"cell_type": "code",
- "execution_count": null,
- "id": "f294ff32-b025-4037-9ebc-cefe6dca00b9",
+ "execution_count": 61,
+ "id": "ac68bdf7-26a0-4679-9a35-26f8a670018a",
"metadata": {},
"outputs": [],
"source": [
- "few_routes.trip_id.nunique()"
+ "df3 = pd.merge(df1, df2, how = \"inner\", on = 'shape_array_key')"
]
},
{
"cell_type": "code",
- "execution_count": null,
- "id": "b4f56307-6624-488d-8688-7a9f1e47ff65",
+ "execution_count": 62,
+ "id": "81d443cc-122f-46f1-87ec-dbdc74e0ca6a",
"metadata": {},
"outputs": [],
"source": [
- "def categorize_by_percentile(df, column_percentile: str, column_str: str):\n",
- "\n",
- " agg1 = (\n",
- " df.groupby([\"shape_array_key\", \"stop_sequence\"])[column_percentile]\n",
- " .describe(percentiles=[0.15, 0.5, 0.95])\n",
- " .reset_index()\n",
- " .add_prefix(column_str)\n",
- " )\n",
- "\n",
- " merge1 = pd.merge(\n",
- " df,\n",
- " agg1,\n",
- " how=\"inner\",\n",
- " left_on=[\"shape_array_key\", \"stop_sequence\"],\n",
- " right_on=[\n",
- " f\"{column_str}shape_array_key\",\n",
- " f\"{column_str}stop_sequence\",\n",
- " ],\n",
- " )\n",
- "\n",
- " def percentile(row):\n",
- "\n",
- " if row[column_percentile] == row[f\"{column_str}mean\"]:\n",
- " return f\"{column_str} elapsed avg\"\n",
- " elif row[column_percentile] == row[f\"{column_str}50%\"]:\n",
- " return f\"{column_str} elapsed avg\"\n",
- " elif row[column_percentile] <= row[f\"{column_str}15%\"]:\n",
- " return f\"{column_str} elapsed low\"\n",
- " elif row[column_percentile] == 0:\n",
- " return f\"{column_str} elapsed is 0\"\n",
- " elif (\n",
- " row[f\"{column_str}15%\"] < row[column_percentile] <= row[f\"{column_str}95%\"]\n",
- " ):\n",
- " return f\"{column_str} elapsed avg\"\n",
- "\n",
- " elif row[column_percentile] > row[f\"{column_str}95%\"]:\n",
- " return f\"{column_str} elapsed high\"\n",
- "\n",
- " else:\n",
- " return \"other\"\n",
- "\n",
- " merge1[f\"{column_str}cat\"] = merge1.apply(lambda x: percentile(x), axis=1)\n",
- " print(f\"Done with {column_str}\")\n",
- " return merge1"
+ "df3['percent_of_trips_with_problematic_rows'] = df3.trips_with_zero/df3.all_trips * 100"
]
},
{
"cell_type": "code",
- "execution_count": null,
- "id": "d80535fb-8648-4216-918f-76e0484ba3ea",
+ "execution_count": 63,
+ "id": "314d9baf-de0e-460a-8c29-4504ba94cfa6",
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "count 2682.00\n",
+ "mean 82.86\n",
+ "std 26.65\n",
+ "min 1.52\n",
+ "25% 75.00\n",
+ "50% 100.00\n",
+ "75% 100.00\n",
+ "max 100.00\n",
+ "Name: percent_of_trips_with_problematic_rows, dtype: float64"
+ ]
+ },
+ "execution_count": 63,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
- "def categorize_meters_speeds(df):\n",
- " start = datetime.datetime.now()\n",
- " print(f\"Begin: {start}\")\n",
- " df.speed_mph = df.speed_mph.fillna(0)\n",
- " df = categorize_by_percentile(df, \"meters_elapsed\", \"meters_\")\n",
- " df = categorize_by_percentile(df, \"sec_elapsed\", \"seconds_\")\n",
- " df = categorize_by_percentile(df, \"speed_mph\", \"speed_\")\n",
- " df = df.rename(columns={\"speed_cat\": \"speed_flags\"})\n",
- " end = datetime.datetime.now()\n",
- " print(f\"Finish: {end}\")\n",
- " return df"
+ "df3['percent_of_trips_with_problematic_rows'].describe()"
]
},
{
"cell_type": "code",
- "execution_count": null,
- "id": "4676866e-674a-4561-bc4f-55dc2dcc4769",
+ "execution_count": 64,
+ "id": "5de3efe6-2233-4251-93a8-1f8dd6fb2dae",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
- "few_routes_cat = categorize_meters_speeds(few_routes)"
+ "# df3.sample(5)"
]
},
{
- "cell_type": "code",
- "execution_count": null,
- "id": "1f8fa537-8807-4544-8568-7d999ca9ecac",
+ "cell_type": "markdown",
+ "id": "a399d982-e400-43fa-b13f-fecafaa27262",
"metadata": {},
- "outputs": [],
"source": [
- "# few_routes_cat.columns"
+ "### Investigate \n",
+ "#### Stage3: \"vp_pared_stops\"/A3_loop_inlining\n",
+ "* Rewrite this part to filter read_parquet with the shape array and whatnot"
]
},
{
"cell_type": "code",
- "execution_count": null,
- "id": "1e6db506-6e09-49c7-84a1-02160178573d",
+ "execution_count": 65,
+ "id": "a2a705af-b588-463b-b6ce-f999b2050208",
"metadata": {},
"outputs": [],
"source": [
- "subset = [\n",
- " \"stop_sequence\",\n",
- " \"speed_flags\",\n",
- " \"speed_mph\",\n",
- " \"speed_15%\",\n",
- " \"speed_50%\",\n",
- " \"speed_95%\",\n",
- " \"meters_cat\",\n",
- " \"meters_elapsed\",\n",
- " \"meters_mean\",\n",
- " \"meters_15%\",\n",
- " \"meters_50%\",\n",
- " \"meters_95%\",\n",
- " \"seconds_cat\",\n",
- " \"sec_elapsed\",\n",
- " \"seconds_mean\",\n",
- " \"seconds_15%\",\n",
- " \"seconds_50%\",\n",
- " \"seconds_95%\",\n",
- " \"gtfs_dataset_key\",\n",
- "]"
+ "def load_vp_stage3(flagged_df:pd.DataFrame, date:str) -> pd.DataFrame:\n",
+ " \n",
+ " # Subset the dataframe and use it to filter out for only the values of interest\n",
+ " shape_array_keys = flagged_df.shape_array_key.unique().tolist()\n",
+ " stop_seq = flagged_df.stop_sequence.unique().tolist() \n",
+ " trip_id = flagged_df.trip_id.unique().tolist() \n",
+ " gtfs_dataset_key = flagged_df.gtfs_dataset_key.unique().tolist() \n",
+ " \n",
+ " #flagged_df = flagged_df[['gtfs_dataset_key', 'trip_id','stop_sequence','shape_array_key']]\n",
+ " vp = pd.read_parquet(f\"{SEGMENT_GCS}vp_pared_stops_{date}\",\n",
+ " filters = [[('shape_array_key', \"in\", shape_array_keys),\n",
+ " ('stop_sequence', 'in', stop_seq), \n",
+ " ('trip_id', 'in', trip_id), \n",
+ " ('gtfs_dataset_key', 'in', gtfs_dataset_key)]],)\n",
+ " \n",
+ " # Merge to filter\n",
+ " vp2 = pd.merge(flagged_df, vp, how = \"inner\", on = ['gtfs_dataset_key', 'trip_id','stop_sequence','shape_array_key'])\n",
+ " \n",
+ " return vp2"
]
},
{
"cell_type": "code",
- "execution_count": null,
- "id": "f5ae87ad-a3a4-4630-ac8c-b7ce711c2fb7",
+ "execution_count": 66,
+ "id": "1e36c5fc-ab3f-4129-97f9-ad9472b7d32a",
"metadata": {},
"outputs": [],
"source": [
- "few_routes_cat.speed_flags.value_counts()"
+ "vp2 = load_vp_stage3(subset, analysis_date)"
]
},
{
"cell_type": "code",
- "execution_count": null,
- "id": "83ede9e9-a76d-4e57-9fac-4cc2ce1af0c5",
+ "execution_count": 67,
+ "id": "1d6fe654-40ca-4758-bc2c-316e33d1a9d1",
"metadata": {},
"outputs": [],
"source": [
- "few_routes_cat.speed_flags.value_counts() / len(few_routes) * 100"
+ "# vp = pd.read_parquet(f\"{SEGMENT_GCS}vp_pared_stops_{analysis_date}\")"
]
},
{
"cell_type": "code",
- "execution_count": null,
- "id": "e016eb5a-0039-4063-ade4-c871e01c8a16",
- "metadata": {},
+ "execution_count": 68,
+ "id": "148e75f1-08dd-44c8-8179-319164d8e020",
+ "metadata": {
+ "tags": []
+ },
"outputs": [],
"source": [
- "few_routes_cat.groupby([\"speed_flags\", \"meters_cat\", \"seconds_cat\",]).agg(\n",
- " {\"trip_id\": \"count\"}\n",
- ").reset_index().sort_values([\"trip_id\"], ascending=False)"
+ "# Check out stop sequences for the trip below that have division by 0\n",
+ "# subset[subset.trip_id == \"1088383\"].stop_sequence.unique()"
]
},
{
"cell_type": "code",
- "execution_count": null,
- "id": "87860e54-fd6e-42d1-a38f-613fea4a77e9",
- "metadata": {},
+ "execution_count": 69,
+ "id": "b4350206-c237-44a3-abce-f8f38cde8117",
+ "metadata": {
+ "scrolled": true,
+ "tags": []
+ },
"outputs": [],
"source": [
- "# 65d9589130415c685b89f4f7c2d8bd7e 65"
+ "# Stop sequences that were flagged as division by 0\n",
+ "# vp2[vp2.trip_id == \"1088383\"].sort_values(['trip_id', 'stop_sequence','location_timestamp_local'])"
]
},
{
"cell_type": "code",
- "execution_count": null,
- "id": "bc6df377-9c23-4b01-80de-8c977b797c47",
- "metadata": {},
+ "execution_count": 70,
+ "id": "aa1e56d1-ec07-436c-8763-7bcf3dcbf7d4",
+ "metadata": {
+ "scrolled": true,
+ "tags": []
+ },
"outputs": [],
"source": [
- "# few_routes_cat[few_routes_cat.speed_flags == \"average\"][subset].sample(3)"
+ "# All the stop sequences for this trip, even those that are ok\n",
+ "# vp_pared[vp_pared.trip_id == \"1088383\"].sort_values(['trip_id', 'stop_sequence','location_timestamp_local'])"
]
},
{
"cell_type": "code",
- "execution_count": null,
- "id": "49c58012-004e-4c47-becf-20e7f18895d3",
- "metadata": {},
+ "execution_count": 71,
+ "id": "22e42aae-9281-4040-ab8c-6a10b93f6cf4",
+ "metadata": {
+ "scrolled": true,
+ "tags": []
+ },
"outputs": [],
"source": [
- "# few_routes_cat[(few_routes_cat.meters_cat == \"meters_ elapsed avg\") & (few_routes_cat.seconds_cat == \"seconds_ elapsed avg\") & (few_routes_cat.speed_flags == \"speed low\")][subset]"
+ "# All the stop sequences for this trip, even those that are ok\n",
+ "# vp_pared[vp_pared.trip_id == \"1088383\"].sort_values(['location_timestamp_local','stop_sequence',])"
]
},
{
"cell_type": "code",
- "execution_count": null,
- "id": "54f38944-af7a-47bf-a799-882899964c6a",
+ "execution_count": 72,
+ "id": "0f21f08f-d4eb-4bbd-94d3-f4b031e97cf4",
"metadata": {},
"outputs": [],
"source": [
- "# few_routes_cat[(few_routes_cat.meters_cat == \"meters_ elapsed low\") & (few_routes_cat.seconds_cat == \"seconds_ elapsed avg\") & (few_routes_cat.speed_flags == \"speed low\")][subset].sample(3)"
+ "def stage3_repeated_timestamps(stage3_df:pd.DataFrame)-> pd.DataFrame:\n",
+ " \"\"\"\n",
+ " Look at how many times a time stamp is repeated a route-trip-location.\n",
+ " Each of these 3 combos should have a different time for each \n",
+ " stop sequence or else the vehicle is not changing locations.\n",
+ " \"\"\"\n",
+ " agg = (stage3_df\n",
+ " .groupby(['shape_array_key','trip_id', 'location_timestamp_local'])\n",
+ " .agg({'stop_sequence':'nunique'})\n",
+ " .reset_index()\n",
+ " .rename(columns = {'stop_sequence':'number_of_repeated_timestamps'})\n",
+ " )\n",
+ " \n",
+ " # Only keep timestamps that are repeated more than once\n",
+ " agg = (agg[agg.number_of_repeated_timestamps > 1]).reset_index(drop = True)\n",
+ "\n",
+ " return agg"
]
},
{
"cell_type": "code",
- "execution_count": null,
- "id": "8b28c22b-9b7c-41ab-b3cc-36661c8439e5",
+ "execution_count": 73,
+ "id": "5ce07566-c1f0-4fa7-9550-2fa07b98dba8",
"metadata": {},
"outputs": [],
"source": [
- "# few_routes_cat[(few_routes_cat.meters_cat == \"meters_ elapsed high\") & (few_routes_cat.seconds_cat == \"seconds_ elapsed low\") & (few_routes_cat.speed_flags == \"speed high\")][subset].sample(3)"
+ "def stage3_repeated_locations(stage3_df:pd.DataFrame):\n",
+ " \"\"\"\n",
+ " Look at how many times a time stamp is repeated for a stop-trip-route combo.\n",
+ " Each of these 3 combos should have a different location for each \n",
+ " stop sequence or else the vehicle is not changing locations.\n",
+ " \"\"\"\n",
+ " # Concat x and y into a string\n",
+ " stage3_df['pair'] = stage3_df.x.astype(str) + '/' + vp2.y.astype(str)\n",
+ " \n",
+ " # Count number of different stops that reference the same location\n",
+ " agg = (stage3_df\n",
+ " .groupby(['shape_array_key','trip_id','pair'])\n",
+ " .agg({'stop_sequence':'nunique'})\n",
+ " .reset_index()\n",
+ " .sort_values('stop_sequence', ascending = False)\n",
+ " .rename(columns = {'stop_sequence':'number_of_repeated_locs'}) \n",
+ " )\n",
+ "\n",
+ " # Only keep locations that are repeated more than once\n",
+ " agg = agg[agg.number_of_repeated_locs != 1].reset_index(drop = True)\n",
+ " \n",
+ " return agg"
]
},
{
"cell_type": "code",
- "execution_count": null,
- "id": "0c00506f-00ed-4660-9c88-aa11bc925fd2",
+ "execution_count": 74,
+ "id": "66e83169-2b4a-4912-bc0e-1a0b3e8deea6",
"metadata": {},
"outputs": [],
"source": [
- "# few_routes_cat[(few_routes_cat.meters_cat == \"meters_ elapsed high\") & (few_routes_cat.seconds_cat == \"seconds_ elapsed high\") & (few_routes_cat.speed_flags == \"speed low\")][subset].sample(3)"
+ "def flag_stage3(flagged_df:pd.DataFrame, date:str) -> pd.DataFrame:\n",
+ " \"\"\"\n",
+ " Flag the errors in stage3\n",
+ " \"\"\"\n",
+ " start = datetime.datetime.now()\n",
+ " print(start)\n",
+ " \n",
+ " # Relevant rows from Vehicle Positions\n",
+ " vp = load_vp_stage3(flagged_df, date)\n",
+ " \n",
+ " # Find repeated timestamps.\n",
+ " multi_timestamps = stage3_repeated_timestamps(vp)\n",
+ " \n",
+ " # Find repeated locations\n",
+ " multi_locs = stage3_repeated_locations(vp)\n",
+ " \n",
+ " # Merge\n",
+ " timestamps_merge_cols = ['shape_array_key','trip_id','location_timestamp_local']\n",
+ " loc_merge_cols = ['shape_array_key','trip_id','pair']\n",
+ " \n",
+ " # Want everything found in vehicle positions, so do left merges\n",
+ " m1 = (vp\n",
+ " .merge(multi_timestamps, how=\"left\", on= timestamps_merge_cols)\n",
+ " .merge(multi_locs, how=\"left\", on=loc_merge_cols)\n",
+ " )\n",
+ " \n",
+ " drop_cols = ['vp_idx','x','y','hour','activity_date',]\n",
+ " m1 = m1.drop(columns = drop_cols)\n",
+ " \n",
+ " # Flag\n",
+ " def flag(row):\n",
+ " if (row[\"number_of_repeated_timestamps\"] > 1) & (row[\"number_of_repeated_locs\"] > 1):\n",
+ " return \"repeated timestamps & locations\"\n",
+ " elif (row[\"number_of_repeated_timestamps\"] > 1):\n",
+ " return \"repeated timestamps\"\n",
+ " elif (row[\"number_of_repeated_locs\"] > 1):\n",
+ " return \"repeated locations\"\n",
+ " else:\n",
+ " return \"check in stage 2\"\n",
+ " \n",
+ " m1[\"stage3_flag\"] = m1.apply(lambda x: flag(x), axis=1)\n",
+ " \n",
+ " print(m1.stage3_flag.value_counts())\n",
+ " \n",
+ " check_in_stage2 = m1[m1.stage3_flag == \"check in stage 2\"]\n",
+ " print(f\"Have to check {len(check_in_stage2)/len(m1) * 100} % of rows in stage 2\")\n",
+ " \n",
+ " end = datetime.datetime.now()\n",
+ " print(f\"Took {end-start}\")\n",
+ " return m1"
]
},
{
"cell_type": "code",
- "execution_count": null,
- "id": "17bceddf-6c73-466d-8a8c-115370ab3301",
+ "execution_count": 75,
+ "id": "cab32ef3-cc66-40ce-aa19-59631734f539",
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "2023-06-30 10:17:51.135694\n",
+ "check in stage 2 538914\n",
+ "repeated timestamps 54883\n",
+ "repeated timestamps & locations 107\n",
+ "repeated locations 42\n",
+ "Name: stage3_flag, dtype: int64\n",
+ "Have to check 90.73451121819154 % of rows in stage 2\n",
+ "Took 0:00:27.583738\n"
+ ]
+ }
+ ],
"source": [
- "# few_routes_cat[(few_routes_cat.meters_cat == \"meters_ elapsed avg\") & (few_routes_cat.seconds_cat == \"seconds_ elapsed high\") & (few_routes_cat.speed_flags == \"speed low\")][subset].sample(3)"
+ "m3 = flag_stage3(m2, analysis_date)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 76,
+ "id": "1cca329c-14bc-4ad5-9465-1a63ca53df49",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "m3 = m3[m3.stage3_flag == \"check in stage 2\"]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 77,
+ "id": "93e87778-edef-4d62-98aa-a4241f177892",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(538914, 29)"
+ ]
+ },
+ "execution_count": 77,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "m3.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 78,
+ "id": "21799f42-873e-41bd-b764-42cc297686a6",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "sort_cols = ['trip_id', 'shape_array_key', 'stop_sequence']"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 149,
+ "id": "17ac977c-d220-414e-be9f-540eec051e06",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " shape_array_key | \n",
+ " gtfs_dataset_key | \n",
+ " trip_id | \n",
+ " n_trips | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 296389 | \n",
+ " 809fd4704a18ae0ad64f8170e0167b56 | \n",
+ " 5222fe2cf728fd3f16b2ff51e133fe8c | \n",
+ " 183-oeiebzuc1 | \n",
+ " 162 | \n",
+ "
\n",
+ " \n",
+ " 295464 | \n",
+ " 805fef558a9bf81d57143cab635b27b1 | \n",
+ " c0e3039da063db95ebabd3fe4ee611a4 | \n",
+ " 11083276_M11 | \n",
+ " 159 | \n",
+ "
\n",
+ " \n",
+ " 396202 | \n",
+ " ac5104538290bb7c7d14b926884e6efa | \n",
+ " c0e3039da063db95ebabd3fe4ee611a4 | \n",
+ " 11060883_M11 | \n",
+ " 157 | \n",
+ "
\n",
+ " \n",
+ " 527457 | \n",
+ " e5ec67542d6f30fa38fdcf2f63c90109 | \n",
+ " c0e3039da063db95ebabd3fe4ee611a4 | \n",
+ " 11083144_M11 | \n",
+ " 156 | \n",
+ "
\n",
+ " \n",
+ " 116604 | \n",
+ " 3928b30e00772c10a38c11ea12ad7869 | \n",
+ " 5222fe2cf728fd3f16b2ff51e133fe8c | \n",
+ " 183-0rjkhjagy | \n",
+ " 150 | \n",
+ "
\n",
+ " \n",
+ " 555304 | \n",
+ " edc5ab1a2be1d269306161ce38e0b2ad | \n",
+ " c0e3039da063db95ebabd3fe4ee611a4 | \n",
+ " 11042148_M11 | \n",
+ " 138 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " shape_array_key gtfs_dataset_key \\\n",
+ "296389 809fd4704a18ae0ad64f8170e0167b56 5222fe2cf728fd3f16b2ff51e133fe8c \n",
+ "295464 805fef558a9bf81d57143cab635b27b1 c0e3039da063db95ebabd3fe4ee611a4 \n",
+ "396202 ac5104538290bb7c7d14b926884e6efa c0e3039da063db95ebabd3fe4ee611a4 \n",
+ "527457 e5ec67542d6f30fa38fdcf2f63c90109 c0e3039da063db95ebabd3fe4ee611a4 \n",
+ "116604 3928b30e00772c10a38c11ea12ad7869 5222fe2cf728fd3f16b2ff51e133fe8c \n",
+ "555304 edc5ab1a2be1d269306161ce38e0b2ad c0e3039da063db95ebabd3fe4ee611a4 \n",
+ "\n",
+ " trip_id n_trips \n",
+ "296389 183-oeiebzuc1 162 \n",
+ "295464 11083276_M11 159 \n",
+ "396202 11060883_M11 157 \n",
+ "527457 11083144_M11 156 \n",
+ "116604 183-0rjkhjagy 150 \n",
+ "555304 11042148_M11 138 "
+ ]
+ },
+ "execution_count": 149,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Find routes with the most trips\n",
+ "(m3\n",
+ " .sort_values(['n_trips'], ascending = False)\n",
+ " .drop_duplicates(['shape_array_key'])\n",
+ " [['shape_array_key','gtfs_dataset_key', 'trip_id', 'n_trips']]\n",
+ " .head(6)\n",
+ ")"
]
},
{
"cell_type": "code",
- "execution_count": null,
- "id": "b8248dd3-c7de-4655-a8a2-9f7b486d01c1",
- "metadata": {
- "scrolled": true,
- "tags": []
- },
- "outputs": [],
- "source": [
- "# few_routes_cat[(few_routes_cat.shape_array_key == \"d8b0826e923620f7b7cd74be090de936\") & (few_routes_cat.stop_sequence == 1)][subset]"
+ "execution_count": 90,
+ "id": "3869ed7a-a951-4ed0-bfa9-bbdba7177790",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "n_trips\n",
+ "140 40\n",
+ "146 28\n",
+ "150 28\n",
+ "147 24\n",
+ "148 18\n",
+ "152 18\n",
+ "149 16\n",
+ "151 16\n",
+ "158 12\n",
+ "157 8\n",
+ "154 6\n",
+ "156 6\n",
+ "159 4\n",
+ "160 2\n",
+ "161 2\n",
+ "162 2\n",
+ "dtype: int64"
+ ]
+ },
+ "execution_count": 90,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "m3[m3.shape_array_key == \"809fd4704a18ae0ad64f8170e0167b56\"][['n_trips']].value_counts()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 91,
+ "id": "a6c3db80-c4bf-4264-873c-c9912cdc9dc5",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "n_trips\n",
+ "140 50\n",
+ "54 22\n",
+ "125 20\n",
+ "132 14\n",
+ "126 12\n",
+ "145 10\n",
+ "136 8\n",
+ "141 6\n",
+ "142 6\n",
+ "155 4\n",
+ "158 4\n",
+ "144 4\n",
+ "151 4\n",
+ "143 4\n",
+ "159 4\n",
+ "153 2\n",
+ "156 2\n",
+ "157 2\n",
+ "dtype: int64"
+ ]
+ },
+ "execution_count": 91,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "m3[m3.shape_array_key == \"805fef558a9bf81d57143cab635b27b1\"][['n_trips']].value_counts()"
]
},
{
- "cell_type": "code",
- "execution_count": null,
- "id": "56d7c123-5002-4ba5-916c-ed5d9097126c",
- "metadata": {
- "scrolled": true,
- "tags": []
- },
- "outputs": [],
+ "cell_type": "markdown",
+ "id": "4b1876cf-9e8b-4c30-8723-2226133b8e01",
+ "metadata": {},
"source": [
- "# few_routes_cat[(few_routes_cat.stop_sequence == 65) & (few_routes_cat.gtfs_dataset_key == \"65d9589130415c685b89f4f7c2d8bd7e\")][subset].sort_values(by = ['speed_mph'])"
+ "#### Stage2: \"vp_stop_segment\"/A1_sjoin_vp_segments\n"
]
},
{
"cell_type": "code",
- "execution_count": null,
- "id": "963353ae-52ab-4ca4-bef5-b3649b6b74c3",
+ "execution_count": 92,
+ "id": "0a469849-f903-44e4-9d2a-4f3775270a52",
"metadata": {},
"outputs": [],
"source": [
- "def flag(row):\n",
- "\n",
- " # Ok rows\n",
- " # If distance and time are average, flag as average\n",
- " if (row[\"meters_cat\"] == \"meters_ elapsed avg\") & (row[\"seconds_cat\"] == \"seconds_ elapsed avg\"):\n",
- " return \"ok\"\n",
- " # If MPH is average, flag as average\n",
- " elif row[\"speed_flags\"] == \"speed_ elapsed avg\":\n",
- " return \"ok\"\n",
- "\n",
- " # Zero rows\n",
- " elif ((row[\"speed_mph\"] == 0) | (row[\"sec_elapsed\"] == 0) | (row[\"meters_elapsed\"] == 0)):\n",
- " return \"low\"\n",
- "\n",
- " # Tag as high\n",
- " elif row[\"speed_flags\"] == \"speed_ elapsed high\":\n",
- " return \"high\"\n",
- "\n",
- " # Tag as low\n",
- " elif row[\"speed_flags\"] == \"speed_ elapsed low\":\n",
- " return \"low\"\n",
- "\n",
- " else:\n",
- " return \"other\""
+ "# Select one route to look at\n",
+ "test_route = \"3928b30e00772c10a38c11ea12ad7869\""
]
},
{
"cell_type": "code",
- "execution_count": null,
- "id": "e9994532-e658-480e-b009-8ad7ef6392b5",
+ "execution_count": 93,
+ "id": "6e946c68-3476-459d-a869-77ac37b5fb07",
"metadata": {},
"outputs": [],
"source": [
- "few_routes_cat[\"unusual_flag\"] = few_routes_cat.apply(lambda x: flag(x), axis=1)"
+ "test_gtfs_key = \"5222fe2cf728fd3f16b2ff51e133fe8c\""
]
},
{
"cell_type": "code",
- "execution_count": null,
- "id": "f652b637-2682-4b00-895d-d1809bab7d12",
+ "execution_count": 94,
+ "id": "b4fa40bf-387c-4301-ba13-2bd16b15cd24",
"metadata": {},
"outputs": [],
"source": [
- "len(few_routes_cat) == len(merge1)"
+ "test_trip = '183-0rjkhjagy'"
]
},
{
- "cell_type": "code",
- "execution_count": null,
- "id": "d7340f9e-62ae-4684-9415-a8d9189fb3f9",
+ "cell_type": "markdown",
+ "id": "ef18eb20-a43e-4d32-80eb-7f902116a944",
"metadata": {},
- "outputs": [],
"source": [
- "few_routes_cat.unusual_flag.value_counts() / len(few_routes_cat) * 100"
+ "#### Look at export file"
]
},
{
"cell_type": "code",
- "execution_count": null,
- "id": "40c0937e-de60-4508-a2aa-056209649c4b",
+ "execution_count": 95,
+ "id": "6397dc45-c271-4057-a0d8-1962846d4f94",
"metadata": {},
"outputs": [],
"source": [
- "few_routes_cat.unusual_flag.value_counts()"
+ "def import_stage_2(date:str, route:str, stop_sequence:str):\n",
+ " df = pd.read_parquet(\n",
+ " f\"{SEGMENT_GCS}vp_sjoin/vp_stop_segment_{date}\",\n",
+ " filters = [[('shape_array_key', \"==\", route),\n",
+ " ('stop_sequence', \"==\", stop_sequence)]],\n",
+ " )\n",
+ " return df"
]
},
{
"cell_type": "code",
- "execution_count": null,
- "id": "347a8e6b-162a-4079-b258-22017bad83e9",
+ "execution_count": 96,
+ "id": "fe8f800a-f180-4495-a387-0367528823ba",
"metadata": {},
"outputs": [],
"source": [
- "subset2 = [\"unusual_flag\", \"_gtfs_dataset_name\"] + subset"
+ "# stg2 = import_stage_2(analysis_date, test_route, test_sequence)"
]
},
{
- "cell_type": "code",
- "execution_count": null,
- "id": "9978f816-a208-4dac-9eb8-154ba9e58d6b",
+ "cell_type": "markdown",
+ "id": "4b5dec8d-c4f5-49dd-9a11-1b10ff30fb55",
"metadata": {
- "scrolled": true,
"tags": []
},
- "outputs": [],
- "source": [
- "high_df = few_routes_cat[few_routes_cat.unusual_flag == \"high\"]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "3335a25d-8447-49b1-b885-ede8234ec16a",
- "metadata": {},
- "outputs": [],
"source": [
- "low_df = few_routes_cat[few_routes_cat.unusual_flag == \"low\"]"
+ "#### Look at vp trips -> import unique trips"
]
},
{
"cell_type": "code",
- "execution_count": null,
- "id": "21432512-a578-4541-bd5c-132e3985e062",
+ "execution_count": 97,
+ "id": "ade9e07f-0b55-4561-96d1-6fd6adec0f1a",
"metadata": {},
"outputs": [],
"source": [
- "stc_3 = few_routes_cat[\n",
- " (few_routes_cat.stop_sequence == 3)\n",
- " & (\n",
- " few_routes_cat._gtfs_dataset_name\n",
- " == \"Bay Area 511 Santa Clara Transit VehiclePositions\"\n",
+ "def import_unique_trips(gtfs_key:str, trip: str, route:str):\n",
+ " vp_trips = A1_sjoin_vp_segments.add_grouping_col_to_vp(\n",
+ " f\"vp_usable_{analysis_date}\",\n",
+ " analysis_date,\n",
+ " [\"shape_array_key\"]\n",
" )\n",
- "]"
+ " \n",
+ " # Filter to just one trip/route/operator\n",
+ " df = vp_trips[(vp_trips.gtfs_dataset_key == gtfs_key)\n",
+ " & (vp_trips.shape_array_key == route)\n",
+ " & (vp_trips.trip_id == trip)].reset_index(drop = True)\n",
+ " return df\n"
]
},
{
"cell_type": "code",
- "execution_count": null,
- "id": "44a2debc-f188-4411-95a7-c9f822ea7f3c",
+ "execution_count": 98,
+ "id": "c8003044-b7e4-477e-9395-fa881a2fa2b3",
"metadata": {},
"outputs": [],
"source": [
- "test1 = few_routes_cat[\n",
- " (few_routes_cat.speed_flags == \"speed_ elapsed avg\")\n",
- " & (few_routes_cat.meters_cat == \"meters_ elapsed low\")\n",
- " & (few_routes_cat.seconds_cat == \"seconds_ elapsed low\")\n",
- "]"
+ "# unique_trips = import_unique_trips(test_gtfs_key, test_trip, test_route)"
]
},
{
- "cell_type": "code",
- "execution_count": null,
- "id": "5f817b1c-f215-486c-ac3c-e75bc8fb9393",
+ "cell_type": "markdown",
+ "id": "52ce333b-9f75-4c9b-a1af-130c93786f94",
"metadata": {},
- "outputs": [],
"source": [
- "# test1[subset2].sample(100)"
+ "#### Look at vehicle positions"
]
},
{
"cell_type": "code",
- "execution_count": null,
- "id": "f7b6bb03-dc37-478b-a1d5-ab5f2e59a556",
- "metadata": {
- "scrolled": true,
- "tags": []
- },
+ "execution_count": 99,
+ "id": "ac6e78c3-694d-4297-8db1-f0f4d6faadbf",
+ "metadata": {},
"outputs": [],
"source": [
- "# high_df[subset2].sample(10)"
+ "def import_vehicle_positions(unique_trips:pd.DataFrame, gtfs_key:str, trip_id:str)-> gpd.GeoDataFrame:\n",
+ " vp = helpers.import_vehicle_positions(\n",
+ " SEGMENT_GCS,\n",
+ " f\"vp_usable_{analysis_date}/\",\n",
+ " filters = [[(\"gtfs_dataset_key\", \"==\", gtfs_key),\n",
+ " ('trip_id', '==', trip_id)]],\n",
+ " columns = [\"gtfs_dataset_key\", \"trip_id\", \n",
+ " \"vp_idx\", \"x\", \"y\"],\n",
+ " partitioned = True\n",
+ " )\n",
+ " \n",
+ " vp = vp.compute()\n",
+ " vp = vp.merge(unique_trips, on = [\"gtfs_dataset_key\", \"trip_id\"],\n",
+ " how = \"inner\"\n",
+ " )\n",
+ " \n",
+ " vp_gdf = gpd.GeoDataFrame(\n",
+ " vp, \n",
+ " geometry = gpd.points_from_xy(vp.x, vp.y, crs = \"EPSG:4326\")\n",
+ " ).to_crs(PROJECT_CRS).drop(columns = [\"x\", \"y\"])\n",
+ " \n",
+ " return vp_gdf"
]
},
{
"cell_type": "code",
- "execution_count": null,
- "id": "58b91d5d-9f9a-47e9-a5fe-b7f7e0e64587",
+ "execution_count": 100,
+ "id": "b47ea0cb-6031-4d98-b963-efbef949d169",
"metadata": {},
"outputs": [],
"source": [
- "metro_62 = few_routes_cat[\n",
- " (few_routes_cat.stop_sequence == 62)\n",
- " & (few_routes_cat._gtfs_dataset_name == \"LA Metro Bus Vehicle Positions\")\n",
- "]"
+ "#vehicle_positions = import_vehicle_positions(unique_trips, test_gtfs_key, test_trip)"
]
},
{
"cell_type": "code",
- "execution_count": null,
- "id": "43ff6e38-5084-463f-a444-68e3bffdd7cc",
- "metadata": {
- "scrolled": true,
- "tags": []
- },
+ "execution_count": 101,
+ "id": "f0f96480-8328-43ed-9add-0a74b533fc8d",
+ "metadata": {},
"outputs": [],
"source": [
- "stop_16 = few_routes_cat[\n",
- " (few_routes_cat.stop_sequence == 16)\n",
- " & (few_routes_cat._gtfs_dataset_name == \"Bay Area 511 Muni VehiclePositions\")\n",
- "]"
+ "#len(vehicle_positions)"
]
},
{
"cell_type": "markdown",
- "id": "e01d88e4-a3e2-43f1-9591-afe83e8a92cf",
+ "id": "896e00be-27f2-43b7-9cb4-69d61a061af0",
"metadata": {},
"source": [
- "#### Should filter even further."
+ "#### Look at segments"
]
},
{
"cell_type": "code",
- "execution_count": null,
- "id": "bdd2b274-87a7-4306-8494-f65416ac88fb",
+ "execution_count": 102,
+ "id": "17bb5083-ace2-4e2d-8400-3bf948625909",
"metadata": {},
"outputs": [],
"source": [
- "high_low_zero = few_routes_cat[\n",
- " few_routes_cat.unusual_flag.isin([\"high\", \"low\"])\n",
- "].reset_index()"
+ "def import_segments(flagged_df: pd.DataFrame, route:str, gtfs_key:str) -> gpd.GeoDataFrame:\n",
+ " \n",
+ " # Load in ALL segments, flag them.\n",
+ " gdf = gpd.read_parquet(f\"{SEGMENT_GCS}stop_segments_{analysis_date}.parquet\",\n",
+ " filters = [[(\"shape_array_key\", \"==\", route),\n",
+ " (\"gtfs_dataset_key\", \"==\", gtfs_key),\n",
+ " ]]).to_crs(PROJECT_CRS)\n",
+ " \n",
+ " gdf[\"geometry_buffered\"] = gdf.geometry.buffer(35)\n",
+ " gdf = gdf.set_geometry('geometry_buffered')\n",
+ " \n",
+ " # Distinguish between \"correct\" and \"incorrect\" seq\n",
+ " # A sequence can be incorrect even if just one row is \"divided by 0\"\n",
+ " incorrect_segments = flagged_df[(flagged_df.shape_array_key == route) & (flagged_df.gtfs_dataset_key == gtfs_key)]\n",
+ " incorrect_segments_list = incorrect_segments.stop_sequence.unique().tolist()\n",
+ " incorrect_segments_filtered = gdf[gdf.stop_sequence.isin(incorrect_segments_list)].reset_index(drop = True)\n",
+ " incorrect_segments_filtered['flag'] = 'contains 0m/0sec'\n",
+ " \n",
+ " # Filter for correct segments\n",
+ " correct_segments = flagged_df[~flagged_df.stop_sequence.isin(incorrect_segments_list)]\n",
+ " correct_segments_list = correct_segments.stop_sequence.unique().tolist()\n",
+ " correct_segments_filtered = gdf[gdf.stop_sequence.isin(correct_segments_list)].reset_index(drop = True)\n",
+ " correct_segments_filtered['flag'] = 'does not contain 0m/0sec'\n",
+ " \n",
+ " final = pd.concat([correct_segments_filtered, incorrect_segments_filtered])\n",
+ " \n",
+ " return final"
]
},
{
"cell_type": "code",
- "execution_count": null,
- "id": "ebea8b6f-a011-4996-be52-9e10ec1f8342",
+ "execution_count": 103,
+ "id": "9f3a302a-f604-49fe-ae9b-ee8db85466de",
"metadata": {},
"outputs": [],
"source": [
- "few_routes_cat.shape"
+ "flagged_segments = import_segments(m3, test_route, test_gtfs_key)"
]
},
{
"cell_type": "code",
- "execution_count": null,
- "id": "987d3500-7594-4d12-a9d0-45d2c19999d9",
+ "execution_count": 104,
+ "id": "e8a14cfd-38b8-4326-9eac-a711f1a189e8",
"metadata": {},
"outputs": [],
"source": [
- "high_low_zero.shape"
+ "#segments = A1_sjoin_vp_segments.import_segments_and_buffer(\n",
+ " # f\"stop_segments_{analysis_date}\",\n",
+ "# 35,\n",
+ " # [\"shape_array_key\", \"stop_sequence\"]+ [\"seg_idx\", \"geometry\"]\n",
+ "#)"
]
},
{
"cell_type": "code",
- "execution_count": null,
- "id": "630e745f-dd71-4cc2-bad6-e17c666900a8",
+ "execution_count": 105,
+ "id": "3c08c38e-7419-4ff6-b74f-5f15615e52c4",
"metadata": {},
"outputs": [],
"source": [
- "len(high_low_zero.drop_duplicates())"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "daa82644-dc46-409a-83ab-a86ea996c356",
- "metadata": {},
- "outputs": [],
- "source": [
- "len(few_routes_cat)-len(high_low_zero)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "8b912560-11ed-4aad-bcc3-ffb9e7966c24",
- "metadata": {
- "scrolled": true,
- "tags": []
- },
- "outputs": [],
- "source": [
- "# To plot\n",
- "# all_trips = one_route3.melt(id_vars=[ '_gtfs_dataset_name','shape_array_key','trip_id', 'stop_sequence','gtfs_dataset_key','loop_or_inlining',\n",
- "#'n_trips'], value_vars=['avg_speed_mph','speed_mph','p20_speed_mph', 'p80_speed_mph'])"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "bc09e6d6-9811-4c50-9f61-e00af00f0b83",
- "metadata": {},
- "outputs": [],
- "source": [
- "# all_trips = all_trips.drop_duplicates(subset = [ '_gtfs_dataset_name','shape_array_key','stop_sequence','gtfs_dataset_key','variable','value']).reset_index(drop = True)"
+ "# segments = segments.compute()"
]
},
{
"cell_type": "markdown",
- "id": "cde431f9-10ad-484f-b954-dd3c13a6e683",
- "metadata": {},
- "source": [
- "#### Other ideas\n",
- "* Show which stops are excluded from flags\n",
- "* Show how many stops are dropped\n",
- "* Show % of stops that were flagged compared to total stops."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "aae6b6c0-ce47-4ca1-b104-68b39ebcf2ca",
- "metadata": {},
- "outputs": [],
- "source": [
- "high_low_zero2 = high_low_zero.melt(\n",
- " id_vars=[\n",
- " \"_gtfs_dataset_name\",\n",
- " \"shape_array_key\",\n",
- " \"trip_id\",\n",
- " \"stop_sequence\",\n",
- " \"gtfs_dataset_key\",\n",
- " \"loop_or_inlining\",\n",
- " \"n_trips\",\n",
- " \"meters_cat\",\n",
- " \"seconds_cat\",\n",
- " \"unusual_flag\",\n",
- " ],\n",
- " value_vars=[\"avg_speed_mph\", \"speed_mph\", \"p20_speed_mph\", \"p80_speed_mph\"],\n",
- ")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "767abd20-d030-42d3-b85f-6d3023d69b8a",
+ "id": "5a688361-e7a9-40f0-877d-3693be99a960",
"metadata": {},
- "outputs": [],
"source": [
- "high_low_zero2 = high_low_zero2.drop_duplicates(\n",
- " subset=[\n",
- " \"loop_or_inlining\",\n",
- " \"_gtfs_dataset_name\",\n",
- " \"shape_array_key\",\n",
- " \"stop_sequence\",\n",
- " \"gtfs_dataset_key\",\n",
- " \"variable\",\n",
- " \"value\",\n",
- " ]\n",
- ").reset_index(drop=True)"
+ "#### Stops kept: last and first"
]
},
{
"cell_type": "code",
- "execution_count": null,
- "id": "52e7ee5a-a40e-423e-ba4b-dea14de17982",
+ "execution_count": 106,
+ "id": "9ea744bf-8019-4b7c-988f-f95196b56435",
"metadata": {},
"outputs": [],
"source": [
- "high_low_zero2.shape"
+ "def find_first_last_points(route:str, trip:str, gtfs_key:str):\n",
+ " df = pd.read_parquet(f\"{SEGMENT_GCS}vp_pared_stops_{analysis_date}\",\n",
+ " filters = [[('shape_array_key', \"==\", route),\n",
+ " \n",
+ " ('trip_id', \"==\", trip), \n",
+ " ('gtfs_dataset_key', '==', gtfs_key)]],)\n",
+ " \n",
+ " gdf = gpd.GeoDataFrame(\n",
+ " df, \n",
+ " geometry = gpd.points_from_xy(df.x, df.y, crs = \"EPSG:4326\")\n",
+ " ).to_crs(PROJECT_CRS).drop(columns = [\"x\", \"y\"])\n",
+ " \n",
+ " gdf = gdf[['geometry','stop_sequence']]\n",
+ " \n",
+ " return gdf"
]
},
{
"cell_type": "code",
- "execution_count": null,
- "id": "229c6d49-58e1-45bd-839f-b03f7e8cdd4a",
+ "execution_count": 107,
+ "id": "7c7aa90f-3e80-472e-b61d-94afd6c0ec01",
"metadata": {},
"outputs": [],
"source": [
- "def stops_info(original: pd.DataFrame, filtered_unusual_stops: pd.DataFrame):\n",
- "\n",
- " subset = [\"_gtfs_dataset_name\", \"gtfs_dataset_key\", \"shape_array_key\"]\n",
- "\n",
- " def aggregate(df, total_trip_column_name: str):\n",
- " agg = (\n",
- " df.groupby(subset)\n",
- " .agg({\"stop_sequence\": \"count\"})\n",
- " .reset_index()\n",
- " .rename(columns={\"stop_sequence\": total_trip_column_name})\n",
- " )\n",
- "\n",
- " return agg\n",
- "\n",
- " total_unusual_stops = aggregate(filtered_unusual_stops, \"total_unusual_stops\")\n",
- " total_stops = aggregate(original, \"total_stops\")\n",
- "\n",
- " # Merge them\n",
- " merge1 = pd.merge(total_unusual_stops, total_stops, on=subset, how=\"inner\")\n",
- "\n",
- " # Add some columns\n",
- " merge1[\"percent_of_unusual_stops\"] = ((merge1.total_unusual_stops / merge1.total_stops) * 100).astype(int)\n",
- " \n",
- " merge1[\"Percentage of Unusual Stops\"] = \"% of Unusual Stops: \" + merge1.percent_of_unusual_stops.astype(str)\n",
- "\n",
- " # Add dropdown menu\n",
- " merge1[\"Dropdown Menu\"] = merge1._gtfs_dataset_name + \" \" + merge1.shape_array_key\n",
- "\n",
- " # Clean\n",
- " merge1 = threshold_utils.pre_clean(merge1)\n",
- "\n",
- " return merge1"
+ "# first_last = find_first_last_points(test_route, test_trip, test_gtfs_key)"
]
},
{
"cell_type": "code",
- "execution_count": null,
- "id": "4a22aba2-00ee-4f11-bcf5-c3001cc34b9f",
+ "execution_count": 108,
+ "id": "5634169a-f26b-4174-b46a-3aa872bc1bdb",
"metadata": {},
"outputs": [],
"source": [
- "# Do not use melted version of the dataframe for second argument\n",
- "stop_info = stops_info(merge1, high_low_zero)"
+ "# len(first_last)"
]
},
{
- "cell_type": "code",
- "execution_count": null,
- "id": "23e7f746-b1b5-402f-92fd-dbc74840e013",
+ "cell_type": "markdown",
+ "id": "9aba7f4e-2b1a-4f1b-aaf2-1bc7bb3cf221",
"metadata": {},
- "outputs": [],
"source": [
- "merge1.shape_array_key.nunique(), high_low_zero.shape_array_key.nunique()"
+ "#### Sjoin "
]
},
{
"cell_type": "code",
- "execution_count": null,
- "id": "7049c7e9-85bc-43c3-8f0c-7111fabbe649",
+ "execution_count": 109,
+ "id": "d535d059-efd9-49dd-9759-8663679ad5e1",
"metadata": {},
"outputs": [],
"source": [
- "stop_info.shape"
+ "def sjoin_vp_segments(segments: gpd.GeoDataFrame, vp_gdf: gpd.GeoDataFrame):\n",
+ " vp_in_seg = gpd.sjoin(\n",
+ " vp_gdf,\n",
+ " segments,\n",
+ " how = \"inner\",\n",
+ " predicate = \"within\"\n",
+ " )\n",
+ " \n",
+ " \n",
+ " return vp_in_seg"
]
},
{
- "cell_type": "code",
- "execution_count": null,
- "id": "c4fc643f-a156-419e-ad35-8ebc6da5d075",
+ "cell_type": "markdown",
+ "id": "1aeb604b-249b-41e7-be71-fe8d3205e54a",
"metadata": {
"tags": []
},
- "outputs": [],
- "source": [
- "stop_info.sort_values(['Percent Of Unusual Stops'], ascending = False).head(10)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "8a25f067-956e-46a7-aa7a-5abf57e662f6",
- "metadata": {},
- "outputs": [],
"source": [
- "# Clean\n",
- "high_low_zero2 = threshold_utils.pre_clean(high_low_zero2)"
+ "#### Mapping"
]
},
{
"cell_type": "code",
- "execution_count": null,
- "id": "b5eea02c-05ec-4707-a06d-9de1864e8fbe",
+ "execution_count": 124,
+ "id": "1a785180-38bb-4d33-a96e-3385c84ed2f1",
"metadata": {},
"outputs": [],
"source": [
- "# Add dropdown menu\n",
- "high_low_zero2[\"Dropdown Menu\"] = (\n",
- " high_low_zero2[\"Gtfs Dataset Name\"] + \" \" + high_low_zero2[\"Shape Array Key\"]\n",
- ")"
+ "def display_maps(all_points: gpd.GeoDataFrame, \n",
+ " first_last_points: gpd.GeoDataFrame,\n",
+ " segments: gpd.GeoDataFrame,\n",
+ " sjoin_results: gpd.GeoDataFrame):\n",
+ " \n",
+ " base1 = segments.explore('flag', cmap= 'tab10', height = 400, width = 600, name = 'segments')\n",
+ " all_points_map = all_points.explore(m = base1, color = 'red',style_kwds = {'weight':6}, name= 'points')\n",
+ " \n",
+ " print('ALL POINTS')\n",
+ " display(all_points_map) \n",
+ " \n",
+ " \n",
+ " # Right left geo\n",
+ " sjoin_points = sjoin_results.set_geometry('geometry_left')\n",
+ " sjoin_segments = sjoin_results.set_geometry('geometry_right')\n",
+ " sjoin_segments.geometry_right = sjoin_segments.geometry_right.buffer(35)\n",
+ " base3 = sjoin_segments.explore('flag', cmap= 'tab10', height = 400, width = 600, name = 'segments')\n",
+ " sjoin_map = sjoin_points.explore(m = base3, color = 'orange',style_kwds = {'weight':6}, name= 'points')\n",
+ " \n",
+ " print('SJOIN')\n",
+ " display(sjoin_map)\n",
+ " \n",
+ " base2 = segments.explore('flag', cmap= 'tab10', height = 400, width = 600, name = 'segments')\n",
+ " first_last_map = first_last_points.explore(m = base2, color = 'pink',style_kwds = {'weight':6},height = 400, width = 600,)\n",
+ " \n",
+ " print('FIRST AND LAST')\n",
+ " display(first_last_map)\n",
+ " "
]
},
{
"cell_type": "code",
- "execution_count": null,
- "id": "571b471f-4a66-474f-8900-c3eaffde441e",
+ "execution_count": 125,
+ "id": "bab6bfc4-3d08-46fe-be33-6179ac5df34d",
"metadata": {},
"outputs": [],
"source": [
- "high_low_zero2[\"Route Type\"] = \"Route Type: \" + high_low_zero2[\n",
- " \"Loop Or Inlining\"\n",
- "].astype(str)"
+ "# display_maps(vehicle_positions,first_last,flagged_segments)"
]
},
{
- "cell_type": "code",
- "execution_count": null,
- "id": "072f39d9-4bf9-4efc-a393-24f096cecf7e",
+ "cell_type": "markdown",
+ "id": "f7ed9dc6-80ce-46f9-ae59-a5ed2bbcad50",
"metadata": {},
- "outputs": [],
"source": [
- "def alt_dropdown(df, col_for_dropdown: str, dropdown_menu_title: str):\n",
- " # Create dropdown menu\n",
- " # Exclude \"none\" operators which are only scheduled data\n",
- " df = df.loc[df[col_for_dropdown] != \"None\"][[col_for_dropdown]]\n",
- " dropdown_list = df[col_for_dropdown].unique().tolist()\n",
+ "#### Function\n",
"\n",
- " # Show only first operator by default\n",
- " initialize_first_op = sorted(dropdown_list)[0]\n",
- " input_dropdown = alt.binding_select(\n",
- " options=sorted(dropdown_list), name=dropdown_menu_title\n",
- " )\n",
+ "Previous tried routes\n",
+ "test_route = \"106d979b9a9e6338827a8e1c145e69fd\"\n",
+ "test_sequence = 39\n",
+ "test_gtfs_key = \"db56b50ab86b5f7a4ae2fc2dd9889bbe\"\n",
+ "test_trip = '1088405'\n",
"\n",
- " selection = alt.selection_single(\n",
- " name=dropdown_menu_title,\n",
- " fields=[col_for_dropdown],\n",
- " bind=input_dropdown,\n",
- " init={col_for_dropdown: initialize_first_op},\n",
- " )\n",
+ "test_route2 = \"0fb4f3627996269dc7075276d3b69e36\"\n",
+ "test_gtfs_key2 = \"a4f6fd5552107e05fe9743ac7cce2c55\"\n",
+ "test_trip2 = \"16939095\"\n",
"\n",
- " return selection"
+ "test_route3 = \"07c9a47264a43d8d0d16ef7109e8fd68\"\n",
+ "test_gtfs_key3 = \"db56b50ab86b5f7a4ae2fc2dd9889bbe\"\n",
+ "test_trip3 = \"1089348\""
]
},
{
"cell_type": "code",
- "execution_count": null,
- "id": "b7b429be-057c-4692-927e-92107b015ae6",
- "metadata": {},
- "outputs": [],
- "source": [
- "selection_test = alt_dropdown(high_low_zero2, \"Dropdown Menu\", \"Route\")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "2189ef37-baa1-4447-802b-f21362e73e03",
+ "execution_count": 126,
+ "id": "509bc2c0-14f3-4021-baf1-6d89a2409a79",
"metadata": {},
"outputs": [],
"source": [
- "alt.data_transformers.enable('default', max_rows=800000)"
+ "def stage2_trouble_shooting(flagged_df:pd.DataFrame,\n",
+ " date:str, \n",
+ " route:str, \n",
+ " trip:str, \n",
+ " gtfs_key:str):\n",
+ " unique_trips = import_unique_trips(gtfs_key, trip, route)\n",
+ " \n",
+ " # Find all recorded vps\n",
+ " vehicle_positions = import_vehicle_positions(unique_trips, gtfs_key, trip)\n",
+ " \n",
+ " # Flag segments, whether one row contains 1+ 0/0 division or not\n",
+ " flagged_segments = import_segments(flagged_df, route, gtfs_key)\n",
+ " \n",
+ " # Find first and last pt kept\n",
+ " first_last = find_first_last_points(route, trip, gtfs_key)\n",
+ " \n",
+ " # Sjoin \n",
+ " sjoin_results = sjoin_vp_segments(flagged_segments,vehicle_positions)\n",
+ " \n",
+ " # Display maps\n",
+ " display_maps(vehicle_positions,first_last,flagged_segments,sjoin_results)\n",
+ " "
]
},
{
- "cell_type": "code",
- "execution_count": null,
- "id": "80014c5e-695d-4280-89cd-4e7e2bb3d302",
+ "cell_type": "markdown",
+ "id": "056dc4ec-dde7-4f5a-bcd6-8ebd5b9d6982",
"metadata": {},
- "outputs": [],
"source": [
- "alt.data_transformers.disable_max_rows()"
+ "#### Example Trip 1"
]
},
{
"cell_type": "code",
- "execution_count": null,
- "id": "4c975495-e436-4914-ace3-3f3c361a2c66",
- "metadata": {},
+ "execution_count": 127,
+ "id": "2bd26525-b824-4b1a-a2bd-817b2207e3fe",
+ "metadata": {
+ "scrolled": true,
+ "tags": []
+ },
"outputs": [],
"source": [
- "high_low_zero2.columns"
+ "# subset[(subset.stop_sequence == test_sequence) & (subset.shape_array_key == test_route)]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 128,
+ "id": "7a4ae095-d010-46b5-80b2-0bbe948f249f",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "ALL POINTS\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "Make this Notebook Trusted to load map: File -> Trust Notebook
"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "SJOIN\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "Make this Notebook Trusted to load map: File -> Trust Notebook
"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "FIRST AND LAST\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "Make this Notebook Trusted to load map: File -> Trust Notebook
"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "stage2_trouble_shooting(flagged_df= m3,\n",
+ " date = analysis_date,\n",
+ " route = test_route,\n",
+ " trip = test_trip,\n",
+ " gtfs_key = test_gtfs_key)"
]
},
{
- "cell_type": "code",
- "execution_count": null,
- "id": "e185065f-1d65-4720-9e72-a229c98fd1bd",
+ "cell_type": "markdown",
+ "id": "38c6d374-155d-4fcc-985a-7c892eaecb46",
"metadata": {},
- "outputs": [],
"source": [
- "len(high_low_zero2[['Route Type']].drop_duplicates())"
+ "#### Example Trip 2"
]
},
{
"cell_type": "code",
- "execution_count": null,
- "id": "aa0fad8b-b49e-48be-8070-adaf6e63d541",
+ "execution_count": 129,
+ "id": "268c5f17-df4a-4d84-b3b0-d76a2727bcf7",
"metadata": {},
"outputs": [],
"source": [
- "# https://github.com/altair-viz/altair/issues/1168\n",
- "title = (\n",
- " alt.Chart(high_low_zero2)\n",
- " .mark_text(dy=-40, size=15, fontWeight=\"normal\")\n",
- " .encode(\n",
- " text=\"Route Type:N\",\n",
- " )\n",
- " .add_selection(selection_test)\n",
- " .transform_filter(selection_test)\n",
- ")"
+ "test_route2 = \"805fef558a9bf81d57143cab635b27b1\"\n",
+ "test_gtfs_key2 = \"c0e3039da063db95ebabd3fe4ee611a4\"\n",
+ "test_trip2 = \"11083276_M11\""
]
},
{
"cell_type": "code",
- "execution_count": null,
- "id": "1042a774-165c-4f7e-bfc9-c4d4980bd29b",
+ "execution_count": 141,
+ "id": "c0914a84-b24d-442f-aaca-acf401b9209c",
"metadata": {},
"outputs": [],
"source": [
- "total_stops_altair = (\n",
- " alt.Chart(stop_info)\n",
- " .mark_text(dy=-40, size=15, fontWeight=\"normal\")\n",
- " .encode(\n",
- " text=\"Percentage Of Unusual Stops:N\",\n",
- " )\n",
- " .add_selection(selection_test)\n",
- " .transform_filter(selection_test)\n",
- ")"
+ "# m1[(m1.stop_sequence == 17) & (m1.shape_array_key == test_route2)]"
]
},
{
"cell_type": "code",
- "execution_count": null,
- "id": "bbc75adc-739d-43b7-b29a-2018f98966f4",
+ "execution_count": 131,
+ "id": "deaa6fdf-37b8-49ee-97f2-46f74d41a449",
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "ALL POINTS\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "Make this Notebook Trusted to load map: File -> Trust Notebook
"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "SJOIN\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "Make this Notebook Trusted to load map: File -> Trust Notebook
"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "FIRST AND LAST\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "Make this Notebook Trusted to load map: File -> Trust Notebook
"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
"source": [
- "total_stops_altair"
+ "stage2_trouble_shooting(flagged_df= m3,\n",
+ " date = analysis_date,\n",
+ " route = test_route2,\n",
+ " trip = test_trip2,\n",
+ " gtfs_key = test_gtfs_key2)"
]
},
{
- "cell_type": "code",
- "execution_count": null,
- "id": "6067e93b-3519-45fc-b027-11cbcc82d80f",
+ "cell_type": "markdown",
+ "id": "27d10dab-c7b0-4ddd-b70b-b3c6b7b3e579",
"metadata": {},
- "outputs": [],
"source": [
- "main_chart = (\n",
- " threshold_utils.chart_size(\n",
- " alt.Chart(high_low_zero2)\n",
- " .mark_tick(\n",
- " size=15,\n",
- " thickness=5,\n",
- " )\n",
- " .encode(\n",
- " x=\"Stop Sequence:N\",\n",
- " y=\"Value:Q\",\n",
- " color=alt.Color(\n",
- " \"Variable:N\", scale=alt.Scale(range=cp.CALITP_CATEGORY_BRIGHT_COLORS)\n",
- " ),\n",
- " tooltip=high_low_zero2.columns.tolist(),\n",
- " )\n",
- " .interactive(),\n",
- " 1100,\n",
- " 400,\n",
- " )\n",
- " .add_selection(selection_test)\n",
- " .transform_filter(selection_test)\n",
- ")"
+ "#### Example Trip 3"
]
},
{
"cell_type": "code",
- "execution_count": null,
- "id": "c71654d2-48f1-4cfb-a3a2-d22bc85d97a8",
+ "execution_count": 150,
+ "id": "f951ac97-43af-452f-9cb7-d40f71c114c9",
"metadata": {},
"outputs": [],
"source": [
- "main_chart"
+ "test_route3 = \"edc5ab1a2be1d269306161ce38e0b2ad\"\n",
+ "test_gtfs_key3 = \"c0e3039da063db95ebabd3fe4ee611a4\"\n",
+ "test_trip3 = \"11042148_M11\""
]
},
{
"cell_type": "code",
- "execution_count": null,
- "id": "c77e2503-e211-48cc-a220-d96f82ab72df",
+ "execution_count": 151,
+ "id": "26e85057-05a9-4606-af4a-7be3e08ae2a2",
"metadata": {},
"outputs": [],
"source": [
- "(title & total_stops_altair | main_chart)"
+ "# subset[(subset.stop_sequence == 34) & (subset.shape_array_key == test_route3)]"
]
},
{
"cell_type": "code",
- "execution_count": null,
- "id": "3e9e3056-137a-451e-a566-52e085499407",
- "metadata": {},
- "outputs": [],
- "source": [
- "print('hi')"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "5816ef70-bc10-4b87-b587-73f3789c5674",
+ "execution_count": 152,
+ "id": "91d07c20-9d78-4eea-8b9c-293df8ade5a3",
"metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "ALL POINTS\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "Make this Notebook Trusted to load map: File -> Trust Notebook
"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "SJOIN\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "Make this Notebook Trusted to load map: File -> Trust Notebook
"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "FIRST AND LAST\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "Make this Notebook Trusted to load map: File -> Trust Notebook
"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
"source": [
- "### Charts \n",
- "Test with a few routes first\n",
- "* Create new col that rounds up speed for plotting purposes only."
+ "stage2_trouble_shooting(flagged_df= m3,\n",
+ " date = analysis_date,\n",
+ " route = test_route3,\n",
+ " trip = test_trip3,\n",
+ " gtfs_key = test_gtfs_key3)"
]
},
{
"cell_type": "markdown",
- "id": "c1b099f7-c8e3-4c37-a70d-e765763448d7",
- "metadata": {
- "tags": []
- },
- "source": [
- "#### Manipulate DF for charts"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "a6401179-2b91-444c-8c80-b97659f4225e",
- "metadata": {},
- "outputs": [],
- "source": [
- "m1 ="
- ]
- },
+ "id": "06df58c1-c7b7-4769-bd8d-696e337eefb3",
+ "metadata": {},
+ "source": [
+ "### Stage1: \"vp_usable\""
+ ]
+ },
{
- "cell_type": "code",
- "execution_count": null,
- "id": "576c28b2-8c36-48a2-b312-9fc54010f7b5",
+ "cell_type": "code",
+ "execution_count": 153,
+ "id": "b9eab37f-0569-4f07-9113-87200b0c7dfd",
"metadata": {},
- "outputs": [],
+ "outputs": [],
+ "source": [
+ "# What's the diff between stop segments normal/special/and without any notation?\n",
+ "usable = pd.read_parquet(f\"{SEGMENT_GCS}vp_usable_{analysis_date}\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 154,
+ "id": "deb486c8-a800-485e-8a46-d994af1c0074",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " _gtfs_dataset_name | \n",
+ " trip_id | \n",
+ " location_timestamp | \n",
+ " location_timestamp_local | \n",
+ " activity_date | \n",
+ " hour | \n",
+ " x | \n",
+ " y | \n",
+ " vp_idx | \n",
+ " gtfs_dataset_key | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 8658254 | \n",
+ " San Diego Vehicle Positions | \n",
+ " 16848405 | \n",
+ " 2023-04-12 15:37:42+00:00 | \n",
+ " 2023-04-12 08:37:42 | \n",
+ " 2023-04-12 | \n",
+ " 8 | \n",
+ " -117.14 | \n",
+ " 32.79 | \n",
+ " 8658254 | \n",
+ " a4f6fd5552107e05fe9743ac7cce2c55 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " _gtfs_dataset_name trip_id location_timestamp \\\n",
+ "8658254 San Diego Vehicle Positions 16848405 2023-04-12 15:37:42+00:00 \n",
+ "\n",
+ " location_timestamp_local activity_date hour x y vp_idx \\\n",
+ "8658254 2023-04-12 08:37:42 2023-04-12 8 -117.14 32.79 8658254 \n",
+ "\n",
+ " gtfs_dataset_key \n",
+ "8658254 a4f6fd5552107e05fe9743ac7cce2c55 "
+ ]
+ },
+ "execution_count": 154,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
- "test1 = m1.melt(\n",
- " id_vars=[\n",
- " \"_gtfs_dataset_name\",\n",
- " \"shape_array_key\",\n",
- " \"trip_id\",\n",
- " \"sorted_stop_seq\",\n",
- " \"gtfs_dataset_key\",\n",
- " \"loop_or_inlining\",\n",
- " \"n_trips\",\n",
- " ],\n",
- " value_vars=[\"avg_speed_mph\", \"speed_mph\", \"p20_speed_mph\", \"p80_speed_mph\"],\n",
- ")"
+ "usable.sample()"
]
},
{
"cell_type": "code",
- "execution_count": null,
- "id": "2b55cb75-8ee8-477c-95bc-439d2ee65962",
+ "execution_count": 155,
+ "id": "813ae4db-0fef-4f10-9408-7284fc531ed2",
"metadata": {},
"outputs": [],
"source": [
- "test1.shape"
- ]
+ "m_cols2 = ['gtfs_dataset_key',\n",
+ " 'trip_id']"
+ ]
},
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "29dc19fd-0d8b-430d-a78a-a1c947263ef0",
+ {
+ "cell_type": "code",
+ "execution_count": 156,
+ "id": "d08fa8db-f3a3-43f2-a763-a39cacc9cf9c",
+ "metadata": {},
+ "outputs": [
+ {
+ "ename": "NameError",
+ "evalue": "name 'subset_for_merge2' is not defined",
+ "output_type": "error",
+ "traceback": [
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+ "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
+ "Cell \u001b[0;32mIn[156], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43msubset_for_merge2\u001b[49m\u001b[38;5;241m.\u001b[39mhead()\n",
+ "\u001b[0;31mNameError\u001b[0m: name 'subset_for_merge2' is not defined"
+ ]
+ }
+ ],
+ "source": [
+ "subset_for_merge2.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "68a59632-cfbd-43fd-aca4-a502e400a854",
"metadata": {
"scrolled": true,
"tags": []
},
"outputs": [],
"source": [
- "# test1[test1.shape_array_key == \"29d2bbdbeaec1d6888800f85bebf6e33\"]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "e2c00548-7615-409c-a7cc-7db5f16c9a88",
- "metadata": {},
- "outputs": [],
- "source": [
- "# Only need average speed/p20 speed/p80 to show up once for each stop sequence-operator-shape array\n",
- "test2 = test1.drop_duplicates(\n",
- " subset=[\n",
- " \"_gtfs_dataset_name\",\n",
- " \"shape_array_key\",\n",
- " \"sorted_stop_seq\",\n",
- " \"gtfs_dataset_key\",\n",
- " \"variable\",\n",
- " \"value\",\n",
- " ]\n",
- ").reset_index(drop=True)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "436aca32-61c9-4488-8fae-26fe66688851",
- "metadata": {},
- "outputs": [],
- "source": [
- "# test2.to_csv(\"./speeds.csv\")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "be4bac6e-2d9f-4d64-a835-22bb4f3c32f5",
- "metadata": {},
- "outputs": [],
- "source": [
- "test2.shape"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "5723b75c-0cc2-42e7-95e0-17b444971ee2",
- "metadata": {},
- "outputs": [],
- "source": [
- "other = [\n",
- " \"cf688717cf0cd8dac0e6d1f12f9c7333\",\n",
- " \"6f39f818c9a0c5496cd1c8bd1aa11e67\",\n",
- " \"3de4482ec32ba0f2edb451d3528b5a5e\",\n",
- "]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "7b08d810-9e33-4a4f-b600-b5c8de9bdef6",
- "metadata": {},
- "outputs": [],
- "source": [
- "# Take out routes that have over 85 stops\n",
- "# subset = test2[~test2.shape_array_key.isin(routes_many_stops_list)].reset_index(drop = True)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "600d9ed7-524e-4af6-b90b-81fd89cf8c02",
- "metadata": {},
- "outputs": [],
- "source": [
- "subset = test2[\n",
- " test2.shape_array_key.isin(\n",
- " [\n",
- " \"29d2bbdbeaec1d6888800f85bebf6e33\",\n",
- " \"754c5b012195800c38dc58e72e4f482e\",\n",
- " \"e3c5ed2c6fa6cd5c5cd57d46aeb3cd8e\",\n",
- " ]\n",
- " )\n",
- "]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "a92360af-5dfb-43e2-b89c-87c8b8268665",
- "metadata": {},
- "outputs": [],
- "source": [
- "subset = threshold_utils.pre_clean(subset)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "3a4c7ac2-4d6f-4702-a20b-0de69e0c86d4",
- "metadata": {},
- "outputs": [],
- "source": [
- "subset.columns"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "52df69f0-7083-4551-b5f5-5dd88f30a69a",
- "metadata": {},
- "outputs": [],
- "source": [
- "subset.sample()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "4f0d0a5e-186e-4b7e-b307-6e6326c3e747",
- "metadata": {},
- "outputs": [],
- "source": [
- "subset[\"Route\"] = subset[\"Gtfs Dataset Name\"] + \" \" + subset[\"Shape Array Key\"]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "a8d96219-49d0-4251-9b4b-a6968542424d",
- "metadata": {},
- "outputs": [],
- "source": [
- "subset = subset.rename(columns={\"Value\": \"Speed\"})"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "b1155355-b4f6-41c2-beb1-f839d9d46027",
- "metadata": {},
- "outputs": [],
- "source": [
- "subset[\"Speed_Int\"] = subset.Speed.fillna(0).astype(int)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "6f695b90-4157-42bd-b9cf-aaa4d71773ef",
- "metadata": {},
- "outputs": [],
- "source": [
- "subset[\"Route Type\"] = \"Loop or Inlining: \" + subset[\"Loop Or Inlining\"].astype(str)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "0834a228-c47d-4760-ae13-aca72784e747",
- "metadata": {},
- "outputs": [],
- "source": [
- "# subset['Rounded Speed'].unique()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "67cb1df7-6337-478d-bd9c-3fbc0a3f698f",
- "metadata": {},
- "outputs": [],
- "source": [
- "def speed(row):\n",
- " # If partner is none, return Unknown.\n",
- " if row.Speed_Int == 0:\n",
- " return 0\n",
- " elif 0 < row.Speed_Int < 6:\n",
- " return 5\n",
- " elif 5 < row.Speed_Int < 11:\n",
- " return 10\n",
- " elif 10 < row.Speed_Int < 16:\n",
- " return 15\n",
- " elif 15 < row.Speed_Int < 21:\n",
- " return 20\n",
- " elif 20 < row.Speed_Int < 26:\n",
- " return 25\n",
- " elif 25 < row.Speed_Int < 31:\n",
- " return 30\n",
- " else:\n",
- " return 35"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "de056258-240b-4aff-a3f9-aabe5330a9e0",
- "metadata": {},
- "outputs": [],
- "source": [
- "# Apply the function\n",
- "subset[\"Rounded Speed\"] = subset.apply(speed, axis=1)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "afc90938-3fbd-4235-b6ab-68cfa445f0b6",
- "metadata": {
- "scrolled": true,
- "tags": []
- },
- "outputs": [],
- "source": [
- "# subset[['Rounded Speed', 'Speed', 'Speed_Int']]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "9a39e842-70f6-4ff6-84b7-44b4dea48a40",
- "metadata": {},
- "outputs": [],
- "source": [
- "subset.Variable = subset.Variable.str.title().str.replace(\"_\", \" \")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "a95e775c-4cff-44b6-a861-1a17efb5fbf2",
- "metadata": {},
- "outputs": [],
- "source": [
- "# One df for the actual speeds\n",
- "subset_speedmph = subset[subset.Variable == \"Speed Mph\"].reset_index(drop=True)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "447aec9f-1a4a-408c-bc16-22c67746c262",
- "metadata": {},
- "outputs": [],
- "source": [
- "# One df for the percentiles\n",
- "subset_other = subset[subset.Variable != \"Speed Mph\"].reset_index(drop=True)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "c2fd6ee4-c841-44fd-8659-0ee2bc8a926d",
- "metadata": {},
- "outputs": [],
- "source": [
- "selection_test = alt_dropdown(subset, \"Route\", \"Operator/Shape Array\")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "27a02401-dc45-493b-baad-e4cb84b6db46",
- "metadata": {},
- "outputs": [],
- "source": [
- "title = title.add_selection(selection_test).transform_filter(selection_test)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "c4904d76-f74e-4fd5-84e1-cd3c4476d010",
- "metadata": {},
- "source": [
- "#### Scatterplot"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "9c8ae725-71da-4c21-a575-969d14a0aa17",
- "metadata": {},
- "source": [
- "#### Jitter"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "e8c0fb00-e9a0-47c1-abc5-cea7a806a6df",
- "metadata": {},
- "outputs": [],
- "source": [
- "def create_jitter_plot(df):\n",
- "\n",
- " # title_op = df['Gtfs Dataset Name'].iloc[0].replace('VehiclePositions','').strip()\n",
- " # inline = df['Loop Or Inlining'].iloc[0]\n",
- " chart1 = (\n",
- " alt.Chart(df, width=0.5)\n",
- " .mark_circle(size=100)\n",
- " .encode(\n",
- " x=alt.X(\n",
- " \"jitter:Q\",\n",
- " title=None,\n",
- " axis=alt.Axis(values=[0], ticks=False, grid=False, labels=False),\n",
- " scale=alt.Scale(),\n",
- " ),\n",
- " y=alt.Y(\n",
- " \"Rounded Speed:Q\",\n",
- " scale=alt.Scale(domain=[0, 50]),\n",
- " title=\"Speed (MPH)\",\n",
- " axis=alt.Axis(\n",
- " labelAngle=360,\n",
- " grid=False,\n",
- " ),\n",
- " ),\n",
- " color=alt.Color(\n",
- " \"Variable:N\",\n",
- " scale=alt.Scale(range=cp.CALITP_CATEGORY_BRIGHT_COLORS),\n",
- " ),\n",
- " tooltip=df.columns.tolist(),\n",
- " column=alt.Column(\n",
- " \"Sorted Stop Seq:N\",\n",
- " header=alt.Header(\n",
- " labelAngle=360,\n",
- " titleOrient=\"top\",\n",
- " labelOrient=\"top\",\n",
- " labelAlign=\"right\",\n",
- " labelPadding=2,\n",
- " ),\n",
- " ),\n",
- " )\n",
- " .transform_calculate(\n",
- " # Generate Gaussian jitter with a Box-Muller transform\n",
- " jitter=\"sqrt(-2*log(random()))*cos(2*PI*random())\"\n",
- " )\n",
- " .properties(title=\"Speeds by Operator-Shape Array\")\n",
- " )\n",
- "\n",
- " chart1 = threshold_utils.chart_size(chart1, 75, 200)\n",
- "\n",
- " return chart1"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "19b97ca6-17c9-4540-acfe-9a4caf740ef4",
- "metadata": {},
- "outputs": [],
- "source": [
- "chart1 = (\n",
- " create_jitter_plot(subset_speedmph)\n",
- " .add_selection(selection_test)\n",
- " .transform_filter(selection_test)\n",
- ")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "c13568df-5e19-4f0a-a9fe-c3edeb0a0073",
- "metadata": {},
- "outputs": [],
- "source": [
- "chart2 = (\n",
- " alt.Chart(subset_other, width=0.5)\n",
- " .mark_circle(size=200)\n",
- " .encode(\n",
- " x=alt.X(\n",
- " \"jitter:Q\",\n",
- " title=None,\n",
- " axis=alt.Axis(values=[0], ticks=False, grid=False, labels=False),\n",
- " scale=alt.Scale(),\n",
- " ),\n",
- " y=alt.Y(\n",
- " \"Rounded Speed:Q\",\n",
- " title=\"Speed (MPH)\",\n",
- " scale=alt.Scale(domain=[0, 50]),\n",
- " axis=alt.Axis(grid=False),\n",
- " ),\n",
- " color=alt.Color(\n",
- " \"Variable:N\",\n",
- " scale=alt.Scale(range=cp.CALITP_CATEGORY_BRIGHT_COLORS),\n",
- " ),\n",
- " tooltip=subset_other.columns.tolist(),\n",
- " column=alt.Column(\n",
- " \"Sorted Stop Seq:N\",\n",
- " header=alt.Header(\n",
- " labelAngle=360,\n",
- " title=None,\n",
- " titleOrient=\"top\",\n",
- " labelOrient=\"top\",\n",
- " labelAlign=\"right\",\n",
- " labelPadding=2,\n",
- " ),\n",
- " ),\n",
- " )\n",
- " .transform_calculate(\n",
- " # Generate Gaussian jitter with a Box-Muller transform\n",
- " jitter=\"sqrt(-2*log(random()))*cos(2*PI*random())\"\n",
- " )\n",
- ")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "2a2bda8b-d48c-44f3-a928-08aca894c565",
- "metadata": {},
- "outputs": [],
- "source": [
- "chart2 = threshold_utils.chart_size(chart2, 75, 200)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "dd20293d-7c43-42c4-b053-de945860b6f0",
- "metadata": {},
- "outputs": [],
- "source": [
- "chart2 = chart2.add_selection(selection_test).transform_filter(selection_test)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "44dc6896-e95d-42df-bbd5-f2bb2c2a2cc6",
- "metadata": {},
- "outputs": [],
- "source": [
- "title = threshold_utils.chart_size(title, 20, 20)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "d651c8c6-179c-4157-a671-11006cb419df",
- "metadata": {},
- "outputs": [],
- "source": [
- "alt.data_transformers.enable(\"default\", max_rows=None)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "8e753711-8506-4939-8d9d-22566a641988",
- "metadata": {},
- "outputs": [],
- "source": [
- "title & (chart1.interactive() & chart2.interactive())"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "9745f078-1999-4e5c-9ddc-5eba135f55ab",
- "metadata": {},
- "source": [
- "### Draft"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "64011717-6707-4a7a-8eec-9bdb0861bab6",
- "metadata": {},
- "outputs": [],
- "source": [
- "def meter_elapsed_categories(row):\n",
- " lower_end = row[\"meters_mean\"] - row[\"meters_std\"]\n",
- " higher_end = row[\"meters_mean\"] + row[\"meters_std\"]\n",
- " if row[\"meters_elapsed\"] == row[\"meters_mean\"]:\n",
- " return \"distance elapsed is average\"\n",
- " elif row[\"meters_elapsed\"] <= lower_end:\n",
- " return \"distance lapsed on lower end\"\n",
- " elif row[\"meters_elapsed\"] >= higher_end:\n",
- " return \"distance lapsed on higher end\"\n",
- " elif lower_end < row[\"meters_elapsed\"] < higher_end:\n",
- " return \"distance elapsed is average\"\n",
- " else:\n",
- " return \"other\"\n",
- "\n",
- "\n",
- "def seconds_elapsed_categories(row):\n",
- " lower_end = row[\"secs_mean\"] - row[\"secs_std\"]\n",
- " higher_end = row[\"secs_mean\"] + row[\"secs_std\"]\n",
- " if row[\"sec_elapsed\"] == row[\"secs_mean\"]:\n",
- " return \"secs elapsed is average\"\n",
- " elif row[\"sec_elapsed\"] <= lower_end:\n",
- " return \"secs lapsed on lower end\"\n",
- " elif row[\"sec_elapsed\"] >= higher_end:\n",
- " return \"secs lapsed on higher end\"\n",
- " elif lower_end < row[\"sec_elapsed\"] < higher_end:\n",
- " return \"secs elapsed is average\"\n",
- " else:\n",
- " return \"other\""
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "fa26aa94-6e1c-46ef-8701-3f4a849faa7d",
- "metadata": {},
- "outputs": [],
- "source": [
- "\"\"\"def mph_categories(row):\n",
- " if (row[\"speed_mph\"] <= row[\"p20_speed_mph\"]):\n",
- " return \"speed low\"\n",
- " elif (row[\"p20_speed_mph\"] < row[\"speed_mph\"] < row[\"p80_speed_mph\"]):\n",
- " return \"speed average\"\n",
- " elif (row[\"speed_mph\"] >= row[\"p80_speed_mph\"]):\n",
- " return \"speed high\"\n",
- " elif (row[\"speed_mph\"] == 0):\n",
- " return \"speed is 0\"\n",
- " else:\n",
- " return \"other\"\n",
- " \"\"\""
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "e784cd39-b73e-478a-8b3b-458506c31a18",
- "metadata": {},
- "outputs": [],
- "source": [
- "def flag(row):\n",
- "\n",
- " # Ok rows\n",
- " # If distance and time are average, flag as average\n",
- " if (row[\"meters_cat\"] == \"meters_ elapsed avg\") & (\n",
- " row[\"seconds_cat\"] == \"seconds_ elapsed avg\"\n",
- " ):\n",
- " return \"ok\"\n",
- " # If MPH is average, flag as average\n",
- " elif row[\"speed_flags\"] == \"speed_ elapsed avg\":\n",
- " return \"ok\"\n",
- "\n",
- " # Zero rows\n",
- " elif (\n",
- " (row[\"speed_mph\"] == 0)\n",
- " | (row[\"sec_elapsed\"] == 0)\n",
- " | (row[\"meters_elapsed\"] == 0)\n",
- " ):\n",
- " return \"low\"\n",
- "\n",
- " # If meters and seconds are high, flag as average\n",
- " # elif ((row[\"meters_cat\"] == \"meters_ elapsed high\") & (row[\"seconds_cat\"] == \"seconds_ elapsed high\")):\n",
- " # return \"ok\"\n",
- " # If meters and seconds are low, flag as average\n",
- " # elif ((row[\"meters_cat\"] == \"meters_ elapsed low\") & (row[\"seconds_cat\"] == \"seconds_ elapsed low\")):\n",
- " # return \"ok\"\n",
- "\n",
- " # Tag as high\n",
- " # elif ((row[\"meters_cat\"] != \"meters_ elapsed avg\") & (row[\"seconds_cat\"] != \"seconds_ elapsed avg\") & (row[\"speed_flags\"] == \"speed_ elapsed high\")):\n",
- " # return \"high\"\n",
- " # elif ((row[\"seconds_cat\"] == \"seconds_ elapsed low\") & (row[\"speed_flags\"] == \"speed_ elapsed high\")):\n",
- " # return \"high\"\n",
- " # elif ((row[\"meters_cat\"] == \"meters_ elapsed high\") & (row[\"speed_flags\"] == \"speed_ elapsed high\")):\n",
- " # return \"high\"\n",
- "\n",
- " # Tag as low\n",
- " elif (\n",
- " (row[\"meters_cat\"] != \"meters_ elapsed avg\")\n",
- " & (row[\"seconds_cat\"] != \"seconds_ elapsed avg\")\n",
- " & (row[\"speed_flags\"] == \"speed_ elapsed low\")\n",
- " ):\n",
- " return \"low\"\n",
- " elif (row[\"seconds_cat\"] == \"seconds_ elapsed high\") & (\n",
- " row[\"speed_flags\"] == \"speed_ elapsed low\"\n",
- " ):\n",
- " return \"low\"\n",
- " elif (row[\"meters_cat\"] == \"meters_ elapsed avg\") & (\n",
- " row[\"speed_flags\"] == \"speed_ elapsed low\"\n",
- " ):\n",
- " return \"high\"\n",
- "\n",
- " else:\n",
- " return \"other\""
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "05d629ed-abfd-477e-b87b-7e321a6d0831",
- "metadata": {},
- "outputs": [],
- "source": [
- "def speed_categories(row):\n",
- " \"\"\"\n",
- " Stricter thresholds for speed categories.\n",
- " Just because a speed is below the 25th or\n",
- " above the 75th percentile doesn't mean it\n",
- " should be flagged. Take into account how far away\n",
- " it is from that.\n",
- " \"\"\"\n",
- " # lower_end = (row[\"speed_mean\"] - row[\"speed_std\"])\n",
- " # higher_end = (row[\"speed_mean\"] + row[\"speed_std\"])\n",
- " if row[\"speed_mph\"] == row[\"avg_speed_mph\"]:\n",
- " return \"average\"\n",
- " elif row[\"speed_mph\"] <= lower_end:\n",
- " return \"speed low\"\n",
- " elif row[\"speed_mph\"] >= higher_end:\n",
- " return \"speed high\"\n",
- " elif (row[\"speed_mph\"] == 0) | (row[\"speed_mph\"] == None):\n",
- " return \"speed is 0\"\n",
- " else:\n",
- " return \"average\""
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "86a5c48c-6720-41a8-b626-94305506ab3a",
- "metadata": {},
- "outputs": [],
- "source": [
- "# Determine if an agency has a small, medium, or large fleet size.\n",
- "def categorize_by_percentile(df, column_percentile: str):\n",
- "\n",
- " # Get percentiles in objects for total vehicle.\n",
- " p75 = df[column_percentile].quantile(0.75).astype(float)\n",
- " p25 = df[column_percentile].quantile(0.25).astype(float)\n",
- " p50 = df[column_percentile].quantile(0.50).astype(float)\n",
- "\n",
- " def percentile(row):\n",
- " if row[column_percentile] <= p25:\n",
- " return f\"{column_percentile}: low\"\n",
- " elif (p25 < row[column_percentile]) and (row[column_percentile] <= p75):\n",
- " return f\"{column_percentile}: average\"\n",
- " elif row[column_percentile] > p75:\n",
- " return f\"{column_percentile}: high\"\n",
- " else:\n",
- " return \"other\"\n",
- "\n",
- " df[f\"{column_percentile}_cat\"] = df.apply(lambda x: percentile(x), axis=1)\n",
- "\n",
- " return df\n",
- "\n",
- "\n",
- "def categorize_all(df):\n",
- "\n",
- " # Hold results\n",
- " final = pd.DataFrame()\n",
- "\n",
- " for column in [\"meters_elapsed\", \"sec_elapsed\"]:\n",
- " for shape_array_key in df.shape_array_key.tolist():\n",
- " for stop in df.stop_sequence.tolist():\n",
- " filtered = df[\n",
- " (df.shape_array_key == shape_array_key) & (df.stop_sequence == stop)\n",
- " ].reset_index()\n",
- " categorized = categorize_by_percentile(filtered, column)\n",
- " final = pd.concat([final, categorized], axis=0)\n",
- " print(f\"done for {column}/{stop}\")\n",
- "\n",
- " return final"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "1bb82075-6269-472e-8582-27a7640f0aa5",
- "metadata": {},
- "outputs": [],
- "source": []
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "4f25a17e-cea1-4bff-888d-d8f37dc31775",
- "metadata": {},
- "outputs": [],
- "source": [
- "\"\"\"\n",
- "p25 = troubleshoot.total_stops.quantile(0.25).astype(float)\n",
- "p50 = troubleshoot.total_stops.quantile(0.50).astype(float)\n",
- "p75 = troubleshoot.total_stops.quantile(0.75).astype(float)\n",
- "p95 = troubleshoot.total_stops.quantile(0.95).astype(float)\n",
- "p99 = troubleshoot.total_stops.quantile(0.99).astype(float)\n",
- "\"\"\""
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "0e676189-df19-4630-8dc6-3c4fc8c60e16",
- "metadata": {},
- "outputs": [],
- "source": [
- "def stop_categories1(row):\n",
- " if (row.total_stops > 0) and (row.total_stops <= p25):\n",
- " return \"25th <= 17 stops\"\n",
- " elif (row.total_stops > p25) and (row.total_stops <= p75):\n",
- " return \"50th <= 30 stops\"\n",
- " elif (row.total_stops > p75) and (row.total_stops <= p95):\n",
- " return \"75th <= 50 stops\"\n",
- " elif (row.total_stops > p95) and (row.total_stops <= p99):\n",
- " return \"95th <= 85 stops\"\n",
- " elif row.total_stops >= p95:\n",
- " return \"99th >= 203 stops\"\n",
- " else:\n",
- " return \"other\""
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "af72c1c9-df07-4e7f-973f-f0c07223b7a4",
- "metadata": {},
- "outputs": [],
- "source": [
- "def create_jitter_plot(df):\n",
- "\n",
- " title_op = df[\"Gtfs Dataset Name\"].iloc[0].replace(\"VehiclePositions\", \"\").strip()\n",
- " inline = df[\"Loop Or Inlining\"].iloc[0]\n",
- "\n",
- " chart1 = (\n",
- " alt.Chart(df, width=0.5)\n",
- " .mark_circle(size=100)\n",
- " .encode(\n",
- " x=alt.X(\n",
- " \"jitter:Q\",\n",
- " title=None,\n",
- " axis=alt.Axis(values=[0], ticks=False, grid=False, labels=False),\n",
- " scale=alt.Scale(),\n",
- " ),\n",
- " y=alt.Y(\"Rounded Speed:Q\", axis=alt.Axis(labelAngle=360)),\n",
- " color=alt.Color(\n",
- " \"Variable:N\",\n",
- " scale=alt.Scale(range=cp.CALITP_CATEGORY_BRIGHT_COLORS),\n",
- " ),\n",
- " tooltip=df.columns.tolist(),\n",
- " column=alt.Column(\n",
- " \"Stop Sequence:N\",\n",
- " header=alt.Header(\n",
- " labelAngle=360,\n",
- " titleOrient=\"top\",\n",
- " labelOrient=\"bottom\",\n",
- " labelAlign=\"right\",\n",
- " labelPadding=2,\n",
- " ),\n",
- " ),\n",
- " )\n",
- " .transform_calculate(\n",
- " # Generate Gaussian jitter with a Box-Muller transform\n",
- " jitter=\"sqrt(-2*log(random()))*cos(2*PI*random())\"\n",
- " )\n",
- " .properties(title=f\"{title_op} - Route Type {inline}\")\n",
- " )\n",
- "\n",
- " chart1 = threshold_utils.chart_size(chart1, 40, 250)\n",
- "\n",
- " return chart1"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "18ebd177-efc1-4bdc-9fac-bf947db810a4",
- "metadata": {},
- "outputs": [],
- "source": [
- "chart2 = (\n",
- " alt.Chart(anaheim_test, width=0.5)\n",
- " .mark_circle(size=100)\n",
- " .encode(\n",
- " x=alt.X(\n",
- " \"jitter:Q\",\n",
- " title=None,\n",
- " axis=alt.Axis(values=[0], ticks=False, grid=False, labels=False),\n",
- " scale=alt.Scale(),\n",
- " ),\n",
- " y=alt.Y(\"rounded_speed:Q\", axis=alt.Axis(labelAngle=-90)),\n",
- " color=alt.Color(\n",
- " \"variable:N\",\n",
- " scale=alt.Scale(range=cp.CALITP_CATEGORY_BRIGHT_COLORS),\n",
- " ),\n",
- " tooltip=anaheim_test.columns.tolist(),\n",
- " column=alt.Column(\n",
- " \"stop_sequence:N\",\n",
- " header=alt.Header(\n",
- " labelAngle=360,\n",
- " titleOrient=\"top\",\n",
- " labelOrient=\"bottom\",\n",
- " labelAlign=\"right\",\n",
- " labelPadding=2,\n",
- " ),\n",
- " ),\n",
- " )\n",
- " .transform_calculate(\n",
- " # Generate Gaussian jitter with a Box-Muller transform\n",
- " jitter=\"sqrt(-2*log(random()))*cos(2*PI*random())\"\n",
- " )\n",
- " .configure_facet(spacing=0)\n",
- " .configure_view(stroke=None)\n",
- " .properties(title=\"Trip Duration by RT Category\")\n",
- ")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "a7287305-20d8-434f-9756-42b18cf172a4",
- "metadata": {},
- "outputs": [],
- "source": [
- "chart2 = threshold_utils.chart_size(chart2, 80, 300)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "3f9e9258-b954-45bf-bd7a-17822c9c607a",
- "metadata": {},
- "outputs": [],
- "source": [
- "chart2"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "ea1d8514-ae9c-42e6-b59a-49a44fecbc98",
- "metadata": {},
- "outputs": [],
- "source": [
- "chart1 = (\n",
- " alt.Chart(anaheim_test_speedmph, width=0.5)\n",
- " .mark_circle(size=100)\n",
- " .encode(\n",
- " x=alt.X(\n",
- " \"jitter:Q\",\n",
- " title=None,\n",
- " axis=alt.Axis(values=[0], ticks=False, grid=False, labels=False),\n",
- " scale=alt.Scale(),\n",
- " ),\n",
- " y=alt.Y(\"rounded_speed:Q\", axis=alt.Axis(labelAngle=-90)),\n",
- " color=alt.Color(\n",
- " \"stop_sequence:N\",\n",
- " scale=alt.Scale(range=cp.CALITP_CATEGORY_BRIGHT_COLORS),\n",
- " ),\n",
- " tooltip=anaheim_test.columns.tolist(),\n",
- " column=alt.Column(\n",
- " \"stop_sequence:N\",\n",
- " header=alt.Header(\n",
- " labelAngle=360,\n",
- " titleOrient=\"top\",\n",
- " labelOrient=\"bottom\",\n",
- " labelAlign=\"right\",\n",
- " labelPadding=2,\n",
- " ),\n",
- " ),\n",
- " )\n",
- " .transform_calculate(\n",
- " # Generate Gaussian jitter with a Box-Muller transform\n",
- " jitter=\"sqrt(-2*log(random()))*cos(2*PI*random())\"\n",
- " )\n",
- " .configure_facet(spacing=0)\n",
- " .configure_view(stroke=None)\n",
- " .properties(title=f\"Loop/Inling:{anaheim_test_speedmph.loop_or_inlining.iloc[0]}\")\n",
- ")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "4e1eea88-640b-4e96-b2d5-07cd2f19afdf",
- "metadata": {},
- "outputs": [],
- "source": [
- "chart1 = threshold_utils.chart_size(chart1, 80, 300)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "c54559c6-8fb0-4109-8e7d-84c1ecb6497e",
- "metadata": {},
- "outputs": [],
- "source": [
- "chart1"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "79375476-00a1-4b85-a532-26889fc465e4",
- "metadata": {},
- "outputs": [],
- "source": []
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "0e5c052e-cccd-4f75-be65-9658cf7616b1",
- "metadata": {},
- "outputs": [],
- "source": [
- "def create_dot_plot2(\n",
- " df,\n",
- " col_for_dots: str,\n",
- " x_axis_col: str,\n",
- " y_axis_col: str,\n",
- " tooltip_cols: list,\n",
- " chart_title: str,\n",
- "):\n",
- "\n",
- " chart = (\n",
- " alt.Chart(df)\n",
- " .mark_circle(opacity=1, size=100)\n",
- " .transform_window(id=\"rank()\", groupby=[col_for_dots])\n",
- " .encode(\n",
- " alt.X(\n",
- " f\"{x_axis_col}:O\",\n",
- " sort=\"descending\",\n",
- " axis=alt.Axis(ticks=False, grid=True),\n",
- " ),\n",
- " alt.Y(f\"{y_axis_col}:N\"),\n",
- " color=alt.Color(\n",
- " f\"{col_for_dots}:N\",\n",
- " scale=alt.Scale(range=cp.CALITP_CATEGORY_BRIGHT_COLORS),\n",
- " legend=None,\n",
- " ),\n",
- " tooltip=tooltip_cols,\n",
- " )\n",
- " .properties(title=chart_title)\n",
- " )\n",
- "\n",
- " return chart"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "e38644ab-7fb8-47b1-b692-512b55f05625",
- "metadata": {},
- "outputs": [],
- "source": [
- "chart3 = create_dot_plot1(\n",
- " anaheim_test_other,\n",
- " \"variable\",\n",
- " \"stop_sequence\",\n",
- " \"rounded_speed\",\n",
- " anaheim_test_other.columns.tolist(),\n",
- " \"Percentile/Average\",\n",
- ")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "2b6df39b-b074-494d-8f5e-65fe05356f4b",
- "metadata": {},
- "outputs": [],
- "source": [
- "chart3 = threshold_utils.chart_size(chart3, 650, 300)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "c2c565de-3c55-4827-a5bb-b48c6d49d4c1",
- "metadata": {},
- "outputs": [],
- "source": [
- "chart4 = create_dot_plot2(\n",
- " anaheim_test_speedmph,\n",
- " \"variable\",\n",
- " \"stop_sequence\",\n",
- " \"rounded_speed\",\n",
- " anaheim_test_speedmph.columns.tolist(),\n",
- " \"Speed per Trip\",\n",
- ")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "29b3b7b1-8f44-4a8d-97c3-ff934250e874",
- "metadata": {},
- "outputs": [],
- "source": [
- "chart4 = threshold_utils.chart_size(chart4, 650, 300)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "54146c88-15df-49e2-9de6-18f2666d9f9f",
- "metadata": {},
- "outputs": [],
- "source": [
- "chart4"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "3ccbc087-f799-4429-b6c7-a9c1f38f021d",
- "metadata": {},
- "outputs": [],
- "source": [
- "chart3 + chart4"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "9829d749-580c-4440-9397-4562192417b7",
- "metadata": {},
- "outputs": [],
- "source": [
- "chart7 = (\n",
- " alt.Chart(anaheim_test_other, width=0.5)\n",
- " .mark_circle(size=100)\n",
- " .encode(\n",
- " x=alt.X(\n",
- " \"jitter:Q\",\n",
- " title=None,\n",
- " axis=alt.Axis(values=[0], ticks=False, grid=False, labels=False),\n",
- " scale=alt.Scale(),\n",
- " ),\n",
- " y=alt.Y(\"rounded_speed:Q\", axis=alt.Axis(labelAngle=-90)),\n",
- " color=alt.Color(\n",
- " \"variable:N\",\n",
- " scale=alt.Scale(range=cp.CALITP_CATEGORY_BRIGHT_COLORS),\n",
- " ),\n",
- " tooltip=anaheim_test.columns.tolist(),\n",
- " column=alt.Column(\n",
- " \"stop_sequence:N\",\n",
- " header=alt.Header(\n",
- " labelAngle=-90,\n",
- " titleOrient=\"top\",\n",
- " labelOrient=\"bottom\",\n",
- " labelAlign=\"right\",\n",
- " labelPadding=2,\n",
- " ),\n",
- " ),\n",
- " )\n",
- " .transform_calculate(\n",
- " # Generate Gaussian jitter with a Box-Muller transform\n",
- " jitter=\"sqrt(-2*log(random()))*cos(2*PI*random())\"\n",
- " )\n",
- " .configure_facet(spacing=0)\n",
- " .configure_view(stroke=None)\n",
- " .properties(title=\"Trip Duration by RT Category\")\n",
- ")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "454980fc-e6ab-4d53-b844-120a075b8034",
- "metadata": {},
- "outputs": [],
- "source": [
- "chart7 = threshold_utils.chart_size(chart7, 80, 300)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "58e57fb2-1c3b-47e1-8bb6-d4941f0ee985",
- "metadata": {},
- "outputs": [],
- "source": [
- "chart8 = (\n",
- " alt.Chart(anaheim_test_other, width=0.5)\n",
- " .mark_circle(size=100)\n",
- " .encode(\n",
- " x=alt.X(\n",
- " \"jitter:Q\",\n",
- " title=None,\n",
- " axis=alt.Axis(values=[0], ticks=False, grid=False, labels=False),\n",
- " scale=alt.Scale(),\n",
- " ),\n",
- " y=alt.Y(\"rounded_speed:Q\", axis=alt.Axis(labelAngle=-90)),\n",
- " color=alt.Color(\n",
- " \"variable:N\",\n",
- " scale=alt.Scale(range=cp.CALITP_CATEGORY_BRIGHT_COLORS),\n",
- " ),\n",
- " tooltip=anaheim_test.columns.tolist(),\n",
- " column=alt.Column(\n",
- " \"stop_sequence:N\",\n",
- " header=alt.Header(\n",
- " labelAngle=-90,\n",
- " titleOrient=\"top\",\n",
- " labelOrient=\"bottom\",\n",
- " labelAlign=\"right\",\n",
- " labelPadding=2,\n",
- " ),\n",
- " ),\n",
- " )\n",
- " .transform_calculate(\n",
- " # Generate Gaussian jitter with a Box-Muller transform\n",
- " jitter=\"sqrt(-2*log(random()))*cos(2*PI*random())\"\n",
- " )\n",
- " .properties(title=\"Trip Duration by RT Category\")\n",
- ")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "c2ce0879-f45f-4c66-9b1a-870a243a1260",
- "metadata": {},
- "outputs": [],
- "source": [
- "chart9 = (\n",
- " alt.Chart(anaheim_test_speedmph, width=0.5)\n",
- " .mark_circle(size=100)\n",
- " .encode(\n",
- " x=alt.X(\n",
- " \"jitter:Q\",\n",
- " title=None,\n",
- " axis=alt.Axis(values=[0], ticks=False, grid=False, labels=False),\n",
- " scale=alt.Scale(),\n",
- " ),\n",
- " y=alt.Y(\"rounded_speed:Q\", axis=alt.Axis(labelAngle=-90)),\n",
- " color=alt.Color(\n",
- " \"variable:N\",\n",
- " scale=alt.Scale(range=cp.CALITP_CATEGORY_BRIGHT_COLORS),\n",
- " ),\n",
- " tooltip=anaheim_test.columns.tolist(),\n",
- " column=alt.Column(\n",
- " \"stop_sequence:N\",\n",
- " header=alt.Header(\n",
- " labelAngle=360,\n",
- " titleOrient=\"top\",\n",
- " labelOrient=\"bottom\",\n",
- " labelAlign=\"right\",\n",
- " labelPadding=2,\n",
- " ),\n",
- " ),\n",
- " )\n",
- " .transform_calculate(\n",
- " # Generate Gaussian jitter with a Box-Muller transform\n",
- " jitter=\"sqrt(-2*log(random()))*cos(2*PI*random())\"\n",
- " )\n",
- " .properties(title=f\"Loop/Inling:{anaheim_test_speedmph.loop_or_inlining.iloc[0]}\")\n",
- ")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "04e3ce9a-a5ab-4aeb-a7d0-4ead6738324b",
- "metadata": {},
- "outputs": [],
- "source": [
- "chart8"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "04d2026e-c040-45ea-aab6-2c09c2211632",
- "metadata": {},
- "outputs": [],
- "source": [
- "chart9 | chart8"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "edb11b29-7038-4a5d-9e44-4d3b88e6cc88",
- "metadata": {},
- "outputs": [],
- "source": [
- "# pip install altair==5.0.0rc3\n",
- "chart5 = (\n",
- " alt.Chart(anaheim_test_speedmph, title=\"Normally distributed jitter\")\n",
- " .mark_circle(size=50)\n",
- " .encode(\n",
- " y=\"rounded_speed:Q\",\n",
- " x=\"stop_sequence:N\",\n",
- " yOffset=\"jitter:Q\",\n",
- " color=alt.Color(\"stop_sequence:Q\").legend(None),\n",
- " )\n",
- " .transform_calculate(\n",
- " # Generate Gaussian jitter with a Box-Muller transform\n",
- " jitter=\"sqrt(-2*log(random()))*cos(2*PI*random())\"\n",
- " )\n",
- ")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "9bbcc880-7866-495c-9a68-3b7ca6c00c4a",
- "metadata": {},
- "outputs": [],
- "source": [
- "chart5 = threshold_utils.chart_size(chart5, 650, 300)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "a765f55e-73e2-4bae-9326-98ed1bbfadaf",
- "metadata": {},
- "outputs": [],
- "source": [
- "chart5"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "c911a105-147a-4b8a-a741-5b67a8cf710a",
- "metadata": {
- "tags": []
- },
- "source": [
- "#### Look at one trip"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "f02f2c0b-f5fc-4452-b39b-bb1a443bc727",
- "metadata": {},
- "outputs": [],
- "source": [
- "# foothill_og = speed_stops2[speed_stops2.trip_id == \"t604-b2791-sl5\"]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "173615fa-c942-4d85-9ca8-64852b706d1f",
- "metadata": {},
- "outputs": [],
- "source": [
- "# len(foothill_og)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "28521af6-c34e-4b20-9831-3177722b9b46",
- "metadata": {},
- "outputs": [],
- "source": [
- "# foothill_og.stop_sequence.nunique()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "1136e4c7-2e5b-492d-8ae6-299a04164ac3",
- "metadata": {},
- "outputs": [],
- "source": [
- "# foothill_og.stop_sequence.describe()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "9cee6f4c-8786-478a-97ff-702be25d0788",
- "metadata": {},
- "outputs": [],
- "source": [
- "# foothill_og.sort_values('stop_sequence').head()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "97ec3ff5-bcda-4edd-9ba6-50821923dd98",
- "metadata": {},
- "outputs": [],
- "source": [
- "# foothill_renumbered_stop_seq = m2[m2.trip_id == \"t604-b2791-sl5\"]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "f2733f91-b890-4818-a649-d79bd6f9a16a",
- "metadata": {},
- "outputs": [],
- "source": [
- "# foothill_renumbered_stop_seq['Test Stop Sequence'].describe()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "c9b04672-2b10-4d06-b84a-1fd92a6a78ac",
- "metadata": {},
- "outputs": [],
- "source": [
- "# foothill_renumbered_stop_seq.sort_values('stop_sequence').head()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "9581ead1-5d91-44a4-a9d3-299445d55056",
- "metadata": {},
- "outputs": [],
- "source": [
- "# len(troubleshoot)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "1f50eb55-9373-4004-b23b-8129db734c1b",
- "metadata": {},
- "outputs": [],
- "source": [
- "# Number of test stops should match stop sequence...\n",
- "# troubleshoot['sequences_are_equal'] = troubleshoot['Test Stop Sequence'] - troubleshoot['stop_sequence']"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "46843106-2e31-46c9-9220-2f12a3e6a4fb",
- "metadata": {},
- "outputs": [],
- "source": [
- "# troubleshoot['sequences_are_equal'].value_counts()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "34d189fa-632c-40d7-a793-e576645e879e",
- "metadata": {},
- "outputs": [],
- "source": [
- "# Look at this trip id in the original df\n",
- "# og_trip = speed_stops2[speed_stops2.trip_id == \"t640-b15FF1-sl5\"]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "34b1e2f4-fe20-4029-9ec1-e5cbaaf73178",
- "metadata": {},
- "outputs": [],
- "source": [
- "# Look at this trip id in the manipulated df\n",
- "# new_trip = m2[m2.trip_id == \"t640-b15FF1-sl5\"]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "81190a32-907b-4a5d-818a-ab5c2740dbc3",
- "metadata": {
- "tags": []
- },
- "outputs": [],
- "source": [
- "# og_trip.shape, og_trip.stop_sequence.nunique()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "ae8d06ad-3016-4909-b766-c370ef074aae",
- "metadata": {},
- "outputs": [],
- "source": [
- "# new_trip.shape, new_trip.stop_sequence.nunique()"
+ "# m2[m2.trip_id == '1350']"
]
}
],