From 723a577c99f06d4858bebeb4d58d0b0468c683f1 Mon Sep 17 00:00:00 2001 From: amandaha8 Date: Tue, 13 Jun 2023 00:10:59 +0000 Subject: [PATCH 1/9] corrected tagging categories --- rt_segment_speeds/12_speeds.ipynb | 693 +++++++++++------------------- 1 file changed, 253 insertions(+), 440 deletions(-) diff --git a/rt_segment_speeds/12_speeds.ipynb b/rt_segment_speeds/12_speeds.ipynb index 677f7b36f..e214d27a5 100644 --- a/rt_segment_speeds/12_speeds.ipynb +++ b/rt_segment_speeds/12_speeds.ipynb @@ -15,7 +15,7 @@ "import geopandas as gpd\n", "import pandas as pd\n", "from segment_speed_utils import gtfs_schedule_wrangling, helpers, segment_calcs\n", - "from segment_speed_utils.project_vars import analysis_date\n", + "from segment_speed_utils.project_vars import analysis_date,SEGMENT_GCS, COMPILED_CACHED_VIEWS, PROJECT_CRS\n", "from shared_utils import calitp_color_palette as cp" ] }, @@ -35,14 +35,11 @@ { "cell_type": "code", "execution_count": null, - "id": "1e9f79a4-5921-4e8c-82c5-3b414f677cf8", - "metadata": { - "tags": [] - }, + "id": "80014c5e-695d-4280-89cd-4e7e2bb3d302", + "metadata": {}, "outputs": [], "source": [ - "# Flag\n", - "# routes_many_stops_df, routes_many_stops_list = speed_utils.find_shapes_with_many_stops(analysis_date)" + "alt.data_transformers.disable_max_rows()" ] }, { @@ -53,6 +50,16 @@ "### Merging" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "701ea573-bac3-453a-ae63-0e6f8ccf0033", + "metadata": {}, + "outputs": [], + "source": [ + "analysis_date" + ] + }, { "cell_type": "code", "execution_count": null, @@ -116,48 +123,6 @@ "merge1.sample()" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "7fa72f28-fd46-4fd9-9f4a-120812a482da", - "metadata": {}, - "outputs": [], - "source": [ - "segments_file = \"stop_segments\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "77f1e38e-6dc4-4e79-9eed-569375a133fe", - "metadata": {}, - "outputs": [], - "source": [ - "stop_segments = pd.read_parquet(\n", - " f\"{speed_utils.GCS_PATH}{segments_file}_{analysis_date}.parquet\"\n", - ").drop(columns=[\"geometry\", \"geometry_arrowized\"])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "cdaf396a-49d0-4da6-a3f8-3080f1f838b0", - "metadata": {}, - "outputs": [], - "source": [ - "stop_segments.sample()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b994dc1c-2a11-4376-9d43-f61db486e6eb", - "metadata": {}, - "outputs": [], - "source": [ - "# pd.merge(stop_segments, merge1, on = ['gtfs_dataset_key','shape_array_key','stop_sequence','loop_or_inlining'], how = \"inner\", indicator = True)[['_merge']].value_counts()" - ] - }, { "cell_type": "code", "execution_count": null, @@ -168,46 +133,6 @@ "merge1.shape" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "6483a59e-1d44-4fa9-b057-8ce5230126c2", - "metadata": {}, - "outputs": [], - "source": [ - "# m1 = speed_utils.merge_all_speeds(analysis_date)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e9d651da-95f8-425b-9a7c-28781b70a595", - "metadata": {}, - "outputs": [], - "source": [ - "# m1.sample()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "20c1eeac-34e6-417c-9354-31cfc3ea9096", - "metadata": {}, - "outputs": [], - "source": [ - "# m1.groupby(['shape_array_key']).agg({'trip_id':'nunique'}).sort_values(['trip_id'], ascending = False).head(30)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "48b197ec-add3-46d3-a2bf-ff91c1ff15be", - "metadata": {}, - "outputs": [], - "source": [ - "merge1.shape_array_key.unique()" - ] - }, { "cell_type": "markdown", "id": "801c89ce-0e7f-4758-a38a-201cc843ef28", @@ -224,17 +149,12 @@ "outputs": [], "source": [ "test_shapes = [\n", - " \"00093e1c28352239174c92c4f07a483b\",\n", - " \"001254fc8105d01a8064046249c0ceba\",\n", - " \"00b40413c13a48046de6e2338aee0410\",\n", - " \"e3c5ed2c6fa6cd5c5cd57d46aeb3cd8e\",\n", - " \"efa0f969b4499620b80c9b82170e2e60\",\n", - " \"00093e1c28352239174c92c4f07a483b\",\n", - " \"001254fc8105d01a8064046249c0ceba\",\n", - " \"6388c0be232f0c745df85d66689a6db0\",\n", - " \"d8b0826e923620f7b7cd74be090de936\",\n", - " \"e7012e8847c179f713daee0f158233e4\",\n", - " \"11d91cab41cde51a6d4f623b9cba867c\"\n", + " '000624bd8453dbe4f2eb2765b04bcb98',\n", + " '000cf9d06f53da9b54fdd44e6d5eff27',\n", + " '00255a44e09390ac3381f7184a18f0c1', \n", + " 'ffeb8a6113e0fdcd18e95257bb5be9cb',\n", + " 'fffa6a34b26647eae8aea4172c83eba1',\n", + " 'fffe83eb37f2bc4982a4cc10ef8cb2d9'\n", "]" ] }, @@ -245,19 +165,29 @@ "metadata": {}, "outputs": [], "source": [ - " few_routes = merge1.loc[merge1.shape_array_key.isin(test_shapes)].reset_index(drop=True)" + "few_routes = merge1.loc[merge1.shape_array_key.isin(test_shapes)].reset_index(drop=True)" ] }, { "cell_type": "code", "execution_count": null, - "id": "913ac9e5-41ed-43c3-86ad-2b13b141d17c", + "id": "7dae3fdc-d8c0-4e50-8e11-5af4804c9a76", "metadata": {}, "outputs": [], "source": [ "# few_routes = merge1.copy()" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "a9cb974d-08fe-49a6-ba7f-f29fd53c771e", + "metadata": {}, + "outputs": [], + "source": [ + "few_routes.shape_array_key.nunique()" + ] + }, { "cell_type": "code", "execution_count": null, @@ -289,11 +219,11 @@ "\n", " agg1 = (\n", " df.groupby([\"shape_array_key\", \"stop_sequence\"])[column_percentile]\n", - " .describe(percentiles=[0.15, 0.5, 0.95])\n", + " .describe(percentiles=[0.05, 0.95])\n", " .reset_index()\n", " .add_prefix(column_str)\n", " )\n", - "\n", + " print('done grouping')\n", " merge1 = pd.merge(\n", " df,\n", " agg1,\n", @@ -304,20 +234,16 @@ " f\"{column_str}stop_sequence\",\n", " ],\n", " )\n", - "\n", + " print('done merging')\n", " def percentile(row):\n", "\n", " if row[column_percentile] == row[f\"{column_str}mean\"]:\n", " return f\"{column_str} elapsed avg\"\n", - " elif row[column_percentile] == row[f\"{column_str}50%\"]:\n", - " return f\"{column_str} elapsed avg\"\n", - " elif row[column_percentile] <= row[f\"{column_str}15%\"]:\n", + " elif row[column_percentile] <= row[f\"{column_str}5%\"]:\n", " return f\"{column_str} elapsed low\"\n", " elif row[column_percentile] == 0:\n", " return f\"{column_str} elapsed is 0\"\n", - " elif (\n", - " row[f\"{column_str}15%\"] < row[column_percentile] <= row[f\"{column_str}95%\"]\n", - " ):\n", + " elif (row[f\"{column_str}5%\"] < row[column_percentile] <= row[f\"{column_str}95%\"]):\n", " return f\"{column_str} elapsed avg\"\n", "\n", " elif row[column_percentile] > row[f\"{column_str}95%\"]:\n", @@ -325,12 +251,22 @@ "\n", " else:\n", " return \"other\"\n", - "\n", + " print('done tagging')\n", " merge1[f\"{column_str}cat\"] = merge1.apply(lambda x: percentile(x), axis=1)\n", + " merge1[f\"{column_str}cat\"] = merge1[f\"{column_str}cat\"].str.replace('_','')\n", " print(f\"Done with {column_str}\")\n", " return merge1" ] }, + { + "cell_type": "markdown", + "id": "728cbf4b-30a8-4929-b653-1b9a7f615bc4", + "metadata": {}, + "source": [ + "#### To Do\n", + "Change to dask maybe?" + ] + }, { "cell_type": "code", "execution_count": null, @@ -340,14 +276,13 @@ "source": [ "def categorize_meters_speeds(df):\n", " start = datetime.datetime.now()\n", + " \n", " print(f\"Begin: {start}\")\n", " df.speed_mph = df.speed_mph.fillna(0)\n", " df = categorize_by_percentile(df, \"meters_elapsed\", \"meters_\")\n", " df = categorize_by_percentile(df, \"sec_elapsed\", \"seconds_\")\n", - " df = categorize_by_percentile(df, \"speed_mph\", \"speed_\")\n", - " df = df.rename(columns={\"speed_cat\": \"speed_flags\"})\n", " end = datetime.datetime.now()\n", - " print(f\"Finish: {end}\")\n", + " print(f\"Finish: {end-start}\")\n", " return df" ] }, @@ -366,453 +301,440 @@ { "cell_type": "code", "execution_count": null, - "id": "1f8fa537-8807-4544-8568-7d999ca9ecac", + "id": "fa44d793-ba32-40be-975b-cbe213a35392", "metadata": {}, "outputs": [], "source": [ - "# few_routes_cat.columns" + "few_routes_cat.head()" ] }, { "cell_type": "code", "execution_count": null, - "id": "1e6db506-6e09-49c7-84a1-02160178573d", + "id": "e016eb5a-0039-4063-ade4-c871e01c8a16", "metadata": {}, "outputs": [], "source": [ - "subset = [\n", - " \"stop_sequence\",\n", - " \"speed_flags\",\n", - " \"speed_mph\",\n", - " \"speed_15%\",\n", - " \"speed_50%\",\n", - " \"speed_95%\",\n", - " \"meters_cat\",\n", - " \"meters_elapsed\",\n", - " \"meters_mean\",\n", - " \"meters_15%\",\n", - " \"meters_50%\",\n", - " \"meters_95%\",\n", - " \"seconds_cat\",\n", - " \"sec_elapsed\",\n", - " \"seconds_mean\",\n", - " \"seconds_15%\",\n", - " \"seconds_50%\",\n", - " \"seconds_95%\",\n", - " \"gtfs_dataset_key\",\n", - "]" + "few_routes_cat.groupby([ \"meters_cat\", \"seconds_cat\",]).agg(\n", + " {\"trip_id\": \"count\"}\n", + ").reset_index().sort_values([\"trip_id\"], ascending=False)" ] }, { "cell_type": "code", "execution_count": null, - "id": "f5ae87ad-a3a4-4630-ac8c-b7ce711c2fb7", + "id": "963353ae-52ab-4ca4-bef5-b3649b6b74c3", "metadata": {}, "outputs": [], "source": [ - "few_routes_cat.speed_flags.value_counts()" + "def flag(row):\n", + " \n", + " if (row[\"meters_elapsed\"] == 0) & (row[\"sec_elapsed\"] == 0):\n", + " return \"division by 0\"\n", + " \n", + " elif (row[\"meters_cat\"] == \"meters elapsed low\"):\n", + " return \"meters too low\"\n", + "\n", + " elif (row[\"seconds_cat\"] == \"seconds elapsed high\"):\n", + " return \"seconds too high\"\n", + " \n", + " else:\n", + " return \"ok\"" ] }, { "cell_type": "code", "execution_count": null, - "id": "83ede9e9-a76d-4e57-9fac-4cc2ce1af0c5", + "id": "e9994532-e658-480e-b009-8ad7ef6392b5", "metadata": {}, "outputs": [], "source": [ - "few_routes_cat.speed_flags.value_counts() / len(few_routes) * 100" + "few_routes_cat[\"unusual_flag\"] = few_routes_cat.apply(lambda x: flag(x), axis=1)" ] }, { "cell_type": "code", "execution_count": null, - "id": "e016eb5a-0039-4063-ade4-c871e01c8a16", + "id": "d7340f9e-62ae-4684-9415-a8d9189fb3f9", "metadata": {}, "outputs": [], "source": [ - "few_routes_cat.groupby([\"speed_flags\", \"meters_cat\", \"seconds_cat\",]).agg(\n", - " {\"trip_id\": \"count\"}\n", - ").reset_index().sort_values([\"trip_id\"], ascending=False)" + "few_routes_cat.unusual_flag.value_counts() / len(few_routes_cat) * 100" ] }, { "cell_type": "code", "execution_count": null, - "id": "87860e54-fd6e-42d1-a38f-613fea4a77e9", + "id": "40c0937e-de60-4508-a2aa-056209649c4b", "metadata": {}, "outputs": [], "source": [ - "# 65d9589130415c685b89f4f7c2d8bd7e 65" + "few_routes_cat.unusual_flag.value_counts()" ] }, { "cell_type": "code", "execution_count": null, - "id": "bc6df377-9c23-4b01-80de-8c977b797c47", + "id": "afdb5cf4-cc61-454f-bdb5-8261e91bedf3", "metadata": {}, "outputs": [], "source": [ - "# few_routes_cat[few_routes_cat.speed_flags == \"average\"][subset].sample(3)" + "subset=['unusual_flag','shape_array_key', 'stop_sequence', 'stop_id',\n", + " 'median_speed_mph','p20_speed_mph',\n", + " 'p80_speed_mph', '_gtfs_dataset_name', 'trip_id',\n", + " 'meters_elapsed',\n", + " 'sec_elapsed', 'speed_mph', 'meters_5%', 'meters_50%', 'meters_95%', \n", + " 'meters_cat',\n", + " 'seconds_5%', 'seconds_50%', 'seconds_95%',\n", + " 'seconds_cat', ]" ] }, { "cell_type": "code", "execution_count": null, - "id": "49c58012-004e-4c47-becf-20e7f18895d3", - "metadata": {}, + "id": "9978f816-a208-4dac-9eb8-154ba9e58d6b", + "metadata": { + "scrolled": true, + "tags": [] + }, "outputs": [], "source": [ - "# few_routes_cat[(few_routes_cat.meters_cat == \"meters_ elapsed avg\") & (few_routes_cat.seconds_cat == \"seconds_ elapsed avg\") & (few_routes_cat.speed_flags == \"speed low\")][subset]" + "div_zero = few_routes_cat[few_routes_cat.unusual_flag == \"division by 0\"]" ] }, { "cell_type": "code", "execution_count": null, - "id": "54f38944-af7a-47bf-a799-882899964c6a", + "id": "252db83a-4633-42c5-acfd-83e535b4bacc", "metadata": {}, "outputs": [], "source": [ - "# few_routes_cat[(few_routes_cat.meters_cat == \"meters_ elapsed low\") & (few_routes_cat.seconds_cat == \"seconds_ elapsed avg\") & (few_routes_cat.speed_flags == \"speed low\")][subset].sample(3)" + "div_zero[subset].sample(5)" ] }, { "cell_type": "code", "execution_count": null, - "id": "8b28c22b-9b7c-41ab-b3cc-36661c8439e5", + "id": "21432512-a578-4541-bd5c-132e3985e062", "metadata": {}, "outputs": [], "source": [ - "# few_routes_cat[(few_routes_cat.meters_cat == \"meters_ elapsed high\") & (few_routes_cat.seconds_cat == \"seconds_ elapsed low\") & (few_routes_cat.speed_flags == \"speed high\")][subset].sample(3)" + "stc_3 = few_routes_cat[\n", + " (few_routes_cat.stop_sequence == 3)\n", + " & (\n", + " few_routes_cat._gtfs_dataset_name\n", + " == \"Bay Area 511 Santa Clara Transit VehiclePositions\"\n", + " )\n", + "]" ] }, { "cell_type": "code", "execution_count": null, - "id": "0c00506f-00ed-4660-9c88-aa11bc925fd2", - "metadata": {}, + "id": "dd744395-e5f3-43d6-94af-8866b2be959d", + "metadata": { + "scrolled": true, + "tags": [] + }, "outputs": [], "source": [ - "# few_routes_cat[(few_routes_cat.meters_cat == \"meters_ elapsed high\") & (few_routes_cat.seconds_cat == \"seconds_ elapsed high\") & (few_routes_cat.speed_flags == \"speed low\")][subset].sample(3)" + "stc_3.loc[stc_3.unusual_flag != 'ok'][subset]" + ] + }, + { + "cell_type": "markdown", + "id": "e01d88e4-a3e2-43f1-9591-afe83e8a92cf", + "metadata": {}, + "source": [ + "#### Summarize" ] }, { "cell_type": "code", "execution_count": null, - "id": "17bceddf-6c73-466d-8a8c-115370ab3301", + "id": "bdd2b274-87a7-4306-8494-f65416ac88fb", "metadata": {}, "outputs": [], "source": [ - "# few_routes_cat[(few_routes_cat.meters_cat == \"meters_ elapsed avg\") & (few_routes_cat.seconds_cat == \"seconds_ elapsed high\") & (few_routes_cat.speed_flags == \"speed low\")][subset].sample(3)" + "high_low_zero = few_routes_cat[\n", + " few_routes_cat.unusual_flag != \"ok\"].reset_index()" ] }, { "cell_type": "code", "execution_count": null, - "id": "b8248dd3-c7de-4655-a8a2-9f7b486d01c1", - "metadata": { - "scrolled": true, - "tags": [] - }, + "id": "ebea8b6f-a011-4996-be52-9e10ec1f8342", + "metadata": {}, "outputs": [], "source": [ - "# few_routes_cat[(few_routes_cat.shape_array_key == \"d8b0826e923620f7b7cd74be090de936\") & (few_routes_cat.stop_sequence == 1)][subset]" + "few_routes_cat.shape" ] }, { "cell_type": "code", "execution_count": null, - "id": "56d7c123-5002-4ba5-916c-ed5d9097126c", - "metadata": { - "scrolled": true, - "tags": [] - }, + "id": "987d3500-7594-4d12-a9d0-45d2c19999d9", + "metadata": {}, "outputs": [], "source": [ - "# few_routes_cat[(few_routes_cat.stop_sequence == 65) & (few_routes_cat.gtfs_dataset_key == \"65d9589130415c685b89f4f7c2d8bd7e\")][subset].sort_values(by = ['speed_mph'])" + "high_low_zero.shape" ] }, { "cell_type": "code", "execution_count": null, - "id": "963353ae-52ab-4ca4-bef5-b3649b6b74c3", + "id": "630e745f-dd71-4cc2-bad6-e17c666900a8", "metadata": {}, "outputs": [], "source": [ - "def flag(row):\n", - "\n", - " # Ok rows\n", - " # If distance and time are average, flag as average\n", - " if (row[\"meters_cat\"] == \"meters_ elapsed avg\") & (row[\"seconds_cat\"] == \"seconds_ elapsed avg\"):\n", - " return \"ok\"\n", - " # If MPH is average, flag as average\n", - " elif row[\"speed_flags\"] == \"speed_ elapsed avg\":\n", - " return \"ok\"\n", - "\n", - " # Zero rows\n", - " elif ((row[\"speed_mph\"] == 0) | (row[\"sec_elapsed\"] == 0) | (row[\"meters_elapsed\"] == 0)):\n", - " return \"low\"\n", - "\n", - " # Tag as high\n", - " elif row[\"speed_flags\"] == \"speed_ elapsed high\":\n", - " return \"high\"\n", - "\n", - " # Tag as low\n", - " elif row[\"speed_flags\"] == \"speed_ elapsed low\":\n", - " return \"low\"\n", - "\n", - " else:\n", - " return \"other\"" + "len(high_low_zero.drop_duplicates())" ] }, { "cell_type": "code", "execution_count": null, - "id": "e9994532-e658-480e-b009-8ad7ef6392b5", + "id": "daa82644-dc46-409a-83ab-a86ea996c356", "metadata": {}, "outputs": [], "source": [ - "few_routes_cat[\"unusual_flag\"] = few_routes_cat.apply(lambda x: flag(x), axis=1)" + "len(few_routes_cat)-len(high_low_zero)" ] }, { "cell_type": "code", "execution_count": null, - "id": "f652b637-2682-4b00-895d-d1809bab7d12", + "id": "229c6d49-58e1-45bd-839f-b03f7e8cdd4a", "metadata": {}, "outputs": [], "source": [ - "len(few_routes_cat) == len(merge1)" + "def summarize(original: pd.DataFrame, filtered_unusual_stops: pd.DataFrame):\n", + "\n", + " subset = [\"_gtfs_dataset_name\", \"gtfs_dataset_key\", \"shape_array_key\"]\n", + "\n", + " def aggregate(df, total_trip_column_name: str):\n", + " agg = (\n", + " df.groupby(subset)\n", + " .agg({\"stop_sequence\": \"count\"})\n", + " .reset_index()\n", + " .rename(columns={\"stop_sequence\": total_trip_column_name})\n", + " )\n", + "\n", + " return agg\n", + "\n", + " total_unusual_stops = aggregate(filtered_unusual_stops, \"total_unusual_stops\")\n", + " total_stops = aggregate(original, \"total_stops\")\n", + "\n", + " # Merge them\n", + " merge1 = pd.merge(total_unusual_stops, total_stops, on=subset, how=\"inner\")\n", + "\n", + " # Add some columns\n", + " merge1[\"percent_of_unusual_stops\"] = ((merge1.total_unusual_stops / merge1.total_stops) * 100).astype(int)\n", + " \n", + " merge1[\"Percentage of Unusual Stops\"] = \"% of Unusual Stops: \" + merge1.percent_of_unusual_stops.astype(str)\n", + "\n", + " # Add dropdown menu\n", + " #merge1[\"Dropdown Menu\"] = merge1._gtfs_dataset_name + \" \" + merge1.shape_array_key\n", + "\n", + " # Clean\n", + " merge1 = merge1.sort_values(['percent_of_unusual_stops'], ascending = False)\n", + " merge1 = merge1.drop(columns = [\"percent_of_unusual_stops\"])\n", + " \n", + " merge1 = threshold_utils.pre_clean(merge1)\n", + " return merge1" ] }, { "cell_type": "code", "execution_count": null, - "id": "d7340f9e-62ae-4684-9415-a8d9189fb3f9", + "id": "dad7c9b8-9025-4d91-bf75-0e23c3ac2a52", "metadata": {}, "outputs": [], "source": [ - "few_routes_cat.unusual_flag.value_counts() / len(few_routes_cat) * 100" + "summarize(few_routes_cat, high_low_zero)" + ] + }, + { + "cell_type": "markdown", + "id": "d0282d6c-a580-4776-b94a-995bd7052f37", + "metadata": {}, + "source": [ + "### Visualize " ] }, { "cell_type": "code", "execution_count": null, - "id": "40c0937e-de60-4508-a2aa-056209649c4b", + "id": "4bfd50d8-37fb-4460-b022-78711ee84a11", "metadata": {}, "outputs": [], "source": [ - "few_routes_cat.unusual_flag.value_counts()" + "high_low_zero_sample = high_low_zero.sample(40)" ] }, { "cell_type": "code", "execution_count": null, - "id": "347a8e6b-162a-4079-b258-22017bad83e9", + "id": "5884edc6-c507-490d-95dd-352b26048719", "metadata": {}, "outputs": [], "source": [ - "subset2 = [\"unusual_flag\", \"_gtfs_dataset_name\"] + subset" + "high_low_zero_sample.unusual_flag.value_counts()" ] }, { "cell_type": "code", "execution_count": null, - "id": "9978f816-a208-4dac-9eb8-154ba9e58d6b", + "id": "50182e58-f92e-48e7-9ae3-006e400465b5", "metadata": { "scrolled": true, "tags": [] }, "outputs": [], "source": [ - "high_df = few_routes_cat[few_routes_cat.unusual_flag == \"high\"]" + "high_low_zero_sample[subset]" ] }, { "cell_type": "code", "execution_count": null, - "id": "3335a25d-8447-49b1-b885-ede8234ec16a", + "id": "88e3c94c-9b5c-4008-bbe7-8be4979b1dba", "metadata": {}, "outputs": [], "source": [ - "low_df = few_routes_cat[few_routes_cat.unusual_flag == \"low\"]" + "trips = list(high_low_zero_sample.trip_id.unique())" ] }, { "cell_type": "code", "execution_count": null, - "id": "21432512-a578-4541-bd5c-132e3985e062", + "id": "b7d48690-de9f-45d7-af31-410b376a6d7e", "metadata": {}, "outputs": [], "source": [ - "stc_3 = few_routes_cat[\n", - " (few_routes_cat.stop_sequence == 3)\n", - " & (\n", - " few_routes_cat._gtfs_dataset_name\n", - " == \"Bay Area 511 Santa Clara Transit VehiclePositions\"\n", - " )\n", - "]" + "stops = list(high_low_zero_sample.stop_id.unique())" ] }, { "cell_type": "code", "execution_count": null, - "id": "44a2debc-f188-4411-95a7-c9f822ea7f3c", + "id": "a1aba28a-92f5-417c-a0d3-e31bf80e3a7d", "metadata": {}, "outputs": [], "source": [ - "test1 = few_routes_cat[\n", - " (few_routes_cat.speed_flags == \"speed_ elapsed avg\")\n", - " & (few_routes_cat.meters_cat == \"meters_ elapsed low\")\n", - " & (few_routes_cat.seconds_cat == \"seconds_ elapsed low\")\n", - "]" + "vehicle_positions = gpd.read_parquet(\"gs://calitp-analytics-data/data-analyses/rt_segment_speeds/vp_2023-04-12.parquet\")" ] }, { "cell_type": "code", "execution_count": null, - "id": "5f817b1c-f215-486c-ac3c-e75bc8fb9393", + "id": "140112e0-8ee0-481d-848c-4db091460418", "metadata": {}, "outputs": [], "source": [ - "# test1[subset2].sample(100)" + "type(vehicle_positions)" ] }, { "cell_type": "code", "execution_count": null, - "id": "f7b6bb03-dc37-478b-a1d5-ab5f2e59a556", - "metadata": { - "scrolled": true, - "tags": [] - }, + "id": "6a9f8a29-b004-474c-8e5d-cd5628d3337c", + "metadata": {}, "outputs": [], "source": [ - "# high_df[subset2].sample(10)" + "vehicle_positions.shape" ] }, { "cell_type": "code", "execution_count": null, - "id": "58b91d5d-9f9a-47e9-a5fe-b7f7e0e64587", + "id": "d843325a-f33c-40de-a3cb-befed24d645e", "metadata": {}, "outputs": [], "source": [ - "metro_62 = few_routes_cat[\n", - " (few_routes_cat.stop_sequence == 62)\n", - " & (few_routes_cat._gtfs_dataset_name == \"LA Metro Bus Vehicle Positions\")\n", - "]" + "vehicle_positions2 = vehicle_positions[vehicle_positions.trip_id.isin(trips)].reset_index()" ] }, { "cell_type": "code", "execution_count": null, - "id": "43ff6e38-5084-463f-a444-68e3bffdd7cc", - "metadata": { - "scrolled": true, - "tags": [] - }, - "outputs": [], - "source": [ - "stop_16 = few_routes_cat[\n", - " (few_routes_cat.stop_sequence == 16)\n", - " & (few_routes_cat._gtfs_dataset_name == \"Bay Area 511 Muni VehiclePositions\")\n", - "]" - ] - }, - { - "cell_type": "markdown", - "id": "e01d88e4-a3e2-43f1-9591-afe83e8a92cf", + "id": "8840809b-dd6f-4c0e-a68b-0a37f508df14", "metadata": {}, + "outputs": [], "source": [ - "#### Should filter even further." + "vehicle_positions2.shape" ] }, { "cell_type": "code", "execution_count": null, - "id": "bdd2b274-87a7-4306-8494-f65416ac88fb", + "id": "fa6e826a-7aaf-4830-9f8e-7b26be84c149", "metadata": {}, "outputs": [], "source": [ - "high_low_zero = few_routes_cat[\n", - " few_routes_cat.unusual_flag.isin([\"high\", \"low\"])\n", - "].reset_index()" + "vehicle_positions2.sample()" ] }, { "cell_type": "code", "execution_count": null, - "id": "ebea8b6f-a011-4996-be52-9e10ec1f8342", + "id": "896b5c73-0835-4ee7-a4f2-9d960778fe35", "metadata": {}, "outputs": [], "source": [ - "few_routes_cat.shape" + "gdf1 = pd.merge(vehicle_positions2, high_low_zero_sample, how = \"inner\", on = ['gtfs_dataset_key','_gtfs_dataset_name','trip_id'])" ] }, { "cell_type": "code", "execution_count": null, - "id": "987d3500-7594-4d12-a9d0-45d2c19999d9", + "id": "c82d7834-567b-4de4-b465-aea8c1a62715", "metadata": {}, "outputs": [], "source": [ - "high_low_zero.shape" + "gdf1 = gdf1[gdf1.stop_id.isin(stops)]" ] }, { "cell_type": "code", "execution_count": null, - "id": "630e745f-dd71-4cc2-bad6-e17c666900a8", + "id": "a86cec8e-e7fd-48ce-9692-c766df2b68e8", "metadata": {}, "outputs": [], "source": [ - "len(high_low_zero.drop_duplicates())" + "gdf1.columns" ] }, { "cell_type": "code", "execution_count": null, - "id": "daa82644-dc46-409a-83ab-a86ea996c356", + "id": "3bf3c599-2aec-4ae7-b8c8-53a4eff8795a", "metadata": {}, "outputs": [], "source": [ - "len(few_routes_cat)-len(high_low_zero)" + "gdf1[['geometry','stop_id','stop_sequence','_gtfs_dataset_name','shape_array_key','speed_mph']].explore('stop_sequence')" ] }, { - "cell_type": "code", - "execution_count": null, - "id": "8b912560-11ed-4aad-bcc3-ffb9e7966c24", + "cell_type": "markdown", + "id": "cde431f9-10ad-484f-b954-dd3c13a6e683", "metadata": { - "scrolled": true, "tags": [] }, - "outputs": [], "source": [ - "# To plot\n", - "# all_trips = one_route3.melt(id_vars=[ '_gtfs_dataset_name','shape_array_key','trip_id', 'stop_sequence','gtfs_dataset_key','loop_or_inlining',\n", - "#'n_trips'], value_vars=['avg_speed_mph','speed_mph','p20_speed_mph', 'p80_speed_mph'])" + "#### Draft\n", + "* Show which stops are excluded from flags\n", + "* Show how many stops are dropped\n", + "* Show % of stops that were flagged compared to total stops." ] }, { "cell_type": "code", "execution_count": null, - "id": "bc09e6d6-9811-4c50-9f61-e00af00f0b83", + "id": "f6d7831f-aed2-4e87-aae1-8ab6ddc08666", "metadata": {}, "outputs": [], "source": [ - "# all_trips = all_trips.drop_duplicates(subset = [ '_gtfs_dataset_name','shape_array_key','stop_sequence','gtfs_dataset_key','variable','value']).reset_index(drop = True)" - ] - }, - { - "cell_type": "markdown", - "id": "cde431f9-10ad-484f-b954-dd3c13a6e683", - "metadata": {}, - "source": [ - "#### Other ideas\n", - "* Show which stops are excluded from flags\n", - "* Show how many stops are dropped\n", - "* Show % of stops that were flagged compared to total stops." + "stop" ] }, { @@ -834,8 +756,9 @@ " \"meters_cat\",\n", " \"seconds_cat\",\n", " \"unusual_flag\",\n", + " \"time_of_day\",\n", " ],\n", - " value_vars=[\"avg_speed_mph\", \"speed_mph\", \"p20_speed_mph\", \"p80_speed_mph\"],\n", + " value_vars=[\"median_speed_mph\", \"speed_mph\", \"p20_speed_mph\", \"p80_speed_mph\"],\n", ")" ] }, @@ -849,10 +772,9 @@ "high_low_zero2 = high_low_zero2.drop_duplicates(\n", " subset=[\n", " \"loop_or_inlining\",\n", - " \"_gtfs_dataset_name\",\n", " \"shape_array_key\",\n", " \"stop_sequence\",\n", - " \"gtfs_dataset_key\",\n", + " \"time_of_day\",\n", " \"variable\",\n", " \"value\",\n", " ]\n", @@ -869,58 +791,6 @@ "high_low_zero2.shape" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "229c6d49-58e1-45bd-839f-b03f7e8cdd4a", - "metadata": {}, - "outputs": [], - "source": [ - "def stops_info(original: pd.DataFrame, filtered_unusual_stops: pd.DataFrame):\n", - "\n", - " subset = [\"_gtfs_dataset_name\", \"gtfs_dataset_key\", \"shape_array_key\"]\n", - "\n", - " def aggregate(df, total_trip_column_name: str):\n", - " agg = (\n", - " df.groupby(subset)\n", - " .agg({\"stop_sequence\": \"count\"})\n", - " .reset_index()\n", - " .rename(columns={\"stop_sequence\": total_trip_column_name})\n", - " )\n", - "\n", - " return agg\n", - "\n", - " total_unusual_stops = aggregate(filtered_unusual_stops, \"total_unusual_stops\")\n", - " total_stops = aggregate(original, \"total_stops\")\n", - "\n", - " # Merge them\n", - " merge1 = pd.merge(total_unusual_stops, total_stops, on=subset, how=\"inner\")\n", - "\n", - " # Add some columns\n", - " merge1[\"percent_of_unusual_stops\"] = ((merge1.total_unusual_stops / merge1.total_stops) * 100).astype(int)\n", - " \n", - " merge1[\"Percentage of Unusual Stops\"] = \"% of Unusual Stops: \" + merge1.percent_of_unusual_stops.astype(str)\n", - "\n", - " # Add dropdown menu\n", - " merge1[\"Dropdown Menu\"] = merge1._gtfs_dataset_name + \" \" + merge1.shape_array_key\n", - "\n", - " # Clean\n", - " merge1 = threshold_utils.pre_clean(merge1)\n", - "\n", - " return merge1" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4a22aba2-00ee-4f11-bcf5-c3001cc34b9f", - "metadata": {}, - "outputs": [], - "source": [ - "# Do not use melted version of the dataframe for second argument\n", - "stop_info = stops_info(merge1, high_low_zero)" - ] - }, { "cell_type": "code", "execution_count": null, @@ -931,28 +801,6 @@ "merge1.shape_array_key.nunique(), high_low_zero.shape_array_key.nunique()" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "7049c7e9-85bc-43c3-8f0c-7111fabbe649", - "metadata": {}, - "outputs": [], - "source": [ - "stop_info.shape" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c4fc643f-a156-419e-ad35-8ebc6da5d075", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "stop_info.sort_values(['Percent Of Unusual Stops'], ascending = False).head(10)" - ] - }, { "cell_type": "code", "execution_count": null, @@ -1028,46 +876,6 @@ "selection_test = alt_dropdown(high_low_zero2, \"Dropdown Menu\", \"Route\")" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "2189ef37-baa1-4447-802b-f21362e73e03", - "metadata": {}, - "outputs": [], - "source": [ - "alt.data_transformers.enable('default', max_rows=800000)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "80014c5e-695d-4280-89cd-4e7e2bb3d302", - "metadata": {}, - "outputs": [], - "source": [ - "alt.data_transformers.disable_max_rows()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4c975495-e436-4914-ace3-3f3c361a2c66", - "metadata": {}, - "outputs": [], - "source": [ - "high_low_zero2.columns" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e185065f-1d65-4720-9e72-a229c98fd1bd", - "metadata": {}, - "outputs": [], - "source": [ - "len(high_low_zero2[['Route Type']].drop_duplicates())" - ] - }, { "cell_type": "code", "execution_count": null, @@ -1094,7 +902,7 @@ "metadata": {}, "outputs": [], "source": [ - "total_stops_altair = (\n", + "\"\"\"total_stops_altair = (\n", " alt.Chart(stop_info)\n", " .mark_text(dy=-40, size=15, fontWeight=\"normal\")\n", " .encode(\n", @@ -1102,17 +910,7 @@ " )\n", " .add_selection(selection_test)\n", " .transform_filter(selection_test)\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "bbc75adc-739d-43b7-b29a-2018f98966f4", - "metadata": {}, - "outputs": [], - "source": [ - "total_stops_altair" + ")\"\"\"" ] }, { @@ -1149,11 +947,11 @@ { "cell_type": "code", "execution_count": null, - "id": "c71654d2-48f1-4cfb-a3a2-d22bc85d97a8", + "id": "da0d6aad-26c3-439b-93d2-ba5a3abac77d", "metadata": {}, "outputs": [], "source": [ - "main_chart" + "high_low_zero2.shape" ] }, { @@ -1169,31 +967,54 @@ { "cell_type": "code", "execution_count": null, - "id": "3e9e3056-137a-451e-a566-52e085499407", + "id": "9f709e13-aa1e-44da-9027-dfafaead5dad", "metadata": {}, "outputs": [], "source": [ - "print('hi')" + "high_low_zero.shape_array_key.unique()" ] }, { - "cell_type": "markdown", - "id": "5816ef70-bc10-4b87-b587-73f3789c5674", + "cell_type": "code", + "execution_count": null, + "id": "24c99824-35b7-476c-9dd2-8e07e075bb4d", "metadata": {}, + "outputs": [], "source": [ - "### Charts \n", - "Test with a few routes first\n", - "* Create new col that rounds up speed for plotting purposes only." + "merge1.time_of_day.value_counts()" ] }, { - "cell_type": "markdown", - "id": "c1b099f7-c8e3-4c37-a70d-e765763448d7", + "cell_type": "code", + "execution_count": null, + "id": "670047c2-2b24-4df3-a6f9-5f399063f521", "metadata": { + "scrolled": true, "tags": [] }, + "outputs": [], + "source": [ + "stc_test = merge1[(merge1.stop_sequence == 30) & (merge1.shape_array_key == 'ffeb8a6113e0fdcd18e95257bb5be9cb')]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c83a6fd6-6fcb-4831-add9-ea844e3959ff", + "metadata": {}, + "outputs": [], + "source": [ + "stc_test" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e5b3c303-f9e9-4914-976e-3a9a39aafe3b", + "metadata": {}, + "outputs": [], "source": [ - "#### Manipulate DF for charts" + "stc_test.groupby(['time_of_day']).agg({'stop_sequence':'count'})" ] }, { @@ -1701,14 +1522,6 @@ "title & (chart1.interactive() & chart2.interactive())" ] }, - { - "cell_type": "markdown", - "id": "9745f078-1999-4e5c-9ddc-5eba135f55ab", - "metadata": {}, - "source": [ - "### Draft" - ] - }, { "cell_type": "code", "execution_count": null, From e77b01d57d450600ad56959962b4ec10ae05a818 Mon Sep 17 00:00:00 2001 From: amandaha8 Date: Tue, 13 Jun 2023 22:19:22 +0000 Subject: [PATCH 2/9] changed function to take dask --- rt_segment_speeds/12_speeds.ipynb | 731 +++++++++++++++++++----------- 1 file changed, 471 insertions(+), 260 deletions(-) diff --git a/rt_segment_speeds/12_speeds.ipynb b/rt_segment_speeds/12_speeds.ipynb index e214d27a5..994769fea 100644 --- a/rt_segment_speeds/12_speeds.ipynb +++ b/rt_segment_speeds/12_speeds.ipynb @@ -7,15 +7,21 @@ "metadata": {}, "outputs": [], "source": [ - "import _speed_utils as speed_utils\n", "import datetime\n", + "\n", + "import _speed_utils as speed_utils\n", "import _threshold_utils as threshold_utils\n", "import altair as alt\n", "import dask.dataframe as dd\n", "import geopandas as gpd\n", "import pandas as pd\n", "from segment_speed_utils import gtfs_schedule_wrangling, helpers, segment_calcs\n", - "from segment_speed_utils.project_vars import analysis_date,SEGMENT_GCS, COMPILED_CACHED_VIEWS, PROJECT_CRS\n", + "from segment_speed_utils.project_vars import (\n", + " COMPILED_CACHED_VIEWS,\n", + " PROJECT_CRS,\n", + " SEGMENT_GCS,\n", + " analysis_date,\n", + ")\n", "from shared_utils import calitp_color_palette as cp" ] }, @@ -149,12 +155,12 @@ "outputs": [], "source": [ "test_shapes = [\n", - " '000624bd8453dbe4f2eb2765b04bcb98',\n", - " '000cf9d06f53da9b54fdd44e6d5eff27',\n", - " '00255a44e09390ac3381f7184a18f0c1', \n", - " 'ffeb8a6113e0fdcd18e95257bb5be9cb',\n", - " 'fffa6a34b26647eae8aea4172c83eba1',\n", - " 'fffe83eb37f2bc4982a4cc10ef8cb2d9'\n", + " \"000624bd8453dbe4f2eb2765b04bcb98\",\n", + " \"000cf9d06f53da9b54fdd44e6d5eff27\",\n", + " \"00255a44e09390ac3381f7184a18f0c1\",\n", + " \"ffeb8a6113e0fdcd18e95257bb5be9cb\",\n", + " \"fffa6a34b26647eae8aea4172c83eba1\",\n", + " \"fffe83eb37f2bc4982a4cc10ef8cb2d9\",\n", "]" ] }, @@ -208,6 +214,130 @@ "few_routes.trip_id.nunique()" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "bafa496a-0fdc-4b03-a4dd-f0c0e39c04de", + "metadata": {}, + "outputs": [], + "source": [ + "\"\"\"few_routes_agg = (\n", + " few_routes.groupby([\"shape_array_key\", \"stop_sequence\"])[\"meters_elapsed\"]\n", + " .describe(percentiles=[0.05, 0.95])\n", + " .reset_index()\n", + " .add_prefix(\"meters_\")\n", + ")\"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7a981533-fa29-4f4e-b873-49be562798cb", + "metadata": {}, + "outputs": [], + "source": [ + "#few_routes_aggdask = dd.from_pandas(few_routes_agg, npartitions=1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c6fda994-bed0-4140-a269-207c20e0cc47", + "metadata": {}, + "outputs": [], + "source": [ + "#few_routes_dask = dd.from_pandas(few_routes, npartitions=1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c2275352-1c4e-4a11-bd96-776f6801ec97", + "metadata": {}, + "outputs": [], + "source": [ + "#type(few_routes_dask), type(few_routes_aggdask)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ffde85cd-ee59-433a-b92f-c757d79e80ec", + "metadata": {}, + "outputs": [], + "source": [ + "\"\"\"dask_merge1 = dd.merge(\n", + " few_routes_dask,\n", + " few_routes_aggdask,\n", + " how=\"inner\",\n", + " left_on=[\"shape_array_key\", \"stop_sequence\"],\n", + " right_on=[\n", + " \"meters_shape_array_key\",\n", + " \"meters_stop_sequence\",\n", + " ],\n", + ")\"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9d61f004-233b-4f0c-a1cc-1923f02a99e9", + "metadata": {}, + "outputs": [], + "source": [ + "#dask_merge1.head(1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "53106a8f-414f-4710-97d5-4c36d953024b", + "metadata": {}, + "outputs": [], + "source": [ + "def percentile(row):\n", + "\n", + " if row[\"meters_elapsed\"] == row[\"meters_mean\"]:\n", + " return \"meters average\"\n", + " elif row[\"meters_elapsed\"] <= row[\"meters_5%\"]:\n", + " return \"meters low\"\n", + "\n", + " else:\n", + " return \"other\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e1887f21-26f5-4ea6-a384-7c6b5519000f", + "metadata": {}, + "outputs": [], + "source": [ + "#dask_merge1[\"test\"] = dask_merge1.apply(\n", + "# lambda x: percentile(x), axis=1, meta=(\"test\", \"string\")\n", + "#)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dac25f8e-5d36-4464-b475-bfc2504ff2c2", + "metadata": {}, + "outputs": [], + "source": [ + "#dask_merge1 = dask_merge1.compute()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9e1e478f-0190-40ec-a32d-783c87482b79", + "metadata": {}, + "outputs": [], + "source": [ + "#dask_merge1[[\"test\"]].head()" + ] + }, { "cell_type": "code", "execution_count": null, @@ -215,18 +345,26 @@ "metadata": {}, "outputs": [], "source": [ - "def categorize_by_percentile(df, column_percentile: str, column_str: str):\n", + "def categorize_by_percentile(\n", + " df: pd.DataFrame, column_percentile: str, column_str: str\n", + ") -> dd.DataFrame:\n", "\n", + " # Find percentiles\n", " agg1 = (\n", " df.groupby([\"shape_array_key\", \"stop_sequence\"])[column_percentile]\n", " .describe(percentiles=[0.05, 0.95])\n", " .reset_index()\n", " .add_prefix(column_str)\n", " )\n", - " print('done grouping')\n", - " merge1 = pd.merge(\n", - " df,\n", - " agg1,\n", + "\n", + " # Convert to dask because it takes a very long time\n", + " agg1_dask = dd.from_pandas(agg1, npartitions=1)\n", + " df_dask = dd.from_pandas(df, npartitions=1)\n", + "\n", + " # Merge using dask\n", + " merge1_dask = dd.merge(\n", + " df_dask,\n", + " agg1_dask,\n", " how=\"inner\",\n", " left_on=[\"shape_array_key\", \"stop_sequence\"],\n", " right_on=[\n", @@ -234,7 +372,7 @@ " f\"{column_str}stop_sequence\",\n", " ],\n", " )\n", - " print('done merging')\n", + "\n", " def percentile(row):\n", "\n", " if row[column_percentile] == row[f\"{column_str}mean\"]:\n", @@ -243,7 +381,7 @@ " return f\"{column_str} elapsed low\"\n", " elif row[column_percentile] == 0:\n", " return f\"{column_str} elapsed is 0\"\n", - " elif (row[f\"{column_str}5%\"] < row[column_percentile] <= row[f\"{column_str}95%\"]):\n", + " elif row[f\"{column_str}5%\"] < row[column_percentile] <= row[f\"{column_str}95%\"]:\n", " return f\"{column_str} elapsed avg\"\n", "\n", " elif row[column_percentile] > row[f\"{column_str}95%\"]:\n", @@ -251,20 +389,79 @@ "\n", " else:\n", " return \"other\"\n", - " print('done tagging')\n", - " merge1[f\"{column_str}cat\"] = merge1.apply(lambda x: percentile(x), axis=1)\n", - " merge1[f\"{column_str}cat\"] = merge1[f\"{column_str}cat\"].str.replace('_','')\n", + "\n", + " merge1_dask[f\"{column_str}cat\"] = merge1_dask.apply(\n", + " lambda x: percentile(x), axis=1, meta=(f\"{column_str}cat\", \"string\")\n", + " )\n", + "\n", + " # Clean\n", + " merge1_dask[f\"{column_str}cat\"] = merge1_dask[f\"{column_str}cat\"].str.replace(\"_\", \"\")\n", + "\n", + " columns_to_keep = [\n", + " \"shape_array_key\",\n", + " \"gtfs_dataset_key\",\n", + " \"_gtfs_dataset_name\",\n", + " \"speed_mph\",\n", + " \"loop_or_inlining\",\n", + " \"stop_sequence\",\n", + " \"stop_id\",\n", + " \"trip_id\",\n", + " \"n_trips\",\n", + " \"p20_speed_mph\",\n", + " \"p80_speed_mph\",\n", + " \"time_of_day\",\n", + " \"median_speed_mph\",\n", + " \"meters_elapsed\",\n", + " \"sec_elapsed\",\n", + " f\"{column_str}5%\",\n", + " f\"{column_str}95%\",\n", + " f\"{column_str}cat\"\n", + " ]\n", + " merge1_dask = merge1_dask[columns_to_keep]\n", " print(f\"Done with {column_str}\")\n", - " return merge1" + " return merge1_dask" ] }, { - "cell_type": "markdown", - "id": "728cbf4b-30a8-4929-b653-1b9a7f615bc4", + "cell_type": "code", + "execution_count": null, + "id": "55eb1e43-bea2-4e6c-b665-513115107c25", + "metadata": {}, + "outputs": [], + "source": [ + "#ddf_meters = ddf_meters.compute()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "12014229-3cf9-4e44-a603-8dbaca1506e5", "metadata": {}, + "outputs": [], "source": [ - "#### To Do\n", - "Change to dask maybe?" + "#ddf_sec = ddf_sec.compute()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "963353ae-52ab-4ca4-bef5-b3649b6b74c3", + "metadata": {}, + "outputs": [], + "source": [ + "def flag(row):\n", + "\n", + " if (row[\"meters_elapsed\"] == 0) & (row[\"sec_elapsed\"] == 0):\n", + " return \"division by 0\"\n", + "\n", + " elif row[\"meters_cat\"] == \"meters elapsed low\":\n", + " return \"meters too low\"\n", + "\n", + " elif row[\"seconds_cat\"] == \"seconds elapsed high\":\n", + " return \"seconds too high\"\n", + "\n", + " else:\n", + " return \"ok\"" ] }, { @@ -276,14 +473,34 @@ "source": [ "def categorize_meters_speeds(df):\n", " start = datetime.datetime.now()\n", - " \n", + "\n", " print(f\"Begin: {start}\")\n", + "\n", + " # Find percentiles\n", " df.speed_mph = df.speed_mph.fillna(0)\n", - " df = categorize_by_percentile(df, \"meters_elapsed\", \"meters_\")\n", - " df = categorize_by_percentile(df, \"sec_elapsed\", \"seconds_\")\n", + "\n", + " # These are now dask dataframes\n", + " ddf_meters = categorize_by_percentile(df, \"meters_elapsed\", \"meters_\")\n", + " ddf_seconds = categorize_by_percentile(df, \"sec_elapsed\", \"seconds_\")\n", + "\n", + " merge_cols = ['shape_array_key', 'gtfs_dataset_key', '_gtfs_dataset_name',\n", + " 'speed_mph', 'loop_or_inlining', 'stop_sequence', 'stop_id', 'n_trips',\n", + " 'p20_speed_mph', 'p80_speed_mph', 'time_of_day', 'median_speed_mph',\n", + " 'meters_elapsed', 'sec_elapsed',\"trip_id\",]\n", + " \n", + " # Merge using dask\n", + " m1 = dd.merge(\n", + " ddf_meters,\n", + " ddf_seconds,\n", + " how=\"inner\",\n", + " on=merge_cols)\n", + "\n", + " # Apply flags\n", + " m1[\"flag\"] = m1.apply(lambda x: flag(x), axis=1, meta=(\"flag\", \"string\"))\n", + "\n", " end = datetime.datetime.now()\n", " print(f\"Finish: {end-start}\")\n", - " return df" + " return m1" ] }, { @@ -301,11 +518,11 @@ { "cell_type": "code", "execution_count": null, - "id": "fa44d793-ba32-40be-975b-cbe213a35392", + "id": "fdde9f12-a650-41ef-9bf6-a83204647f30", "metadata": {}, "outputs": [], "source": [ - "few_routes_cat.head()" + "len(few_routes_cat)" ] }, { @@ -315,403 +532,469 @@ "metadata": {}, "outputs": [], "source": [ - "few_routes_cat.groupby([ \"meters_cat\", \"seconds_cat\",]).agg(\n", + "few_routes_cat.groupby([\"flag\",\"meters_cat\", \"seconds_cat\",]).agg(\n", " {\"trip_id\": \"count\"}\n", - ").reset_index().sort_values([\"trip_id\"], ascending=False)" + ").reset_index().sort_values([\"trip_id\"], ascending=False).compute()" ] }, { "cell_type": "code", "execution_count": null, - "id": "963353ae-52ab-4ca4-bef5-b3649b6b74c3", + "id": "afdb5cf4-cc61-454f-bdb5-8261e91bedf3", "metadata": {}, "outputs": [], "source": [ - "def flag(row):\n", - " \n", - " if (row[\"meters_elapsed\"] == 0) & (row[\"sec_elapsed\"] == 0):\n", - " return \"division by 0\"\n", - " \n", - " elif (row[\"meters_cat\"] == \"meters elapsed low\"):\n", - " return \"meters too low\"\n", - "\n", - " elif (row[\"seconds_cat\"] == \"seconds elapsed high\"):\n", - " return \"seconds too high\"\n", - " \n", - " else:\n", - " return \"ok\"" + "subset = [\n", + " \"unusual_flag\",\n", + " \"shape_array_key\",\n", + " \"stop_sequence\",\n", + " \"stop_id\",\n", + " \"median_speed_mph\",\n", + " \"p20_speed_mph\",\n", + " \"p80_speed_mph\",\n", + " \"_gtfs_dataset_name\",\n", + " \"trip_id\",\n", + " \"meters_elapsed\",\n", + " \"sec_elapsed\",\n", + " \"speed_mph\",\n", + " \"meters_5%\",\n", + " \"meters_50%\",\n", + " \"meters_95%\",\n", + " \"meters_cat\",\n", + " \"seconds_5%\",\n", + " \"seconds_50%\",\n", + " \"seconds_95%\",\n", + " \"seconds_cat\",\n", + "]" ] }, { "cell_type": "code", "execution_count": null, - "id": "e9994532-e658-480e-b009-8ad7ef6392b5", + "id": "bdd2b274-87a7-4306-8494-f65416ac88fb", "metadata": {}, "outputs": [], "source": [ - "few_routes_cat[\"unusual_flag\"] = few_routes_cat.apply(lambda x: flag(x), axis=1)" + "high_low_zero = few_routes_cat[few_routes_cat.flag != \"ok\"].reset_index()" ] }, { "cell_type": "code", "execution_count": null, - "id": "d7340f9e-62ae-4684-9415-a8d9189fb3f9", + "id": "5b2a9d41-925b-442f-a851-a354f58eb127", "metadata": {}, "outputs": [], "source": [ - "few_routes_cat.unusual_flag.value_counts() / len(few_routes_cat) * 100" + "high_low_zero = high_low_zero.compute()" ] }, { "cell_type": "code", "execution_count": null, - "id": "40c0937e-de60-4508-a2aa-056209649c4b", + "id": "798b9bfe-ca16-4453-8ca5-7a06f449e38e", "metadata": {}, "outputs": [], "source": [ - "few_routes_cat.unusual_flag.value_counts()" + "high_low_zero.trip_id.nunique()" ] }, { "cell_type": "code", "execution_count": null, - "id": "afdb5cf4-cc61-454f-bdb5-8261e91bedf3", + "id": "54a2b149-c5db-4917-8232-2ecf425fddc7", "metadata": {}, "outputs": [], "source": [ - "subset=['unusual_flag','shape_array_key', 'stop_sequence', 'stop_id',\n", - " 'median_speed_mph','p20_speed_mph',\n", - " 'p80_speed_mph', '_gtfs_dataset_name', 'trip_id',\n", - " 'meters_elapsed',\n", - " 'sec_elapsed', 'speed_mph', 'meters_5%', 'meters_50%', 'meters_95%', \n", - " 'meters_cat',\n", - " 'seconds_5%', 'seconds_50%', 'seconds_95%',\n", - " 'seconds_cat', ]" + "few_routes_cat = few_routes_cat.compute()" ] }, { "cell_type": "code", "execution_count": null, - "id": "9978f816-a208-4dac-9eb8-154ba9e58d6b", - "metadata": { - "scrolled": true, - "tags": [] - }, + "id": "c1fbe857-c760-4b22-8265-638326a36890", + "metadata": {}, "outputs": [], "source": [ - "div_zero = few_routes_cat[few_routes_cat.unusual_flag == \"division by 0\"]" + "few_routes_cat.trip_id.nunique()" ] }, { - "cell_type": "code", - "execution_count": null, - "id": "252db83a-4633-42c5-acfd-83e535b4bacc", + "cell_type": "markdown", + "id": "8d16b36f-4ace-4ad5-beb6-fd0f5d98e56e", "metadata": {}, - "outputs": [], "source": [ - "div_zero[subset].sample(5)" + "### Visualize \n", + "#### One" ] }, { "cell_type": "code", "execution_count": null, - "id": "21432512-a578-4541-bd5c-132e3985e062", + "id": "c0fbb61d-586d-4513-99c0-793e7105ac2a", "metadata": {}, "outputs": [], "source": [ - "stc_3 = few_routes_cat[\n", - " (few_routes_cat.stop_sequence == 3)\n", - " & (\n", - " few_routes_cat._gtfs_dataset_name\n", - " == \"Bay Area 511 Santa Clara Transit VehiclePositions\"\n", - " )\n", - "]" + "equal_sampling = high_low_zero.groupby('flag').apply(lambda x: x.sample(n=2)).reset_index(drop = True)" ] }, { "cell_type": "code", "execution_count": null, - "id": "dd744395-e5f3-43d6-94af-8866b2be959d", - "metadata": { - "scrolled": true, - "tags": [] - }, - "outputs": [], - "source": [ - "stc_3.loc[stc_3.unusual_flag != 'ok'][subset]" - ] - }, - { - "cell_type": "markdown", - "id": "e01d88e4-a3e2-43f1-9591-afe83e8a92cf", + "id": "34840f61-1857-4f5c-b3a6-abcac18af0db", "metadata": {}, + "outputs": [], "source": [ - "#### Summarize" + "trips = list(equal_sampling.trip_id.unique())" ] }, { "cell_type": "code", "execution_count": null, - "id": "bdd2b274-87a7-4306-8494-f65416ac88fb", + "id": "759ac80e-9c98-4c94-add3-859ab998e038", "metadata": {}, "outputs": [], "source": [ - "high_low_zero = few_routes_cat[\n", - " few_routes_cat.unusual_flag != \"ok\"].reset_index()" + "stops = list(equal_sampling.stop_id.unique())" ] }, { "cell_type": "code", "execution_count": null, - "id": "ebea8b6f-a011-4996-be52-9e10ec1f8342", + "id": "94620388-876e-4e6d-80ab-ac68eb1061c9", "metadata": {}, "outputs": [], "source": [ - "few_routes_cat.shape" + "# Plot some of the trips\n", + "sample_data = few_routes_cat[few_routes_cat.trip_id.isin(trips)].reset_index()" ] }, { "cell_type": "code", "execution_count": null, - "id": "987d3500-7594-4d12-a9d0-45d2c19999d9", + "id": "417804f1-9e24-4ac6-9c96-95f4da8f9693", "metadata": {}, "outputs": [], "source": [ - "high_low_zero.shape" + "sample_data.shape" ] }, { "cell_type": "code", "execution_count": null, - "id": "630e745f-dd71-4cc2-bad6-e17c666900a8", + "id": "339a67c7-3634-43d4-b865-b10119507c44", "metadata": {}, "outputs": [], "source": [ - "len(high_low_zero.drop_duplicates())" + "# sample_data2 = sample_data[['shape_array_key','gtfs_dataset_key','trip_id']]" ] }, { "cell_type": "code", "execution_count": null, - "id": "daa82644-dc46-409a-83ab-a86ea996c356", + "id": "39231b51-f0ab-4bbe-80d0-8a2081849e50", "metadata": {}, "outputs": [], "source": [ - "len(few_routes_cat)-len(high_low_zero)" + "plotting = sample_data.melt(\n", + " id_vars=[\n", + " \"_gtfs_dataset_name\",\n", + " \"shape_array_key\",\n", + " \"trip_id\",\n", + " \"stop_sequence\",\n", + " \"gtfs_dataset_key\",\n", + " \"loop_or_inlining\",\n", + " \"n_trips\",\n", + " \"meters_elapsed\",\n", + " \"meters_cat\",\n", + " \"seconds_cat\",\n", + " \"sec_elapsed\",\n", + " \"flag\",\n", + " 'p20_speed_mph', 'p80_speed_mph',\n", + " 'median_speed_mph',\n", + " ],\n", + " value_vars=[\"speed_mph\"],\n", + ")" ] }, { "cell_type": "code", "execution_count": null, - "id": "229c6d49-58e1-45bd-839f-b03f7e8cdd4a", + "id": "959bbe35-8c21-4795-b418-745a0caa94e8", "metadata": {}, "outputs": [], "source": [ - "def summarize(original: pd.DataFrame, filtered_unusual_stops: pd.DataFrame):\n", - "\n", - " subset = [\"_gtfs_dataset_name\", \"gtfs_dataset_key\", \"shape_array_key\"]\n", - "\n", - " def aggregate(df, total_trip_column_name: str):\n", - " agg = (\n", - " df.groupby(subset)\n", - " .agg({\"stop_sequence\": \"count\"})\n", - " .reset_index()\n", - " .rename(columns={\"stop_sequence\": total_trip_column_name})\n", - " )\n", - "\n", - " return agg\n", - "\n", - " total_unusual_stops = aggregate(filtered_unusual_stops, \"total_unusual_stops\")\n", - " total_stops = aggregate(original, \"total_stops\")\n", - "\n", - " # Merge them\n", - " merge1 = pd.merge(total_unusual_stops, total_stops, on=subset, how=\"inner\")\n", - "\n", - " # Add some columns\n", - " merge1[\"percent_of_unusual_stops\"] = ((merge1.total_unusual_stops / merge1.total_stops) * 100).astype(int)\n", - " \n", - " merge1[\"Percentage of Unusual Stops\"] = \"% of Unusual Stops: \" + merge1.percent_of_unusual_stops.astype(str)\n", - "\n", - " # Add dropdown menu\n", - " #merge1[\"Dropdown Menu\"] = merge1._gtfs_dataset_name + \" \" + merge1.shape_array_key\n", - "\n", - " # Clean\n", - " merge1 = merge1.sort_values(['percent_of_unusual_stops'], ascending = False)\n", - " merge1 = merge1.drop(columns = [\"percent_of_unusual_stops\"])\n", - " \n", - " merge1 = threshold_utils.pre_clean(merge1)\n", - " return merge1" + "# Clean\n", + "plotting = threshold_utils.pre_clean(plotting)" ] }, { "cell_type": "code", "execution_count": null, - "id": "dad7c9b8-9025-4d91-bf75-0e23c3ac2a52", + "id": "dd343cdd-acf3-4433-a702-8c3eb53ba1f0", "metadata": {}, "outputs": [], "source": [ - "summarize(few_routes_cat, high_low_zero)" + "plotting[\"Dropdown Menu\"] = (\n", + " plotting[\"Gtfs Dataset Name\"] + \" \" + plotting[\"Trip Id\"]\n", + ")" ] }, { - "cell_type": "markdown", - "id": "d0282d6c-a580-4776-b94a-995bd7052f37", + "cell_type": "code", + "execution_count": null, + "id": "072f39d9-4bf9-4efc-a393-24f096cecf7e", "metadata": {}, + "outputs": [], "source": [ - "### Visualize " + "def alt_dropdown(df, col_for_dropdown: str, dropdown_menu_title: str):\n", + " # Create dropdown menu\n", + " # Exclude \"none\" operators which are only scheduled data\n", + " df = df.loc[df[col_for_dropdown] != \"None\"][[col_for_dropdown]]\n", + " dropdown_list = df[col_for_dropdown].unique().tolist()\n", + "\n", + " # Show only first operator by default\n", + " initialize_first_op = sorted(dropdown_list)[0]\n", + " input_dropdown = alt.binding_select(\n", + " options=sorted(dropdown_list), name=dropdown_menu_title\n", + " )\n", + "\n", + " selection = alt.selection_single(\n", + " name=dropdown_menu_title,\n", + " fields=[col_for_dropdown],\n", + " bind=input_dropdown,\n", + " init={col_for_dropdown: initialize_first_op},\n", + " )\n", + "\n", + " return selection" ] }, { "cell_type": "code", "execution_count": null, - "id": "4bfd50d8-37fb-4460-b022-78711ee84a11", + "id": "6b919e59-03e8-426f-b9b3-4468d5b1b06b", "metadata": {}, "outputs": [], "source": [ - "high_low_zero_sample = high_low_zero.sample(40)" + "selection_test = alt_dropdown(plotting, \"Dropdown Menu\", \"Route\")" ] }, { "cell_type": "code", "execution_count": null, - "id": "5884edc6-c507-490d-95dd-352b26048719", + "id": "d7b9d14f-b708-4426-a1b8-4afb4e7c95ba", "metadata": {}, "outputs": [], "source": [ - "high_low_zero_sample.unusual_flag.value_counts()" + "(\n", + " threshold_utils.chart_size(\n", + " alt.Chart(plotting)\n", + " .mark_tick(\n", + " size=15,\n", + " thickness=5,\n", + " )\n", + " .encode(\n", + " x=\"Stop Sequence:N\",\n", + " y=\"Value:Q\",\n", + " color=alt.Color(\n", + " \"Flag:N\", scale=alt.Scale(range=cp.CALITP_CATEGORY_BOLD_COLORS)\n", + " ),\n", + " tooltip=plotting.columns.tolist(),\n", + " )\n", + " .interactive(),\n", + " 1100,\n", + " 400,\n", + " )\n", + " .add_selection(selection_test)\n", + " .transform_filter(selection_test)\n", + ")" ] }, { "cell_type": "code", "execution_count": null, - "id": "50182e58-f92e-48e7-9ae3-006e400465b5", - "metadata": { - "scrolled": true, - "tags": [] - }, + "id": "7ec90906-3364-440c-a951-eb5f2b1e84c2", + "metadata": {}, "outputs": [], "source": [ - "high_low_zero_sample[subset]" + "stop" ] }, { "cell_type": "code", "execution_count": null, - "id": "88e3c94c-9b5c-4008-bbe7-8be4979b1dba", + "id": "a1aba28a-92f5-417c-a0d3-e31bf80e3a7d", "metadata": {}, "outputs": [], "source": [ - "trips = list(high_low_zero_sample.trip_id.unique())" + "vehicle_positions = gpd.read_parquet(\n", + " \"gs://calitp-analytics-data/data-analyses/rt_segment_speeds/vp_2023-04-12.parquet\"\n", + ")" ] }, { "cell_type": "code", "execution_count": null, - "id": "b7d48690-de9f-45d7-af31-410b376a6d7e", + "id": "140112e0-8ee0-481d-848c-4db091460418", "metadata": {}, "outputs": [], "source": [ - "stops = list(high_low_zero_sample.stop_id.unique())" + "type(vehicle_positions)" ] }, { "cell_type": "code", "execution_count": null, - "id": "a1aba28a-92f5-417c-a0d3-e31bf80e3a7d", + "id": "d843325a-f33c-40de-a3cb-befed24d645e", "metadata": {}, "outputs": [], "source": [ - "vehicle_positions = gpd.read_parquet(\"gs://calitp-analytics-data/data-analyses/rt_segment_speeds/vp_2023-04-12.parquet\")" + "vehicle_positions2 = vehicle_positions[\n", + " vehicle_positions.trip_id.isin(trips)\n", + "].reset_index()" ] }, { "cell_type": "code", "execution_count": null, - "id": "140112e0-8ee0-481d-848c-4db091460418", + "id": "8840809b-dd6f-4c0e-a68b-0a37f508df14", "metadata": {}, "outputs": [], "source": [ - "type(vehicle_positions)" + "vehicle_positions2.shape" ] }, { "cell_type": "code", "execution_count": null, - "id": "6a9f8a29-b004-474c-8e5d-cd5628d3337c", + "id": "896b5c73-0835-4ee7-a4f2-9d960778fe35", "metadata": {}, "outputs": [], "source": [ - "vehicle_positions.shape" + "gdf1 = pd.merge(\n", + " vehicle_positions2,\n", + " sample_data,\n", + " how=\"inner\",\n", + " on=[\"gtfs_dataset_key\", \"_gtfs_dataset_name\", \"trip_id\"],\n", + ")" ] }, { "cell_type": "code", "execution_count": null, - "id": "d843325a-f33c-40de-a3cb-befed24d645e", + "id": "3725d07d-7b9f-483b-aa56-8d428b9f3d11", "metadata": {}, "outputs": [], "source": [ - "vehicle_positions2 = vehicle_positions[vehicle_positions.trip_id.isin(trips)].reset_index()" + "gdf1.shape" ] }, { "cell_type": "code", "execution_count": null, - "id": "8840809b-dd6f-4c0e-a68b-0a37f508df14", + "id": "c82d7834-567b-4de4-b465-aea8c1a62715", "metadata": {}, "outputs": [], "source": [ - "vehicle_positions2.shape" + "gdf1 = gdf1[gdf1.stop_id.isin(stops)]" ] }, { "cell_type": "code", "execution_count": null, - "id": "fa6e826a-7aaf-4830-9f8e-7b26be84c149", + "id": "a86cec8e-e7fd-48ce-9692-c766df2b68e8", "metadata": {}, "outputs": [], "source": [ - "vehicle_positions2.sample()" + "gdf1.shape" ] }, { "cell_type": "code", "execution_count": null, - "id": "896b5c73-0835-4ee7-a4f2-9d960778fe35", - "metadata": {}, + "id": "3bf3c599-2aec-4ae7-b8c8-53a4eff8795a", + "metadata": { + "tags": [] + }, "outputs": [], "source": [ - "gdf1 = pd.merge(vehicle_positions2, high_low_zero_sample, how = \"inner\", on = ['gtfs_dataset_key','_gtfs_dataset_name','trip_id'])" + "gdf1[\n", + " [\n", + " \"geometry\",\n", + " \"stop_id\",\n", + " \"stop_sequence\",\n", + " \"_gtfs_dataset_name\",\n", + " \"shape_array_key\",\n", + " \"speed_mph\",\n", + " \"flag\",\n", + " ]\n", + "].explore(\"flag\")" ] }, { - "cell_type": "code", - "execution_count": null, - "id": "c82d7834-567b-4de4-b465-aea8c1a62715", - "metadata": {}, - "outputs": [], + "cell_type": "markdown", + "id": "e01d88e4-a3e2-43f1-9591-afe83e8a92cf", + "metadata": { + "tags": [] + }, "source": [ - "gdf1 = gdf1[gdf1.stop_id.isin(stops)]" + "#### Summarize" ] }, { "cell_type": "code", "execution_count": null, - "id": "a86cec8e-e7fd-48ce-9692-c766df2b68e8", + "id": "229c6d49-58e1-45bd-839f-b03f7e8cdd4a", "metadata": {}, "outputs": [], "source": [ - "gdf1.columns" + "def summarize(original: pd.DataFrame, filtered_unusual_stops: pd.DataFrame):\n", + "\n", + " subset = [\"_gtfs_dataset_name\", \"gtfs_dataset_key\", \"shape_array_key\"]\n", + "\n", + " def aggregate(df, total_trip_column_name: str):\n", + " agg = (\n", + " df.groupby(subset)\n", + " .agg({\"stop_sequence\": \"count\"})\n", + " .reset_index()\n", + " .rename(columns={\"stop_sequence\": total_trip_column_name})\n", + " )\n", + "\n", + " return agg\n", + "\n", + " total_unusual_stops = aggregate(filtered_unusual_stops, \"total_unusual_stops\")\n", + " total_stops = aggregate(original, \"total_stops\")\n", + "\n", + " # Merge them\n", + " merge1 = pd.merge(total_unusual_stops, total_stops, on=subset, how=\"inner\")\n", + "\n", + " # Add some columns\n", + " merge1[\"percent_of_unusual_stops\"] = (\n", + " (merge1.total_unusual_stops / merge1.total_stops) * 100\n", + " ).astype(int)\n", + "\n", + " merge1[\n", + " \"Percentage of Unusual Stops\"\n", + " ] = \"% of Unusual Stops: \" + merge1.percent_of_unusual_stops.astype(str)\n", + "\n", + " # Add dropdown menu\n", + " # merge1[\"Dropdown Menu\"] = merge1._gtfs_dataset_name + \" \" + merge1.shape_array_key\n", + "\n", + " # Clean\n", + " merge1 = merge1.sort_values([\"percent_of_unusual_stops\"], ascending=False)\n", + " merge1 = merge1.drop(columns=[\"percent_of_unusual_stops\"])\n", + "\n", + " merge1 = threshold_utils.pre_clean(merge1)\n", + " return merge1" ] }, { "cell_type": "code", "execution_count": null, - "id": "3bf3c599-2aec-4ae7-b8c8-53a4eff8795a", + "id": "dad7c9b8-9025-4d91-bf75-0e23c3ac2a52", "metadata": {}, "outputs": [], "source": [ - "gdf1[['geometry','stop_id','stop_sequence','_gtfs_dataset_name','shape_array_key','speed_mph']].explore('stop_sequence')" + "summarize(few_routes_cat, high_low_zero)" ] }, { @@ -837,35 +1120,6 @@ "].astype(str)" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "072f39d9-4bf9-4efc-a393-24f096cecf7e", - "metadata": {}, - "outputs": [], - "source": [ - "def alt_dropdown(df, col_for_dropdown: str, dropdown_menu_title: str):\n", - " # Create dropdown menu\n", - " # Exclude \"none\" operators which are only scheduled data\n", - " df = df.loc[df[col_for_dropdown] != \"None\"][[col_for_dropdown]]\n", - " dropdown_list = df[col_for_dropdown].unique().tolist()\n", - "\n", - " # Show only first operator by default\n", - " initialize_first_op = sorted(dropdown_list)[0]\n", - " input_dropdown = alt.binding_select(\n", - " options=sorted(dropdown_list), name=dropdown_menu_title\n", - " )\n", - "\n", - " selection = alt.selection_single(\n", - " name=dropdown_menu_title,\n", - " fields=[col_for_dropdown],\n", - " bind=input_dropdown,\n", - " init={col_for_dropdown: initialize_first_op},\n", - " )\n", - "\n", - " return selection" - ] - }, { "cell_type": "code", "execution_count": null, @@ -994,7 +1248,10 @@ }, "outputs": [], "source": [ - "stc_test = merge1[(merge1.stop_sequence == 30) & (merge1.shape_array_key == 'ffeb8a6113e0fdcd18e95257bb5be9cb')]" + "stc_test = merge1[\n", + " (merge1.stop_sequence == 30)\n", + " & (merge1.shape_array_key == \"ffeb8a6113e0fdcd18e95257bb5be9cb\")\n", + "]" ] }, { @@ -1014,7 +1271,7 @@ "metadata": {}, "outputs": [], "source": [ - "stc_test.groupby(['time_of_day']).agg({'stop_sequence':'count'})" + "stc_test.groupby([\"time_of_day\"]).agg({\"stop_sequence\": \"count\"})" ] }, { @@ -1337,22 +1594,6 @@ "title = title.add_selection(selection_test).transform_filter(selection_test)" ] }, - { - "cell_type": "markdown", - "id": "c4904d76-f74e-4fd5-84e1-cd3c4476d010", - "metadata": {}, - "source": [ - "#### Scatterplot" - ] - }, - { - "cell_type": "markdown", - "id": "9c8ae725-71da-4c21-a575-969d14a0aa17", - "metadata": {}, - "source": [ - "#### Jitter" - ] - }, { "cell_type": "code", "execution_count": null, @@ -2268,16 +2509,6 @@ "chart5" ] }, - { - "cell_type": "markdown", - "id": "c911a105-147a-4b8a-a741-5b67a8cf710a", - "metadata": { - "tags": [] - }, - "source": [ - "#### Look at one trip" - ] - }, { "cell_type": "code", "execution_count": null, @@ -2288,26 +2519,6 @@ "# foothill_og = speed_stops2[speed_stops2.trip_id == \"t604-b2791-sl5\"]" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "173615fa-c942-4d85-9ca8-64852b706d1f", - "metadata": {}, - "outputs": [], - "source": [ - "# len(foothill_og)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "28521af6-c34e-4b20-9831-3177722b9b46", - "metadata": {}, - "outputs": [], - "source": [ - "# foothill_og.stop_sequence.nunique()" - ] - }, { "cell_type": "code", "execution_count": null, From f4ce07d7cdc36ccf5a16a7cd83f54570d027c709 Mon Sep 17 00:00:00 2001 From: amandaha8 Date: Wed, 21 Jun 2023 20:59:00 +0000 Subject: [PATCH 3/9] find 5% and 95% for all routes, stage3 --- rt_segment_speeds/12_speeds.ipynb | 2732 +++++++++-------------------- 1 file changed, 857 insertions(+), 1875 deletions(-) diff --git a/rt_segment_speeds/12_speeds.ipynb b/rt_segment_speeds/12_speeds.ipynb index 994769fea..ac9036789 100644 --- a/rt_segment_speeds/12_speeds.ipynb +++ b/rt_segment_speeds/12_speeds.ipynb @@ -9,6 +9,9 @@ "source": [ "import datetime\n", "\n", + "import gcsfs\n", + "\n", + "fs = gcsfs.GCSFileSystem()\n", "import _speed_utils as speed_utils\n", "import _threshold_utils as threshold_utils\n", "import altair as alt\n", @@ -21,6 +24,7 @@ " PROJECT_CRS,\n", " SEGMENT_GCS,\n", " analysis_date,\n", + " CONFIG_PATH\n", ")\n", "from shared_utils import calitp_color_palette as cp" ] @@ -45,7 +49,7 @@ "metadata": {}, "outputs": [], "source": [ - "alt.data_transformers.disable_max_rows()" + "# alt.data_transformers.disable_max_rows()" ] }, { @@ -59,2589 +63,1567 @@ { "cell_type": "code", "execution_count": null, - "id": "701ea573-bac3-453a-ae63-0e6f8ccf0033", + "id": "2f0c5f4f-f419-42a8-8527-7060ed412092", "metadata": {}, "outputs": [], "source": [ - "analysis_date" + "def merge_all_speeds(analysis_date:str) -> pd.DataFrame:\n", + " \"\"\"\n", + " Merge avg_speeds_stop_segments and\n", + " speed_stops parquets.\n", + " \n", + " Args:\n", + " date: analysis date\n", + " \"\"\"\n", + " # Open up avg speeds\n", + " avg_speeds = pd.read_parquet(f\"{speed_utils.GCS_PATH}avg_speeds_stop_segments_{analysis_date}.parquet\")\n", + " avg_speeds = avg_speeds.drop(columns=[\"geometry\", \"geometry_arrowized\", \"district\", \"district_name\"])\n", + " # Filter for all day flags\n", + " avg_speeds = avg_speeds[avg_speeds.time_of_day == 'all_day'].reset_index(drop = True)\n", + " \n", + " # Open up speeds\n", + " speeds = pd.read_parquet(f\"{speed_utils.GCS_PATH}speeds_stop_segments_{analysis_date}\")\n", + " \n", + " merge_cols = ['gtfs_dataset_key','shape_array_key', 'stop_sequence']\n", + " m1 = pd.merge(avg_speeds, speeds, on = merge_cols, how = 'inner')\n", + " \n", + " m1 = m1.drop_duplicates().reset_index(drop = True)\n", + " \n", + " return m1" ] }, { "cell_type": "code", "execution_count": null, - "id": "c3ab6f3f-2982-4466-aa76-06c7a235c62e", + "id": "84ac97bf-ee4f-4d85-b523-8a36823f9d9a", "metadata": {}, "outputs": [], "source": [ - "avg_speeds = pd.read_parquet(\n", - " f\"{speed_utils.GCS_PATH}avg_speeds_stop_segments_{analysis_date}.parquet\"\n", - ").drop(columns=[\"geometry\", \"geometry_arrowized\", \"district\", \"district_name\"])" + "# m1 = merge_all_speeds(analysis_date)" ] }, { "cell_type": "code", "execution_count": null, - "id": "8185a464-5ca2-43bb-89f6-062ee01b5e2d", + "id": "68950ae7-4061-47d6-ac48-5eac0b1f29c0", "metadata": {}, "outputs": [], "source": [ - "speeds = pd.read_parquet(f\"{speed_utils.GCS_PATH}speeds_stop_segments_{analysis_date}\")" + "# m1.shape" ] }, { "cell_type": "code", "execution_count": null, - "id": "01e1b2ad-edb7-4021-825f-612f070db139", + "id": "b04dfb8b-7476-49df-873a-cea75dc61763", "metadata": {}, "outputs": [], "source": [ - "avg_speeds.sample()" + "\n", + "# Picked 4 random routes\n", + "sample_0_keys = [\n", + " \"0fb4f3627996269dc7075276d3b69e36\",\n", + " \"07c9a47264a43d8d0d16ef7109e8fd68\",\n", + " \"106d979b9a9e6338827a8e1c145e69fd\",\n", + " \"000624bd8453dbe4f2eb2765b04bcb98\",\n", + "]" ] }, { "cell_type": "code", "execution_count": null, - "id": "4623eaa7-4594-4155-859d-af997094c3de", + "id": "7201d5e3-f765-4e5d-9bbd-aa6a336bcc0a", "metadata": {}, "outputs": [], "source": [ - "speeds.sample()" + "subset = m1[m1.shape_array_key.isin(sample_0_keys)].reset_index()" ] }, { "cell_type": "code", "execution_count": null, - "id": "bbc7804a-550a-40fa-a25f-7694b057c9b7", + "id": "6239a1c1-adc8-47b6-b6ee-b8b3f55d2257", "metadata": {}, "outputs": [], "source": [ - "merge_cols = [\"gtfs_dataset_key\", \"shape_array_key\", \"stop_sequence\"]\n", - "merge1 = pd.merge(avg_speeds, speeds, on=merge_cols, how=\"inner\")" + "def rate_column(df: pd.DataFrame, column_percentile:str, column_str:str): \n", + " #Get percentiles in objects for total vehicle.\n", + " p5 = df[column_percentile].quantile(0.05).astype(float)\n", + " p95 = df[column_percentile].quantile(0.95).astype(float)\n", + " \n", + " #Function for fleet size\n", + " def rate(row):\n", + " if ((row[column_percentile] >= 0) and (row[column_percentile] <= p5)):\n", + " return f\"{column_str} is low\"\n", + " elif (row[column_percentile] >= p95):\n", + " return f\"{column_str} is high\"\n", + " else:\n", + " return \"Ok\"\n", + " df[f\"{column_str}cat\"] = df.apply(lambda x: rate(x), axis=1)\n", + " \n", + " return df " ] }, { - "cell_type": "code", - "execution_count": null, - "id": "34c64e59-0379-4edf-a87f-ec621c0b668b", + "cell_type": "markdown", + "id": "898e3546-5298-4c4f-87d0-ee1d1a10f07d", "metadata": {}, - "outputs": [], "source": [ - "merge1.sample()" + "### Categorize" ] }, { "cell_type": "code", "execution_count": null, - "id": "e510e28b-a179-41d7-b738-e50edb26d878", + "id": "e81e59fd-cc2f-408e-9148-1a1055425fc4", "metadata": {}, "outputs": [], "source": [ - "merge1.shape" - ] - }, - { - "cell_type": "markdown", - "id": "801c89ce-0e7f-4758-a38a-201cc843ef28", - "metadata": {}, - "source": [ - "#### A few routes" + "def categorize_by_percentile_pandas(\n", + " df: pd.DataFrame, column_percentile: str, column_str: str\n", + ") -> pd.DataFrame:\n", + "\n", + " # Find percentiles\n", + " #Get percentiles in objects for total vehicle.\n", + " p5 = df[column_percentile].quantile(0.05).astype(float)\n", + " p95 = df[column_percentile].quantile(0.95).astype(float)\n", + " \n", + " def rate(row):\n", + " if ((row[column_percentile] >= 0) and (row[column_percentile] <= p5)):\n", + " return f\"{column_str} is low\"\n", + " elif (row[column_percentile] >= p95):\n", + " return f\"{column_str} is high\"\n", + " else:\n", + " return f\"{column_str} is avg\"\n", + " \n", + " # Apply flags\n", + " df[f\"{column_str}cat\"] = df.apply(lambda x: rate(x), axis=1)\n", + " \n", + " # Clean\n", + " df[f\"{column_str}cat\"] = df[f\"{column_str}cat\"].str.replace(\"_\", \"\")\n", + "\n", + " columns_to_keep = [\n", + " \"shape_array_key\",\n", + " \"gtfs_dataset_key\",\n", + " \"_gtfs_dataset_name\",\n", + " \"speed_mph\",\n", + " \"loop_or_inlining\",\n", + " \"stop_sequence\",\n", + " \"stop_id\",\n", + " \"trip_id\",\n", + " \"n_trips\",\n", + " \"p20_mph\",\n", + " \"p80_mph\",\n", + " \"p50_mph\",\n", + " \"time_of_day\",\n", + " \"meters_elapsed\",\n", + " \"sec_elapsed\",\n", + " f\"{column_str}cat\",\n", + " ]\n", + "\n", + " # df = df[columns_to_keep]\n", + " print(f\"Done with {column_str}\")\n", + " \n", + " return df " ] }, { "cell_type": "code", "execution_count": null, - "id": "8ca57e6f-fb38-4381-ac83-cb9dc9fabdf4", + "id": "0dfb836d-f919-4f2b-a0d1-9e4a4713ba8a", "metadata": {}, "outputs": [], "source": [ - "test_shapes = [\n", - " \"000624bd8453dbe4f2eb2765b04bcb98\",\n", - " \"000cf9d06f53da9b54fdd44e6d5eff27\",\n", - " \"00255a44e09390ac3381f7184a18f0c1\",\n", - " \"ffeb8a6113e0fdcd18e95257bb5be9cb\",\n", - " \"fffa6a34b26647eae8aea4172c83eba1\",\n", - " \"fffe83eb37f2bc4982a4cc10ef8cb2d9\",\n", - "]" + "# df1 = categorize_by_percentile_pandas(subset, \"meters_elapsed\", \"meters_\")" ] }, { "cell_type": "code", "execution_count": null, - "id": "6858f9a8-2136-4aab-a099-25907b6ef7ef", + "id": "9f84205d-93db-49f3-be99-6b5014f7faeb", "metadata": {}, "outputs": [], "source": [ - "few_routes = merge1.loc[merge1.shape_array_key.isin(test_shapes)].reset_index(drop=True)" + "# df1.head()" ] }, { "cell_type": "code", "execution_count": null, - "id": "7dae3fdc-d8c0-4e50-8e11-5af4804c9a76", + "id": "b0d2184f-8a44-4489-a1b4-2be8317142f1", "metadata": {}, "outputs": [], "source": [ - "# few_routes = merge1.copy()" + "# df2 = categorize_by_percentile_pandas(df1, \"sec_elapsed\", \"sec_\")" ] }, { "cell_type": "code", "execution_count": null, - "id": "a9cb974d-08fe-49a6-ba7f-f29fd53c771e", + "id": "940fb010-0dff-465e-bf8d-87dd3f4ba101", "metadata": {}, "outputs": [], "source": [ - "few_routes.shape_array_key.nunique()" + "# df2.head()" ] }, { "cell_type": "code", "execution_count": null, - "id": "602983ba-8f2e-41cb-9eb8-96e8ede9f58a", + "id": "9d38d541-5c9c-4d31-8986-9c3928eb2f59", "metadata": {}, "outputs": [], "source": [ - "few_routes.shape" + "def categorize_meters_speeds_pandas(df):\n", + " start = datetime.datetime.now()\n", + " print(start)\n", + " \n", + " #df = merge_all_speeds(analysis_date)\n", + " \n", + " # Categorize\n", + " df1 = categorize_by_percentile_pandas(df, \"meters_elapsed\", \"meters_\")\n", + " df2 = categorize_by_percentile_pandas(df1, \"sec_elapsed\", \"sec_\")\n", + " \n", + " # Find size of categories\n", + " print(df2.groupby(['sec_cat','meters_cat']).size())\n", + "\n", + " # Filter out \n", + " df2 = df2[(df2.meters_cat == 'meters is low') | (df2.sec_cat == 'sec is high')].reset_index(drop = True)\n", + " print(f\"{len(df2)} rows after filtering for rows with either high seconds OR low meters\") \n", + " \n", + " def flag_round(row):\n", + " if (row[\"meters_elapsed\"] == 0) & (row[\"sec_elapsed\"] == 0):\n", + " return \"division by 0\"\n", + " elif row[\"meters_cat\"] == \"meters is low\":\n", + " return \"meters too low\"\n", + " elif row[\"sec_cat\"] == \"sec is high\":\n", + " return \"seconds too high\"\n", + " else:\n", + " return \"ok\"\n", + " \n", + " df2[\"flag\"] = df2.apply(lambda x: flag_round(x), axis=1)\n", + " print(m2.flag.value_counts())\n", + " \n", + " # Filter out for only division by 0 \n", + " df3 = df2[(df2.flag == 'division by 0')].reset_index(drop = True)\n", + " print(f\"{len(df3)} rows after filtering for only division by 0 rows\") \n", + " \n", + " end = datetime.datetime.now()\n", + " print(f\"Took {end-start}\")\n", + " return df3" ] }, { "cell_type": "code", "execution_count": null, - "id": "f294ff32-b025-4037-9ebc-cefe6dca00b9", + "id": "2c5107cb-c574-449b-95b6-fb205f38502e", "metadata": {}, "outputs": [], "source": [ - "few_routes.trip_id.nunique()" + "m2 = categorize_meters_speeds_pandas(m1)" ] }, { "cell_type": "code", "execution_count": null, - "id": "bafa496a-0fdc-4b03-a4dd-f0c0e39c04de", + "id": "9fe08d2a-b874-4439-aa5b-a52de58cad09", "metadata": {}, "outputs": [], "source": [ - "\"\"\"few_routes_agg = (\n", - " few_routes.groupby([\"shape_array_key\", \"stop_sequence\"])[\"meters_elapsed\"]\n", - " .describe(percentiles=[0.05, 0.95])\n", - " .reset_index()\n", - " .add_prefix(\"meters_\")\n", - ")\"\"\"" + "m2.trip_id.nunique()" ] }, { "cell_type": "code", "execution_count": null, - "id": "7a981533-fa29-4f4e-b873-49be562798cb", + "id": "a96f031c-c785-4793-9dfc-4b87090e6128", "metadata": {}, "outputs": [], "source": [ - "#few_routes_aggdask = dd.from_pandas(few_routes_agg, npartitions=1)" + "m2.shape" ] }, { "cell_type": "code", "execution_count": null, - "id": "c6fda994-bed0-4140-a269-207c20e0cc47", + "id": "647fad46-7f9b-4ce2-a26a-1ea69d02daee", "metadata": {}, "outputs": [], "source": [ - "#few_routes_dask = dd.from_pandas(few_routes, npartitions=1)" + "m2._gtfs_dataset_name.nunique(), m1._gtfs_dataset_name.nunique()" ] }, { "cell_type": "code", "execution_count": null, - "id": "c2275352-1c4e-4a11-bd96-776f6801ec97", + "id": "83036ccc-7339-42c2-b1f7-183734253c21", "metadata": {}, "outputs": [], "source": [ - "#type(few_routes_dask), type(few_routes_aggdask)" + "m2.groupby([\"loop_or_inlining\"]).agg({\"shape_array_key\": \"nunique\"})" ] }, { "cell_type": "code", "execution_count": null, - "id": "ffde85cd-ee59-433a-b92f-c757d79e80ec", + "id": "b9519846-59ed-40cb-9087-3f8229e771d1", "metadata": {}, "outputs": [], "source": [ - "\"\"\"dask_merge1 = dd.merge(\n", - " few_routes_dask,\n", - " few_routes_aggdask,\n", - " how=\"inner\",\n", - " left_on=[\"shape_array_key\", \"stop_sequence\"],\n", - " right_on=[\n", - " \"meters_shape_array_key\",\n", - " \"meters_stop_sequence\",\n", - " ],\n", - ")\"\"\"" + "subset = m2[m2.shape_array_key.isin(sample_0_keys)].reset_index()" ] }, { - "cell_type": "code", - "execution_count": null, - "id": "9d61f004-233b-4f0c-a1cc-1923f02a99e9", + "cell_type": "markdown", + "id": "a399d982-e400-43fa-b13f-fecafaa27262", "metadata": {}, - "outputs": [], "source": [ - "#dask_merge1.head(1)" + "### Investigate \n", + "#### Stage3: \"vp_pared_stops\"" ] }, { "cell_type": "code", "execution_count": null, - "id": "53106a8f-414f-4710-97d5-4c36d953024b", + "id": "a2a705af-b588-463b-b6ce-f999b2050208", "metadata": {}, "outputs": [], "source": [ - "def percentile(row):\n", - "\n", - " if row[\"meters_elapsed\"] == row[\"meters_mean\"]:\n", - " return \"meters average\"\n", - " elif row[\"meters_elapsed\"] <= row[\"meters_5%\"]:\n", - " return \"meters low\"\n", - "\n", - " else:\n", - " return \"other\"" + "def load_vp_stage3(flagged_df:pd.DataFrame, date:str):\n", + " \n", + " # Subset the dataframe and use it to filter out for only the values of interest\n", + " flagged_df = flagged_df[['gtfs_dataset_key', 'trip_id','stop_sequence','shape_array_key']]\n", + " \n", + " vp = pd.read_parquet(f\"{speed_utils.GCS_PATH}vp_pared_stops_{date}\")\n", + " \n", + " # Merge to filter\n", + " vp2 = pd.merge(flagged_df, vp, how = \"inner\", on = ['gtfs_dataset_key', 'trip_id','stop_sequence','shape_array_key'])\n", + " \n", + " return vp2" ] }, { "cell_type": "code", "execution_count": null, - "id": "e1887f21-26f5-4ea6-a384-7c6b5519000f", + "id": "1e36c5fc-ab3f-4129-97f9-ad9472b7d32a", "metadata": {}, "outputs": [], "source": [ - "#dask_merge1[\"test\"] = dask_merge1.apply(\n", - "# lambda x: percentile(x), axis=1, meta=(\"test\", \"string\")\n", - "#)" + "vp2 = load_vp_stage3(subset, analysis_date)" ] }, { "cell_type": "code", "execution_count": null, - "id": "dac25f8e-5d36-4464-b475-bfc2504ff2c2", + "id": "21799f42-873e-41bd-b764-42cc297686a6", "metadata": {}, "outputs": [], "source": [ - "#dask_merge1 = dask_merge1.compute()" + "sort_cols = ['trip_id', 'stop_sequence','location_timestamp_local']" ] }, { "cell_type": "code", "execution_count": null, - "id": "9e1e478f-0190-40ec-a32d-783c87482b79", - "metadata": {}, + "id": "148e75f1-08dd-44c8-8179-319164d8e020", + "metadata": { + "tags": [] + }, "outputs": [], "source": [ - "#dask_merge1[[\"test\"]].head()" + "# Check out stop sequences for the trip below that have division by 0\n", + "# subset[subset.trip_id == \"1088383\"].stop_sequence.unique()" ] }, { "cell_type": "code", "execution_count": null, - "id": "b4f56307-6624-488d-8688-7a9f1e47ff65", - "metadata": {}, + "id": "b4350206-c237-44a3-abce-f8f38cde8117", + "metadata": { + "scrolled": true, + "tags": [] + }, "outputs": [], "source": [ - "def categorize_by_percentile(\n", - " df: pd.DataFrame, column_percentile: str, column_str: str\n", - ") -> dd.DataFrame:\n", - "\n", - " # Find percentiles\n", - " agg1 = (\n", - " df.groupby([\"shape_array_key\", \"stop_sequence\"])[column_percentile]\n", - " .describe(percentiles=[0.05, 0.95])\n", - " .reset_index()\n", - " .add_prefix(column_str)\n", - " )\n", - "\n", - " # Convert to dask because it takes a very long time\n", - " agg1_dask = dd.from_pandas(agg1, npartitions=1)\n", - " df_dask = dd.from_pandas(df, npartitions=1)\n", - "\n", - " # Merge using dask\n", - " merge1_dask = dd.merge(\n", - " df_dask,\n", - " agg1_dask,\n", - " how=\"inner\",\n", - " left_on=[\"shape_array_key\", \"stop_sequence\"],\n", - " right_on=[\n", - " f\"{column_str}shape_array_key\",\n", - " f\"{column_str}stop_sequence\",\n", - " ],\n", - " )\n", - "\n", - " def percentile(row):\n", - "\n", - " if row[column_percentile] == row[f\"{column_str}mean\"]:\n", - " return f\"{column_str} elapsed avg\"\n", - " elif row[column_percentile] <= row[f\"{column_str}5%\"]:\n", - " return f\"{column_str} elapsed low\"\n", - " elif row[column_percentile] == 0:\n", - " return f\"{column_str} elapsed is 0\"\n", - " elif row[f\"{column_str}5%\"] < row[column_percentile] <= row[f\"{column_str}95%\"]:\n", - " return f\"{column_str} elapsed avg\"\n", - "\n", - " elif row[column_percentile] > row[f\"{column_str}95%\"]:\n", - " return f\"{column_str} elapsed high\"\n", - "\n", - " else:\n", - " return \"other\"\n", - "\n", - " merge1_dask[f\"{column_str}cat\"] = merge1_dask.apply(\n", - " lambda x: percentile(x), axis=1, meta=(f\"{column_str}cat\", \"string\")\n", - " )\n", - "\n", - " # Clean\n", - " merge1_dask[f\"{column_str}cat\"] = merge1_dask[f\"{column_str}cat\"].str.replace(\"_\", \"\")\n", - "\n", - " columns_to_keep = [\n", - " \"shape_array_key\",\n", - " \"gtfs_dataset_key\",\n", - " \"_gtfs_dataset_name\",\n", - " \"speed_mph\",\n", - " \"loop_or_inlining\",\n", - " \"stop_sequence\",\n", - " \"stop_id\",\n", - " \"trip_id\",\n", - " \"n_trips\",\n", - " \"p20_speed_mph\",\n", - " \"p80_speed_mph\",\n", - " \"time_of_day\",\n", - " \"median_speed_mph\",\n", - " \"meters_elapsed\",\n", - " \"sec_elapsed\",\n", - " f\"{column_str}5%\",\n", - " f\"{column_str}95%\",\n", - " f\"{column_str}cat\"\n", - " ]\n", - " merge1_dask = merge1_dask[columns_to_keep]\n", - " print(f\"Done with {column_str}\")\n", - " return merge1_dask" + "# Stop sequences that were flagged as division by 0\n", + "# vp2[vp2.trip_id == \"1088383\"].sort_values(['trip_id', 'stop_sequence','location_timestamp_local'])" ] }, { "cell_type": "code", "execution_count": null, - "id": "55eb1e43-bea2-4e6c-b665-513115107c25", - "metadata": {}, + "id": "aa1e56d1-ec07-436c-8763-7bcf3dcbf7d4", + "metadata": { + "scrolled": true, + "tags": [] + }, "outputs": [], "source": [ - "#ddf_meters = ddf_meters.compute()" + "# All the stop sequences for this trip, even those that are ok\n", + "# vp_pared[vp_pared.trip_id == \"1088383\"].sort_values(['trip_id', 'stop_sequence','location_timestamp_local'])" ] }, { "cell_type": "code", "execution_count": null, - "id": "12014229-3cf9-4e44-a603-8dbaca1506e5", - "metadata": {}, + "id": "22e42aae-9281-4040-ab8c-6a10b93f6cf4", + "metadata": { + "scrolled": true, + "tags": [] + }, "outputs": [], "source": [ - "#ddf_sec = ddf_sec.compute()" + "# All the stop sequences for this trip, even those that are ok\n", + "# vp_pared[vp_pared.trip_id == \"1088383\"].sort_values(['location_timestamp_local','stop_sequence',])" ] }, { "cell_type": "code", "execution_count": null, - "id": "963353ae-52ab-4ca4-bef5-b3649b6b74c3", + "id": "0f21f08f-d4eb-4bbd-94d3-f4b031e97cf4", "metadata": {}, "outputs": [], "source": [ - "def flag(row):\n", - "\n", - " if (row[\"meters_elapsed\"] == 0) & (row[\"sec_elapsed\"] == 0):\n", - " return \"division by 0\"\n", - "\n", - " elif row[\"meters_cat\"] == \"meters elapsed low\":\n", - " return \"meters too low\"\n", - "\n", - " elif row[\"seconds_cat\"] == \"seconds elapsed high\":\n", - " return \"seconds too high\"\n", + "def stage3_repeated_timestamps(stage3_df:pd.DataFrame):\n", + " \"\"\"\n", + " Look at how many times a time stamp is repeated a route-trip-location.\n", + " Each of these 3 combos should have something different.\n", + " \"\"\"\n", + " agg = (stage3_df\n", + " .groupby(['shape_array_key','trip_id', 'location_timestamp_local'])\n", + " .agg({'stop_sequence':'nunique'})\n", + " .reset_index()\n", + " .rename(columns = {'stop_sequence':'number_of_repeated_timestamps'})\n", + " )\n", + " \n", + " # Only keep timestamps that are repeated more than once\n", + " agg = (agg[agg.number_of_repeated_timestamps > 1]).reset_index(drop = True)\n", "\n", - " else:\n", - " return \"ok\"" + " return agg" ] }, { "cell_type": "code", "execution_count": null, - "id": "d80535fb-8648-4216-918f-76e0484ba3ea", + "id": "5ce07566-c1f0-4fa7-9550-2fa07b98dba8", "metadata": {}, "outputs": [], "source": [ - "def categorize_meters_speeds(df):\n", - " start = datetime.datetime.now()\n", - "\n", - " print(f\"Begin: {start}\")\n", - "\n", - " # Find percentiles\n", - " df.speed_mph = df.speed_mph.fillna(0)\n", - "\n", - " # These are now dask dataframes\n", - " ddf_meters = categorize_by_percentile(df, \"meters_elapsed\", \"meters_\")\n", - " ddf_seconds = categorize_by_percentile(df, \"sec_elapsed\", \"seconds_\")\n", - "\n", - " merge_cols = ['shape_array_key', 'gtfs_dataset_key', '_gtfs_dataset_name',\n", - " 'speed_mph', 'loop_or_inlining', 'stop_sequence', 'stop_id', 'n_trips',\n", - " 'p20_speed_mph', 'p80_speed_mph', 'time_of_day', 'median_speed_mph',\n", - " 'meters_elapsed', 'sec_elapsed',\"trip_id\",]\n", + "def stage3_repeated_locations(stage3_df:pd.DataFrame):\n", + " \"\"\"\n", + " Look at how many times a time stamp is repeated for a stop-trip-route combo.\n", + " Each combo should have a differnt location.\n", + " \"\"\"\n", + " # Concat x and y into a string\n", + " stage3_df['pair'] = stage3_df.x.astype(str) + '/' + vp2.y.astype(str)\n", " \n", - " # Merge using dask\n", - " m1 = dd.merge(\n", - " ddf_meters,\n", - " ddf_seconds,\n", - " how=\"inner\",\n", - " on=merge_cols)\n", - "\n", - " # Apply flags\n", - " m1[\"flag\"] = m1.apply(lambda x: flag(x), axis=1, meta=(\"flag\", \"string\"))\n", + " # Count number of different stops that reference the same location\n", + " agg = (stage3_df\n", + " .groupby(['shape_array_key','trip_id','pair'])\n", + " .agg({'stop_sequence':'nunique'})\n", + " .reset_index()\n", + " .sort_values('stop_sequence', ascending = False)\n", + " .rename(columns = {'stop_sequence':'number_of_repeated_locs'}) \n", + " )\n", "\n", - " end = datetime.datetime.now()\n", - " print(f\"Finish: {end-start}\")\n", - " return m1" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4676866e-674a-4561-bc4f-55dc2dcc4769", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "few_routes_cat = categorize_meters_speeds(few_routes)" + " # Only keep locations that are repeated more than once\n", + " # agg = agg[agg.number_of_repeated_locs != 1].reset_index(drop = True)\n", + " \n", + " return agg" ] }, { "cell_type": "code", "execution_count": null, - "id": "fdde9f12-a650-41ef-9bf6-a83204647f30", + "id": "15a49a52-1178-4e65-a443-b35b89812d54", "metadata": {}, "outputs": [], "source": [ - "len(few_routes_cat)" + "vp2.pair.nunique()" ] }, { "cell_type": "code", "execution_count": null, - "id": "e016eb5a-0039-4063-ade4-c871e01c8a16", + "id": "f0663258-c7d5-495f-9191-0664b65970a8", "metadata": {}, "outputs": [], "source": [ - "few_routes_cat.groupby([\"flag\",\"meters_cat\", \"seconds_cat\",]).agg(\n", - " {\"trip_id\": \"count\"}\n", - ").reset_index().sort_values([\"trip_id\"], ascending=False).compute()" + "vp2.trip_id.nunique()" ] }, { "cell_type": "code", "execution_count": null, - "id": "afdb5cf4-cc61-454f-bdb5-8261e91bedf3", + "id": "66e83169-2b4a-4912-bc0e-1a0b3e8deea6", "metadata": {}, "outputs": [], "source": [ - "subset = [\n", - " \"unusual_flag\",\n", - " \"shape_array_key\",\n", - " \"stop_sequence\",\n", - " \"stop_id\",\n", - " \"median_speed_mph\",\n", - " \"p20_speed_mph\",\n", - " \"p80_speed_mph\",\n", - " \"_gtfs_dataset_name\",\n", - " \"trip_id\",\n", - " \"meters_elapsed\",\n", - " \"sec_elapsed\",\n", - " \"speed_mph\",\n", - " \"meters_5%\",\n", - " \"meters_50%\",\n", - " \"meters_95%\",\n", - " \"meters_cat\",\n", - " \"seconds_5%\",\n", - " \"seconds_50%\",\n", - " \"seconds_95%\",\n", - " \"seconds_cat\",\n", - "]" + "def flag_stage3(flagged_df:pd.DataFrame, date:str):\n", + " \n", + " # Relevant rows from Vehicle Positions\n", + " vp = load_vp_stage3(flagged_df, date)\n", + " \n", + " # Find repeated timestamps.\n", + " multi_timestamps = stage3_repeated_timestamps(vp)\n", + " \n", + " # Find repeated locations\n", + " multi_locs = stage3_repeated_locations(vp)\n", + " \n", + " # Merge\n", + " timestamps_merge_cols = ['shape_array_key','trip_id','location_timestamp_local']\n", + " loc_merge_cols = ['shape_array_key','trip_id','pair']\n", + " \n", + " m1 = (vp\n", + " .merge(multi_timestamps, how=\"left\", on= timestamps_merge_cols)\n", + " .merge(multi_locs, how=\"left\", on=loc_merge_cols)\n", + " )\n", + " \n", + " drop_cols = ['vp_idx','x','y','hour','activity_date']\n", + " m1 = m1.drop(columns = drop_cols)\n", + " \n", + " return m1" ] }, { "cell_type": "code", "execution_count": null, - "id": "bdd2b274-87a7-4306-8494-f65416ac88fb", + "id": "cab32ef3-cc66-40ce-aa19-59631734f539", "metadata": {}, "outputs": [], "source": [ - "high_low_zero = few_routes_cat[few_routes_cat.flag != \"ok\"].reset_index()" + "m3 = flag_stage3(m2, analysis_date)" ] }, { "cell_type": "code", "execution_count": null, - "id": "5b2a9d41-925b-442f-a851-a354f58eb127", + "id": "9374b39f-f286-4b82-b18b-e596f602f6b0", "metadata": {}, "outputs": [], "source": [ - "high_low_zero = high_low_zero.compute()" + "len(m3)" ] }, { "cell_type": "code", "execution_count": null, - "id": "798b9bfe-ca16-4453-8ca5-7a06f449e38e", + "id": "4150ae22-7888-475f-9c24-f0ac17cf1b4d", "metadata": {}, "outputs": [], "source": [ - "high_low_zero.trip_id.nunique()" + "m3.groupby(['number_of_repeated_timestamps', 'number_of_repeated_locs']).size()" ] }, { "cell_type": "code", "execution_count": null, - "id": "54a2b149-c5db-4917-8232-2ecf425fddc7", + "id": "8fc4eeb2-ccbb-4691-a2fc-f4713664cb58", "metadata": {}, "outputs": [], "source": [ - "few_routes_cat = few_routes_cat.compute()" + "more_than_one = m3[(m3.number_of_repeated_timestamps > 1) | (m3.number_of_repeated_locs > 1)]" ] }, { "cell_type": "code", "execution_count": null, - "id": "c1fbe857-c760-4b22-8265-638326a36890", + "id": "e7b06af8-26c8-4394-865d-424cf25868d2", "metadata": {}, "outputs": [], "source": [ - "few_routes_cat.trip_id.nunique()" - ] - }, - { - "cell_type": "markdown", - "id": "8d16b36f-4ace-4ad5-beb6-fd0f5d98e56e", - "metadata": {}, - "source": [ - "### Visualize \n", - "#### One" + "more_than_one.shape" ] }, { "cell_type": "code", "execution_count": null, - "id": "c0fbb61d-586d-4513-99c0-793e7105ac2a", + "id": "6877289f-008d-4377-9748-270e05ae22f5", "metadata": {}, "outputs": [], "source": [ - "equal_sampling = high_low_zero.groupby('flag').apply(lambda x: x.sample(n=2)).reset_index(drop = True)" + "more_than_one.sample(5)" ] }, { "cell_type": "code", "execution_count": null, - "id": "34840f61-1857-4f5c-b3a6-abcac18af0db", + "id": "86ce812a-4081-44b3-b1f1-b02be149ebf1", "metadata": {}, "outputs": [], "source": [ - "trips = list(equal_sampling.trip_id.unique())" + "more_than_one._gtfs_dataset_name.nunique(), more_than_one.shape_array_key.nunique()" ] }, { - "cell_type": "code", - "execution_count": null, - "id": "759ac80e-9c98-4c94-add3-859ab998e038", - "metadata": {}, - "outputs": [], + "cell_type": "markdown", + "id": "e01d88e4-a3e2-43f1-9591-afe83e8a92cf", + "metadata": { + "tags": [] + }, "source": [ - "stops = list(equal_sampling.stop_id.unique())" + "#### Summarize" ] }, { "cell_type": "code", "execution_count": null, - "id": "94620388-876e-4e6d-80ab-ac68eb1061c9", + "id": "229c6d49-58e1-45bd-839f-b03f7e8cdd4a", "metadata": {}, "outputs": [], "source": [ - "# Plot some of the trips\n", - "sample_data = few_routes_cat[few_routes_cat.trip_id.isin(trips)].reset_index()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "417804f1-9e24-4ac6-9c96-95f4da8f9693", - "metadata": {}, - "outputs": [], - "source": [ - "sample_data.shape" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "339a67c7-3634-43d4-b865-b10119507c44", - "metadata": {}, - "outputs": [], - "source": [ - "# sample_data2 = sample_data[['shape_array_key','gtfs_dataset_key','trip_id']]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "39231b51-f0ab-4bbe-80d0-8a2081849e50", - "metadata": {}, - "outputs": [], - "source": [ - "plotting = sample_data.melt(\n", - " id_vars=[\n", - " \"_gtfs_dataset_name\",\n", - " \"shape_array_key\",\n", - " \"trip_id\",\n", - " \"stop_sequence\",\n", - " \"gtfs_dataset_key\",\n", - " \"loop_or_inlining\",\n", - " \"n_trips\",\n", - " \"meters_elapsed\",\n", - " \"meters_cat\",\n", - " \"seconds_cat\",\n", - " \"sec_elapsed\",\n", - " \"flag\",\n", - " 'p20_speed_mph', 'p80_speed_mph',\n", - " 'median_speed_mph',\n", - " ],\n", - " value_vars=[\"speed_mph\"],\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "959bbe35-8c21-4795-b418-745a0caa94e8", - "metadata": {}, - "outputs": [], - "source": [ - "# Clean\n", - "plotting = threshold_utils.pre_clean(plotting)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "dd343cdd-acf3-4433-a702-8c3eb53ba1f0", - "metadata": {}, - "outputs": [], - "source": [ - "plotting[\"Dropdown Menu\"] = (\n", - " plotting[\"Gtfs Dataset Name\"] + \" \" + plotting[\"Trip Id\"]\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "072f39d9-4bf9-4efc-a393-24f096cecf7e", - "metadata": {}, - "outputs": [], - "source": [ - "def alt_dropdown(df, col_for_dropdown: str, dropdown_menu_title: str):\n", - " # Create dropdown menu\n", - " # Exclude \"none\" operators which are only scheduled data\n", - " df = df.loc[df[col_for_dropdown] != \"None\"][[col_for_dropdown]]\n", - " dropdown_list = df[col_for_dropdown].unique().tolist()\n", - "\n", - " # Show only first operator by default\n", - " initialize_first_op = sorted(dropdown_list)[0]\n", - " input_dropdown = alt.binding_select(\n", - " options=sorted(dropdown_list), name=dropdown_menu_title\n", - " )\n", - "\n", - " selection = alt.selection_single(\n", - " name=dropdown_menu_title,\n", - " fields=[col_for_dropdown],\n", - " bind=input_dropdown,\n", - " init={col_for_dropdown: initialize_first_op},\n", - " )\n", - "\n", - " return selection" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6b919e59-03e8-426f-b9b3-4468d5b1b06b", - "metadata": {}, - "outputs": [], - "source": [ - "selection_test = alt_dropdown(plotting, \"Dropdown Menu\", \"Route\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d7b9d14f-b708-4426-a1b8-4afb4e7c95ba", - "metadata": {}, - "outputs": [], - "source": [ - "(\n", - " threshold_utils.chart_size(\n", - " alt.Chart(plotting)\n", - " .mark_tick(\n", - " size=15,\n", - " thickness=5,\n", - " )\n", - " .encode(\n", - " x=\"Stop Sequence:N\",\n", - " y=\"Value:Q\",\n", - " color=alt.Color(\n", - " \"Flag:N\", scale=alt.Scale(range=cp.CALITP_CATEGORY_BOLD_COLORS)\n", - " ),\n", - " tooltip=plotting.columns.tolist(),\n", - " )\n", - " .interactive(),\n", - " 1100,\n", - " 400,\n", - " )\n", - " .add_selection(selection_test)\n", - " .transform_filter(selection_test)\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7ec90906-3364-440c-a951-eb5f2b1e84c2", - "metadata": {}, - "outputs": [], - "source": [ - "stop" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a1aba28a-92f5-417c-a0d3-e31bf80e3a7d", - "metadata": {}, - "outputs": [], - "source": [ - "vehicle_positions = gpd.read_parquet(\n", - " \"gs://calitp-analytics-data/data-analyses/rt_segment_speeds/vp_2023-04-12.parquet\"\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "140112e0-8ee0-481d-848c-4db091460418", - "metadata": {}, - "outputs": [], - "source": [ - "type(vehicle_positions)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d843325a-f33c-40de-a3cb-befed24d645e", - "metadata": {}, - "outputs": [], - "source": [ - "vehicle_positions2 = vehicle_positions[\n", - " vehicle_positions.trip_id.isin(trips)\n", - "].reset_index()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8840809b-dd6f-4c0e-a68b-0a37f508df14", - "metadata": {}, - "outputs": [], - "source": [ - "vehicle_positions2.shape" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "896b5c73-0835-4ee7-a4f2-9d960778fe35", - "metadata": {}, - "outputs": [], - "source": [ - "gdf1 = pd.merge(\n", - " vehicle_positions2,\n", - " sample_data,\n", - " how=\"inner\",\n", - " on=[\"gtfs_dataset_key\", \"_gtfs_dataset_name\", \"trip_id\"],\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3725d07d-7b9f-483b-aa56-8d428b9f3d11", - "metadata": {}, - "outputs": [], - "source": [ - "gdf1.shape" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c82d7834-567b-4de4-b465-aea8c1a62715", - "metadata": {}, - "outputs": [], - "source": [ - "gdf1 = gdf1[gdf1.stop_id.isin(stops)]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a86cec8e-e7fd-48ce-9692-c766df2b68e8", - "metadata": {}, - "outputs": [], - "source": [ - "gdf1.shape" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3bf3c599-2aec-4ae7-b8c8-53a4eff8795a", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "gdf1[\n", - " [\n", - " \"geometry\",\n", - " \"stop_id\",\n", - " \"stop_sequence\",\n", - " \"_gtfs_dataset_name\",\n", - " \"shape_array_key\",\n", - " \"speed_mph\",\n", - " \"flag\",\n", - " ]\n", - "].explore(\"flag\")" - ] - }, - { - "cell_type": "markdown", - "id": "e01d88e4-a3e2-43f1-9591-afe83e8a92cf", - "metadata": { - "tags": [] - }, - "source": [ - "#### Summarize" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "229c6d49-58e1-45bd-839f-b03f7e8cdd4a", - "metadata": {}, - "outputs": [], - "source": [ - "def summarize(original: pd.DataFrame, filtered_unusual_stops: pd.DataFrame):\n", - "\n", - " subset = [\"_gtfs_dataset_name\", \"gtfs_dataset_key\", \"shape_array_key\"]\n", - "\n", - " def aggregate(df, total_trip_column_name: str):\n", - " agg = (\n", - " df.groupby(subset)\n", - " .agg({\"stop_sequence\": \"count\"})\n", - " .reset_index()\n", - " .rename(columns={\"stop_sequence\": total_trip_column_name})\n", - " )\n", - "\n", - " return agg\n", - "\n", - " total_unusual_stops = aggregate(filtered_unusual_stops, \"total_unusual_stops\")\n", - " total_stops = aggregate(original, \"total_stops\")\n", - "\n", - " # Merge them\n", - " merge1 = pd.merge(total_unusual_stops, total_stops, on=subset, how=\"inner\")\n", - "\n", - " # Add some columns\n", - " merge1[\"percent_of_unusual_stops\"] = (\n", - " (merge1.total_unusual_stops / merge1.total_stops) * 100\n", - " ).astype(int)\n", - "\n", - " merge1[\n", - " \"Percentage of Unusual Stops\"\n", - " ] = \"% of Unusual Stops: \" + merge1.percent_of_unusual_stops.astype(str)\n", - "\n", - " # Add dropdown menu\n", - " # merge1[\"Dropdown Menu\"] = merge1._gtfs_dataset_name + \" \" + merge1.shape_array_key\n", - "\n", - " # Clean\n", - " merge1 = merge1.sort_values([\"percent_of_unusual_stops\"], ascending=False)\n", - " merge1 = merge1.drop(columns=[\"percent_of_unusual_stops\"])\n", - "\n", - " merge1 = threshold_utils.pre_clean(merge1)\n", - " return merge1" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "dad7c9b8-9025-4d91-bf75-0e23c3ac2a52", - "metadata": {}, - "outputs": [], - "source": [ - "summarize(few_routes_cat, high_low_zero)" - ] - }, - { - "cell_type": "markdown", - "id": "cde431f9-10ad-484f-b954-dd3c13a6e683", - "metadata": { - "tags": [] - }, - "source": [ - "#### Draft\n", - "* Show which stops are excluded from flags\n", - "* Show how many stops are dropped\n", - "* Show % of stops that were flagged compared to total stops." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f6d7831f-aed2-4e87-aae1-8ab6ddc08666", - "metadata": {}, - "outputs": [], - "source": [ - "stop" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "aae6b6c0-ce47-4ca1-b104-68b39ebcf2ca", - "metadata": {}, - "outputs": [], - "source": [ - "high_low_zero2 = high_low_zero.melt(\n", - " id_vars=[\n", - " \"_gtfs_dataset_name\",\n", - " \"shape_array_key\",\n", - " \"trip_id\",\n", - " \"stop_sequence\",\n", - " \"gtfs_dataset_key\",\n", - " \"loop_or_inlining\",\n", - " \"n_trips\",\n", - " \"meters_cat\",\n", - " \"seconds_cat\",\n", - " \"unusual_flag\",\n", - " \"time_of_day\",\n", - " ],\n", - " value_vars=[\"median_speed_mph\", \"speed_mph\", \"p20_speed_mph\", \"p80_speed_mph\"],\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "767abd20-d030-42d3-b85f-6d3023d69b8a", - "metadata": {}, - "outputs": [], - "source": [ - "high_low_zero2 = high_low_zero2.drop_duplicates(\n", - " subset=[\n", - " \"loop_or_inlining\",\n", - " \"shape_array_key\",\n", - " \"stop_sequence\",\n", - " \"time_of_day\",\n", - " \"variable\",\n", - " \"value\",\n", - " ]\n", - ").reset_index(drop=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "52e7ee5a-a40e-423e-ba4b-dea14de17982", - "metadata": {}, - "outputs": [], - "source": [ - "high_low_zero2.shape" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "23e7f746-b1b5-402f-92fd-dbc74840e013", - "metadata": {}, - "outputs": [], - "source": [ - "merge1.shape_array_key.nunique(), high_low_zero.shape_array_key.nunique()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8a25f067-956e-46a7-aa7a-5abf57e662f6", - "metadata": {}, - "outputs": [], - "source": [ - "# Clean\n", - "high_low_zero2 = threshold_utils.pre_clean(high_low_zero2)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b5eea02c-05ec-4707-a06d-9de1864e8fbe", - "metadata": {}, - "outputs": [], - "source": [ - "# Add dropdown menu\n", - "high_low_zero2[\"Dropdown Menu\"] = (\n", - " high_low_zero2[\"Gtfs Dataset Name\"] + \" \" + high_low_zero2[\"Shape Array Key\"]\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "571b471f-4a66-474f-8900-c3eaffde441e", - "metadata": {}, - "outputs": [], - "source": [ - "high_low_zero2[\"Route Type\"] = \"Route Type: \" + high_low_zero2[\n", - " \"Loop Or Inlining\"\n", - "].astype(str)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b7b429be-057c-4692-927e-92107b015ae6", - "metadata": {}, - "outputs": [], - "source": [ - "selection_test = alt_dropdown(high_low_zero2, \"Dropdown Menu\", \"Route\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "aa0fad8b-b49e-48be-8070-adaf6e63d541", - "metadata": {}, - "outputs": [], - "source": [ - "# https://github.com/altair-viz/altair/issues/1168\n", - "title = (\n", - " alt.Chart(high_low_zero2)\n", - " .mark_text(dy=-40, size=15, fontWeight=\"normal\")\n", - " .encode(\n", - " text=\"Route Type:N\",\n", - " )\n", - " .add_selection(selection_test)\n", - " .transform_filter(selection_test)\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1042a774-165c-4f7e-bfc9-c4d4980bd29b", - "metadata": {}, - "outputs": [], - "source": [ - "\"\"\"total_stops_altair = (\n", - " alt.Chart(stop_info)\n", - " .mark_text(dy=-40, size=15, fontWeight=\"normal\")\n", - " .encode(\n", - " text=\"Percentage Of Unusual Stops:N\",\n", - " )\n", - " .add_selection(selection_test)\n", - " .transform_filter(selection_test)\n", - ")\"\"\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6067e93b-3519-45fc-b027-11cbcc82d80f", - "metadata": {}, - "outputs": [], - "source": [ - "main_chart = (\n", - " threshold_utils.chart_size(\n", - " alt.Chart(high_low_zero2)\n", - " .mark_tick(\n", - " size=15,\n", - " thickness=5,\n", - " )\n", - " .encode(\n", - " x=\"Stop Sequence:N\",\n", - " y=\"Value:Q\",\n", - " color=alt.Color(\n", - " \"Variable:N\", scale=alt.Scale(range=cp.CALITP_CATEGORY_BRIGHT_COLORS)\n", - " ),\n", - " tooltip=high_low_zero2.columns.tolist(),\n", - " )\n", - " .interactive(),\n", - " 1100,\n", - " 400,\n", - " )\n", - " .add_selection(selection_test)\n", - " .transform_filter(selection_test)\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "da0d6aad-26c3-439b-93d2-ba5a3abac77d", - "metadata": {}, - "outputs": [], - "source": [ - "high_low_zero2.shape" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c77e2503-e211-48cc-a220-d96f82ab72df", - "metadata": {}, - "outputs": [], - "source": [ - "(title & total_stops_altair | main_chart)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9f709e13-aa1e-44da-9027-dfafaead5dad", - "metadata": {}, - "outputs": [], - "source": [ - "high_low_zero.shape_array_key.unique()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "24c99824-35b7-476c-9dd2-8e07e075bb4d", - "metadata": {}, - "outputs": [], - "source": [ - "merge1.time_of_day.value_counts()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "670047c2-2b24-4df3-a6f9-5f399063f521", - "metadata": { - "scrolled": true, - "tags": [] - }, - "outputs": [], - "source": [ - "stc_test = merge1[\n", - " (merge1.stop_sequence == 30)\n", - " & (merge1.shape_array_key == \"ffeb8a6113e0fdcd18e95257bb5be9cb\")\n", - "]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c83a6fd6-6fcb-4831-add9-ea844e3959ff", - "metadata": {}, - "outputs": [], - "source": [ - "stc_test" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e5b3c303-f9e9-4914-976e-3a9a39aafe3b", - "metadata": {}, - "outputs": [], - "source": [ - "stc_test.groupby([\"time_of_day\"]).agg({\"stop_sequence\": \"count\"})" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a6401179-2b91-444c-8c80-b97659f4225e", - "metadata": {}, - "outputs": [], - "source": [ - "m1 =" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "576c28b2-8c36-48a2-b312-9fc54010f7b5", - "metadata": {}, - "outputs": [], - "source": [ - "test1 = m1.melt(\n", - " id_vars=[\n", - " \"_gtfs_dataset_name\",\n", - " \"shape_array_key\",\n", - " \"trip_id\",\n", - " \"sorted_stop_seq\",\n", - " \"gtfs_dataset_key\",\n", - " \"loop_or_inlining\",\n", - " \"n_trips\",\n", - " ],\n", - " value_vars=[\"avg_speed_mph\", \"speed_mph\", \"p20_speed_mph\", \"p80_speed_mph\"],\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2b55cb75-8ee8-477c-95bc-439d2ee65962", - "metadata": {}, - "outputs": [], - "source": [ - "test1.shape" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "29dc19fd-0d8b-430d-a78a-a1c947263ef0", - "metadata": { - "scrolled": true, - "tags": [] - }, - "outputs": [], - "source": [ - "# test1[test1.shape_array_key == \"29d2bbdbeaec1d6888800f85bebf6e33\"]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e2c00548-7615-409c-a7cc-7db5f16c9a88", - "metadata": {}, - "outputs": [], - "source": [ - "# Only need average speed/p20 speed/p80 to show up once for each stop sequence-operator-shape array\n", - "test2 = test1.drop_duplicates(\n", - " subset=[\n", - " \"_gtfs_dataset_name\",\n", - " \"shape_array_key\",\n", - " \"sorted_stop_seq\",\n", - " \"gtfs_dataset_key\",\n", - " \"variable\",\n", - " \"value\",\n", - " ]\n", - ").reset_index(drop=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "436aca32-61c9-4488-8fae-26fe66688851", - "metadata": {}, - "outputs": [], - "source": [ - "# test2.to_csv(\"./speeds.csv\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "be4bac6e-2d9f-4d64-a835-22bb4f3c32f5", - "metadata": {}, - "outputs": [], - "source": [ - "test2.shape" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5723b75c-0cc2-42e7-95e0-17b444971ee2", - "metadata": {}, - "outputs": [], - "source": [ - "other = [\n", - " \"cf688717cf0cd8dac0e6d1f12f9c7333\",\n", - " \"6f39f818c9a0c5496cd1c8bd1aa11e67\",\n", - " \"3de4482ec32ba0f2edb451d3528b5a5e\",\n", - "]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7b08d810-9e33-4a4f-b600-b5c8de9bdef6", - "metadata": {}, - "outputs": [], - "source": [ - "# Take out routes that have over 85 stops\n", - "# subset = test2[~test2.shape_array_key.isin(routes_many_stops_list)].reset_index(drop = True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "600d9ed7-524e-4af6-b90b-81fd89cf8c02", - "metadata": {}, - "outputs": [], - "source": [ - "subset = test2[\n", - " test2.shape_array_key.isin(\n", - " [\n", - " \"29d2bbdbeaec1d6888800f85bebf6e33\",\n", - " \"754c5b012195800c38dc58e72e4f482e\",\n", - " \"e3c5ed2c6fa6cd5c5cd57d46aeb3cd8e\",\n", - " ]\n", - " )\n", - "]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a92360af-5dfb-43e2-b89c-87c8b8268665", - "metadata": {}, - "outputs": [], - "source": [ - "subset = threshold_utils.pre_clean(subset)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3a4c7ac2-4d6f-4702-a20b-0de69e0c86d4", - "metadata": {}, - "outputs": [], - "source": [ - "subset.columns" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "52df69f0-7083-4551-b5f5-5dd88f30a69a", - "metadata": {}, - "outputs": [], - "source": [ - "subset.sample()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4f0d0a5e-186e-4b7e-b307-6e6326c3e747", - "metadata": {}, - "outputs": [], - "source": [ - "subset[\"Route\"] = subset[\"Gtfs Dataset Name\"] + \" \" + subset[\"Shape Array Key\"]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a8d96219-49d0-4251-9b4b-a6968542424d", - "metadata": {}, - "outputs": [], - "source": [ - "subset = subset.rename(columns={\"Value\": \"Speed\"})" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b1155355-b4f6-41c2-beb1-f839d9d46027", - "metadata": {}, - "outputs": [], - "source": [ - "subset[\"Speed_Int\"] = subset.Speed.fillna(0).astype(int)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6f695b90-4157-42bd-b9cf-aaa4d71773ef", - "metadata": {}, - "outputs": [], - "source": [ - "subset[\"Route Type\"] = \"Loop or Inlining: \" + subset[\"Loop Or Inlining\"].astype(str)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0834a228-c47d-4760-ae13-aca72784e747", - "metadata": {}, - "outputs": [], - "source": [ - "# subset['Rounded Speed'].unique()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "67cb1df7-6337-478d-bd9c-3fbc0a3f698f", - "metadata": {}, - "outputs": [], - "source": [ - "def speed(row):\n", - " # If partner is none, return Unknown.\n", - " if row.Speed_Int == 0:\n", - " return 0\n", - " elif 0 < row.Speed_Int < 6:\n", - " return 5\n", - " elif 5 < row.Speed_Int < 11:\n", - " return 10\n", - " elif 10 < row.Speed_Int < 16:\n", - " return 15\n", - " elif 15 < row.Speed_Int < 21:\n", - " return 20\n", - " elif 20 < row.Speed_Int < 26:\n", - " return 25\n", - " elif 25 < row.Speed_Int < 31:\n", - " return 30\n", - " else:\n", - " return 35" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "de056258-240b-4aff-a3f9-aabe5330a9e0", - "metadata": {}, - "outputs": [], - "source": [ - "# Apply the function\n", - "subset[\"Rounded Speed\"] = subset.apply(speed, axis=1)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "afc90938-3fbd-4235-b6ab-68cfa445f0b6", - "metadata": { - "scrolled": true, - "tags": [] - }, - "outputs": [], - "source": [ - "# subset[['Rounded Speed', 'Speed', 'Speed_Int']]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9a39e842-70f6-4ff6-84b7-44b4dea48a40", - "metadata": {}, - "outputs": [], - "source": [ - "subset.Variable = subset.Variable.str.title().str.replace(\"_\", \" \")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a95e775c-4cff-44b6-a861-1a17efb5fbf2", - "metadata": {}, - "outputs": [], - "source": [ - "# One df for the actual speeds\n", - "subset_speedmph = subset[subset.Variable == \"Speed Mph\"].reset_index(drop=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "447aec9f-1a4a-408c-bc16-22c67746c262", - "metadata": {}, - "outputs": [], - "source": [ - "# One df for the percentiles\n", - "subset_other = subset[subset.Variable != \"Speed Mph\"].reset_index(drop=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c2fd6ee4-c841-44fd-8659-0ee2bc8a926d", - "metadata": {}, - "outputs": [], - "source": [ - "selection_test = alt_dropdown(subset, \"Route\", \"Operator/Shape Array\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "27a02401-dc45-493b-baad-e4cb84b6db46", - "metadata": {}, - "outputs": [], - "source": [ - "title = title.add_selection(selection_test).transform_filter(selection_test)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e8c0fb00-e9a0-47c1-abc5-cea7a806a6df", - "metadata": {}, - "outputs": [], - "source": [ - "def create_jitter_plot(df):\n", + "def summarize(original: pd.DataFrame, filtered_unusual_stops: pd.DataFrame):\n", "\n", - " # title_op = df['Gtfs Dataset Name'].iloc[0].replace('VehiclePositions','').strip()\n", - " # inline = df['Loop Or Inlining'].iloc[0]\n", - " chart1 = (\n", - " alt.Chart(df, width=0.5)\n", - " .mark_circle(size=100)\n", - " .encode(\n", - " x=alt.X(\n", - " \"jitter:Q\",\n", - " title=None,\n", - " axis=alt.Axis(values=[0], ticks=False, grid=False, labels=False),\n", - " scale=alt.Scale(),\n", - " ),\n", - " y=alt.Y(\n", - " \"Rounded Speed:Q\",\n", - " scale=alt.Scale(domain=[0, 50]),\n", - " title=\"Speed (MPH)\",\n", - " axis=alt.Axis(\n", - " labelAngle=360,\n", - " grid=False,\n", - " ),\n", - " ),\n", - " color=alt.Color(\n", - " \"Variable:N\",\n", - " scale=alt.Scale(range=cp.CALITP_CATEGORY_BRIGHT_COLORS),\n", - " ),\n", - " tooltip=df.columns.tolist(),\n", - " column=alt.Column(\n", - " \"Sorted Stop Seq:N\",\n", - " header=alt.Header(\n", - " labelAngle=360,\n", - " titleOrient=\"top\",\n", - " labelOrient=\"top\",\n", - " labelAlign=\"right\",\n", - " labelPadding=2,\n", - " ),\n", - " ),\n", - " )\n", - " .transform_calculate(\n", - " # Generate Gaussian jitter with a Box-Muller transform\n", - " jitter=\"sqrt(-2*log(random()))*cos(2*PI*random())\"\n", + " subset = [\"_gtfs_dataset_name\", \"gtfs_dataset_key\", \"shape_array_key\"]\n", + "\n", + " def aggregate(df, total_trip_column_name: str):\n", + " agg = (\n", + " df.groupby(subset)\n", + " .agg({\"stop_sequence\": \"count\"})\n", + " .reset_index()\n", + " .rename(columns={\"stop_sequence\": total_trip_column_name})\n", " )\n", - " .properties(title=\"Speeds by Operator-Shape Array\")\n", - " )\n", "\n", - " chart1 = threshold_utils.chart_size(chart1, 75, 200)\n", + " return agg\n", "\n", - " return chart1" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "19b97ca6-17c9-4540-acfe-9a4caf740ef4", - "metadata": {}, - "outputs": [], - "source": [ - "chart1 = (\n", - " create_jitter_plot(subset_speedmph)\n", - " .add_selection(selection_test)\n", - " .transform_filter(selection_test)\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c13568df-5e19-4f0a-a9fe-c3edeb0a0073", - "metadata": {}, - "outputs": [], - "source": [ - "chart2 = (\n", - " alt.Chart(subset_other, width=0.5)\n", - " .mark_circle(size=200)\n", - " .encode(\n", - " x=alt.X(\n", - " \"jitter:Q\",\n", - " title=None,\n", - " axis=alt.Axis(values=[0], ticks=False, grid=False, labels=False),\n", - " scale=alt.Scale(),\n", - " ),\n", - " y=alt.Y(\n", - " \"Rounded Speed:Q\",\n", - " title=\"Speed (MPH)\",\n", - " scale=alt.Scale(domain=[0, 50]),\n", - " axis=alt.Axis(grid=False),\n", - " ),\n", - " color=alt.Color(\n", - " \"Variable:N\",\n", - " scale=alt.Scale(range=cp.CALITP_CATEGORY_BRIGHT_COLORS),\n", - " ),\n", - " tooltip=subset_other.columns.tolist(),\n", - " column=alt.Column(\n", - " \"Sorted Stop Seq:N\",\n", - " header=alt.Header(\n", - " labelAngle=360,\n", - " title=None,\n", - " titleOrient=\"top\",\n", - " labelOrient=\"top\",\n", - " labelAlign=\"right\",\n", - " labelPadding=2,\n", - " ),\n", - " ),\n", - " )\n", - " .transform_calculate(\n", - " # Generate Gaussian jitter with a Box-Muller transform\n", - " jitter=\"sqrt(-2*log(random()))*cos(2*PI*random())\"\n", - " )\n", - ")" + " total_unusual_stops = aggregate(filtered_unusual_stops, \"total_unusual_stops\")\n", + " total_stops = aggregate(original, \"total_stops\")\n", + "\n", + " # Merge them\n", + " merge1 = pd.merge(total_unusual_stops, total_stops, on=subset, how=\"inner\")\n", + "\n", + " # Add some columns\n", + " merge1[\"percent_of_unusual_stops\"] = (\n", + " (merge1.total_unusual_stops / merge1.total_stops) * 100\n", + " ).astype(int)\n", + "\n", + " merge1[\n", + " \"Percentage of Unusual Stops\"\n", + " ] = \"% of Unusual Stops: \" + merge1.percent_of_unusual_stops.astype(str)\n", + "\n", + " # Add dropdown menu\n", + " # merge1[\"Dropdown Menu\"] = merge1._gtfs_dataset_name + \" \" + merge1.shape_array_key\n", + "\n", + " # Clean\n", + " merge1 = merge1.sort_values([\"percent_of_unusual_stops\"], ascending=False)\n", + " merge1 = merge1.drop(columns=[\"percent_of_unusual_stops\"])\n", + "\n", + " merge1 = threshold_utils.pre_clean(merge1)\n", + " return merge1" ] }, { "cell_type": "code", "execution_count": null, - "id": "2a2bda8b-d48c-44f3-a928-08aca894c565", + "id": "dad7c9b8-9025-4d91-bf75-0e23c3ac2a52", "metadata": {}, "outputs": [], "source": [ - "chart2 = threshold_utils.chart_size(chart2, 75, 200)" + "summarize(few_routes_cat, high_low_zero)" ] }, { - "cell_type": "code", - "execution_count": null, - "id": "dd20293d-7c43-42c4-b053-de945860b6f0", - "metadata": {}, - "outputs": [], + "cell_type": "markdown", + "id": "cde431f9-10ad-484f-b954-dd3c13a6e683", + "metadata": { + "tags": [] + }, "source": [ - "chart2 = chart2.add_selection(selection_test).transform_filter(selection_test)" + "#### Draft\n", + "* Show which stops are excluded from flags\n", + "* Show how many stops are dropped\n", + "* Show % of stops that were flagged compared to total stops." ] }, { "cell_type": "code", "execution_count": null, - "id": "44dc6896-e95d-42df-bbd5-f2bb2c2a2cc6", + "id": "6a9ba66d-5421-4170-9201-881ad3704b39", "metadata": {}, "outputs": [], "source": [ - "title = threshold_utils.chart_size(title, 20, 20)" + "def read_back_gcs():\n", + " # Read back all the partitioned stuff - grab the file number\n", + " # part0.parquet, part1.parquet\n", + " start = datetime.datetime.now()\n", + " print(f\"Begin: {start}\")\n", + " gcs_file_path1 = f\"{speed_utils.GCS_PATH}partitioned_flags\"\n", + " file_names_dask = extract_number(gcs_file_path1, \"part\")\n", + "\n", + " # https://www.geeksforgeeks.org/read-multiple-csv-files-into-separate-dataframes-in-python/\n", + " # create empty list\n", + " all_df = []\n", + "\n", + " # append datasets into the list\n", + " for i in range(len(file_names_dask)):\n", + " gcs_file_path2 = f\"{gcs_file_path1}/part.\"\n", + " temp_df = dd.read_parquet(f\"{gcs_file_path2}{file_names_dask[i]}.parquet\")\n", + " all_df.append(temp_df)\n", + "\n", + " final_df = dd.concat(all_df, axis=0).reset_index(drop=True)\n", + " print(\"Begin computing\")\n", + " final_df = final_df.compute()\n", + " print(\"Done computing\")\n", + " end = datetime.datetime.now()\n", + " print(f\"Finish: {end-start}\")\n", + " return final_df" ] }, { "cell_type": "code", "execution_count": null, - "id": "d651c8c6-179c-4157-a671-11006cb419df", + "id": "8354439b-0ffc-4b3f-8114-05516a8e48ef", "metadata": {}, "outputs": [], "source": [ - "alt.data_transformers.enable(\"default\", max_rows=None)" + "def categorize_by_percentile_pandas(\n", + " df: pd.DataFrame, column_percentile: str, column_str: str\n", + ") -> pd.DataFrame:\n", + "\n", + " # Find percentiles\n", + " agg1 = (\n", + " df.groupby([\"shape_array_key\", \"stop_sequence\"])[column_percentile]\n", + " .describe(percentiles=[0.05, 0.95])\n", + " .reset_index()\n", + " .add_prefix(column_str)\n", + " )\n", + " \n", + " # Merge \n", + " m1 = dd.merge(\n", + " df,\n", + " agg1,\n", + " how=\"inner\",\n", + " left_on=[\"shape_array_key\", \"stop_sequence\"],\n", + " right_on=[\n", + " f\"{column_str}shape_array_key\",\n", + " f\"{column_str}stop_sequence\",\n", + " ],\n", + " )\n", + " \n", + " def percentile(row):\n", + "\n", + " if row[column_percentile] == row[f\"{column_str}mean\"]:\n", + " return f\"{column_str} elapsed avg\"\n", + " elif row[f\"{column_str}5%\"] < row[column_percentile] <= row[f\"{column_str}95%\"]:\n", + " return f\"{column_str} elapsed avg\"\n", + " elif row[column_percentile] <= row[f\"{column_str}5%\"]:\n", + " return f\"{column_str} elapsed low\"\n", + " elif row[column_percentile] > row[f\"{column_str}95%\"]:\n", + " return f\"{column_str} elapsed high\"\n", + "\n", + " else:\n", + " return f\"{column_str} elapsed avg\"\n", + " \n", + " \n", + " # Apply flags\n", + " m1[f\"{column_str}cat\"] = m1.apply(lambda x: percentile(x), axis=1)\n", + " \n", + " # Delete out any average columns\n", + " m1 = m1.loc[m1[f\"{column_str}cat\"] != f\"{column_str} elapsed avg\"].reset_index(drop = True)\n", + " \n", + " # Clean\n", + " m1[f\"{column_str}cat\"] = m1[f\"{column_str}cat\"].str.replace(\"_\", \"\")\n", + " \n", + " columns_to_keep = [\n", + " \"shape_array_key\",\n", + " \"gtfs_dataset_key\",\n", + " \"_gtfs_dataset_name\",\n", + " \"speed_mph\",\n", + " \"loop_or_inlining\",\n", + " \"stop_sequence\",\n", + " \"stop_id\",\n", + " \"trip_id\",\n", + " \"n_trips\",\n", + " \"p20_mph\",\n", + " \"p80_mph\",\n", + " \"p50_mph\",\n", + " \"time_of_day\",\n", + " \"meters_elapsed\",\n", + " \"sec_elapsed\",\n", + " f\"{column_str}5%\",\n", + " f\"{column_str}95%\",\n", + " f\"{column_str}cat\",\n", + " ]\n", + " m1 = m1[columns_to_keep]\n", + " print(f\"Done with {column_str}\")\n", + " \n", + " return m1 " ] }, { "cell_type": "code", "execution_count": null, - "id": "8e753711-8506-4939-8d9d-22566a641988", + "id": "18c06e7d-9b09-4573-bdf6-1dc753b3d552", "metadata": {}, "outputs": [], "source": [ - "title & (chart1.interactive() & chart2.interactive())" + "def extract_number(folder: str, phrase_to_find: str) -> list:\n", + " \"\"\"\n", + " Extract the numeric portion of a file path.\n", + " \"\"\"\n", + " files = find_files(folder, phrase_to_find)\n", + " all_file_numbers = []\n", + " for file in files:\n", + " # https://stackoverflow.com/questions/11339210/how-to-get-integer-values-from-a-string-in-python\n", + " file_number = \"\".join(i for i in file if i.isdigit())\n", + " all_file_numbers.append(file_number)\n", + " return all_file_numbers" ] }, { "cell_type": "code", "execution_count": null, - "id": "64011717-6707-4a7a-8eec-9bdb0861bab6", + "id": "dba2d199-47b4-4116-9ddb-adc1259ea3e2", "metadata": {}, "outputs": [], "source": [ - "def meter_elapsed_categories(row):\n", - " lower_end = row[\"meters_mean\"] - row[\"meters_std\"]\n", - " higher_end = row[\"meters_mean\"] + row[\"meters_std\"]\n", - " if row[\"meters_elapsed\"] == row[\"meters_mean\"]:\n", - " return \"distance elapsed is average\"\n", - " elif row[\"meters_elapsed\"] <= lower_end:\n", - " return \"distance lapsed on lower end\"\n", - " elif row[\"meters_elapsed\"] >= higher_end:\n", - " return \"distance lapsed on higher end\"\n", - " elif lower_end < row[\"meters_elapsed\"] < higher_end:\n", - " return \"distance elapsed is average\"\n", - " else:\n", - " return \"other\"\n", + "# Find all the parquets again\n", + "def find_files(folder: str, phrase_to_find: str) -> list:\n", + " \"\"\"\n", + " Grab a list of files that contain the\n", + " phrase inputted.\n", + " \"\"\"\n", + " # Create a list of all the files in my folder\n", + " all_files_in_folder = fs.ls(folder)\n", + " my_files = [i for i in all_files_in_folder if phrase_to_find in i]\n", "\n", + " # String to add to read the files\n", + " my_string = \"gs://\"\n", + " my_files = [my_string + i for i in my_files]\n", "\n", - "def seconds_elapsed_categories(row):\n", - " lower_end = row[\"secs_mean\"] - row[\"secs_std\"]\n", - " higher_end = row[\"secs_mean\"] + row[\"secs_std\"]\n", - " if row[\"sec_elapsed\"] == row[\"secs_mean\"]:\n", - " return \"secs elapsed is average\"\n", - " elif row[\"sec_elapsed\"] <= lower_end:\n", - " return \"secs lapsed on lower end\"\n", - " elif row[\"sec_elapsed\"] >= higher_end:\n", - " return \"secs lapsed on higher end\"\n", - " elif lower_end < row[\"sec_elapsed\"] < higher_end:\n", - " return \"secs elapsed is average\"\n", - " else:\n", - " return \"other\"" + " # Extract digit of parquet\n", + " return my_files" ] }, { "cell_type": "code", "execution_count": null, - "id": "fa26aa94-6e1c-46ef-8701-3f4a849faa7d", + "id": "354cf0e3-7b2c-4403-bef9-74533691a0e9", "metadata": {}, "outputs": [], "source": [ - "\"\"\"def mph_categories(row):\n", - " if (row[\"speed_mph\"] <= row[\"p20_speed_mph\"]):\n", - " return \"speed low\"\n", - " elif (row[\"p20_speed_mph\"] < row[\"speed_mph\"] < row[\"p80_speed_mph\"]):\n", - " return \"speed average\"\n", - " elif (row[\"speed_mph\"] >= row[\"p80_speed_mph\"]):\n", - " return \"speed high\"\n", - " elif (row[\"speed_mph\"] == 0):\n", - " return \"speed is 0\"\n", + "def categorize_by_percentile(\n", + " df: pd.DataFrame, column_percentile: str, column_str: str\n", + ") -> dd.DataFrame:\n", + " \n", + " # Find percentiles\n", + " agg1 = (\n", + " df.groupby([\"shape_array_key\", \"stop_sequence\"])[column_percentile]\n", + " .describe(percentiles=[0.05, 0.95])\n", + " .reset_index()\n", + " .add_prefix(column_str)\n", + " )\n", + " \n", + " \n", + " # Convert to dask because it takes a very long time\n", + " agg1_dask = dd.from_pandas(agg1, npartitions=1)\n", + " df_dask = dd.from_pandas(df, npartitions=1)\n", + "\n", + " # Merge using dask\n", + " merge1_dask = dd.merge(\n", + " df_dask,\n", + " agg1_dask,\n", + " how=\"inner\",\n", + " left_on=[\"shape_array_key\", \"stop_sequence\"],\n", + " right_on=[\n", + " f\"{column_str}shape_array_key\",\n", + " f\"{column_str}stop_sequence\",\n", + " ],\n", + " )\n", + "\n", + " def percentile(row):\n", + "\n", + " if row[column_percentile] == row[f\"{column_str}mean\"]:\n", + " return f\"{column_str} elapsed avg\"\n", + " elif row[column_percentile] <= row[f\"{column_str}5%\"]:\n", + " return f\"{column_str} elapsed low\"\n", + " elif row[column_percentile] == 0:\n", + " return f\"{column_str} elapsed is 0\"\n", + " elif row[f\"{column_str}5%\"] < row[column_percentile] <= row[f\"{column_str}95%\"]:\n", + " return f\"{column_str} elapsed avg\"\n", + " elif row[column_percentile] > row[f\"{column_str}95%\"]:\n", + " return f\"{column_str} elapsed high\"\n", + "\n", " else:\n", " return \"other\"\n", - " \"\"\"" + "\n", + " merge1_dask[f\"{column_str}cat\"] = merge1_dask.apply(\n", + " lambda x: percentile(x), axis=1, meta=(f\"{column_str}cat\", \"string\")\n", + " )\n", + " \n", + " # Filter for only unsually high and low stuff\n", + " merge1_dask = merge1_dask[merge1_dask[f\"{column_str}cat\"].isin([f\"{column_str} elapsed high\", f\"{column_str} elapsed low\"]).reset_index(drop = True)\n", + " \n", + " # Clean\n", + " merge1_dask[f\"{column_str}cat\"] = merge1_dask[f\"{column_str}cat\"].str.replace(\n", + " \"_\", \"\"\n", + " )\n", + "\n", + " columns_to_keep = [\n", + " \"shape_array_key\",\n", + " \"gtfs_dataset_key\",\n", + " \"_gtfs_dataset_name\",\n", + " \"speed_mph\",\n", + " \"loop_or_inlining\",\n", + " \"stop_sequence\",\n", + " \"stop_id\",\n", + " \"trip_id\",\n", + " \"n_trips\",\n", + " \"p20_mph\",\n", + " \"p80_mph\",\n", + " \"p50_mph\",\n", + " \"time_of_day\",\n", + " \"meters_elapsed\",\n", + " \"sec_elapsed\",\n", + " f\"{column_str}5%\",\n", + " f\"{column_str}95%\",\n", + " f\"{column_str}cat\",\n", + " ]\n", + " merge1_dask = merge1_dask[columns_to_keep]\n", + " print(f\"Done with {column_str}\")\n", + " return merge1_dask" ] }, { "cell_type": "code", "execution_count": null, - "id": "e784cd39-b73e-478a-8b3b-458506c31a18", + "id": "d80535fb-8648-4216-918f-76e0484ba3ea", "metadata": {}, "outputs": [], "source": [ - "def flag(row):\n", - "\n", - " # Ok rows\n", - " # If distance and time are average, flag as average\n", - " if (row[\"meters_cat\"] == \"meters_ elapsed avg\") & (\n", - " row[\"seconds_cat\"] == \"seconds_ elapsed avg\"\n", - " ):\n", - " return \"ok\"\n", - " # If MPH is average, flag as average\n", - " elif row[\"speed_flags\"] == \"speed_ elapsed avg\":\n", + "def flag_round1(row):\n", + " if (row[\"meters_elapsed\"] == 0) & (row[\"sec_elapsed\"] == 0):\n", + " return \"division by 0\"\n", + " elif row[\"meters_cat\"] == \"meters elapsed low\":\n", + " return \"meters too low\"\n", + " elif row[\"seconds_cat\"] == \"seconds elapsed high\":\n", + " return \"seconds too high\"\n", + " else:\n", " return \"ok\"\n", + " \n", + "#def flag_round2(row):\n", + "# if (row[\"meters_elapsed\"] == 0) & (row[\"sec_elapsed\"] == 0):\n", + "# return \"division by 0\"\n", + "# else:\n", + "# return \"meters/seconds are filled but flagged\"\n", "\n", - " # Zero rows\n", - " elif (\n", - " (row[\"speed_mph\"] == 0)\n", - " | (row[\"sec_elapsed\"] == 0)\n", - " | (row[\"meters_elapsed\"] == 0)\n", - " ):\n", - " return \"low\"\n", + "def categorize_meters_speeds_dask(df):\n", + " start = datetime.datetime.now()\n", + " print(f\"Begin: {start}\")\n", "\n", - " # If meters and seconds are high, flag as average\n", - " # elif ((row[\"meters_cat\"] == \"meters_ elapsed high\") & (row[\"seconds_cat\"] == \"seconds_ elapsed high\")):\n", - " # return \"ok\"\n", - " # If meters and seconds are low, flag as average\n", - " # elif ((row[\"meters_cat\"] == \"meters_ elapsed low\") & (row[\"seconds_cat\"] == \"seconds_ elapsed low\")):\n", - " # return \"ok\"\n", + " # Find percentiles\n", + " df.speed_mph = df.speed_mph.fillna(0)\n", "\n", - " # Tag as high\n", - " # elif ((row[\"meters_cat\"] != \"meters_ elapsed avg\") & (row[\"seconds_cat\"] != \"seconds_ elapsed avg\") & (row[\"speed_flags\"] == \"speed_ elapsed high\")):\n", - " # return \"high\"\n", - " # elif ((row[\"seconds_cat\"] == \"seconds_ elapsed low\") & (row[\"speed_flags\"] == \"speed_ elapsed high\")):\n", - " # return \"high\"\n", - " # elif ((row[\"meters_cat\"] == \"meters_ elapsed high\") & (row[\"speed_flags\"] == \"speed_ elapsed high\")):\n", - " # return \"high\"\n", + " # These are now dask dataframes\n", + " ddf_meters = categorize_by_percentile(df, \"meters_elapsed\", \"meters_\")\n", + " ddf_seconds = categorize_by_percentile(df, \"sec_elapsed\", \"seconds_\")\n", "\n", - " # Tag as low\n", - " elif (\n", - " (row[\"meters_cat\"] != \"meters_ elapsed avg\")\n", - " & (row[\"seconds_cat\"] != \"seconds_ elapsed avg\")\n", - " & (row[\"speed_flags\"] == \"speed_ elapsed low\")\n", - " ):\n", - " return \"low\"\n", - " elif (row[\"seconds_cat\"] == \"seconds_ elapsed high\") & (\n", - " row[\"speed_flags\"] == \"speed_ elapsed low\"\n", - " ):\n", - " return \"low\"\n", - " elif (row[\"meters_cat\"] == \"meters_ elapsed avg\") & (\n", - " row[\"speed_flags\"] == \"speed_ elapsed low\"\n", - " ):\n", - " return \"high\"\n", + " merge_cols = [\n", + " \"shape_array_key\",\n", + " \"gtfs_dataset_key\",\n", + " \"_gtfs_dataset_name\",\n", + " \"speed_mph\",\n", + " \"loop_or_inlining\",\n", + " \"stop_sequence\",\n", + " \"stop_id\",\n", + " \"n_trips\",\n", + " \"p20_mph\",\n", + " \"p80_mph\",\n", + " \"p50_mph\",\n", + " \"meters_elapsed\",\n", + " \"sec_elapsed\",\n", + " \"trip_id\",\n", + " \"time_of_day\",\n", + " ]\n", "\n", - " else:\n", - " return \"other\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "05d629ed-abfd-477e-b87b-7e321a6d0831", - "metadata": {}, - "outputs": [], - "source": [ - "def speed_categories(row):\n", - " \"\"\"\n", - " Stricter thresholds for speed categories.\n", - " Just because a speed is below the 25th or\n", - " above the 75th percentile doesn't mean it\n", - " should be flagged. Take into account how far away\n", - " it is from that.\n", - " \"\"\"\n", - " # lower_end = (row[\"speed_mean\"] - row[\"speed_std\"])\n", - " # higher_end = (row[\"speed_mean\"] + row[\"speed_std\"])\n", - " if row[\"speed_mph\"] == row[\"avg_speed_mph\"]:\n", - " return \"average\"\n", - " elif row[\"speed_mph\"] <= lower_end:\n", - " return \"speed low\"\n", - " elif row[\"speed_mph\"] >= higher_end:\n", - " return \"speed high\"\n", - " elif (row[\"speed_mph\"] == 0) | (row[\"speed_mph\"] == None):\n", - " return \"speed is 0\"\n", - " else:\n", - " return \"average\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "86a5c48c-6720-41a8-b626-94305506ab3a", - "metadata": {}, - "outputs": [], - "source": [ - "# Determine if an agency has a small, medium, or large fleet size.\n", - "def categorize_by_percentile(df, column_percentile: str):\n", + " # Merge using dask\n", + " m1 = dd.merge(ddf_meters, ddf_seconds, how=\"inner\", on=merge_cols)\n", "\n", - " # Get percentiles in objects for total vehicle.\n", - " p75 = df[column_percentile].quantile(0.75).astype(float)\n", - " p25 = df[column_percentile].quantile(0.25).astype(float)\n", - " p50 = df[column_percentile].quantile(0.50).astype(float)\n", + " # Apply flags\n", + " m1[\"flag\"] = m1.apply(lambda x: flag_round1(x), axis=1, meta=(\"flag\", \"string\"))\n", + " print(\"Apply first round of flags\")\n", "\n", - " def percentile(row):\n", - " if row[column_percentile] <= p25:\n", - " return f\"{column_percentile}: low\"\n", - " elif (p25 < row[column_percentile]) and (row[column_percentile] <= p75):\n", - " return f\"{column_percentile}: average\"\n", - " elif row[column_percentile] > p75:\n", - " return f\"{column_percentile}: high\"\n", - " else:\n", - " return \"other\"\n", + " # Filter out for projects that are ok, retag for zeroes\n", + " m2 = m1[m1.flag != \"ok\"].reset_index()\n", "\n", - " df[f\"{column_percentile}_cat\"] = df.apply(lambda x: percentile(x), axis=1)\n", + " # Apply flag for zeroes\n", + " m2[\"flag_division_0\"] = m2.apply(\n", + " lambda x: flag_round2(x), axis=1, meta=(\"flag\", \"string\")\n", + " )\n", + " print(\"Apply second round of flags\")\n", "\n", - " return df\n", + " # Replace values in the original flag\n", + " # https://stackoverflow.com/questions/54302694/updating-the-values-of-a-column-in-a-dask-dataframe-based-on-some-condition-on-s\n", + " condition = m2.flag_division_0 == \"division by 0\"\n", + " m2[\"flag\"] = m2[\"flag\"].mask(condition, m2.flag_division_0)\n", + " print(\"Done flagging\")\n", "\n", + " # Print value counts\n", + " # print(f\"breakout of rows after separating out for 0: \\n {m2.flag.value_counts().compute()}\")\n", "\n", - "def categorize_all(df):\n", + " # Filter for only projects that are divided by 0\n", + " # m2 = m2[m2.flag == \"division by 0\"].reset_index()\n", + " # Delete older column\n", + " m2 = m2.drop(columns=[\"flag_division_0\", \"level_0\", \"index\"])\n", + " print(\"Drop columns\")\n", "\n", - " # Hold results\n", - " final = pd.DataFrame()\n", + " # Save\n", + " # m2 = m2.repartition(partition_size=\"5MB\")\n", + " # m2.to_parquet(f\"{speed_utils.GCS_PATH}partitioned_flags\", overwrite = True)\n", + " print(\"Saved\")\n", "\n", - " for column in [\"meters_elapsed\", \"sec_elapsed\"]:\n", - " for shape_array_key in df.shape_array_key.tolist():\n", - " for stop in df.stop_sequence.tolist():\n", - " filtered = df[\n", - " (df.shape_array_key == shape_array_key) & (df.stop_sequence == stop)\n", - " ].reset_index()\n", - " categorized = categorize_by_percentile(filtered, column)\n", - " final = pd.concat([final, categorized], axis=0)\n", - " print(f\"done for {column}/{stop}\")\n", + " end = datetime.datetime.now()\n", + " print(f\"Finish: {end-start}\")\n", "\n", - " return final" + " return m2" ] }, { "cell_type": "code", "execution_count": null, - "id": "1bb82075-6269-472e-8582-27a7640f0aa5", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4f25a17e-cea1-4bff-888d-d8f37dc31775", + "id": "34840f61-1857-4f5c-b3a6-abcac18af0db", "metadata": {}, "outputs": [], "source": [ - "\"\"\"\n", - "p25 = troubleshoot.total_stops.quantile(0.25).astype(float)\n", - "p50 = troubleshoot.total_stops.quantile(0.50).astype(float)\n", - "p75 = troubleshoot.total_stops.quantile(0.75).astype(float)\n", - "p95 = troubleshoot.total_stops.quantile(0.95).astype(float)\n", - "p99 = troubleshoot.total_stops.quantile(0.99).astype(float)\n", - "\"\"\"" + "trips = list(equal_sampling.trip_id.unique())" ] }, { "cell_type": "code", "execution_count": null, - "id": "0e676189-df19-4630-8dc6-3c4fc8c60e16", + "id": "759ac80e-9c98-4c94-add3-859ab998e038", "metadata": {}, "outputs": [], "source": [ - "def stop_categories1(row):\n", - " if (row.total_stops > 0) and (row.total_stops <= p25):\n", - " return \"25th <= 17 stops\"\n", - " elif (row.total_stops > p25) and (row.total_stops <= p75):\n", - " return \"50th <= 30 stops\"\n", - " elif (row.total_stops > p75) and (row.total_stops <= p95):\n", - " return \"75th <= 50 stops\"\n", - " elif (row.total_stops > p95) and (row.total_stops <= p99):\n", - " return \"95th <= 85 stops\"\n", - " elif row.total_stops >= p95:\n", - " return \"99th >= 203 stops\"\n", - " else:\n", - " return \"other\"" + "stops = list(equal_sampling.stop_id.unique())" ] }, { "cell_type": "code", "execution_count": null, - "id": "af72c1c9-df07-4e7f-973f-f0c07223b7a4", + "id": "94620388-876e-4e6d-80ab-ac68eb1061c9", "metadata": {}, "outputs": [], "source": [ - "def create_jitter_plot(df):\n", - "\n", - " title_op = df[\"Gtfs Dataset Name\"].iloc[0].replace(\"VehiclePositions\", \"\").strip()\n", - " inline = df[\"Loop Or Inlining\"].iloc[0]\n", - "\n", - " chart1 = (\n", - " alt.Chart(df, width=0.5)\n", - " .mark_circle(size=100)\n", - " .encode(\n", - " x=alt.X(\n", - " \"jitter:Q\",\n", - " title=None,\n", - " axis=alt.Axis(values=[0], ticks=False, grid=False, labels=False),\n", - " scale=alt.Scale(),\n", - " ),\n", - " y=alt.Y(\"Rounded Speed:Q\", axis=alt.Axis(labelAngle=360)),\n", - " color=alt.Color(\n", - " \"Variable:N\",\n", - " scale=alt.Scale(range=cp.CALITP_CATEGORY_BRIGHT_COLORS),\n", - " ),\n", - " tooltip=df.columns.tolist(),\n", - " column=alt.Column(\n", - " \"Stop Sequence:N\",\n", - " header=alt.Header(\n", - " labelAngle=360,\n", - " titleOrient=\"top\",\n", - " labelOrient=\"bottom\",\n", - " labelAlign=\"right\",\n", - " labelPadding=2,\n", - " ),\n", - " ),\n", - " )\n", - " .transform_calculate(\n", - " # Generate Gaussian jitter with a Box-Muller transform\n", - " jitter=\"sqrt(-2*log(random()))*cos(2*PI*random())\"\n", - " )\n", - " .properties(title=f\"{title_op} - Route Type {inline}\")\n", - " )\n", - "\n", - " chart1 = threshold_utils.chart_size(chart1, 40, 250)\n", - "\n", - " return chart1" + "# Plot some of the trips\n", + "sample_data = few_routes_cat[few_routes_cat.trip_id.isin(trips)].reset_index()" ] }, { "cell_type": "code", "execution_count": null, - "id": "18ebd177-efc1-4bdc-9fac-bf947db810a4", + "id": "417804f1-9e24-4ac6-9c96-95f4da8f9693", "metadata": {}, "outputs": [], - "source": [ - "chart2 = (\n", - " alt.Chart(anaheim_test, width=0.5)\n", - " .mark_circle(size=100)\n", - " .encode(\n", - " x=alt.X(\n", - " \"jitter:Q\",\n", - " title=None,\n", - " axis=alt.Axis(values=[0], ticks=False, grid=False, labels=False),\n", - " scale=alt.Scale(),\n", - " ),\n", - " y=alt.Y(\"rounded_speed:Q\", axis=alt.Axis(labelAngle=-90)),\n", - " color=alt.Color(\n", - " \"variable:N\",\n", - " scale=alt.Scale(range=cp.CALITP_CATEGORY_BRIGHT_COLORS),\n", - " ),\n", - " tooltip=anaheim_test.columns.tolist(),\n", - " column=alt.Column(\n", - " \"stop_sequence:N\",\n", - " header=alt.Header(\n", - " labelAngle=360,\n", - " titleOrient=\"top\",\n", - " labelOrient=\"bottom\",\n", - " labelAlign=\"right\",\n", - " labelPadding=2,\n", - " ),\n", - " ),\n", - " )\n", - " .transform_calculate(\n", - " # Generate Gaussian jitter with a Box-Muller transform\n", - " jitter=\"sqrt(-2*log(random()))*cos(2*PI*random())\"\n", - " )\n", - " .configure_facet(spacing=0)\n", - " .configure_view(stroke=None)\n", - " .properties(title=\"Trip Duration by RT Category\")\n", - ")" + "source": [ + "sample_data.shape" ] }, { "cell_type": "code", "execution_count": null, - "id": "a7287305-20d8-434f-9756-42b18cf172a4", + "id": "339a67c7-3634-43d4-b865-b10119507c44", "metadata": {}, "outputs": [], "source": [ - "chart2 = threshold_utils.chart_size(chart2, 80, 300)" + "# sample_data2 = sample_data[['shape_array_key','gtfs_dataset_key','trip_id']]" ] }, { "cell_type": "code", "execution_count": null, - "id": "3f9e9258-b954-45bf-bd7a-17822c9c607a", + "id": "39231b51-f0ab-4bbe-80d0-8a2081849e50", "metadata": {}, "outputs": [], "source": [ - "chart2" + "plotting = sample_data.melt(\n", + " id_vars=[\n", + " \"_gtfs_dataset_name\",\n", + " \"shape_array_key\",\n", + " \"trip_id\",\n", + " \"stop_sequence\",\n", + " \"gtfs_dataset_key\",\n", + " \"loop_or_inlining\",\n", + " \"n_trips\",\n", + " \"meters_elapsed\",\n", + " \"meters_cat\",\n", + " \"seconds_cat\",\n", + " \"sec_elapsed\",\n", + " \"flag\",\n", + " \"p20_speed_mph\",\n", + " \"p80_speed_mph\",\n", + " \"median_speed_mph\",\n", + " ],\n", + " value_vars=[\"speed_mph\"],\n", + ")" ] }, { "cell_type": "code", "execution_count": null, - "id": "ea1d8514-ae9c-42e6-b59a-49a44fecbc98", + "id": "959bbe35-8c21-4795-b418-745a0caa94e8", "metadata": {}, "outputs": [], "source": [ - "chart1 = (\n", - " alt.Chart(anaheim_test_speedmph, width=0.5)\n", - " .mark_circle(size=100)\n", - " .encode(\n", - " x=alt.X(\n", - " \"jitter:Q\",\n", - " title=None,\n", - " axis=alt.Axis(values=[0], ticks=False, grid=False, labels=False),\n", - " scale=alt.Scale(),\n", - " ),\n", - " y=alt.Y(\"rounded_speed:Q\", axis=alt.Axis(labelAngle=-90)),\n", - " color=alt.Color(\n", - " \"stop_sequence:N\",\n", - " scale=alt.Scale(range=cp.CALITP_CATEGORY_BRIGHT_COLORS),\n", - " ),\n", - " tooltip=anaheim_test.columns.tolist(),\n", - " column=alt.Column(\n", - " \"stop_sequence:N\",\n", - " header=alt.Header(\n", - " labelAngle=360,\n", - " titleOrient=\"top\",\n", - " labelOrient=\"bottom\",\n", - " labelAlign=\"right\",\n", - " labelPadding=2,\n", - " ),\n", - " ),\n", - " )\n", - " .transform_calculate(\n", - " # Generate Gaussian jitter with a Box-Muller transform\n", - " jitter=\"sqrt(-2*log(random()))*cos(2*PI*random())\"\n", - " )\n", - " .configure_facet(spacing=0)\n", - " .configure_view(stroke=None)\n", - " .properties(title=f\"Loop/Inling:{anaheim_test_speedmph.loop_or_inlining.iloc[0]}\")\n", - ")" + "# Clean\n", + "plotting = threshold_utils.pre_clean(plotting)" ] }, { "cell_type": "code", "execution_count": null, - "id": "4e1eea88-640b-4e96-b2d5-07cd2f19afdf", + "id": "dd343cdd-acf3-4433-a702-8c3eb53ba1f0", "metadata": {}, "outputs": [], "source": [ - "chart1 = threshold_utils.chart_size(chart1, 80, 300)" + "plotting[\"Dropdown Menu\"] = plotting[\"Gtfs Dataset Name\"] + \" \" + plotting[\"Trip Id\"]" ] }, { "cell_type": "code", "execution_count": null, - "id": "c54559c6-8fb0-4109-8e7d-84c1ecb6497e", + "id": "072f39d9-4bf9-4efc-a393-24f096cecf7e", "metadata": {}, "outputs": [], "source": [ - "chart1" + "def alt_dropdown(df, col_for_dropdown: str, dropdown_menu_title: str):\n", + " # Create dropdown menu\n", + " # Exclude \"none\" operators which are only scheduled data\n", + " df = df.loc[df[col_for_dropdown] != \"None\"][[col_for_dropdown]]\n", + " dropdown_list = df[col_for_dropdown].unique().tolist()\n", + "\n", + " # Show only first operator by default\n", + " initialize_first_op = sorted(dropdown_list)[0]\n", + " input_dropdown = alt.binding_select(\n", + " options=sorted(dropdown_list), name=dropdown_menu_title\n", + " )\n", + "\n", + " selection = alt.selection_single(\n", + " name=dropdown_menu_title,\n", + " fields=[col_for_dropdown],\n", + " bind=input_dropdown,\n", + " init={col_for_dropdown: initialize_first_op},\n", + " )\n", + "\n", + " return selection" ] }, { "cell_type": "code", "execution_count": null, - "id": "79375476-00a1-4b85-a532-26889fc465e4", + "id": "6b919e59-03e8-426f-b9b3-4468d5b1b06b", "metadata": {}, "outputs": [], - "source": [] + "source": [ + "selection_test = alt_dropdown(plotting, \"Dropdown Menu\", \"Route\")" + ] }, { "cell_type": "code", "execution_count": null, - "id": "0e5c052e-cccd-4f75-be65-9658cf7616b1", + "id": "d7b9d14f-b708-4426-a1b8-4afb4e7c95ba", "metadata": {}, "outputs": [], "source": [ - "def create_dot_plot2(\n", - " df,\n", - " col_for_dots: str,\n", - " x_axis_col: str,\n", - " y_axis_col: str,\n", - " tooltip_cols: list,\n", - " chart_title: str,\n", - "):\n", - "\n", - " chart = (\n", - " alt.Chart(df)\n", - " .mark_circle(opacity=1, size=100)\n", - " .transform_window(id=\"rank()\", groupby=[col_for_dots])\n", + "(\n", + " threshold_utils.chart_size(\n", + " alt.Chart(plotting)\n", + " .mark_tick(\n", + " size=15,\n", + " thickness=5,\n", + " )\n", " .encode(\n", - " alt.X(\n", - " f\"{x_axis_col}:O\",\n", - " sort=\"descending\",\n", - " axis=alt.Axis(ticks=False, grid=True),\n", - " ),\n", - " alt.Y(f\"{y_axis_col}:N\"),\n", + " x=\"Stop Sequence:N\",\n", + " y=\"Value:Q\",\n", " color=alt.Color(\n", - " f\"{col_for_dots}:N\",\n", - " scale=alt.Scale(range=cp.CALITP_CATEGORY_BRIGHT_COLORS),\n", - " legend=None,\n", + " \"Flag:N\", scale=alt.Scale(range=cp.CALITP_CATEGORY_BOLD_COLORS)\n", " ),\n", - " tooltip=tooltip_cols,\n", + " tooltip=plotting.columns.tolist(),\n", " )\n", - " .properties(title=chart_title)\n", + " .interactive(),\n", + " 1100,\n", + " 400,\n", " )\n", - "\n", - " return chart" + " .add_selection(selection_test)\n", + " .transform_filter(selection_test)\n", + ")" ] }, { "cell_type": "code", "execution_count": null, - "id": "e38644ab-7fb8-47b1-b692-512b55f05625", + "id": "7ec90906-3364-440c-a951-eb5f2b1e84c2", "metadata": {}, "outputs": [], "source": [ - "chart3 = create_dot_plot1(\n", - " anaheim_test_other,\n", - " \"variable\",\n", - " \"stop_sequence\",\n", - " \"rounded_speed\",\n", - " anaheim_test_other.columns.tolist(),\n", - " \"Percentile/Average\",\n", - ")" + "stop" ] }, { "cell_type": "code", "execution_count": null, - "id": "2b6df39b-b074-494d-8f5e-65fe05356f4b", + "id": "140112e0-8ee0-481d-848c-4db091460418", "metadata": {}, "outputs": [], "source": [ - "chart3 = threshold_utils.chart_size(chart3, 650, 300)" + "type(vehicle_positions)" ] }, { "cell_type": "code", "execution_count": null, - "id": "c2c565de-3c55-4827-a5bb-b48c6d49d4c1", + "id": "d843325a-f33c-40de-a3cb-befed24d645e", "metadata": {}, "outputs": [], "source": [ - "chart4 = create_dot_plot2(\n", - " anaheim_test_speedmph,\n", - " \"variable\",\n", - " \"stop_sequence\",\n", - " \"rounded_speed\",\n", - " anaheim_test_speedmph.columns.tolist(),\n", - " \"Speed per Trip\",\n", - ")" + "vehicle_positions2 = vehicle_positions[\n", + " vehicle_positions.trip_id.isin(trips)\n", + "].reset_index()" ] }, { "cell_type": "code", "execution_count": null, - "id": "29b3b7b1-8f44-4a8d-97c3-ff934250e874", + "id": "8840809b-dd6f-4c0e-a68b-0a37f508df14", "metadata": {}, "outputs": [], "source": [ - "chart4 = threshold_utils.chart_size(chart4, 650, 300)" + "vehicle_positions2.shape" ] }, { "cell_type": "code", "execution_count": null, - "id": "54146c88-15df-49e2-9de6-18f2666d9f9f", + "id": "896b5c73-0835-4ee7-a4f2-9d960778fe35", "metadata": {}, "outputs": [], "source": [ - "chart4" + "gdf1 = pd.merge(\n", + " vehicle_positions2,\n", + " sample_data,\n", + " how=\"inner\",\n", + " on=[\"gtfs_dataset_key\", \"_gtfs_dataset_name\", \"trip_id\"],\n", + ")" ] }, { "cell_type": "code", "execution_count": null, - "id": "3ccbc087-f799-4429-b6c7-a9c1f38f021d", + "id": "3725d07d-7b9f-483b-aa56-8d428b9f3d11", "metadata": {}, "outputs": [], "source": [ - "chart3 + chart4" + "gdf1.shape" ] }, { "cell_type": "code", "execution_count": null, - "id": "9829d749-580c-4440-9397-4562192417b7", + "id": "c82d7834-567b-4de4-b465-aea8c1a62715", "metadata": {}, "outputs": [], "source": [ - "chart7 = (\n", - " alt.Chart(anaheim_test_other, width=0.5)\n", - " .mark_circle(size=100)\n", - " .encode(\n", - " x=alt.X(\n", - " \"jitter:Q\",\n", - " title=None,\n", - " axis=alt.Axis(values=[0], ticks=False, grid=False, labels=False),\n", - " scale=alt.Scale(),\n", - " ),\n", - " y=alt.Y(\"rounded_speed:Q\", axis=alt.Axis(labelAngle=-90)),\n", - " color=alt.Color(\n", - " \"variable:N\",\n", - " scale=alt.Scale(range=cp.CALITP_CATEGORY_BRIGHT_COLORS),\n", - " ),\n", - " tooltip=anaheim_test.columns.tolist(),\n", - " column=alt.Column(\n", - " \"stop_sequence:N\",\n", - " header=alt.Header(\n", - " labelAngle=-90,\n", - " titleOrient=\"top\",\n", - " labelOrient=\"bottom\",\n", - " labelAlign=\"right\",\n", - " labelPadding=2,\n", - " ),\n", - " ),\n", - " )\n", - " .transform_calculate(\n", - " # Generate Gaussian jitter with a Box-Muller transform\n", - " jitter=\"sqrt(-2*log(random()))*cos(2*PI*random())\"\n", - " )\n", - " .configure_facet(spacing=0)\n", - " .configure_view(stroke=None)\n", - " .properties(title=\"Trip Duration by RT Category\")\n", - ")" + "gdf1 = gdf1[gdf1.stop_id.isin(stops)]" ] }, { "cell_type": "code", "execution_count": null, - "id": "454980fc-e6ab-4d53-b844-120a075b8034", + "id": "a86cec8e-e7fd-48ce-9692-c766df2b68e8", "metadata": {}, "outputs": [], "source": [ - "chart7 = threshold_utils.chart_size(chart7, 80, 300)" + "gdf1.shape" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3bf3c599-2aec-4ae7-b8c8-53a4eff8795a", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "gdf1[\n", + " [\n", + " \"geometry\",\n", + " \"stop_id\",\n", + " \"stop_sequence\",\n", + " \"_gtfs_dataset_name\",\n", + " \"shape_array_key\",\n", + " \"speed_mph\",\n", + " \"flag\",\n", + " ]\n", + "].explore(\"flag\")" ] }, { "cell_type": "code", "execution_count": null, - "id": "58e57fb2-1c3b-47e1-8bb6-d4941f0ee985", + "id": "f6d7831f-aed2-4e87-aae1-8ab6ddc08666", "metadata": {}, "outputs": [], "source": [ - "chart8 = (\n", - " alt.Chart(anaheim_test_other, width=0.5)\n", - " .mark_circle(size=100)\n", - " .encode(\n", - " x=alt.X(\n", - " \"jitter:Q\",\n", - " title=None,\n", - " axis=alt.Axis(values=[0], ticks=False, grid=False, labels=False),\n", - " scale=alt.Scale(),\n", - " ),\n", - " y=alt.Y(\"rounded_speed:Q\", axis=alt.Axis(labelAngle=-90)),\n", - " color=alt.Color(\n", - " \"variable:N\",\n", - " scale=alt.Scale(range=cp.CALITP_CATEGORY_BRIGHT_COLORS),\n", - " ),\n", - " tooltip=anaheim_test.columns.tolist(),\n", - " column=alt.Column(\n", - " \"stop_sequence:N\",\n", - " header=alt.Header(\n", - " labelAngle=-90,\n", - " titleOrient=\"top\",\n", - " labelOrient=\"bottom\",\n", - " labelAlign=\"right\",\n", - " labelPadding=2,\n", - " ),\n", - " ),\n", - " )\n", - " .transform_calculate(\n", - " # Generate Gaussian jitter with a Box-Muller transform\n", - " jitter=\"sqrt(-2*log(random()))*cos(2*PI*random())\"\n", - " )\n", - " .properties(title=\"Trip Duration by RT Category\")\n", - ")" + "stop" ] }, { "cell_type": "code", "execution_count": null, - "id": "c2ce0879-f45f-4c66-9b1a-870a243a1260", + "id": "aae6b6c0-ce47-4ca1-b104-68b39ebcf2ca", "metadata": {}, "outputs": [], "source": [ - "chart9 = (\n", - " alt.Chart(anaheim_test_speedmph, width=0.5)\n", - " .mark_circle(size=100)\n", - " .encode(\n", - " x=alt.X(\n", - " \"jitter:Q\",\n", - " title=None,\n", - " axis=alt.Axis(values=[0], ticks=False, grid=False, labels=False),\n", - " scale=alt.Scale(),\n", - " ),\n", - " y=alt.Y(\"rounded_speed:Q\", axis=alt.Axis(labelAngle=-90)),\n", - " color=alt.Color(\n", - " \"variable:N\",\n", - " scale=alt.Scale(range=cp.CALITP_CATEGORY_BRIGHT_COLORS),\n", - " ),\n", - " tooltip=anaheim_test.columns.tolist(),\n", - " column=alt.Column(\n", - " \"stop_sequence:N\",\n", - " header=alt.Header(\n", - " labelAngle=360,\n", - " titleOrient=\"top\",\n", - " labelOrient=\"bottom\",\n", - " labelAlign=\"right\",\n", - " labelPadding=2,\n", - " ),\n", - " ),\n", - " )\n", - " .transform_calculate(\n", - " # Generate Gaussian jitter with a Box-Muller transform\n", - " jitter=\"sqrt(-2*log(random()))*cos(2*PI*random())\"\n", - " )\n", - " .properties(title=f\"Loop/Inling:{anaheim_test_speedmph.loop_or_inlining.iloc[0]}\")\n", + "high_low_zero2 = high_low_zero.melt(\n", + " id_vars=[\n", + " \"_gtfs_dataset_name\",\n", + " \"shape_array_key\",\n", + " \"trip_id\",\n", + " \"stop_sequence\",\n", + " \"gtfs_dataset_key\",\n", + " \"loop_or_inlining\",\n", + " \"n_trips\",\n", + " \"meters_cat\",\n", + " \"seconds_cat\",\n", + " \"unusual_flag\",\n", + " \"time_of_day\",\n", + " ],\n", + " value_vars=[\"median_speed_mph\", \"speed_mph\", \"p20_speed_mph\", \"p80_speed_mph\"],\n", ")" ] }, { "cell_type": "code", "execution_count": null, - "id": "04e3ce9a-a5ab-4aeb-a7d0-4ead6738324b", + "id": "767abd20-d030-42d3-b85f-6d3023d69b8a", "metadata": {}, "outputs": [], "source": [ - "chart8" + "high_low_zero2 = high_low_zero2.drop_duplicates(\n", + " subset=[\n", + " \"loop_or_inlining\",\n", + " \"shape_array_key\",\n", + " \"stop_sequence\",\n", + " \"time_of_day\",\n", + " \"variable\",\n", + " \"value\",\n", + " ]\n", + ").reset_index(drop=True)" ] }, { "cell_type": "code", "execution_count": null, - "id": "04d2026e-c040-45ea-aab6-2c09c2211632", + "id": "52e7ee5a-a40e-423e-ba4b-dea14de17982", "metadata": {}, "outputs": [], "source": [ - "chart9 | chart8" + "high_low_zero2.shape" ] }, { "cell_type": "code", "execution_count": null, - "id": "edb11b29-7038-4a5d-9e44-4d3b88e6cc88", + "id": "23e7f746-b1b5-402f-92fd-dbc74840e013", "metadata": {}, "outputs": [], "source": [ - "# pip install altair==5.0.0rc3\n", - "chart5 = (\n", - " alt.Chart(anaheim_test_speedmph, title=\"Normally distributed jitter\")\n", - " .mark_circle(size=50)\n", - " .encode(\n", - " y=\"rounded_speed:Q\",\n", - " x=\"stop_sequence:N\",\n", - " yOffset=\"jitter:Q\",\n", - " color=alt.Color(\"stop_sequence:Q\").legend(None),\n", - " )\n", - " .transform_calculate(\n", - " # Generate Gaussian jitter with a Box-Muller transform\n", - " jitter=\"sqrt(-2*log(random()))*cos(2*PI*random())\"\n", - " )\n", - ")" + "merge1.shape_array_key.nunique(), high_low_zero.shape_array_key.nunique()" ] }, { "cell_type": "code", "execution_count": null, - "id": "9bbcc880-7866-495c-9a68-3b7ca6c00c4a", + "id": "8a25f067-956e-46a7-aa7a-5abf57e662f6", "metadata": {}, "outputs": [], "source": [ - "chart5 = threshold_utils.chart_size(chart5, 650, 300)" + "# Clean\n", + "high_low_zero2 = threshold_utils.pre_clean(high_low_zero2)" ] }, { "cell_type": "code", "execution_count": null, - "id": "a765f55e-73e2-4bae-9326-98ed1bbfadaf", + "id": "b5eea02c-05ec-4707-a06d-9de1864e8fbe", "metadata": {}, "outputs": [], "source": [ - "chart5" + "# Add dropdown menu\n", + "high_low_zero2[\"Dropdown Menu\"] = (\n", + " high_low_zero2[\"Gtfs Dataset Name\"] + \" \" + high_low_zero2[\"Shape Array Key\"]\n", + ")" ] }, { "cell_type": "code", "execution_count": null, - "id": "f02f2c0b-f5fc-4452-b39b-bb1a443bc727", + "id": "571b471f-4a66-474f-8900-c3eaffde441e", "metadata": {}, "outputs": [], "source": [ - "# foothill_og = speed_stops2[speed_stops2.trip_id == \"t604-b2791-sl5\"]" + "high_low_zero2[\"Route Type\"] = \"Route Type: \" + high_low_zero2[\n", + " \"Loop Or Inlining\"\n", + "].astype(str)" ] }, { "cell_type": "code", "execution_count": null, - "id": "1136e4c7-2e5b-492d-8ae6-299a04164ac3", + "id": "b7b429be-057c-4692-927e-92107b015ae6", "metadata": {}, "outputs": [], "source": [ - "# foothill_og.stop_sequence.describe()" + "selection_test = alt_dropdown(high_low_zero2, \"Dropdown Menu\", \"Route\")" ] }, { "cell_type": "code", "execution_count": null, - "id": "9cee6f4c-8786-478a-97ff-702be25d0788", + "id": "aa0fad8b-b49e-48be-8070-adaf6e63d541", "metadata": {}, "outputs": [], "source": [ - "# foothill_og.sort_values('stop_sequence').head()" + "# https://github.com/altair-viz/altair/issues/1168\n", + "title = (\n", + " alt.Chart(high_low_zero2)\n", + " .mark_text(dy=-40, size=15, fontWeight=\"normal\")\n", + " .encode(\n", + " text=\"Route Type:N\",\n", + " )\n", + " .add_selection(selection_test)\n", + " .transform_filter(selection_test)\n", + ")" ] }, { "cell_type": "code", "execution_count": null, - "id": "97ec3ff5-bcda-4edd-9ba6-50821923dd98", + "id": "1042a774-165c-4f7e-bfc9-c4d4980bd29b", "metadata": {}, "outputs": [], "source": [ - "# foothill_renumbered_stop_seq = m2[m2.trip_id == \"t604-b2791-sl5\"]" + "\"\"\"total_stops_altair = (\n", + " alt.Chart(stop_info)\n", + " .mark_text(dy=-40, size=15, fontWeight=\"normal\")\n", + " .encode(\n", + " text=\"Percentage Of Unusual Stops:N\",\n", + " )\n", + " .add_selection(selection_test)\n", + " .transform_filter(selection_test)\n", + ")\"\"\"" ] }, { "cell_type": "code", "execution_count": null, - "id": "f2733f91-b890-4818-a649-d79bd6f9a16a", + "id": "6067e93b-3519-45fc-b027-11cbcc82d80f", "metadata": {}, "outputs": [], "source": [ - "# foothill_renumbered_stop_seq['Test Stop Sequence'].describe()" + "main_chart = (\n", + " threshold_utils.chart_size(\n", + " alt.Chart(high_low_zero2)\n", + " .mark_tick(\n", + " size=15,\n", + " thickness=5,\n", + " )\n", + " .encode(\n", + " x=\"Stop Sequence:N\",\n", + " y=\"Value:Q\",\n", + " color=alt.Color(\n", + " \"Variable:N\", scale=alt.Scale(range=cp.CALITP_CATEGORY_BRIGHT_COLORS)\n", + " ),\n", + " tooltip=high_low_zero2.columns.tolist(),\n", + " )\n", + " .interactive(),\n", + " 1100,\n", + " 400,\n", + " )\n", + " .add_selection(selection_test)\n", + " .transform_filter(selection_test)\n", + ")" ] }, { "cell_type": "code", "execution_count": null, - "id": "c9b04672-2b10-4d06-b84a-1fd92a6a78ac", + "id": "da0d6aad-26c3-439b-93d2-ba5a3abac77d", "metadata": {}, "outputs": [], "source": [ - "# foothill_renumbered_stop_seq.sort_values('stop_sequence').head()" + "high_low_zero2.shape" ] }, { "cell_type": "code", "execution_count": null, - "id": "9581ead1-5d91-44a4-a9d3-299445d55056", + "id": "c77e2503-e211-48cc-a220-d96f82ab72df", "metadata": {}, "outputs": [], "source": [ - "# len(troubleshoot)" + "(title & total_stops_altair | main_chart)" ] }, { "cell_type": "code", "execution_count": null, - "id": "1f50eb55-9373-4004-b23b-8129db734c1b", + "id": "9f709e13-aa1e-44da-9027-dfafaead5dad", "metadata": {}, "outputs": [], "source": [ - "# Number of test stops should match stop sequence...\n", - "# troubleshoot['sequences_are_equal'] = troubleshoot['Test Stop Sequence'] - troubleshoot['stop_sequence']" + "high_low_zero.shape_array_key.unique()" ] }, { "cell_type": "code", "execution_count": null, - "id": "46843106-2e31-46c9-9220-2f12a3e6a4fb", + "id": "2a2bda8b-d48c-44f3-a928-08aca894c565", "metadata": {}, "outputs": [], "source": [ - "# troubleshoot['sequences_are_equal'].value_counts()" + "chart2 = threshold_utils.chart_size(chart2, 75, 200)" ] }, { "cell_type": "code", "execution_count": null, - "id": "34d189fa-632c-40d7-a793-e576645e879e", + "id": "dd20293d-7c43-42c4-b053-de945860b6f0", "metadata": {}, "outputs": [], "source": [ - "# Look at this trip id in the original df\n", - "# og_trip = speed_stops2[speed_stops2.trip_id == \"t640-b15FF1-sl5\"]" + "chart2 = chart2.add_selection(selection_test).transform_filter(selection_test)" ] }, { "cell_type": "code", "execution_count": null, - "id": "34b1e2f4-fe20-4029-9ec1-e5cbaaf73178", + "id": "44dc6896-e95d-42df-bbd5-f2bb2c2a2cc6", "metadata": {}, "outputs": [], "source": [ - "# Look at this trip id in the manipulated df\n", - "# new_trip = m2[m2.trip_id == \"t640-b15FF1-sl5\"]" + "title = threshold_utils.chart_size(title, 20, 20)" ] }, { "cell_type": "code", "execution_count": null, - "id": "81190a32-907b-4a5d-818a-ab5c2740dbc3", - "metadata": { - "tags": [] - }, + "id": "d651c8c6-179c-4157-a671-11006cb419df", + "metadata": {}, "outputs": [], "source": [ - "# og_trip.shape, og_trip.stop_sequence.nunique()" + "alt.data_transformers.enable(\"default\", max_rows=None)" ] }, { "cell_type": "code", "execution_count": null, - "id": "ae8d06ad-3016-4909-b766-c370ef074aae", + "id": "8e753711-8506-4939-8d9d-22566a641988", "metadata": {}, "outputs": [], "source": [ - "# new_trip.shape, new_trip.stop_sequence.nunique()" + "title & (chart1.interactive() & chart2.interactive())" ] } ], From 725193ba3e6bcc2bbf63f758a57c62c3bb71a981 Mon Sep 17 00:00:00 2001 From: amandaha8 Date: Thu, 22 Jun 2023 23:45:49 +0000 Subject: [PATCH 4/9] look at stage2 --- rt_segment_speeds/12_speeds.ipynb | 444 +++++++++++++++++++++--------- 1 file changed, 318 insertions(+), 126 deletions(-) diff --git a/rt_segment_speeds/12_speeds.ipynb b/rt_segment_speeds/12_speeds.ipynb index ac9036789..c02446de6 100644 --- a/rt_segment_speeds/12_speeds.ipynb +++ b/rt_segment_speeds/12_speeds.ipynb @@ -8,10 +8,6 @@ "outputs": [], "source": [ "import datetime\n", - "\n", - "import gcsfs\n", - "\n", - "fs = gcsfs.GCSFileSystem()\n", "import _speed_utils as speed_utils\n", "import _threshold_utils as threshold_utils\n", "import altair as alt\n", @@ -99,7 +95,7 @@ "metadata": {}, "outputs": [], "source": [ - "# m1 = merge_all_speeds(analysis_date)" + "m1 = merge_all_speeds(analysis_date)" ] }, { @@ -129,41 +125,6 @@ "]" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "7201d5e3-f765-4e5d-9bbd-aa6a336bcc0a", - "metadata": {}, - "outputs": [], - "source": [ - "subset = m1[m1.shape_array_key.isin(sample_0_keys)].reset_index()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6239a1c1-adc8-47b6-b6ee-b8b3f55d2257", - "metadata": {}, - "outputs": [], - "source": [ - "def rate_column(df: pd.DataFrame, column_percentile:str, column_str:str): \n", - " #Get percentiles in objects for total vehicle.\n", - " p5 = df[column_percentile].quantile(0.05).astype(float)\n", - " p95 = df[column_percentile].quantile(0.95).astype(float)\n", - " \n", - " #Function for fleet size\n", - " def rate(row):\n", - " if ((row[column_percentile] >= 0) and (row[column_percentile] <= p5)):\n", - " return f\"{column_str} is low\"\n", - " elif (row[column_percentile] >= p95):\n", - " return f\"{column_str} is high\"\n", - " else:\n", - " return \"Ok\"\n", - " df[f\"{column_str}cat\"] = df.apply(lambda x: rate(x), axis=1)\n", - " \n", - " return df " - ] - }, { "cell_type": "markdown", "id": "898e3546-5298-4c4f-87d0-ee1d1a10f07d", @@ -202,26 +163,6 @@ " # Clean\n", " df[f\"{column_str}cat\"] = df[f\"{column_str}cat\"].str.replace(\"_\", \"\")\n", "\n", - " columns_to_keep = [\n", - " \"shape_array_key\",\n", - " \"gtfs_dataset_key\",\n", - " \"_gtfs_dataset_name\",\n", - " \"speed_mph\",\n", - " \"loop_or_inlining\",\n", - " \"stop_sequence\",\n", - " \"stop_id\",\n", - " \"trip_id\",\n", - " \"n_trips\",\n", - " \"p20_mph\",\n", - " \"p80_mph\",\n", - " \"p50_mph\",\n", - " \"time_of_day\",\n", - " \"meters_elapsed\",\n", - " \"sec_elapsed\",\n", - " f\"{column_str}cat\",\n", - " ]\n", - "\n", - " # df = df[columns_to_keep]\n", " print(f\"Done with {column_str}\")\n", " \n", " return df " @@ -274,7 +215,7 @@ "metadata": {}, "outputs": [], "source": [ - "def categorize_meters_speeds_pandas(df):\n", + "def categorize_meters_speeds_pandas(df)-> pd.DataFrame:\n", " start = datetime.datetime.now()\n", " print(start)\n", " \n", @@ -287,9 +228,9 @@ " # Find size of categories\n", " print(df2.groupby(['sec_cat','meters_cat']).size())\n", "\n", - " # Filter out \n", + " # Filter out for only meters that are low or seconds that are high\n", " df2 = df2[(df2.meters_cat == 'meters is low') | (df2.sec_cat == 'sec is high')].reset_index(drop = True)\n", - " print(f\"{len(df2)} rows after filtering for rows with either high seconds OR low meters\") \n", + " print(f\"{len(df2)} rows left after filtering for rows with either high seconds OR low meters\") \n", " \n", " def flag_round(row):\n", " if (row[\"meters_elapsed\"] == 0) & (row[\"sec_elapsed\"] == 0):\n", @@ -302,17 +243,36 @@ " return \"ok\"\n", " \n", " df2[\"flag\"] = df2.apply(lambda x: flag_round(x), axis=1)\n", - " print(m2.flag.value_counts())\n", + " print(df2.flag.value_counts())\n", " \n", " # Filter out for only division by 0 \n", " df3 = df2[(df2.flag == 'division by 0')].reset_index(drop = True)\n", - " print(f\"{len(df3)} rows after filtering for only division by 0 rows\") \n", " \n", " end = datetime.datetime.now()\n", " print(f\"Took {end-start}\")\n", " return df3" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "7201d5e3-f765-4e5d-9bbd-aa6a336bcc0a", + "metadata": {}, + "outputs": [], + "source": [ + "subset = m1[m1.shape_array_key.isin(sample_0_keys)].reset_index()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ce0fbb35-f81e-4343-92d2-4382d2173dbd", + "metadata": {}, + "outputs": [], + "source": [ + "len(m1)" + ] + }, { "cell_type": "code", "execution_count": null, @@ -320,7 +280,7 @@ "metadata": {}, "outputs": [], "source": [ - "m2 = categorize_meters_speeds_pandas(m1)" + "m2 = categorize_meters_speeds_pandas(subset)" ] }, { @@ -363,23 +323,14 @@ "m2.groupby([\"loop_or_inlining\"]).agg({\"shape_array_key\": \"nunique\"})" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "b9519846-59ed-40cb-9087-3f8229e771d1", - "metadata": {}, - "outputs": [], - "source": [ - "subset = m2[m2.shape_array_key.isin(sample_0_keys)].reset_index()" - ] - }, { "cell_type": "markdown", "id": "a399d982-e400-43fa-b13f-fecafaa27262", "metadata": {}, "source": [ "### Investigate \n", - "#### Stage3: \"vp_pared_stops\"" + "#### Stage3: \"vp_pared_stops\"\n", + "* Keeps only first and last point of a segment." ] }, { @@ -389,7 +340,7 @@ "metadata": {}, "outputs": [], "source": [ - "def load_vp_stage3(flagged_df:pd.DataFrame, date:str):\n", + "def load_vp_stage3(flagged_df:pd.DataFrame, date:str) -> pd.DataFrame:\n", " \n", " # Subset the dataframe and use it to filter out for only the values of interest\n", " flagged_df = flagged_df[['gtfs_dataset_key', 'trip_id','stop_sequence','shape_array_key']]\n", @@ -412,16 +363,6 @@ "vp2 = load_vp_stage3(subset, analysis_date)" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "21799f42-873e-41bd-b764-42cc297686a6", - "metadata": {}, - "outputs": [], - "source": [ - "sort_cols = ['trip_id', 'stop_sequence','location_timestamp_local']" - ] - }, { "cell_type": "code", "execution_count": null, @@ -484,10 +425,11 @@ "metadata": {}, "outputs": [], "source": [ - "def stage3_repeated_timestamps(stage3_df:pd.DataFrame):\n", + "def stage3_repeated_timestamps(stage3_df:pd.DataFrame)-> pd.DataFrame:\n", " \"\"\"\n", " Look at how many times a time stamp is repeated a route-trip-location.\n", - " Each of these 3 combos should have something different.\n", + " Each of these 3 combos should have a different time for each \n", + " stop sequence or else the vehicle is not changing locations.\n", " \"\"\"\n", " agg = (stage3_df\n", " .groupby(['shape_array_key','trip_id', 'location_timestamp_local'])\n", @@ -512,7 +454,8 @@ "def stage3_repeated_locations(stage3_df:pd.DataFrame):\n", " \"\"\"\n", " Look at how many times a time stamp is repeated for a stop-trip-route combo.\n", - " Each combo should have a differnt location.\n", + " Each of these 3 combos should have a different location for each \n", + " stop sequence or else the vehicle is not changing locations.\n", " \"\"\"\n", " # Concat x and y into a string\n", " stage3_df['pair'] = stage3_df.x.astype(str) + '/' + vp2.y.astype(str)\n", @@ -527,31 +470,11 @@ " )\n", "\n", " # Only keep locations that are repeated more than once\n", - " # agg = agg[agg.number_of_repeated_locs != 1].reset_index(drop = True)\n", + " agg = agg[agg.number_of_repeated_locs != 1].reset_index(drop = True)\n", " \n", " return agg" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "15a49a52-1178-4e65-a443-b35b89812d54", - "metadata": {}, - "outputs": [], - "source": [ - "vp2.pair.nunique()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f0663258-c7d5-495f-9191-0664b65970a8", - "metadata": {}, - "outputs": [], - "source": [ - "vp2.trip_id.nunique()" - ] - }, { "cell_type": "code", "execution_count": null, @@ -559,7 +482,12 @@ "metadata": {}, "outputs": [], "source": [ - "def flag_stage3(flagged_df:pd.DataFrame, date:str):\n", + "def flag_stage3(flagged_df:pd.DataFrame, date:str) -> pd.DataFrame:\n", + " \"\"\"\n", + " Flag the errors in stage3\n", + " \"\"\"\n", + " start = datetime.datetime.now()\n", + " print(start)\n", " \n", " # Relevant rows from Vehicle Positions\n", " vp = load_vp_stage3(flagged_df, date)\n", @@ -574,14 +502,35 @@ " timestamps_merge_cols = ['shape_array_key','trip_id','location_timestamp_local']\n", " loc_merge_cols = ['shape_array_key','trip_id','pair']\n", " \n", + " # Want everything found in vehicle positions, so do left merges\n", " m1 = (vp\n", " .merge(multi_timestamps, how=\"left\", on= timestamps_merge_cols)\n", " .merge(multi_locs, how=\"left\", on=loc_merge_cols)\n", " )\n", " \n", - " drop_cols = ['vp_idx','x','y','hour','activity_date']\n", + " drop_cols = ['vp_idx','x','y','hour','activity_date',]\n", " m1 = m1.drop(columns = drop_cols)\n", " \n", + " # Flag\n", + " def flag(row):\n", + " if (row[\"number_of_repeated_timestamps\"] > 1) & (row[\"number_of_repeated_locs\"] > 1):\n", + " return \"repeated timestamps & locations\"\n", + " elif (row[\"number_of_repeated_timestamps\"] > 1):\n", + " return \"repeated timestamps\"\n", + " elif (row[\"number_of_repeated_locs\"] > 1):\n", + " return \"repeated locations\"\n", + " else:\n", + " return \"check in stage 2\"\n", + " \n", + " m1[\"stage3_flag\"] = m1.apply(lambda x: flag(x), axis=1)\n", + " \n", + " print(m1.stage3_flag.value_counts())\n", + " \n", + " check_in_stage2 = m1[m1.stage3_flag == \"check in stage 2\"]\n", + " print(f\"Have to check {len(check_in_stage2)/len(m1) * 100} % of rows in stage 2\")\n", + " \n", + " end = datetime.datetime.now()\n", + " print(f\"Took {end-start}\")\n", " return m1" ] }, @@ -598,61 +547,304 @@ { "cell_type": "code", "execution_count": null, - "id": "9374b39f-f286-4b82-b18b-e596f602f6b0", + "id": "68a9dbba-ee6b-42b1-9203-1146d6cd56e9", + "metadata": {}, + "outputs": [], + "source": [ + "m3.shape" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "21799f42-873e-41bd-b764-42cc297686a6", + "metadata": {}, + "outputs": [], + "source": [ + "sort_cols = ['trip_id', 'shape_array_key', 'stop_sequence']" + ] + }, + { + "cell_type": "markdown", + "id": "4b1876cf-9e8b-4c30-8723-2226133b8e01", + "metadata": {}, + "source": [ + "### Stage2: \"vp_stop_segment\"\n", + "* Were the right points kept?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "68fba15b-7c2d-4c1b-a556-286dc4acc4e1", + "metadata": {}, + "outputs": [], + "source": [ + "# Find rows that need to be tagged in stage 2\n", + "stage2_rows = m3[m3.stage3_flag == \"check in stage 2\"].reset_index()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f14e0ab1-59d2-42cc-884c-868da650cfa6", + "metadata": {}, + "outputs": [], + "source": [ + "stage2_routes = stage2_rows.shape_array_key.unique().tolist() " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ddb1ae2a-d66a-45b7-9dd9-2b11f7d325b2", + "metadata": {}, + "outputs": [], + "source": [ + "# Use flagged df\n", + "stage2_rows = m2[m2.shape_array_key.isin(stage2_routes)].reset_index(drop = True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bf64a406-1124-423c-8577-705a21c9b422", + "metadata": {}, + "outputs": [], + "source": [ + "# Subset df to filter the vp \n", + "subset_for_merge = stage2_rows[['gtfs_dataset_key', 'trip_id','stop_sequence','shape_array_key', 'loop_or_inlining', 'stop_id']].drop_duplicates().reset_index(drop = True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6397dc45-c271-4057-a0d8-1962846d4f94", + "metadata": {}, + "outputs": [], + "source": [ + "# What's the diff between stop segments normal/special/and without any notation?\n", + "stg2 = gpd.read_parquet(f\"{speed_utils.GCS_PATH}stop_segments_{analysis_date}.parquet\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "45440033-aae7-4f94-9495-5e14529b7c5c", + "metadata": { + "scrolled": true, + "tags": [] + }, + "outputs": [], + "source": [ + "stg2_m = pd.merge(stg2, subset_for_merge, how = \"inner\", on = ['gtfs_dataset_key', 'stop_sequence','shape_array_key', 'loop_or_inlining','stop_id'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fe37945a-26d9-4891-831f-1bb92f85b39e", + "metadata": {}, + "outputs": [], + "source": [ + "stage2_rows.shape" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e2d79d69-bf4f-4b4f-9c9a-3d70547eeb63", + "metadata": {}, + "outputs": [], + "source": [ + "type(stg2)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "198195f7-5c6d-4fda-afc4-4da8307426cb", + "metadata": {}, + "outputs": [], + "source": [ + "stg2.columns" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fab30694-9948-4e82-b19b-0814a459c340", + "metadata": {}, + "outputs": [], + "source": [ + "# Delete out empty geo \n", + "filtered = stg2[~stg2.geometry_arrowized.is_empty]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4d9238cc-99da-49a1-abf1-392e3bd5bcc7", + "metadata": {}, + "outputs": [], + "source": [ + "subset.sample()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "97403c1c-e39f-4179-a7bb-2fc8588d3249", + "metadata": {}, + "outputs": [], + "source": [ + "# This is the original dataframe loaded in from merging stage\n", + "original = pd.merge(filtered, \n", + " subset[['gtfs_dataset_key', 'trip_id','stop_sequence','shape_array_key', 'loop_or_inlining', 'stop_id']],\n", + " how = \"inner\", \n", + " on = ['gtfs_dataset_key', 'stop_sequence','shape_array_key', 'loop_or_inlining','stop_id'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f81683e1-94a1-4436-a2fb-ee378acaacef", + "metadata": {}, + "outputs": [], + "source": [ + "original.columns" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ac442f11-ff84-4e3f-9f05-5f9ece3c0d51", + "metadata": {}, + "outputs": [], + "source": [ + "# filtered[filtered.shape_array_key == \"000624bd8453dbe4f2eb2765b04bcb98\"].set_geometry(\"geometry_arrowized\").explore('stop_sequence')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "94481b9a-9a4a-4fa2-b7ff-63d1ef93e1dc", + "metadata": {}, + "outputs": [], + "source": [ + "stg2.shape" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2965a631-1a2e-4551-bfcb-c89ba56ecf92", + "metadata": {}, + "outputs": [], + "source": [ + "stg2_m.shape" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "145e129e-b119-44d9-940b-18ac70c93e0a", + "metadata": {}, + "outputs": [], + "source": [ + "for i in ['geometry','geometry_arrowized']:\n", + " print(f\"{i}: {stg2_m[i].is_valid.sum()/len(stg2_m)}\")\n", + " print(f\"{i}: {len(stg2_m[stg2_m[i].is_empty])}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a72b5744-ff22-44b0-834d-5b3718162661", + "metadata": {}, + "outputs": [], + "source": [ + "geo_arrowized = stg2_m[~stg2_m.geometry_arrowized.is_empty]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2fad7b35-d1bf-4df2-99d7-b63b2381ba34", + "metadata": {}, + "outputs": [], + "source": [ + "len(geo_arrowized)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "97b08e7f-4902-44a2-a904-d15800382ccd", + "metadata": {}, + "outputs": [], + "source": [ + "keep_tripid = ['1350', '1089339', '16939087', '1088383',]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "298f3ddb-0fde-4b90-b9b4-5ea9e45d461d", "metadata": {}, "outputs": [], "source": [ - "len(m3)" + "geo_arrowized[geo_arrowized.trip_id == '1088383'].set_geometry(\"geometry_arrowized\").explore('stop_sequence', style_kwds = {'weight':10})" ] }, { "cell_type": "code", "execution_count": null, - "id": "4150ae22-7888-475f-9c24-f0ac17cf1b4d", + "id": "b28c57bb-95ee-4a27-85c9-12cb0d6f3c19", "metadata": {}, "outputs": [], "source": [ - "m3.groupby(['number_of_repeated_timestamps', 'number_of_repeated_locs']).size()" + "original[original.trip_id == '1088383'].set_geometry(\"geometry_arrowized\").explore('stop_sequence', style_kwds = {'weight':10})" ] }, { "cell_type": "code", "execution_count": null, - "id": "8fc4eeb2-ccbb-4691-a2fc-f4713664cb58", + "id": "c9121bc1-9192-45fa-976a-4f818c4a39a7", "metadata": {}, "outputs": [], "source": [ - "more_than_one = m3[(m3.number_of_repeated_timestamps > 1) | (m3.number_of_repeated_locs > 1)]" + "filtered[filtered.trip_id == '16939087'].set_geometry(\"geometry_arrowized\").explore('stop_sequence', style_kwds = {'weight':10})" ] }, { "cell_type": "code", "execution_count": null, - "id": "e7b06af8-26c8-4394-865d-424cf25868d2", + "id": "4213b966-f843-4d14-ab9a-f2344df1962f", "metadata": {}, "outputs": [], "source": [ - "more_than_one.shape" + "geo_arrowized[geo_arrowized.trip_id == '16939087'].set_geometry(\"geometry_arrowized\").explore('stop_sequence', style_kwds = {'weight':10})" ] }, { "cell_type": "code", "execution_count": null, - "id": "6877289f-008d-4377-9748-270e05ae22f5", + "id": "6ac43a58-184c-41b3-9d53-9e21ef321bc3", "metadata": {}, "outputs": [], "source": [ - "more_than_one.sample(5)" + "geo_arrowized[geo_arrowized.trip_id == '1350'].set_geometry(\"geometry_arrowized\").explore('stop_sequence', style_kwds = {'weight':10})" ] }, { "cell_type": "code", "execution_count": null, - "id": "86ce812a-4081-44b3-b1f1-b02be149ebf1", + "id": "60a53891-66cb-40ea-a42a-55ea290c6584", "metadata": {}, "outputs": [], "source": [ - "more_than_one._gtfs_dataset_name.nunique(), more_than_one.shape_array_key.nunique()" + "filtered[filtered.trip_id == '1350'].set_geometry(\"geometry_arrowized\").explore('stop_sequence', style_kwds = {'weight':10})" ] }, { From a8703d379c9e1639bf98d01cfe712067a4aec6ee Mon Sep 17 00:00:00 2001 From: amandaha8 Date: Fri, 23 Jun 2023 17:38:29 +0000 Subject: [PATCH 5/9] looked at sjoin more --- rt_segment_speeds/12_speeds.ipynb | 979 ++++++++++++++++++++++++------ 1 file changed, 780 insertions(+), 199 deletions(-) diff --git a/rt_segment_speeds/12_speeds.ipynb b/rt_segment_speeds/12_speeds.ipynb index c02446de6..86954b1d2 100644 --- a/rt_segment_speeds/12_speeds.ipynb +++ b/rt_segment_speeds/12_speeds.ipynb @@ -2,10 +2,27 @@ "cells": [ { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "id": "2c7feec3-aa18-42ab-94b9-cab4be608152", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/opt/conda/lib/python3.9/site-packages/geopandas/_compat.py:123: UserWarning: The Shapely GEOS version (3.11.1-CAPI-1.17.1) is incompatible with the GEOS version PyGEOS was compiled with (3.10.1-CAPI-1.16.0). Conversions between both will be slow.\n", + " warnings.warn(\n", + "/home/jovyan/data-analyses/rt_segment_speeds/_threshold_utils.py:1: UserWarning: Shapely 2.0 is installed, but because PyGEOS is also installed, GeoPandas will still use PyGEOS by default for now. To force to use and test Shapely 2.0, you have to set the environment variable USE_PYGEOS=0. You can do this before starting the Python process, or in your code before importing geopandas:\n", + "\n", + "import os\n", + "os.environ['USE_PYGEOS'] = '0'\n", + "import geopandas\n", + "\n", + "In a future release, GeoPandas will switch to using Shapely by default. If you are using PyGEOS directly (calling PyGEOS functions on geometries from GeoPandas), this will then stop working and you are encouraged to migrate from PyGEOS to Shapely 2.0 (https://shapely.readthedocs.io/en/latest/migration_pygeos.html).\n", + " import geopandas as gpd\n" + ] + } + ], "source": [ "import datetime\n", "import _speed_utils as speed_utils\n", @@ -27,7 +44,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "id": "0108ae4a-4518-4487-85f7-a5faa3e9cbf6", "metadata": {}, "outputs": [], @@ -40,7 +57,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "id": "80014c5e-695d-4280-89cd-4e7e2bb3d302", "metadata": {}, "outputs": [], @@ -58,7 +75,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "id": "2f0c5f4f-f419-42a8-8527-7060ed412092", "metadata": {}, "outputs": [], @@ -90,7 +107,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "id": "84ac97bf-ee4f-4d85-b523-8a36823f9d9a", "metadata": {}, "outputs": [], @@ -100,7 +117,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "id": "68950ae7-4061-47d6-ac48-5eac0b1f29c0", "metadata": {}, "outputs": [], @@ -110,7 +127,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "id": "b04dfb8b-7476-49df-873a-cea75dc61763", "metadata": {}, "outputs": [], @@ -135,7 +152,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "id": "e81e59fd-cc2f-408e-9148-1a1055425fc4", "metadata": {}, "outputs": [], @@ -170,7 +187,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "id": "0dfb836d-f919-4f2b-a0d1-9e4a4713ba8a", "metadata": {}, "outputs": [], @@ -180,7 +197,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "id": "9f84205d-93db-49f3-be99-6b5014f7faeb", "metadata": {}, "outputs": [], @@ -190,7 +207,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "id": "b0d2184f-8a44-4489-a1b4-2be8317142f1", "metadata": {}, "outputs": [], @@ -200,7 +217,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 12, "id": "940fb010-0dff-465e-bf8d-87dd3f4ba101", "metadata": {}, "outputs": [], @@ -210,7 +227,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 13, "id": "9d38d541-5c9c-4d31-8986-9c3928eb2f59", "metadata": {}, "outputs": [], @@ -255,7 +272,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 14, "id": "7201d5e3-f765-4e5d-9bbd-aa6a336bcc0a", "metadata": {}, "outputs": [], @@ -265,64 +282,342 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 15, + "id": "2c5107cb-c574-449b-95b6-fb205f38502e", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2023-06-23 10:25:44.437503\n", + "Done with meters_\n", + "Done with sec_\n", + "sec_cat meters_cat \n", + "sec is avg meters is avg 1829\n", + " meters is high 110\n", + " meters is low 22\n", + "sec is high meters is avg 63\n", + " meters is high 40\n", + " meters is low 47\n", + "sec is low meters is low 850\n", + "dtype: int64\n", + "1022 rows left after filtering for rows with either high seconds OR low meters\n", + "division by 0 850\n", + "seconds too high 103\n", + "meters too low 69\n", + "Name: flag, dtype: int64\n", + "Took 0:00:00.209815\n" + ] + } + ], + "source": [ + "m2 = categorize_meters_speeds_pandas(subset)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, "id": "ce0fbb35-f81e-4343-92d2-4382d2173dbd", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "3075512" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "len(m1)" + "len(m1)-len(m2)" ] }, { "cell_type": "code", - "execution_count": null, - "id": "2c5107cb-c574-449b-95b6-fb205f38502e", + "execution_count": 17, + "id": "508f1411-4328-4b80-a029-0ae516107ed0", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "850" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "m2 = categorize_meters_speeds_pandas(subset)" + "len(m2)" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 18, "id": "9fe08d2a-b874-4439-aa5b-a52de58cad09", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "(73, 72067)" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "m2.trip_id.nunique()" + "m2.trip_id.nunique(), m1.trip_id.nunique()" ] }, { "cell_type": "code", - "execution_count": null, - "id": "a96f031c-c785-4793-9dfc-4b87090e6128", + "execution_count": 19, + "id": "8e6d31ab-46a7-4e20-bb2f-9cac1a2d672d", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "(4, 4837)" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "m2.shape" + "m2.shape_array_key.nunique(), m1.shape_array_key.nunique()" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 20, "id": "647fad46-7f9b-4ce2-a26a-1ea69d02daee", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "(3, 76)" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "m2._gtfs_dataset_name.nunique(), m1._gtfs_dataset_name.nunique()" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 21, "id": "83036ccc-7339-42c2-b1f7-183734253c21", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
shape_array_key
loop_or_inlining
04
\n", + "
" + ], + "text/plain": [ + " shape_array_key\n", + "loop_or_inlining \n", + "0 4" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "m2.groupby([\"loop_or_inlining\"]).agg({\"shape_array_key\": \"nunique\"})" ] }, + { + "cell_type": "code", + "execution_count": 22, + "id": "468be3c9-7a24-4f01-84fd-31c137bc45e8", + "metadata": { + "scrolled": true, + "tags": [] + }, + "outputs": [], + "source": [ + "# Number of trips that have at least one row that was divided by 0 \n", + "# for this shape array key\n", + "df1 = m2.groupby(['shape_array_key']).agg({'trip_id':'nunique'}).rename(columns = {'trip_id':'trips_with_zero'}).reset_index()" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "4350f540-8f6b-4fb0-8b16-836245c0e44c", + "metadata": { + "scrolled": true, + "tags": [] + }, + "outputs": [], + "source": [ + "# Original number of trips\n", + "df2 = subset.groupby(['shape_array_key']).agg({'trip_id':'nunique'}).rename(columns = {'trip_id':'all_trips'}).reset_index()" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "ac68bdf7-26a0-4679-9a35-26f8a670018a", + "metadata": {}, + "outputs": [], + "source": [ + "df3 = pd.merge(df1, df2, how = \"inner\", on = 'shape_array_key')" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "81d443cc-122f-46f1-87ec-dbdc74e0ca6a", + "metadata": {}, + "outputs": [], + "source": [ + "df3['percent_of_trips_with_problematic_rows'] = df3.trips_with_zero/df3.all_trips * 100" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "5de3efe6-2233-4251-93a8-1f8dd6fb2dae", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
shape_array_keytrips_with_zeroall_tripspercent_of_trips_with_problematic_rows
0000624bd8453dbe4f2eb2765b04bcb981414100.00
107c9a47264a43d8d0d16ef7109e8fd682020100.00
20fb4f3627996269dc7075276d3b69e3699100.00
3106d979b9a9e6338827a8e1c145e69fd3030100.00
\n", + "
" + ], + "text/plain": [ + " shape_array_key trips_with_zero all_trips \\\n", + "0 000624bd8453dbe4f2eb2765b04bcb98 14 14 \n", + "1 07c9a47264a43d8d0d16ef7109e8fd68 20 20 \n", + "2 0fb4f3627996269dc7075276d3b69e36 9 9 \n", + "3 106d979b9a9e6338827a8e1c145e69fd 30 30 \n", + "\n", + " percent_of_trips_with_problematic_rows \n", + "0 100.00 \n", + "1 100.00 \n", + "2 100.00 \n", + "3 100.00 " + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df3" + ] + }, { "cell_type": "markdown", "id": "a399d982-e400-43fa-b13f-fecafaa27262", @@ -335,7 +630,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 29, "id": "a2a705af-b588-463b-b6ce-f999b2050208", "metadata": {}, "outputs": [], @@ -355,7 +650,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 30, "id": "1e36c5fc-ab3f-4129-97f9-ad9472b7d32a", "metadata": {}, "outputs": [], @@ -365,7 +660,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 31, "id": "148e75f1-08dd-44c8-8179-319164d8e020", "metadata": { "tags": [] @@ -378,7 +673,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 32, "id": "b4350206-c237-44a3-abce-f8f38cde8117", "metadata": { "scrolled": true, @@ -392,7 +687,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 33, "id": "aa1e56d1-ec07-436c-8763-7bcf3dcbf7d4", "metadata": { "scrolled": true, @@ -406,7 +701,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 34, "id": "22e42aae-9281-4040-ab8c-6a10b93f6cf4", "metadata": { "scrolled": true, @@ -420,7 +715,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 35, "id": "0f21f08f-d4eb-4bbd-94d3-f4b031e97cf4", "metadata": {}, "outputs": [], @@ -446,7 +741,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 36, "id": "5ce07566-c1f0-4fa7-9550-2fa07b98dba8", "metadata": {}, "outputs": [], @@ -477,7 +772,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 37, "id": "66e83169-2b4a-4912-bc0e-1a0b3e8deea6", "metadata": {}, "outputs": [], @@ -536,27 +831,53 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 38, "id": "cab32ef3-cc66-40ce-aa19-59631734f539", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2023-06-23 10:28:50.163006\n", + "check in stage 2 1503\n", + "repeated timestamps 149\n", + "repeated timestamps & locations 27\n", + "repeated locations 21\n", + "Name: stage3_flag, dtype: int64\n", + "Have to check 88.41176470588236 % of rows in stage 2\n", + "Took 0:00:15.103777\n" + ] + } + ], "source": [ "m3 = flag_stage3(m2, analysis_date)" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 39, "id": "68a9dbba-ee6b-42b1-9203-1146d6cd56e9", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "(1700, 11)" + ] + }, + "execution_count": 39, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "m3.shape" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 40, "id": "21799f42-873e-41bd-b764-42cc297686a6", "metadata": {}, "outputs": [], @@ -575,7 +896,27 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 41, + "id": "933b83af-137c-4402-86ac-ebd3f2693ee1", + "metadata": {}, + "outputs": [], + "source": [ + "subset_cols = ['gtfs_dataset_key', 'trip_id','stop_sequence','shape_array_key', 'loop_or_inlining', 'stop_id', 'meters_elapsed','sec_elapsed']" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "id": "55df584c-f36d-4c3d-9760-dd9ecb0471a6", + "metadata": {}, + "outputs": [], + "source": [ + "m_cols = ['gtfs_dataset_key', 'stop_sequence','shape_array_key', 'loop_or_inlining','stop_id']" + ] + }, + { + "cell_type": "code", + "execution_count": 43, "id": "68fba15b-7c2d-4c1b-a556-286dc4acc4e1", "metadata": {}, "outputs": [], @@ -586,7 +927,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 44, "id": "f14e0ab1-59d2-42cc-884c-868da650cfa6", "metadata": {}, "outputs": [], @@ -596,7 +937,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 45, "id": "ddb1ae2a-d66a-45b7-9dd9-2b11f7d325b2", "metadata": {}, "outputs": [], @@ -607,18 +948,18 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 46, "id": "bf64a406-1124-423c-8577-705a21c9b422", "metadata": {}, "outputs": [], "source": [ "# Subset df to filter the vp \n", - "subset_for_merge = stage2_rows[['gtfs_dataset_key', 'trip_id','stop_sequence','shape_array_key', 'loop_or_inlining', 'stop_id']].drop_duplicates().reset_index(drop = True)" + "subset_for_merge = stage2_rows[subset_cols].drop_duplicates().reset_index(drop = True)" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 47, "id": "6397dc45-c271-4057-a0d8-1962846d4f94", "metadata": {}, "outputs": [], @@ -629,7 +970,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 48, "id": "45440033-aae7-4f94-9495-5e14529b7c5c", "metadata": { "scrolled": true, @@ -637,279 +978,519 @@ }, "outputs": [], "source": [ - "stg2_m = pd.merge(stg2, subset_for_merge, how = \"inner\", on = ['gtfs_dataset_key', 'stop_sequence','shape_array_key', 'loop_or_inlining','stop_id'])" + "# Merge\n", + "stg2_m = pd.merge(stg2,\n", + " subset_for_merge, \n", + " how = \"inner\",\n", + " on = m_cols\n", + " )" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 49, "id": "fe37945a-26d9-4891-831f-1bb92f85b39e", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "(850, 23)" + ] + }, + "execution_count": 49, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "stage2_rows.shape" ] }, { "cell_type": "code", - "execution_count": null, - "id": "e2d79d69-bf4f-4b4f-9c9a-3d70547eeb63", + "execution_count": 50, + "id": "fab30694-9948-4e82-b19b-0814a459c340", "metadata": {}, "outputs": [], "source": [ - "type(stg2)" + "# Delete out empty geo \n", + "filtered = stg2[~stg2.geometry_arrowized.is_empty]" ] }, { "cell_type": "code", - "execution_count": null, - "id": "198195f7-5c6d-4fda-afc4-4da8307426cb", + "execution_count": 51, + "id": "4d9238cc-99da-49a1-abf1-392e3bd5bcc7", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
gtfs_dataset_keytrip_idstop_sequenceshape_array_keyloop_or_inliningstop_idmeters_elapsedsec_elapsed
336db56b50ab86b5f7a4ae2fc2dd9889bbe10893424407c9a47264a43d8d0d16ef7109e8fd68023080.000.00
\n", + "
" + ], + "text/plain": [ + " gtfs_dataset_key trip_id stop_sequence \\\n", + "336 db56b50ab86b5f7a4ae2fc2dd9889bbe 1089342 44 \n", + "\n", + " shape_array_key loop_or_inlining stop_id \\\n", + "336 07c9a47264a43d8d0d16ef7109e8fd68 0 2308 \n", + "\n", + " meters_elapsed sec_elapsed \n", + "336 0.00 0.00 " + ] + }, + "execution_count": 51, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "subset_for_merge.sample()" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "id": "a72b5744-ff22-44b0-834d-5b3718162661", "metadata": {}, "outputs": [], "source": [ - "stg2.columns" + "# Delete out empty geometry arrowized\n", + "geo_arrowized = stg2_m[~stg2_m.geometry_arrowized.is_empty]" ] }, { "cell_type": "code", - "execution_count": null, - "id": "fab30694-9948-4e82-b19b-0814a459c340", + "execution_count": 53, + "id": "145e129e-b119-44d9-940b-18ac70c93e0a", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "geometry: 0.9847058823529412\n", + "geometry: 0\n", + "geometry_arrowized: 1.0\n", + "geometry_arrowized: 13\n" + ] + } + ], "source": [ - "# Delete out empty geo \n", - "filtered = stg2[~stg2.geometry_arrowized.is_empty]" + "for i in ['geometry','geometry_arrowized']:\n", + " print(f\"{i}: {stg2_m[i].is_valid.sum()/len(stg2_m)}\")\n", + " print(f\"{i}: {len(stg2_m[stg2_m[i].is_empty])}\")" ] }, { - "cell_type": "code", - "execution_count": null, - "id": "4d9238cc-99da-49a1-abf1-392e3bd5bcc7", + "cell_type": "markdown", + "id": "cf7a7ff5-7aca-4c1b-8751-34981d90da15", "metadata": {}, - "outputs": [], "source": [ - "subset.sample()" + "#### Look at the original routes" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 54, "id": "97403c1c-e39f-4179-a7bb-2fc8588d3249", "metadata": {}, "outputs": [], "source": [ "# This is the original dataframe loaded in from merging stage\n", + "# It's not even flagged. \n", "original = pd.merge(filtered, \n", - " subset[['gtfs_dataset_key', 'trip_id','stop_sequence','shape_array_key', 'loop_or_inlining', 'stop_id']],\n", + " subset[subset_cols],\n", " how = \"inner\", \n", - " on = ['gtfs_dataset_key', 'stop_sequence','shape_array_key', 'loop_or_inlining','stop_id'])" + " on = m_cols)" ] }, { "cell_type": "code", - "execution_count": null, - "id": "f81683e1-94a1-4436-a2fb-ee378acaacef", + "execution_count": 55, + "id": "2fad7b35-d1bf-4df2-99d7-b63b2381ba34", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "837" + ] + }, + "execution_count": 55, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "original.columns" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ac442f11-ff84-4e3f-9f05-5f9ece3c0d51", - "metadata": {}, - "outputs": [], - "source": [ - "# filtered[filtered.shape_array_key == \"000624bd8453dbe4f2eb2765b04bcb98\"].set_geometry(\"geometry_arrowized\").explore('stop_sequence')" + "len(geo_arrowized)" ] }, { "cell_type": "code", - "execution_count": null, - "id": "94481b9a-9a4a-4fa2-b7ff-63d1ef93e1dc", + "execution_count": 56, + "id": "c86b76a0-bc6f-4e85-811c-0076892828da", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
number_of_trips_with_problematic_stop_seq
shape_array_keystop_sequence
106d979b9a9e6338827a8e1c145e69fd5225
4620
2018
4818
4516
\n", + "
" + ], + "text/plain": [ + " number_of_trips_with_problematic_stop_seq\n", + "shape_array_key stop_sequence \n", + "106d979b9a9e6338827a8e1c145e69fd 52 25\n", + " 46 20\n", + " 20 18\n", + " 48 18\n", + " 45 16" + ] + }, + "execution_count": 56, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Find number of messed up sequences...are the same sequences being hit?\n", + "(subset_for_merge\n", + " .groupby(['shape_array_key','stop_sequence'])\n", + " .agg({'trip_id':'nunique'})\n", + " .rename(columns = {'trip_id':'number_of_trips_with_problematic_stop_seq'})\n", + " .sort_values(['shape_array_key','number_of_trips_with_problematic_stop_seq']\n", + " , ascending = False)\n", + ").head()" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "id": "97b08e7f-4902-44a2-a904-d15800382ccd", "metadata": {}, "outputs": [], "source": [ - "stg2.shape" + "keep_tripid = ['1350', '1089339', '16939087', '1088383',]" ] }, { - "cell_type": "code", - "execution_count": null, - "id": "2965a631-1a2e-4551-bfcb-c89ba56ecf92", + "cell_type": "markdown", + "id": "a6cdedf7-c112-40ac-ac3e-81bf1144cefe", "metadata": {}, - "outputs": [], "source": [ - "stg2_m.shape" + "#### Look at all the stop sequences vs the ones flagged as 0 for each trip" ] }, { - "cell_type": "code", - "execution_count": null, - "id": "145e129e-b119-44d9-940b-18ac70c93e0a", + "cell_type": "markdown", + "id": "44ea63a5-afdb-4030-b3d3-ec8804f3116b", "metadata": {}, - "outputs": [], "source": [ - "for i in ['geometry','geometry_arrowized']:\n", - " print(f\"{i}: {stg2_m[i].is_valid.sum()/len(stg2_m)}\")\n", - " print(f\"{i}: {len(stg2_m[stg2_m[i].is_empty])}\")" + "##### 106d979b9a9e6338827a8e1c145e69fd\n", + "* 1088383\n", + "* 1088403" ] }, { "cell_type": "code", - "execution_count": null, - "id": "a72b5744-ff22-44b0-834d-5b3718162661", + "execution_count": 58, + "id": "298f3ddb-0fde-4b90-b9b4-5ea9e45d461d", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
Make this Notebook Trusted to load map: File -> Trust Notebook
" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 58, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "geo_arrowized = stg2_m[~stg2_m.geometry_arrowized.is_empty]" + "# SEgments that show up have something wrong with them\n", + "geo_arrowized[geo_arrowized.trip_id == '1088383'].set_geometry(\"geometry_arrowized\").explore('stop_sequence', style_kwds = {'weight':10})" ] }, { "cell_type": "code", - "execution_count": null, - "id": "2fad7b35-d1bf-4df2-99d7-b63b2381ba34", + "execution_count": 59, + "id": "e42cd7cf-3521-498c-be7d-c08146fe8070", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
Make this Notebook Trusted to load map: File -> Trust Notebook
" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 59, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "len(geo_arrowized)" + "# SEgments that show up have something wrong with them\n", + "geo_arrowized[geo_arrowized.trip_id == '1088403'].set_geometry(\"geometry_arrowized\").explore('stop_sequence', style_kwds = {'weight':10})" ] }, { "cell_type": "code", - "execution_count": null, - "id": "97b08e7f-4902-44a2-a904-d15800382ccd", + "execution_count": 60, + "id": "b28c57bb-95ee-4a27-85c9-12cb0d6f3c19", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
Make this Notebook Trusted to load map: File -> Trust Notebook
" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 60, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "keep_tripid = ['1350', '1089339', '16939087', '1088383',]" + "original[original.trip_id == '1088383'].set_geometry(\"geometry_arrowized\").explore('stop_sequence', style_kwds = {'weight':10})" ] }, { - "cell_type": "code", - "execution_count": null, - "id": "298f3ddb-0fde-4b90-b9b4-5ea9e45d461d", + "cell_type": "markdown", + "id": "80f135f9-141e-4e43-9e9e-5e483fcd4f23", "metadata": {}, - "outputs": [], "source": [ - "geo_arrowized[geo_arrowized.trip_id == '1088383'].set_geometry(\"geometry_arrowized\").explore('stop_sequence', style_kwds = {'weight':10})" + "#### 16939089" ] }, { "cell_type": "code", - "execution_count": null, - "id": "b28c57bb-95ee-4a27-85c9-12cb0d6f3c19", + "execution_count": 61, + "id": "c9121bc1-9192-45fa-976a-4f818c4a39a7", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
Make this Notebook Trusted to load map: File -> Trust Notebook
" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 61, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "original[original.trip_id == '1088383'].set_geometry(\"geometry_arrowized\").explore('stop_sequence', style_kwds = {'weight':10})" + "geo_arrowized[geo_arrowized.trip_id == '16939089'].set_geometry(\"geometry_arrowized\").explore('stop_sequence', style_kwds = {'weight':10})" ] }, { "cell_type": "code", - "execution_count": null, - "id": "c9121bc1-9192-45fa-976a-4f818c4a39a7", + "execution_count": 62, + "id": "4213b966-f843-4d14-ab9a-f2344df1962f", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
Make this Notebook Trusted to load map: File -> Trust Notebook
" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 62, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "filtered[filtered.trip_id == '16939087'].set_geometry(\"geometry_arrowized\").explore('stop_sequence', style_kwds = {'weight':10})" + "original[original.trip_id == '16939089'].set_geometry(\"geometry_arrowized\").explore('stop_sequence', style_kwds = {'weight':10})" ] }, { - "cell_type": "code", - "execution_count": null, - "id": "4213b966-f843-4d14-ab9a-f2344df1962f", + "cell_type": "markdown", + "id": "6f048e93-5ffc-45e8-8cc2-24d891a9dac3", "metadata": {}, - "outputs": [], "source": [ - "geo_arrowized[geo_arrowized.trip_id == '16939087'].set_geometry(\"geometry_arrowized\").explore('stop_sequence', style_kwds = {'weight':10})" + "#### 1350" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 63, "id": "6ac43a58-184c-41b3-9d53-9e21ef321bc3", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
Make this Notebook Trusted to load map: File -> Trust Notebook
" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 63, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "geo_arrowized[geo_arrowized.trip_id == '1350'].set_geometry(\"geometry_arrowized\").explore('stop_sequence', style_kwds = {'weight':10})" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 64, "id": "60a53891-66cb-40ea-a42a-55ea290c6584", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
Make this Notebook Trusted to load map: File -> Trust Notebook
" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 64, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "filtered[filtered.trip_id == '1350'].set_geometry(\"geometry_arrowized\").explore('stop_sequence', style_kwds = {'weight':10})" - ] - }, - { - "cell_type": "markdown", - "id": "e01d88e4-a3e2-43f1-9591-afe83e8a92cf", - "metadata": { - "tags": [] - }, - "source": [ - "#### Summarize" + "original[original.trip_id == '1350'].set_geometry(\"geometry_arrowized\").explore('stop_sequence', style_kwds = {'weight':10})" ] }, { "cell_type": "code", - "execution_count": null, - "id": "229c6d49-58e1-45bd-839f-b03f7e8cdd4a", - "metadata": {}, - "outputs": [], - "source": [ - "def summarize(original: pd.DataFrame, filtered_unusual_stops: pd.DataFrame):\n", - "\n", - " subset = [\"_gtfs_dataset_name\", \"gtfs_dataset_key\", \"shape_array_key\"]\n", - "\n", - " def aggregate(df, total_trip_column_name: str):\n", - " agg = (\n", - " df.groupby(subset)\n", - " .agg({\"stop_sequence\": \"count\"})\n", - " .reset_index()\n", - " .rename(columns={\"stop_sequence\": total_trip_column_name})\n", - " )\n", - "\n", - " return agg\n", - "\n", - " total_unusual_stops = aggregate(filtered_unusual_stops, \"total_unusual_stops\")\n", - " total_stops = aggregate(original, \"total_stops\")\n", - "\n", - " # Merge them\n", - " merge1 = pd.merge(total_unusual_stops, total_stops, on=subset, how=\"inner\")\n", - "\n", - " # Add some columns\n", - " merge1[\"percent_of_unusual_stops\"] = (\n", - " (merge1.total_unusual_stops / merge1.total_stops) * 100\n", - " ).astype(int)\n", - "\n", - " merge1[\n", - " \"Percentage of Unusual Stops\"\n", - " ] = \"% of Unusual Stops: \" + merge1.percent_of_unusual_stops.astype(str)\n", - "\n", - " # Add dropdown menu\n", - " # merge1[\"Dropdown Menu\"] = merge1._gtfs_dataset_name + \" \" + merge1.shape_array_key\n", - "\n", - " # Clean\n", - " merge1 = merge1.sort_values([\"percent_of_unusual_stops\"], ascending=False)\n", - " merge1 = merge1.drop(columns=[\"percent_of_unusual_stops\"])\n", - "\n", - " merge1 = threshold_utils.pre_clean(merge1)\n", - " return merge1" - ] - }, - { - "cell_type": "code", - "execution_count": null, + "execution_count": 65, "id": "dad7c9b8-9025-4d91-bf75-0e23c3ac2a52", "metadata": {}, - "outputs": [], + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'summarize' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[65], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43msummarize\u001b[49m(few_routes_cat, high_low_zero)\n", + "\u001b[0;31mNameError\u001b[0m: name 'summarize' is not defined" + ] + } + ], "source": [ "summarize(few_routes_cat, high_low_zero)" ] From 9bff5e91ddc8afd104e868d1ca865098701686b3 Mon Sep 17 00:00:00 2001 From: amandaha8 Date: Mon, 26 Jun 2023 16:19:46 +0000 Subject: [PATCH 6/9] ran for all shapes --- rt_segment_speeds/12_speeds.ipynb | 664 +++++++++++------------------- 1 file changed, 242 insertions(+), 422 deletions(-) diff --git a/rt_segment_speeds/12_speeds.ipynb b/rt_segment_speeds/12_speeds.ipynb index 86954b1d2..3c04c2eb5 100644 --- a/rt_segment_speeds/12_speeds.ipynb +++ b/rt_segment_speeds/12_speeds.ipynb @@ -162,7 +162,6 @@ ") -> pd.DataFrame:\n", "\n", " # Find percentiles\n", - " #Get percentiles in objects for total vehicle.\n", " p5 = df[column_percentile].quantile(0.05).astype(float)\n", " p95 = df[column_percentile].quantile(0.95).astype(float)\n", " \n", @@ -290,41 +289,42 @@ "name": "stdout", "output_type": "stream", "text": [ - "2023-06-23 10:25:44.437503\n", + "2023-06-26 09:14:38.408932\n", "Done with meters_\n", "Done with sec_\n", "sec_cat meters_cat \n", - "sec is avg meters is avg 1829\n", - " meters is high 110\n", - " meters is low 22\n", - "sec is high meters is avg 63\n", - " meters is high 40\n", - " meters is low 47\n", - "sec is low meters is low 850\n", + "sec is avg meters is avg 2415102\n", + " meters is high 70745\n", + " meters is low 139528\n", + "sec is high meters is avg 57245\n", + " meters is high 83074\n", + " meters is low 13695\n", + "sec is low meters is low 296973\n", "dtype: int64\n", - "1022 rows left after filtering for rows with either high seconds OR low meters\n", - "division by 0 850\n", - "seconds too high 103\n", - "meters too low 69\n", + "590515 rows left after filtering for rows with either high seconds OR low meters\n", + "division by 0 296973\n", + "meters too low 153223\n", + "seconds too high 140319\n", "Name: flag, dtype: int64\n", - "Took 0:00:00.209815\n" + "Took 0:02:17.630093\n" ] } ], "source": [ - "m2 = categorize_meters_speeds_pandas(subset)" + "m2 = categorize_meters_speeds_pandas(m1)" ] }, { "cell_type": "code", "execution_count": 16, - "id": "ce0fbb35-f81e-4343-92d2-4382d2173dbd", + "id": "2d1bf90c-d9ed-4861-a1be-23f356165a4c", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "3075512" + "division by 0 296973\n", + "Name: flag, dtype: int64" ] }, "execution_count": 16, @@ -333,19 +333,19 @@ } ], "source": [ - "len(m1)-len(m2)" + "m2.flag.value_counts()" ] }, { "cell_type": "code", "execution_count": 17, - "id": "508f1411-4328-4b80-a029-0ae516107ed0", + "id": "ce0fbb35-f81e-4343-92d2-4382d2173dbd", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "850" + "2779389" ] }, "execution_count": 17, @@ -354,19 +354,19 @@ } ], "source": [ - "len(m2)" + "len(m1)-len(m2)" ] }, { "cell_type": "code", "execution_count": 18, - "id": "9fe08d2a-b874-4439-aa5b-a52de58cad09", + "id": "508f1411-4328-4b80-a029-0ae516107ed0", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "(73, 72067)" + "296973" ] }, "execution_count": 18, @@ -375,19 +375,19 @@ } ], "source": [ - "m2.trip_id.nunique(), m1.trip_id.nunique()" + "len(m2)" ] }, { "cell_type": "code", "execution_count": 19, - "id": "8e6d31ab-46a7-4e20-bb2f-9cac1a2d672d", + "id": "9fe08d2a-b874-4439-aa5b-a52de58cad09", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "(4, 4837)" + "(45357, 72067)" ] }, "execution_count": 19, @@ -396,19 +396,19 @@ } ], "source": [ - "m2.shape_array_key.nunique(), m1.shape_array_key.nunique()" + "m2.trip_id.nunique(), m1.trip_id.nunique()" ] }, { "cell_type": "code", "execution_count": 20, - "id": "647fad46-7f9b-4ce2-a26a-1ea69d02daee", + "id": "8e6d31ab-46a7-4e20-bb2f-9cac1a2d672d", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "(3, 76)" + "(2682, 4837)" ] }, "execution_count": 20, @@ -417,12 +417,33 @@ } ], "source": [ - "m2._gtfs_dataset_name.nunique(), m1._gtfs_dataset_name.nunique()" + "m2.shape_array_key.nunique(), m1.shape_array_key.nunique()" ] }, { "cell_type": "code", "execution_count": 21, + "id": "647fad46-7f9b-4ce2-a26a-1ea69d02daee", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(63, 76)" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "m2._gtfs_dataset_name.nunique(), m1._gtfs_dataset_name.nunique()" + ] + }, + { + "cell_type": "code", + "execution_count": 22, "id": "83036ccc-7339-42c2-b1f7-183734253c21", "metadata": {}, "outputs": [ @@ -457,7 +478,7 @@ " \n", " \n", " 0\n", - " 4\n", + " 2682\n", " \n", " \n", "\n", @@ -466,10 +487,10 @@ "text/plain": [ " shape_array_key\n", "loop_or_inlining \n", - "0 4" + "0 2682" ] }, - "execution_count": 21, + "execution_count": 22, "metadata": {}, "output_type": "execute_result" } @@ -478,9 +499,17 @@ "m2.groupby([\"loop_or_inlining\"]).agg({\"shape_array_key\": \"nunique\"})" ] }, + { + "cell_type": "markdown", + "id": "4486cd7c-31d7-4420-ac67-f9783676ede8", + "metadata": {}, + "source": [ + "#### See how many trips for a shape ID have problematic rows\n" + ] + }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 23, "id": "468be3c9-7a24-4f01-84fd-31c137bc45e8", "metadata": { "scrolled": true, @@ -495,7 +524,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 40, "id": "4350f540-8f6b-4fb0-8b16-836245c0e44c", "metadata": { "scrolled": true, @@ -504,12 +533,12 @@ "outputs": [], "source": [ "# Original number of trips\n", - "df2 = subset.groupby(['shape_array_key']).agg({'trip_id':'nunique'}).rename(columns = {'trip_id':'all_trips'}).reset_index()" + "df2 = m1.groupby(['shape_array_key']).agg({'trip_id':'nunique'}).rename(columns = {'trip_id':'all_trips'}).reset_index()" ] }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 41, "id": "ac68bdf7-26a0-4679-9a35-26f8a670018a", "metadata": {}, "outputs": [], @@ -519,7 +548,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 42, "id": "81d443cc-122f-46f1-87ec-dbdc74e0ca6a", "metadata": {}, "outputs": [], @@ -529,7 +558,36 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 44, + "id": "314d9baf-de0e-460a-8c29-4504ba94cfa6", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "count 2682.00\n", + "mean 82.86\n", + "std 26.65\n", + "min 1.52\n", + "25% 75.00\n", + "50% 100.00\n", + "75% 100.00\n", + "max 100.00\n", + "Name: percent_of_trips_with_problematic_rows, dtype: float64" + ] + }, + "execution_count": 44, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df3['percent_of_trips_with_problematic_rows'].describe()" + ] + }, + { + "cell_type": "code", + "execution_count": 43, "id": "5de3efe6-2233-4251-93a8-1f8dd6fb2dae", "metadata": { "tags": [] @@ -564,58 +622,67 @@ " \n", " \n", " \n", - " 0\n", - " 000624bd8453dbe4f2eb2765b04bcb98\n", - " 14\n", - " 14\n", - " 100.00\n", + " 1397\n", + " 82f0e3379d90a630b9e42e5ec79e0279\n", + " 6\n", + " 7\n", + " 85.71\n", " \n", " \n", - " 1\n", - " 07c9a47264a43d8d0d16ef7109e8fd68\n", - " 20\n", + " 333\n", + " 1e469c778efe30b55db3dd93ee1d9946\n", + " 19\n", " 20\n", - " 100.00\n", + " 95.00\n", " \n", " \n", - " 2\n", - " 0fb4f3627996269dc7075276d3b69e36\n", - " 9\n", - " 9\n", - " 100.00\n", + " 2060\n", + " c750d9ce7a9e659d5d443f0925e175e7\n", + " 6\n", + " 7\n", + " 85.71\n", " \n", " \n", - " 3\n", - " 106d979b9a9e6338827a8e1c145e69fd\n", - " 30\n", - " 30\n", + " 908\n", + " 59626d7e12b3fec5d917b3e052e87d70\n", + " 17\n", + " 17\n", " 100.00\n", " \n", + " \n", + " 47\n", + " 0485a3b83c38283730ce3e9372baf031\n", + " 2\n", + " 3\n", + " 66.67\n", + " \n", " \n", "\n", "" ], "text/plain": [ - " shape_array_key trips_with_zero all_trips \\\n", - "0 000624bd8453dbe4f2eb2765b04bcb98 14 14 \n", - "1 07c9a47264a43d8d0d16ef7109e8fd68 20 20 \n", - "2 0fb4f3627996269dc7075276d3b69e36 9 9 \n", - "3 106d979b9a9e6338827a8e1c145e69fd 30 30 \n", + " shape_array_key trips_with_zero all_trips \\\n", + "1397 82f0e3379d90a630b9e42e5ec79e0279 6 7 \n", + "333 1e469c778efe30b55db3dd93ee1d9946 19 20 \n", + "2060 c750d9ce7a9e659d5d443f0925e175e7 6 7 \n", + "908 59626d7e12b3fec5d917b3e052e87d70 17 17 \n", + "47 0485a3b83c38283730ce3e9372baf031 2 3 \n", "\n", - " percent_of_trips_with_problematic_rows \n", - "0 100.00 \n", - "1 100.00 \n", - "2 100.00 \n", - "3 100.00 " + " percent_of_trips_with_problematic_rows \n", + "1397 85.71 \n", + "333 95.00 \n", + "2060 85.71 \n", + "908 100.00 \n", + "47 66.67 " ] }, - "execution_count": 28, + "execution_count": 43, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "df3" + "df3.sample(5)" ] }, { @@ -630,7 +697,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 28, "id": "a2a705af-b588-463b-b6ce-f999b2050208", "metadata": {}, "outputs": [], @@ -639,7 +706,6 @@ " \n", " # Subset the dataframe and use it to filter out for only the values of interest\n", " flagged_df = flagged_df[['gtfs_dataset_key', 'trip_id','stop_sequence','shape_array_key']]\n", - " \n", " vp = pd.read_parquet(f\"{speed_utils.GCS_PATH}vp_pared_stops_{date}\")\n", " \n", " # Merge to filter\n", @@ -650,7 +716,7 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 29, "id": "1e36c5fc-ab3f-4129-97f9-ad9472b7d32a", "metadata": {}, "outputs": [], @@ -660,7 +726,7 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 30, "id": "148e75f1-08dd-44c8-8179-319164d8e020", "metadata": { "tags": [] @@ -673,7 +739,7 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 31, "id": "b4350206-c237-44a3-abce-f8f38cde8117", "metadata": { "scrolled": true, @@ -687,7 +753,7 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 32, "id": "aa1e56d1-ec07-436c-8763-7bcf3dcbf7d4", "metadata": { "scrolled": true, @@ -701,7 +767,7 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 33, "id": "22e42aae-9281-4040-ab8c-6a10b93f6cf4", "metadata": { "scrolled": true, @@ -715,7 +781,7 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 34, "id": "0f21f08f-d4eb-4bbd-94d3-f4b031e97cf4", "metadata": {}, "outputs": [], @@ -741,7 +807,7 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 35, "id": "5ce07566-c1f0-4fa7-9550-2fa07b98dba8", "metadata": {}, "outputs": [], @@ -772,7 +838,7 @@ }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 36, "id": "66e83169-2b4a-4912-bc0e-1a0b3e8deea6", "metadata": {}, "outputs": [], @@ -831,7 +897,7 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 37, "id": "cab32ef3-cc66-40ce-aa19-59631734f539", "metadata": {}, "outputs": [ @@ -839,14 +905,14 @@ "name": "stdout", "output_type": "stream", "text": [ - "2023-06-23 10:28:50.163006\n", - "check in stage 2 1503\n", - "repeated timestamps 149\n", - "repeated timestamps & locations 27\n", - "repeated locations 21\n", + "2023-06-26 09:17:07.320679\n", + "check in stage 2 538914\n", + "repeated timestamps 54883\n", + "repeated timestamps & locations 107\n", + "repeated locations 42\n", "Name: stage3_flag, dtype: int64\n", - "Have to check 88.41176470588236 % of rows in stage 2\n", - "Took 0:00:15.103777\n" + "Have to check 90.73451121819154 % of rows in stage 2\n", + "Took 0:00:27.798047\n" ] } ], @@ -856,17 +922,17 @@ }, { "cell_type": "code", - "execution_count": 39, + "execution_count": 38, "id": "68a9dbba-ee6b-42b1-9203-1146d6cd56e9", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "(1700, 11)" + "(593946, 11)" ] }, - "execution_count": 39, + "execution_count": 38, "metadata": {}, "output_type": "execute_result" } @@ -877,7 +943,7 @@ }, { "cell_type": "code", - "execution_count": 40, + "execution_count": 39, "id": "21799f42-873e-41bd-b764-42cc297686a6", "metadata": {}, "outputs": [], @@ -896,7 +962,7 @@ }, { "cell_type": "code", - "execution_count": 41, + "execution_count": null, "id": "933b83af-137c-4402-86ac-ebd3f2693ee1", "metadata": {}, "outputs": [], @@ -906,7 +972,7 @@ }, { "cell_type": "code", - "execution_count": 42, + "execution_count": null, "id": "55df584c-f36d-4c3d-9760-dd9ecb0471a6", "metadata": {}, "outputs": [], @@ -916,7 +982,7 @@ }, { "cell_type": "code", - "execution_count": 43, + "execution_count": null, "id": "68fba15b-7c2d-4c1b-a556-286dc4acc4e1", "metadata": {}, "outputs": [], @@ -927,7 +993,7 @@ }, { "cell_type": "code", - "execution_count": 44, + "execution_count": null, "id": "f14e0ab1-59d2-42cc-884c-868da650cfa6", "metadata": {}, "outputs": [], @@ -937,7 +1003,7 @@ }, { "cell_type": "code", - "execution_count": 45, + "execution_count": null, "id": "ddb1ae2a-d66a-45b7-9dd9-2b11f7d325b2", "metadata": {}, "outputs": [], @@ -948,7 +1014,7 @@ }, { "cell_type": "code", - "execution_count": 46, + "execution_count": null, "id": "bf64a406-1124-423c-8577-705a21c9b422", "metadata": {}, "outputs": [], @@ -959,7 +1025,7 @@ }, { "cell_type": "code", - "execution_count": 47, + "execution_count": null, "id": "6397dc45-c271-4057-a0d8-1962846d4f94", "metadata": {}, "outputs": [], @@ -970,7 +1036,7 @@ }, { "cell_type": "code", - "execution_count": 48, + "execution_count": null, "id": "45440033-aae7-4f94-9495-5e14529b7c5c", "metadata": { "scrolled": true, @@ -988,28 +1054,29 @@ }, { "cell_type": "code", - "execution_count": 49, + "execution_count": null, "id": "fe37945a-26d9-4891-831f-1bb92f85b39e", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(850, 23)" - ] - }, - "execution_count": 49, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "stage2_rows.shape" ] }, { "cell_type": "code", - "execution_count": 50, + "execution_count": null, + "id": "145e129e-b119-44d9-940b-18ac70c93e0a", + "metadata": {}, + "outputs": [], + "source": [ + "for i in ['geometry','geometry_arrowized']:\n", + " print(f\"{i}: {stg2_m[i].is_valid.sum()/len(stg2_m)}\")\n", + " print(f\"{i}: {len(stg2_m[stg2_m[i].is_empty])}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, "id": "fab30694-9948-4e82-b19b-0814a459c340", "metadata": {}, "outputs": [], @@ -1020,80 +1087,17 @@ }, { "cell_type": "code", - "execution_count": 51, + "execution_count": null, "id": "4d9238cc-99da-49a1-abf1-392e3bd5bcc7", "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
gtfs_dataset_keytrip_idstop_sequenceshape_array_keyloop_or_inliningstop_idmeters_elapsedsec_elapsed
336db56b50ab86b5f7a4ae2fc2dd9889bbe10893424407c9a47264a43d8d0d16ef7109e8fd68023080.000.00
\n", - "
" - ], - "text/plain": [ - " gtfs_dataset_key trip_id stop_sequence \\\n", - "336 db56b50ab86b5f7a4ae2fc2dd9889bbe 1089342 44 \n", - "\n", - " shape_array_key loop_or_inlining stop_id \\\n", - "336 07c9a47264a43d8d0d16ef7109e8fd68 0 2308 \n", - "\n", - " meters_elapsed sec_elapsed \n", - "336 0.00 0.00 " - ] - }, - "execution_count": 51, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "subset_for_merge.sample()" ] }, { "cell_type": "code", - "execution_count": 52, + "execution_count": null, "id": "a72b5744-ff22-44b0-834d-5b3718162661", "metadata": {}, "outputs": [], @@ -1102,29 +1106,6 @@ "geo_arrowized = stg2_m[~stg2_m.geometry_arrowized.is_empty]" ] }, - { - "cell_type": "code", - "execution_count": 53, - "id": "145e129e-b119-44d9-940b-18ac70c93e0a", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "geometry: 0.9847058823529412\n", - "geometry: 0\n", - "geometry_arrowized: 1.0\n", - "geometry_arrowized: 13\n" - ] - } - ], - "source": [ - "for i in ['geometry','geometry_arrowized']:\n", - " print(f\"{i}: {stg2_m[i].is_valid.sum()/len(stg2_m)}\")\n", - " print(f\"{i}: {len(stg2_m[stg2_m[i].is_empty])}\")" - ] - }, { "cell_type": "markdown", "id": "cf7a7ff5-7aca-4c1b-8751-34981d90da15", @@ -1135,7 +1116,7 @@ }, { "cell_type": "code", - "execution_count": 54, + "execution_count": null, "id": "97403c1c-e39f-4179-a7bb-2fc8588d3249", "metadata": {}, "outputs": [], @@ -1150,123 +1131,45 @@ }, { "cell_type": "code", - "execution_count": 55, + "execution_count": null, "id": "2fad7b35-d1bf-4df2-99d7-b63b2381ba34", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "837" - ] - }, - "execution_count": 55, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "len(geo_arrowized)" ] }, { "cell_type": "code", - "execution_count": 56, + "execution_count": null, "id": "c86b76a0-bc6f-4e85-811c-0076892828da", "metadata": { "tags": [] }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
number_of_trips_with_problematic_stop_seq
shape_array_keystop_sequence
106d979b9a9e6338827a8e1c145e69fd5225
4620
2018
4818
4516
\n", - "
" - ], - "text/plain": [ - " number_of_trips_with_problematic_stop_seq\n", - "shape_array_key stop_sequence \n", - "106d979b9a9e6338827a8e1c145e69fd 52 25\n", - " 46 20\n", - " 20 18\n", - " 48 18\n", - " 45 16" - ] - }, - "execution_count": 56, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "# Find number of messed up sequences...are the same sequences being hit?\n", + "\"\"\"\n", "(subset_for_merge\n", " .groupby(['shape_array_key','stop_sequence'])\n", " .agg({'trip_id':'nunique'})\n", " .rename(columns = {'trip_id':'number_of_trips_with_problematic_stop_seq'})\n", " .sort_values(['shape_array_key','number_of_trips_with_problematic_stop_seq']\n", " , ascending = False)\n", - ").head()" + ")\"\"\"" ] }, { "cell_type": "code", - "execution_count": 57, - "id": "97b08e7f-4902-44a2-a904-d15800382ccd", - "metadata": {}, + "execution_count": null, + "id": "e0c6367d-cd6b-44e8-a325-4f2bb1615599", + "metadata": { + "scrolled": true, + "tags": [] + }, "outputs": [], "source": [ - "keep_tripid = ['1350', '1089339', '16939087', '1088383',]" + "# subset_for_merge.groupby(['shape_array_key','trip_id']).agg({'stop_sequence':'nunique'})" ] }, { @@ -1289,24 +1192,10 @@ }, { "cell_type": "code", - "execution_count": 58, + "execution_count": null, "id": "298f3ddb-0fde-4b90-b9b4-5ea9e45d461d", "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
Make this Notebook Trusted to load map: File -> Trust Notebook
" - ], - "text/plain": [ - "" - ] - }, - "execution_count": 58, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "# SEgments that show up have something wrong with them\n", "geo_arrowized[geo_arrowized.trip_id == '1088383'].set_geometry(\"geometry_arrowized\").explore('stop_sequence', style_kwds = {'weight':10})" @@ -1314,24 +1203,10 @@ }, { "cell_type": "code", - "execution_count": 59, + "execution_count": null, "id": "e42cd7cf-3521-498c-be7d-c08146fe8070", "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
Make this Notebook Trusted to load map: File -> Trust Notebook
" - ], - "text/plain": [ - "" - ] - }, - "execution_count": 59, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "# SEgments that show up have something wrong with them\n", "geo_arrowized[geo_arrowized.trip_id == '1088403'].set_geometry(\"geometry_arrowized\").explore('stop_sequence', style_kwds = {'weight':10})" @@ -1339,24 +1214,10 @@ }, { "cell_type": "code", - "execution_count": 60, + "execution_count": null, "id": "b28c57bb-95ee-4a27-85c9-12cb0d6f3c19", "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
Make this Notebook Trusted to load map: File -> Trust Notebook
" - ], - "text/plain": [ - "" - ] - }, - "execution_count": 60, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "original[original.trip_id == '1088383'].set_geometry(\"geometry_arrowized\").explore('stop_sequence', style_kwds = {'weight':10})" ] @@ -1366,53 +1227,26 @@ "id": "80f135f9-141e-4e43-9e9e-5e483fcd4f23", "metadata": {}, "source": [ - "#### 16939089" + "#### 0fb4f3627996269dc7075276d3b69e36 \n", + "* 16939089" ] }, { "cell_type": "code", - "execution_count": 61, + "execution_count": null, "id": "c9121bc1-9192-45fa-976a-4f818c4a39a7", "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
Make this Notebook Trusted to load map: File -> Trust Notebook
" - ], - "text/plain": [ - "" - ] - }, - "execution_count": 61, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "geo_arrowized[geo_arrowized.trip_id == '16939089'].set_geometry(\"geometry_arrowized\").explore('stop_sequence', style_kwds = {'weight':10})" ] }, { "cell_type": "code", - "execution_count": 62, + "execution_count": null, "id": "4213b966-f843-4d14-ab9a-f2344df1962f", "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
Make this Notebook Trusted to load map: File -> Trust Notebook
" - ], - "text/plain": [ - "" - ] - }, - "execution_count": 62, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "original[original.trip_id == '16939089'].set_geometry(\"geometry_arrowized\").explore('stop_sequence', style_kwds = {'weight':10})" ] @@ -1420,77 +1254,63 @@ { "cell_type": "markdown", "id": "6f048e93-5ffc-45e8-8cc2-24d891a9dac3", + "metadata": { + "tags": [] + }, + "source": [ + "#### 000624bd8453dbe4f2eb2765b04bcb98 \n", + "* 1350" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "75d7e270-48c5-4421-ae89-50d696604da0", "metadata": {}, + "outputs": [], "source": [ - "#### 1350" + "geo_arrowized[geo_arrowized.trip_id == '1359'].set_geometry(\"geometry_arrowized\").explore('stop_sequence', style_kwds = {'weight':10})" ] }, { "cell_type": "code", - "execution_count": 63, + "execution_count": null, "id": "6ac43a58-184c-41b3-9d53-9e21ef321bc3", "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
Make this Notebook Trusted to load map: File -> Trust Notebook
" - ], - "text/plain": [ - "" - ] - }, - "execution_count": 63, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "geo_arrowized[geo_arrowized.trip_id == '1350'].set_geometry(\"geometry_arrowized\").explore('stop_sequence', style_kwds = {'weight':10})" ] }, { "cell_type": "code", - "execution_count": 64, + "execution_count": null, + "id": "68a59632-cfbd-43fd-aca4-a502e400a854", + "metadata": { + "scrolled": true, + "tags": [] + }, + "outputs": [], + "source": [ + "m2[m2.trip_id == '1350']" + ] + }, + { + "cell_type": "code", + "execution_count": null, "id": "60a53891-66cb-40ea-a42a-55ea290c6584", "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
Make this Notebook Trusted to load map: File -> Trust Notebook
" - ], - "text/plain": [ - "" - ] - }, - "execution_count": 64, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "original[original.trip_id == '1350'].set_geometry(\"geometry_arrowized\").explore('stop_sequence', style_kwds = {'weight':10})" ] }, { "cell_type": "code", - "execution_count": 65, + "execution_count": null, "id": "dad7c9b8-9025-4d91-bf75-0e23c3ac2a52", "metadata": {}, - "outputs": [ - { - "ename": "NameError", - "evalue": "name 'summarize' is not defined", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[65], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43msummarize\u001b[49m(few_routes_cat, high_low_zero)\n", - "\u001b[0;31mNameError\u001b[0m: name 'summarize' is not defined" - ] - } - ], + "outputs": [], "source": [ "summarize(few_routes_cat, high_low_zero)" ] From 0917de416c5f3769c7716e7a376dc4dfb8d984e1 Mon Sep 17 00:00:00 2001 From: amandaha8 Date: Wed, 28 Jun 2023 22:51:19 +0000 Subject: [PATCH 7/9] looking at all pts vs first and last pt --- rt_segment_speeds/12_speeds.ipynb | 1570 ++++++----------------------- 1 file changed, 322 insertions(+), 1248 deletions(-) diff --git a/rt_segment_speeds/12_speeds.ipynb b/rt_segment_speeds/12_speeds.ipynb index 3c04c2eb5..31d49bf30 100644 --- a/rt_segment_speeds/12_speeds.ipynb +++ b/rt_segment_speeds/12_speeds.ipynb @@ -2,27 +2,10 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "id": "2c7feec3-aa18-42ab-94b9-cab4be608152", "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/opt/conda/lib/python3.9/site-packages/geopandas/_compat.py:123: UserWarning: The Shapely GEOS version (3.11.1-CAPI-1.17.1) is incompatible with the GEOS version PyGEOS was compiled with (3.10.1-CAPI-1.16.0). Conversions between both will be slow.\n", - " warnings.warn(\n", - "/home/jovyan/data-analyses/rt_segment_speeds/_threshold_utils.py:1: UserWarning: Shapely 2.0 is installed, but because PyGEOS is also installed, GeoPandas will still use PyGEOS by default for now. To force to use and test Shapely 2.0, you have to set the environment variable USE_PYGEOS=0. You can do this before starting the Python process, or in your code before importing geopandas:\n", - "\n", - "import os\n", - "os.environ['USE_PYGEOS'] = '0'\n", - "import geopandas\n", - "\n", - "In a future release, GeoPandas will switch to using Shapely by default. If you are using PyGEOS directly (calling PyGEOS functions on geometries from GeoPandas), this will then stop working and you are encouraged to migrate from PyGEOS to Shapely 2.0 (https://shapely.readthedocs.io/en/latest/migration_pygeos.html).\n", - " import geopandas as gpd\n" - ] - } - ], + "outputs": [], "source": [ "import datetime\n", "import _speed_utils as speed_utils\n", @@ -39,12 +22,13 @@ " analysis_date,\n", " CONFIG_PATH\n", ")\n", + "from scripts import A1_sjoin_vp_segments\n", "from shared_utils import calitp_color_palette as cp" ] }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "id": "0108ae4a-4518-4487-85f7-a5faa3e9cbf6", "metadata": {}, "outputs": [], @@ -57,7 +41,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "id": "80014c5e-695d-4280-89cd-4e7e2bb3d302", "metadata": {}, "outputs": [], @@ -75,7 +59,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "id": "2f0c5f4f-f419-42a8-8527-7060ed412092", "metadata": {}, "outputs": [], @@ -89,13 +73,13 @@ " date: analysis date\n", " \"\"\"\n", " # Open up avg speeds\n", - " avg_speeds = pd.read_parquet(f\"{speed_utils.GCS_PATH}avg_speeds_stop_segments_{analysis_date}.parquet\")\n", - " avg_speeds = avg_speeds.drop(columns=[\"geometry\", \"geometry_arrowized\", \"district\", \"district_name\"])\n", + " avg_speeds = pd.read_parquet(f\"{SEGMENT_GCS}avg_speeds_stop_segments_{analysis_date}.parquet\")\n", + " avg_speeds = avg_speeds.drop(columns=[\"geometry\", \"district\", \"district_name\"])\n", " # Filter for all day flags\n", " avg_speeds = avg_speeds[avg_speeds.time_of_day == 'all_day'].reset_index(drop = True)\n", " \n", " # Open up speeds\n", - " speeds = pd.read_parquet(f\"{speed_utils.GCS_PATH}speeds_stop_segments_{analysis_date}\")\n", + " speeds = pd.read_parquet(f\"{SEGMENT_GCS}speeds_stop_segments_{analysis_date}\")\n", " \n", " merge_cols = ['gtfs_dataset_key','shape_array_key', 'stop_sequence']\n", " m1 = pd.merge(avg_speeds, speeds, on = merge_cols, how = 'inner')\n", @@ -107,7 +91,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "id": "84ac97bf-ee4f-4d85-b523-8a36823f9d9a", "metadata": {}, "outputs": [], @@ -117,7 +101,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "id": "68950ae7-4061-47d6-ac48-5eac0b1f29c0", "metadata": {}, "outputs": [], @@ -127,7 +111,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "id": "b04dfb8b-7476-49df-873a-cea75dc61763", "metadata": {}, "outputs": [], @@ -152,7 +136,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "id": "e81e59fd-cc2f-408e-9148-1a1055425fc4", "metadata": {}, "outputs": [], @@ -186,7 +170,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": null, "id": "0dfb836d-f919-4f2b-a0d1-9e4a4713ba8a", "metadata": {}, "outputs": [], @@ -196,7 +180,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": null, "id": "9f84205d-93db-49f3-be99-6b5014f7faeb", "metadata": {}, "outputs": [], @@ -206,7 +190,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": null, "id": "b0d2184f-8a44-4489-a1b4-2be8317142f1", "metadata": {}, "outputs": [], @@ -216,7 +200,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": null, "id": "940fb010-0dff-465e-bf8d-87dd3f4ba101", "metadata": {}, "outputs": [], @@ -226,7 +210,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": null, "id": "9d38d541-5c9c-4d31-8986-9c3928eb2f59", "metadata": {}, "outputs": [], @@ -271,7 +255,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": null, "id": "7201d5e3-f765-4e5d-9bbd-aa6a336bcc0a", "metadata": {}, "outputs": [], @@ -281,220 +265,80 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": null, "id": "2c5107cb-c574-449b-95b6-fb205f38502e", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2023-06-26 09:14:38.408932\n", - "Done with meters_\n", - "Done with sec_\n", - "sec_cat meters_cat \n", - "sec is avg meters is avg 2415102\n", - " meters is high 70745\n", - " meters is low 139528\n", - "sec is high meters is avg 57245\n", - " meters is high 83074\n", - " meters is low 13695\n", - "sec is low meters is low 296973\n", - "dtype: int64\n", - "590515 rows left after filtering for rows with either high seconds OR low meters\n", - "division by 0 296973\n", - "meters too low 153223\n", - "seconds too high 140319\n", - "Name: flag, dtype: int64\n", - "Took 0:02:17.630093\n" - ] - } - ], - "source": [ - "m2 = categorize_meters_speeds_pandas(m1)" - ] - }, - { - "cell_type": "code", - "execution_count": 16, + "outputs": [], + "source": [ + "m2 = categorize_meters_speeds_pandas(subset)" + ] + }, + { + "cell_type": "code", + "execution_count": null, "id": "2d1bf90c-d9ed-4861-a1be-23f356165a4c", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "division by 0 296973\n", - "Name: flag, dtype: int64" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "m2.flag.value_counts()" ] }, { "cell_type": "code", - "execution_count": 17, + "execution_count": null, "id": "ce0fbb35-f81e-4343-92d2-4382d2173dbd", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "2779389" - ] - }, - "execution_count": 17, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "len(m1)-len(m2)" ] }, { "cell_type": "code", - "execution_count": 18, + "execution_count": null, "id": "508f1411-4328-4b80-a029-0ae516107ed0", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "296973" - ] - }, - "execution_count": 18, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "len(m2)" ] }, { "cell_type": "code", - "execution_count": 19, + "execution_count": null, "id": "9fe08d2a-b874-4439-aa5b-a52de58cad09", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(45357, 72067)" - ] - }, - "execution_count": 19, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "m2.trip_id.nunique(), m1.trip_id.nunique()" ] }, { "cell_type": "code", - "execution_count": 20, + "execution_count": null, "id": "8e6d31ab-46a7-4e20-bb2f-9cac1a2d672d", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(2682, 4837)" - ] - }, - "execution_count": 20, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "m2.shape_array_key.nunique(), m1.shape_array_key.nunique()" ] }, { "cell_type": "code", - "execution_count": 21, + "execution_count": null, "id": "647fad46-7f9b-4ce2-a26a-1ea69d02daee", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(63, 76)" - ] - }, - "execution_count": 21, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "m2._gtfs_dataset_name.nunique(), m1._gtfs_dataset_name.nunique()" ] }, { "cell_type": "code", - "execution_count": 22, + "execution_count": null, "id": "83036ccc-7339-42c2-b1f7-183734253c21", "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
shape_array_key
loop_or_inlining
02682
\n", - "
" - ], - "text/plain": [ - " shape_array_key\n", - "loop_or_inlining \n", - "0 2682" - ] - }, - "execution_count": 22, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "m2.groupby([\"loop_or_inlining\"]).agg({\"shape_array_key\": \"nunique\"})" ] @@ -509,7 +353,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": null, "id": "468be3c9-7a24-4f01-84fd-31c137bc45e8", "metadata": { "scrolled": true, @@ -524,7 +368,7 @@ }, { "cell_type": "code", - "execution_count": 40, + "execution_count": null, "id": "4350f540-8f6b-4fb0-8b16-836245c0e44c", "metadata": { "scrolled": true, @@ -538,7 +382,7 @@ }, { "cell_type": "code", - "execution_count": 41, + "execution_count": null, "id": "ac68bdf7-26a0-4679-9a35-26f8a670018a", "metadata": {}, "outputs": [], @@ -548,7 +392,7 @@ }, { "cell_type": "code", - "execution_count": 42, + "execution_count": null, "id": "81d443cc-122f-46f1-87ec-dbdc74e0ca6a", "metadata": {}, "outputs": [], @@ -558,131 +402,24 @@ }, { "cell_type": "code", - "execution_count": 44, + "execution_count": null, "id": "314d9baf-de0e-460a-8c29-4504ba94cfa6", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "count 2682.00\n", - "mean 82.86\n", - "std 26.65\n", - "min 1.52\n", - "25% 75.00\n", - "50% 100.00\n", - "75% 100.00\n", - "max 100.00\n", - "Name: percent_of_trips_with_problematic_rows, dtype: float64" - ] - }, - "execution_count": 44, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "df3['percent_of_trips_with_problematic_rows'].describe()" ] }, { "cell_type": "code", - "execution_count": 43, + "execution_count": null, "id": "5de3efe6-2233-4251-93a8-1f8dd6fb2dae", "metadata": { "tags": [] }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
shape_array_keytrips_with_zeroall_tripspercent_of_trips_with_problematic_rows
139782f0e3379d90a630b9e42e5ec79e02796785.71
3331e469c778efe30b55db3dd93ee1d9946192095.00
2060c750d9ce7a9e659d5d443f0925e175e76785.71
90859626d7e12b3fec5d917b3e052e87d701717100.00
470485a3b83c38283730ce3e9372baf0312366.67
\n", - "
" - ], - "text/plain": [ - " shape_array_key trips_with_zero all_trips \\\n", - "1397 82f0e3379d90a630b9e42e5ec79e0279 6 7 \n", - "333 1e469c778efe30b55db3dd93ee1d9946 19 20 \n", - "2060 c750d9ce7a9e659d5d443f0925e175e7 6 7 \n", - "908 59626d7e12b3fec5d917b3e052e87d70 17 17 \n", - "47 0485a3b83c38283730ce3e9372baf031 2 3 \n", - "\n", - " percent_of_trips_with_problematic_rows \n", - "1397 85.71 \n", - "333 95.00 \n", - "2060 85.71 \n", - "908 100.00 \n", - "47 66.67 " - ] - }, - "execution_count": 43, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df3.sample(5)" + "outputs": [], + "source": [ + "# df3.sample(5)" ] }, { @@ -691,13 +428,13 @@ "metadata": {}, "source": [ "### Investigate \n", - "#### Stage3: \"vp_pared_stops\"\n", - "* Keeps only first and last point of a segment." + "#### Stage3: \"vp_pared_stops\"/A3_loop_inlining\n", + "* Rewrite this part to filter read_parquet with the shape array and whatnot" ] }, { "cell_type": "code", - "execution_count": 28, + "execution_count": null, "id": "a2a705af-b588-463b-b6ce-f999b2050208", "metadata": {}, "outputs": [], @@ -705,8 +442,17 @@ "def load_vp_stage3(flagged_df:pd.DataFrame, date:str) -> pd.DataFrame:\n", " \n", " # Subset the dataframe and use it to filter out for only the values of interest\n", - " flagged_df = flagged_df[['gtfs_dataset_key', 'trip_id','stop_sequence','shape_array_key']]\n", - " vp = pd.read_parquet(f\"{speed_utils.GCS_PATH}vp_pared_stops_{date}\")\n", + " shape_array_keys = flagged_df.shape_array_key.unique().tolist()\n", + " stop_seq = flagged_df.stop_sequence.unique().tolist() \n", + " trip_id = flagged_df.trip_id.unique().tolist() \n", + " gtfs_dataset_key = flagged_df.gtfs_dataset_key.unique().tolist() \n", + " \n", + " #flagged_df = flagged_df[['gtfs_dataset_key', 'trip_id','stop_sequence','shape_array_key']]\n", + " vp = pd.read_parquet(f\"{SEGMENT_GCS}vp_pared_stops_{date}\",\n", + " filters = [[('shape_array_key', \"in\", shape_array_keys),\n", + " ('stop_sequence', 'in', stop_seq), \n", + " ('trip_id', 'in', trip_id), \n", + " ('gtfs_dataset_key', 'in', gtfs_dataset_key)]],)\n", " \n", " # Merge to filter\n", " vp2 = pd.merge(flagged_df, vp, how = \"inner\", on = ['gtfs_dataset_key', 'trip_id','stop_sequence','shape_array_key'])\n", @@ -716,7 +462,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": null, "id": "1e36c5fc-ab3f-4129-97f9-ad9472b7d32a", "metadata": {}, "outputs": [], @@ -726,7 +472,17 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": null, + "id": "1d6fe654-40ca-4758-bc2c-316e33d1a9d1", + "metadata": {}, + "outputs": [], + "source": [ + "# vp = pd.read_parquet(f\"{SEGMENT_GCS}vp_pared_stops_{analysis_date}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, "id": "148e75f1-08dd-44c8-8179-319164d8e020", "metadata": { "tags": [] @@ -739,7 +495,7 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": null, "id": "b4350206-c237-44a3-abce-f8f38cde8117", "metadata": { "scrolled": true, @@ -753,7 +509,7 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": null, "id": "aa1e56d1-ec07-436c-8763-7bcf3dcbf7d4", "metadata": { "scrolled": true, @@ -767,7 +523,7 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": null, "id": "22e42aae-9281-4040-ab8c-6a10b93f6cf4", "metadata": { "scrolled": true, @@ -781,7 +537,7 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": null, "id": "0f21f08f-d4eb-4bbd-94d3-f4b031e97cf4", "metadata": {}, "outputs": [], @@ -807,7 +563,7 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": null, "id": "5ce07566-c1f0-4fa7-9550-2fa07b98dba8", "metadata": {}, "outputs": [], @@ -838,7 +594,7 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": null, "id": "66e83169-2b4a-4912-bc0e-1a0b3e8deea6", "metadata": {}, "outputs": [], @@ -895,1328 +651,646 @@ " return m1" ] }, - { - "cell_type": "code", - "execution_count": 37, - "id": "cab32ef3-cc66-40ce-aa19-59631734f539", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2023-06-26 09:17:07.320679\n", - "check in stage 2 538914\n", - "repeated timestamps 54883\n", - "repeated timestamps & locations 107\n", - "repeated locations 42\n", - "Name: stage3_flag, dtype: int64\n", - "Have to check 90.73451121819154 % of rows in stage 2\n", - "Took 0:00:27.798047\n" - ] - } - ], - "source": [ - "m3 = flag_stage3(m2, analysis_date)" - ] - }, - { - "cell_type": "code", - "execution_count": 38, - "id": "68a9dbba-ee6b-42b1-9203-1146d6cd56e9", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(593946, 11)" - ] - }, - "execution_count": 38, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "m3.shape" - ] - }, - { - "cell_type": "code", - "execution_count": 39, - "id": "21799f42-873e-41bd-b764-42cc297686a6", - "metadata": {}, - "outputs": [], - "source": [ - "sort_cols = ['trip_id', 'shape_array_key', 'stop_sequence']" - ] - }, - { - "cell_type": "markdown", - "id": "4b1876cf-9e8b-4c30-8723-2226133b8e01", - "metadata": {}, - "source": [ - "### Stage2: \"vp_stop_segment\"\n", - "* Were the right points kept?" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "933b83af-137c-4402-86ac-ebd3f2693ee1", - "metadata": {}, - "outputs": [], - "source": [ - "subset_cols = ['gtfs_dataset_key', 'trip_id','stop_sequence','shape_array_key', 'loop_or_inlining', 'stop_id', 'meters_elapsed','sec_elapsed']" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "55df584c-f36d-4c3d-9760-dd9ecb0471a6", - "metadata": {}, - "outputs": [], - "source": [ - "m_cols = ['gtfs_dataset_key', 'stop_sequence','shape_array_key', 'loop_or_inlining','stop_id']" - ] - }, { "cell_type": "code", "execution_count": null, - "id": "68fba15b-7c2d-4c1b-a556-286dc4acc4e1", + "id": "cab32ef3-cc66-40ce-aa19-59631734f539", "metadata": {}, "outputs": [], "source": [ - "# Find rows that need to be tagged in stage 2\n", - "stage2_rows = m3[m3.stage3_flag == \"check in stage 2\"].reset_index()" + "m3 = flag_stage3(m2, analysis_date)" ] }, { "cell_type": "code", "execution_count": null, - "id": "f14e0ab1-59d2-42cc-884c-868da650cfa6", + "id": "68a9dbba-ee6b-42b1-9203-1146d6cd56e9", "metadata": {}, "outputs": [], "source": [ - "stage2_routes = stage2_rows.shape_array_key.unique().tolist() " + "m3.shape" ] }, { "cell_type": "code", "execution_count": null, - "id": "ddb1ae2a-d66a-45b7-9dd9-2b11f7d325b2", + "id": "1cca329c-14bc-4ad5-9465-1a63ca53df49", "metadata": {}, "outputs": [], "source": [ - "# Use flagged df\n", - "stage2_rows = m2[m2.shape_array_key.isin(stage2_routes)].reset_index(drop = True)" + "m3 = m3[m3.stage3_flag == \"check in stage 2\"]" ] }, { "cell_type": "code", "execution_count": null, - "id": "bf64a406-1124-423c-8577-705a21c9b422", + "id": "93e87778-edef-4d62-98aa-a4241f177892", "metadata": {}, "outputs": [], "source": [ - "# Subset df to filter the vp \n", - "subset_for_merge = stage2_rows[subset_cols].drop_duplicates().reset_index(drop = True)" + "m3.shape" ] }, { "cell_type": "code", "execution_count": null, - "id": "6397dc45-c271-4057-a0d8-1962846d4f94", + "id": "21799f42-873e-41bd-b764-42cc297686a6", "metadata": {}, "outputs": [], "source": [ - "# What's the diff between stop segments normal/special/and without any notation?\n", - "stg2 = gpd.read_parquet(f\"{speed_utils.GCS_PATH}stop_segments_{analysis_date}.parquet\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "45440033-aae7-4f94-9495-5e14529b7c5c", - "metadata": { - "scrolled": true, - "tags": [] - }, - "outputs": [], - "source": [ - "# Merge\n", - "stg2_m = pd.merge(stg2,\n", - " subset_for_merge, \n", - " how = \"inner\",\n", - " on = m_cols\n", - " )" + "sort_cols = ['trip_id', 'shape_array_key', 'stop_sequence']" ] }, { - "cell_type": "code", - "execution_count": null, - "id": "fe37945a-26d9-4891-831f-1bb92f85b39e", + "cell_type": "markdown", + "id": "4b1876cf-9e8b-4c30-8723-2226133b8e01", "metadata": {}, - "outputs": [], "source": [ - "stage2_rows.shape" + "#### Stage2: \"vp_stop_segment\"/A1_sjoin_vp_segments\n" ] }, { "cell_type": "code", "execution_count": null, - "id": "145e129e-b119-44d9-940b-18ac70c93e0a", + "id": "0a469849-f903-44e4-9d2a-4f3775270a52", "metadata": {}, "outputs": [], "source": [ - "for i in ['geometry','geometry_arrowized']:\n", - " print(f\"{i}: {stg2_m[i].is_valid.sum()/len(stg2_m)}\")\n", - " print(f\"{i}: {len(stg2_m[stg2_m[i].is_empty])}\")" + "# Select one route to look at\n", + "test_route = \"106d979b9a9e6338827a8e1c145e69fd\"" ] }, { "cell_type": "code", "execution_count": null, - "id": "fab30694-9948-4e82-b19b-0814a459c340", + "id": "c23a767c-80b5-439d-97e2-b7fe5e6bfd06", "metadata": {}, "outputs": [], "source": [ - "# Delete out empty geo \n", - "filtered = stg2[~stg2.geometry_arrowized.is_empty]" + "test_sequence = 39" ] }, { "cell_type": "code", "execution_count": null, - "id": "4d9238cc-99da-49a1-abf1-392e3bd5bcc7", + "id": "6e946c68-3476-459d-a869-77ac37b5fb07", "metadata": {}, "outputs": [], "source": [ - "subset_for_merge.sample()" + "test_gtfs_key = \"db56b50ab86b5f7a4ae2fc2dd9889bbe\"" ] }, { "cell_type": "code", "execution_count": null, - "id": "a72b5744-ff22-44b0-834d-5b3718162661", + "id": "b4fa40bf-387c-4301-ba13-2bd16b15cd24", "metadata": {}, "outputs": [], "source": [ - "# Delete out empty geometry arrowized\n", - "geo_arrowized = stg2_m[~stg2_m.geometry_arrowized.is_empty]" + "test_trip = '1088405'" ] }, { "cell_type": "markdown", - "id": "cf7a7ff5-7aca-4c1b-8751-34981d90da15", + "id": "ef18eb20-a43e-4d32-80eb-7f902116a944", "metadata": {}, "source": [ - "#### Look at the original routes" + "#### Look at export file" ] }, { "cell_type": "code", "execution_count": null, - "id": "97403c1c-e39f-4179-a7bb-2fc8588d3249", + "id": "6397dc45-c271-4057-a0d8-1962846d4f94", "metadata": {}, "outputs": [], "source": [ - "# This is the original dataframe loaded in from merging stage\n", - "# It's not even flagged. \n", - "original = pd.merge(filtered, \n", - " subset[subset_cols],\n", - " how = \"inner\", \n", - " on = m_cols)" + "def import_stage_2(date:str, route:str, stop_sequence:str):\n", + " df = pd.read_parquet(\n", + " f\"{SEGMENT_GCS}vp_sjoin/vp_stop_segment_{date}\",\n", + " filters = [[('shape_array_key', \"==\", route),\n", + " ('stop_sequence', \"==\", stop_sequence)]],\n", + " )\n", + " return df" ] }, { "cell_type": "code", "execution_count": null, - "id": "2fad7b35-d1bf-4df2-99d7-b63b2381ba34", + "id": "fe8f800a-f180-4495-a387-0367528823ba", "metadata": {}, "outputs": [], "source": [ - "len(geo_arrowized)" + "# stg2 = import_stage_2(analysis_date, test_route, test_sequence)" ] }, { - "cell_type": "code", - "execution_count": null, - "id": "c86b76a0-bc6f-4e85-811c-0076892828da", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "# Find number of messed up sequences...are the same sequences being hit?\n", - "\"\"\"\n", - "(subset_for_merge\n", - " .groupby(['shape_array_key','stop_sequence'])\n", - " .agg({'trip_id':'nunique'})\n", - " .rename(columns = {'trip_id':'number_of_trips_with_problematic_stop_seq'})\n", - " .sort_values(['shape_array_key','number_of_trips_with_problematic_stop_seq']\n", - " , ascending = False)\n", - ")\"\"\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e0c6367d-cd6b-44e8-a325-4f2bb1615599", + "cell_type": "markdown", + "id": "4b5dec8d-c4f5-49dd-9a11-1b10ff30fb55", "metadata": { - "scrolled": true, "tags": [] }, - "outputs": [], - "source": [ - "# subset_for_merge.groupby(['shape_array_key','trip_id']).agg({'stop_sequence':'nunique'})" - ] - }, - { - "cell_type": "markdown", - "id": "a6cdedf7-c112-40ac-ac3e-81bf1144cefe", - "metadata": {}, - "source": [ - "#### Look at all the stop sequences vs the ones flagged as 0 for each trip" - ] - }, - { - "cell_type": "markdown", - "id": "44ea63a5-afdb-4030-b3d3-ec8804f3116b", - "metadata": {}, - "source": [ - "##### 106d979b9a9e6338827a8e1c145e69fd\n", - "* 1088383\n", - "* 1088403" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "298f3ddb-0fde-4b90-b9b4-5ea9e45d461d", - "metadata": {}, - "outputs": [], "source": [ - "# SEgments that show up have something wrong with them\n", - "geo_arrowized[geo_arrowized.trip_id == '1088383'].set_geometry(\"geometry_arrowized\").explore('stop_sequence', style_kwds = {'weight':10})" + "#### Look at vp trips -> import unique trips" ] }, { "cell_type": "code", "execution_count": null, - "id": "e42cd7cf-3521-498c-be7d-c08146fe8070", + "id": "ade9e07f-0b55-4561-96d1-6fd6adec0f1a", "metadata": {}, "outputs": [], "source": [ - "# SEgments that show up have something wrong with them\n", - "geo_arrowized[geo_arrowized.trip_id == '1088403'].set_geometry(\"geometry_arrowized\").explore('stop_sequence', style_kwds = {'weight':10})" + "def import_unique_trips(gtfs_key:str, trip: str, route:str):\n", + " vp_trips = A1_sjoin_vp_segments.add_grouping_col_to_vp(\n", + " f\"vp_usable_{analysis_date}\",\n", + " analysis_date,\n", + " [\"shape_array_key\"]\n", + " )\n", + " \n", + " df = vp_trips[(vp_trips.gtfs_dataset_key == gtfs_key)\n", + " & (vp_trips.shape_array_key == route)\n", + " & (vp_trips.trip_id == trip)].reset_index(drop = True)\n", + " return df\n" ] }, { "cell_type": "code", "execution_count": null, - "id": "b28c57bb-95ee-4a27-85c9-12cb0d6f3c19", + "id": "c8003044-b7e4-477e-9395-fa881a2fa2b3", "metadata": {}, "outputs": [], "source": [ - "original[original.trip_id == '1088383'].set_geometry(\"geometry_arrowized\").explore('stop_sequence', style_kwds = {'weight':10})" + "unique_trips = import_unique_trips(test_gtfs_key, test_trip, test_route)" ] }, { "cell_type": "markdown", - "id": "80f135f9-141e-4e43-9e9e-5e483fcd4f23", + "id": "52ce333b-9f75-4c9b-a1af-130c93786f94", "metadata": {}, "source": [ - "#### 0fb4f3627996269dc7075276d3b69e36 \n", - "* 16939089" + "#### Look at vehicle positions" ] }, { "cell_type": "code", "execution_count": null, - "id": "c9121bc1-9192-45fa-976a-4f818c4a39a7", + "id": "ac6e78c3-694d-4297-8db1-f0f4d6faadbf", "metadata": {}, "outputs": [], "source": [ - "geo_arrowized[geo_arrowized.trip_id == '16939089'].set_geometry(\"geometry_arrowized\").explore('stop_sequence', style_kwds = {'weight':10})" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4213b966-f843-4d14-ab9a-f2344df1962f", - "metadata": {}, - "outputs": [], - "source": [ - "original[original.trip_id == '16939089'].set_geometry(\"geometry_arrowized\").explore('stop_sequence', style_kwds = {'weight':10})" - ] - }, - { - "cell_type": "markdown", - "id": "6f048e93-5ffc-45e8-8cc2-24d891a9dac3", - "metadata": { - "tags": [] - }, - "source": [ - "#### 000624bd8453dbe4f2eb2765b04bcb98 \n", - "* 1350" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "75d7e270-48c5-4421-ae89-50d696604da0", - "metadata": {}, - "outputs": [], - "source": [ - "geo_arrowized[geo_arrowized.trip_id == '1359'].set_geometry(\"geometry_arrowized\").explore('stop_sequence', style_kwds = {'weight':10})" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6ac43a58-184c-41b3-9d53-9e21ef321bc3", - "metadata": {}, - "outputs": [], - "source": [ - "geo_arrowized[geo_arrowized.trip_id == '1350'].set_geometry(\"geometry_arrowized\").explore('stop_sequence', style_kwds = {'weight':10})" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "68a59632-cfbd-43fd-aca4-a502e400a854", - "metadata": { - "scrolled": true, - "tags": [] - }, - "outputs": [], - "source": [ - "m2[m2.trip_id == '1350']" + "def import_vehicle_positions(unique_trips:pd.DataFrame, gtfs_key:str, trip_id:str)-> gpd.GeoDataFrame:\n", + " vp = helpers.import_vehicle_positions(\n", + " SEGMENT_GCS,\n", + " f\"vp_usable_{analysis_date}/\",\n", + " filters = [[(\"gtfs_dataset_key\", \"==\", gtfs_key),\n", + " ('trip_id', '==', trip_id)]],\n", + " columns = [\"gtfs_dataset_key\", \"trip_id\", \n", + " \"vp_idx\", \"x\", \"y\"],\n", + " partitioned = True\n", + " )\n", + " \n", + " vp = vp.compute()\n", + " vp = vp.merge(unique_trips, on = [\"gtfs_dataset_key\", \"trip_id\"],\n", + " how = \"inner\"\n", + " )\n", + " \n", + " vp_gdf = gpd.GeoDataFrame(\n", + " vp, \n", + " geometry = gpd.points_from_xy(vp.x, vp.y, crs = \"EPSG:4326\")\n", + " ).to_crs(PROJECT_CRS).drop(columns = [\"x\", \"y\"])\n", + " \n", + " return vp_gdf" ] }, { "cell_type": "code", "execution_count": null, - "id": "60a53891-66cb-40ea-a42a-55ea290c6584", + "id": "b47ea0cb-6031-4d98-b963-efbef949d169", "metadata": {}, "outputs": [], "source": [ - "original[original.trip_id == '1350'].set_geometry(\"geometry_arrowized\").explore('stop_sequence', style_kwds = {'weight':10})" + "vehicle_positions = import_vehicle_positions(unique_trips, test_gtfs_key, test_trip)" ] }, { "cell_type": "code", "execution_count": null, - "id": "dad7c9b8-9025-4d91-bf75-0e23c3ac2a52", + "id": "f0f96480-8328-43ed-9add-0a74b533fc8d", "metadata": {}, "outputs": [], "source": [ - "summarize(few_routes_cat, high_low_zero)" + "len(vehicle_positions)" ] }, { "cell_type": "markdown", - "id": "cde431f9-10ad-484f-b954-dd3c13a6e683", - "metadata": { - "tags": [] - }, - "source": [ - "#### Draft\n", - "* Show which stops are excluded from flags\n", - "* Show how many stops are dropped\n", - "* Show % of stops that were flagged compared to total stops." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6a9ba66d-5421-4170-9201-881ad3704b39", + "id": "896e00be-27f2-43b7-9cb4-69d61a061af0", "metadata": {}, - "outputs": [], "source": [ - "def read_back_gcs():\n", - " # Read back all the partitioned stuff - grab the file number\n", - " # part0.parquet, part1.parquet\n", - " start = datetime.datetime.now()\n", - " print(f\"Begin: {start}\")\n", - " gcs_file_path1 = f\"{speed_utils.GCS_PATH}partitioned_flags\"\n", - " file_names_dask = extract_number(gcs_file_path1, \"part\")\n", - "\n", - " # https://www.geeksforgeeks.org/read-multiple-csv-files-into-separate-dataframes-in-python/\n", - " # create empty list\n", - " all_df = []\n", - "\n", - " # append datasets into the list\n", - " for i in range(len(file_names_dask)):\n", - " gcs_file_path2 = f\"{gcs_file_path1}/part.\"\n", - " temp_df = dd.read_parquet(f\"{gcs_file_path2}{file_names_dask[i]}.parquet\")\n", - " all_df.append(temp_df)\n", - "\n", - " final_df = dd.concat(all_df, axis=0).reset_index(drop=True)\n", - " print(\"Begin computing\")\n", - " final_df = final_df.compute()\n", - " print(\"Done computing\")\n", - " end = datetime.datetime.now()\n", - " print(f\"Finish: {end-start}\")\n", - " return final_df" + "#### Look at segments" ] }, { "cell_type": "code", "execution_count": null, - "id": "8354439b-0ffc-4b3f-8114-05516a8e48ef", + "id": "17bb5083-ace2-4e2d-8400-3bf948625909", "metadata": {}, "outputs": [], "source": [ - "def categorize_by_percentile_pandas(\n", - " df: pd.DataFrame, column_percentile: str, column_str: str\n", - ") -> pd.DataFrame:\n", - "\n", - " # Find percentiles\n", - " agg1 = (\n", - " df.groupby([\"shape_array_key\", \"stop_sequence\"])[column_percentile]\n", - " .describe(percentiles=[0.05, 0.95])\n", - " .reset_index()\n", - " .add_prefix(column_str)\n", - " )\n", - " \n", - " # Merge \n", - " m1 = dd.merge(\n", - " df,\n", - " agg1,\n", - " how=\"inner\",\n", - " left_on=[\"shape_array_key\", \"stop_sequence\"],\n", - " right_on=[\n", - " f\"{column_str}shape_array_key\",\n", - " f\"{column_str}stop_sequence\",\n", - " ],\n", - " )\n", - " \n", - " def percentile(row):\n", - "\n", - " if row[column_percentile] == row[f\"{column_str}mean\"]:\n", - " return f\"{column_str} elapsed avg\"\n", - " elif row[f\"{column_str}5%\"] < row[column_percentile] <= row[f\"{column_str}95%\"]:\n", - " return f\"{column_str} elapsed avg\"\n", - " elif row[column_percentile] <= row[f\"{column_str}5%\"]:\n", - " return f\"{column_str} elapsed low\"\n", - " elif row[column_percentile] > row[f\"{column_str}95%\"]:\n", - " return f\"{column_str} elapsed high\"\n", - "\n", - " else:\n", - " return f\"{column_str} elapsed avg\"\n", + "def import_segments(flagged_df: pd.DataFrame, route:str, gtfs_key:str) -> gpd.GeoDataFrame:\n", + " gdf = gpd.read_parquet(f\"{SEGMENT_GCS}stop_segments_{analysis_date}.parquet\",\n", + " filters = [[(\"shape_array_key\", \"==\", route),\n", + " (\"gtfs_dataset_key\", \"==\", gtfs_key),\n", + " ]]).to_crs(PROJECT_CRS)\n", " \n", + " gdf[\"geometry_buffered\"] = gdf.geometry.buffer(35)\n", + " gdf = gdf.set_geometry('geometry_buffered')\n", " \n", - " # Apply flags\n", - " m1[f\"{column_str}cat\"] = m1.apply(lambda x: percentile(x), axis=1)\n", - " \n", - " # Delete out any average columns\n", - " m1 = m1.loc[m1[f\"{column_str}cat\"] != f\"{column_str} elapsed avg\"].reset_index(drop = True)\n", + " # Distinguish between \"correct\" and \"incorrect\" seq\n", + " # A sequence can be incorrect even if just one row is \"divided by 0\"\n", + " incorrect_segments = flagged_df[(flagged_df.shape_array_key == route) & (flagged_df.gtfs_dataset_key == gtfs_key)]\n", + " incorrect_segments_list = incorrect_segments.stop_sequence.unique().tolist()\n", + " incorrect_segments_filtered = gdf[gdf.stop_sequence.isin(incorrect_segments_list)].reset_index(drop = True)\n", + " incorrect_segments_filtered['flag'] = 'incorrect'\n", " \n", - " # Clean\n", - " m1[f\"{column_str}cat\"] = m1[f\"{column_str}cat\"].str.replace(\"_\", \"\")\n", + " # Filter for correct segments\n", + " correct_segments = flagged_df[~flagged_df.stop_sequence.isin(incorrect_segments_list)]\n", + " correct_segments_list = correct_segments.stop_sequence.unique().tolist()\n", + " correct_segments_filtered = gdf[gdf.stop_sequence.isin(correct_segments_list)].reset_index(drop = True)\n", + " correct_segments_filtered['flag'] = 'correct'\n", " \n", - " columns_to_keep = [\n", - " \"shape_array_key\",\n", - " \"gtfs_dataset_key\",\n", - " \"_gtfs_dataset_name\",\n", - " \"speed_mph\",\n", - " \"loop_or_inlining\",\n", - " \"stop_sequence\",\n", - " \"stop_id\",\n", - " \"trip_id\",\n", - " \"n_trips\",\n", - " \"p20_mph\",\n", - " \"p80_mph\",\n", - " \"p50_mph\",\n", - " \"time_of_day\",\n", - " \"meters_elapsed\",\n", - " \"sec_elapsed\",\n", - " f\"{column_str}5%\",\n", - " f\"{column_str}95%\",\n", - " f\"{column_str}cat\",\n", - " ]\n", - " m1 = m1[columns_to_keep]\n", - " print(f\"Done with {column_str}\")\n", + " final = pd.concat([correct_segments_filtered, incorrect_segments_filtered])\n", " \n", - " return m1 " + " return final" ] }, { "cell_type": "code", "execution_count": null, - "id": "18c06e7d-9b09-4573-bdf6-1dc753b3d552", + "id": "9f3a302a-f604-49fe-ae9b-ee8db85466de", "metadata": {}, "outputs": [], "source": [ - "def extract_number(folder: str, phrase_to_find: str) -> list:\n", - " \"\"\"\n", - " Extract the numeric portion of a file path.\n", - " \"\"\"\n", - " files = find_files(folder, phrase_to_find)\n", - " all_file_numbers = []\n", - " for file in files:\n", - " # https://stackoverflow.com/questions/11339210/how-to-get-integer-values-from-a-string-in-python\n", - " file_number = \"\".join(i for i in file if i.isdigit())\n", - " all_file_numbers.append(file_number)\n", - " return all_file_numbers" + "# flagged_segments = import_segments(m3, test_route, test_gtfs_key)" ] }, { "cell_type": "code", "execution_count": null, - "id": "dba2d199-47b4-4116-9ddb-adc1259ea3e2", + "id": "e8a14cfd-38b8-4326-9eac-a711f1a189e8", "metadata": {}, "outputs": [], "source": [ - "# Find all the parquets again\n", - "def find_files(folder: str, phrase_to_find: str) -> list:\n", - " \"\"\"\n", - " Grab a list of files that contain the\n", - " phrase inputted.\n", - " \"\"\"\n", - " # Create a list of all the files in my folder\n", - " all_files_in_folder = fs.ls(folder)\n", - " my_files = [i for i in all_files_in_folder if phrase_to_find in i]\n", - "\n", - " # String to add to read the files\n", - " my_string = \"gs://\"\n", - " my_files = [my_string + i for i in my_files]\n", - "\n", - " # Extract digit of parquet\n", - " return my_files" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "354cf0e3-7b2c-4403-bef9-74533691a0e9", - "metadata": {}, - "outputs": [], - "source": [ - "def categorize_by_percentile(\n", - " df: pd.DataFrame, column_percentile: str, column_str: str\n", - ") -> dd.DataFrame:\n", - " \n", - " # Find percentiles\n", - " agg1 = (\n", - " df.groupby([\"shape_array_key\", \"stop_sequence\"])[column_percentile]\n", - " .describe(percentiles=[0.05, 0.95])\n", - " .reset_index()\n", - " .add_prefix(column_str)\n", - " )\n", - " \n", - " \n", - " # Convert to dask because it takes a very long time\n", - " agg1_dask = dd.from_pandas(agg1, npartitions=1)\n", - " df_dask = dd.from_pandas(df, npartitions=1)\n", - "\n", - " # Merge using dask\n", - " merge1_dask = dd.merge(\n", - " df_dask,\n", - " agg1_dask,\n", - " how=\"inner\",\n", - " left_on=[\"shape_array_key\", \"stop_sequence\"],\n", - " right_on=[\n", - " f\"{column_str}shape_array_key\",\n", - " f\"{column_str}stop_sequence\",\n", - " ],\n", - " )\n", - "\n", - " def percentile(row):\n", - "\n", - " if row[column_percentile] == row[f\"{column_str}mean\"]:\n", - " return f\"{column_str} elapsed avg\"\n", - " elif row[column_percentile] <= row[f\"{column_str}5%\"]:\n", - " return f\"{column_str} elapsed low\"\n", - " elif row[column_percentile] == 0:\n", - " return f\"{column_str} elapsed is 0\"\n", - " elif row[f\"{column_str}5%\"] < row[column_percentile] <= row[f\"{column_str}95%\"]:\n", - " return f\"{column_str} elapsed avg\"\n", - " elif row[column_percentile] > row[f\"{column_str}95%\"]:\n", - " return f\"{column_str} elapsed high\"\n", - "\n", - " else:\n", - " return \"other\"\n", - "\n", - " merge1_dask[f\"{column_str}cat\"] = merge1_dask.apply(\n", - " lambda x: percentile(x), axis=1, meta=(f\"{column_str}cat\", \"string\")\n", - " )\n", - " \n", - " # Filter for only unsually high and low stuff\n", - " merge1_dask = merge1_dask[merge1_dask[f\"{column_str}cat\"].isin([f\"{column_str} elapsed high\", f\"{column_str} elapsed low\"]).reset_index(drop = True)\n", - " \n", - " # Clean\n", - " merge1_dask[f\"{column_str}cat\"] = merge1_dask[f\"{column_str}cat\"].str.replace(\n", - " \"_\", \"\"\n", - " )\n", - "\n", - " columns_to_keep = [\n", - " \"shape_array_key\",\n", - " \"gtfs_dataset_key\",\n", - " \"_gtfs_dataset_name\",\n", - " \"speed_mph\",\n", - " \"loop_or_inlining\",\n", - " \"stop_sequence\",\n", - " \"stop_id\",\n", - " \"trip_id\",\n", - " \"n_trips\",\n", - " \"p20_mph\",\n", - " \"p80_mph\",\n", - " \"p50_mph\",\n", - " \"time_of_day\",\n", - " \"meters_elapsed\",\n", - " \"sec_elapsed\",\n", - " f\"{column_str}5%\",\n", - " f\"{column_str}95%\",\n", - " f\"{column_str}cat\",\n", - " ]\n", - " merge1_dask = merge1_dask[columns_to_keep]\n", - " print(f\"Done with {column_str}\")\n", - " return merge1_dask" + "#segments = A1_sjoin_vp_segments.import_segments_and_buffer(\n", + " # f\"stop_segments_{analysis_date}\",\n", + "# 35,\n", + " # [\"shape_array_key\", \"stop_sequence\"]+ [\"seg_idx\", \"geometry\"]\n", + "#)" ] }, { "cell_type": "code", "execution_count": null, - "id": "d80535fb-8648-4216-918f-76e0484ba3ea", + "id": "3c08c38e-7419-4ff6-b74f-5f15615e52c4", "metadata": {}, "outputs": [], "source": [ - "def flag_round1(row):\n", - " if (row[\"meters_elapsed\"] == 0) & (row[\"sec_elapsed\"] == 0):\n", - " return \"division by 0\"\n", - " elif row[\"meters_cat\"] == \"meters elapsed low\":\n", - " return \"meters too low\"\n", - " elif row[\"seconds_cat\"] == \"seconds elapsed high\":\n", - " return \"seconds too high\"\n", - " else:\n", - " return \"ok\"\n", - " \n", - "#def flag_round2(row):\n", - "# if (row[\"meters_elapsed\"] == 0) & (row[\"sec_elapsed\"] == 0):\n", - "# return \"division by 0\"\n", - "# else:\n", - "# return \"meters/seconds are filled but flagged\"\n", - "\n", - "def categorize_meters_speeds_dask(df):\n", - " start = datetime.datetime.now()\n", - " print(f\"Begin: {start}\")\n", - "\n", - " # Find percentiles\n", - " df.speed_mph = df.speed_mph.fillna(0)\n", - "\n", - " # These are now dask dataframes\n", - " ddf_meters = categorize_by_percentile(df, \"meters_elapsed\", \"meters_\")\n", - " ddf_seconds = categorize_by_percentile(df, \"sec_elapsed\", \"seconds_\")\n", - "\n", - " merge_cols = [\n", - " \"shape_array_key\",\n", - " \"gtfs_dataset_key\",\n", - " \"_gtfs_dataset_name\",\n", - " \"speed_mph\",\n", - " \"loop_or_inlining\",\n", - " \"stop_sequence\",\n", - " \"stop_id\",\n", - " \"n_trips\",\n", - " \"p20_mph\",\n", - " \"p80_mph\",\n", - " \"p50_mph\",\n", - " \"meters_elapsed\",\n", - " \"sec_elapsed\",\n", - " \"trip_id\",\n", - " \"time_of_day\",\n", - " ]\n", - "\n", - " # Merge using dask\n", - " m1 = dd.merge(ddf_meters, ddf_seconds, how=\"inner\", on=merge_cols)\n", - "\n", - " # Apply flags\n", - " m1[\"flag\"] = m1.apply(lambda x: flag_round1(x), axis=1, meta=(\"flag\", \"string\"))\n", - " print(\"Apply first round of flags\")\n", - "\n", - " # Filter out for projects that are ok, retag for zeroes\n", - " m2 = m1[m1.flag != \"ok\"].reset_index()\n", - "\n", - " # Apply flag for zeroes\n", - " m2[\"flag_division_0\"] = m2.apply(\n", - " lambda x: flag_round2(x), axis=1, meta=(\"flag\", \"string\")\n", - " )\n", - " print(\"Apply second round of flags\")\n", - "\n", - " # Replace values in the original flag\n", - " # https://stackoverflow.com/questions/54302694/updating-the-values-of-a-column-in-a-dask-dataframe-based-on-some-condition-on-s\n", - " condition = m2.flag_division_0 == \"division by 0\"\n", - " m2[\"flag\"] = m2[\"flag\"].mask(condition, m2.flag_division_0)\n", - " print(\"Done flagging\")\n", - "\n", - " # Print value counts\n", - " # print(f\"breakout of rows after separating out for 0: \\n {m2.flag.value_counts().compute()}\")\n", - "\n", - " # Filter for only projects that are divided by 0\n", - " # m2 = m2[m2.flag == \"division by 0\"].reset_index()\n", - " # Delete older column\n", - " m2 = m2.drop(columns=[\"flag_division_0\", \"level_0\", \"index\"])\n", - " print(\"Drop columns\")\n", - "\n", - " # Save\n", - " # m2 = m2.repartition(partition_size=\"5MB\")\n", - " # m2.to_parquet(f\"{speed_utils.GCS_PATH}partitioned_flags\", overwrite = True)\n", - " print(\"Saved\")\n", - "\n", - " end = datetime.datetime.now()\n", - " print(f\"Finish: {end-start}\")\n", - "\n", - " return m2" + "# segments = segments.compute()" ] }, { - "cell_type": "code", - "execution_count": null, - "id": "34840f61-1857-4f5c-b3a6-abcac18af0db", + "cell_type": "markdown", + "id": "5a688361-e7a9-40f0-877d-3693be99a960", "metadata": {}, - "outputs": [], "source": [ - "trips = list(equal_sampling.trip_id.unique())" + "#### Stops kept: last and first" ] }, { "cell_type": "code", "execution_count": null, - "id": "759ac80e-9c98-4c94-add3-859ab998e038", + "id": "9ea744bf-8019-4b7c-988f-f95196b56435", "metadata": {}, "outputs": [], "source": [ - "stops = list(equal_sampling.stop_id.unique())" + "def find_first_last_points(route:str, trip:str, gtfs_key:str):\n", + " df = pd.read_parquet(f\"{SEGMENT_GCS}vp_pared_stops_{analysis_date}\",\n", + " filters = [[('shape_array_key', \"==\", route),\n", + " \n", + " ('trip_id', \"==\", trip), \n", + " ('gtfs_dataset_key', '==', gtfs_key)]],)\n", + " \n", + " gdf = gpd.GeoDataFrame(\n", + " df, \n", + " geometry = gpd.points_from_xy(df.x, df.y, crs = \"EPSG:4326\")\n", + " ).to_crs(PROJECT_CRS).drop(columns = [\"x\", \"y\"])\n", + " \n", + " gdf = gdf[['geometry','stop_sequence']]\n", + " \n", + " return gdf" ] }, { "cell_type": "code", "execution_count": null, - "id": "94620388-876e-4e6d-80ab-ac68eb1061c9", + "id": "7c7aa90f-3e80-472e-b61d-94afd6c0ec01", "metadata": {}, "outputs": [], "source": [ - "# Plot some of the trips\n", - "sample_data = few_routes_cat[few_routes_cat.trip_id.isin(trips)].reset_index()" + "first_last = find_first_last_points(test_route, test_trip, test_gtfs_key)" ] }, { "cell_type": "code", "execution_count": null, - "id": "417804f1-9e24-4ac6-9c96-95f4da8f9693", + "id": "5634169a-f26b-4174-b46a-3aa872bc1bdb", "metadata": {}, "outputs": [], "source": [ - "sample_data.shape" + "len(first_last)" ] }, { - "cell_type": "code", - "execution_count": null, - "id": "339a67c7-3634-43d4-b865-b10119507c44", - "metadata": {}, - "outputs": [], + "cell_type": "markdown", + "id": "1aeb604b-249b-41e7-be71-fe8d3205e54a", + "metadata": { + "tags": [] + }, "source": [ - "# sample_data2 = sample_data[['shape_array_key','gtfs_dataset_key','trip_id']]" + "#### Mapping" ] }, { "cell_type": "code", "execution_count": null, - "id": "39231b51-f0ab-4bbe-80d0-8a2081849e50", + "id": "1a785180-38bb-4d33-a96e-3385c84ed2f1", "metadata": {}, "outputs": [], "source": [ - "plotting = sample_data.melt(\n", - " id_vars=[\n", - " \"_gtfs_dataset_name\",\n", - " \"shape_array_key\",\n", - " \"trip_id\",\n", - " \"stop_sequence\",\n", - " \"gtfs_dataset_key\",\n", - " \"loop_or_inlining\",\n", - " \"n_trips\",\n", - " \"meters_elapsed\",\n", - " \"meters_cat\",\n", - " \"seconds_cat\",\n", - " \"sec_elapsed\",\n", - " \"flag\",\n", - " \"p20_speed_mph\",\n", - " \"p80_speed_mph\",\n", - " \"median_speed_mph\",\n", - " ],\n", - " value_vars=[\"speed_mph\"],\n", - ")" + "def display_maps(all_points: gpd.GeoDataFrame, first_last_points: gpd.GeoDataFrame, segments: gpd.GeoDataFrame):\n", + " base1 = segments.explore('flag', cmap= 'tab10', height = 400, width = 600, name = 'segments')\n", + " \n", + " all_points_map = all_points.explore(m = base1, color = 'red',style_kwds = {'weight':5}, legend_kwds = {'caption': 'all_points'}, name= 'points')\n", + " \n", + " display(all_points_map) \n", + " first_last_map = first_last_points.explore('stop_sequence', cmap = 'tab10',style_kwds = {'weight':5},height = 400, width = 600,)\n", + " display(first_last_map)" ] }, { "cell_type": "code", "execution_count": null, - "id": "959bbe35-8c21-4795-b418-745a0caa94e8", + "id": "bab6bfc4-3d08-46fe-be33-6179ac5df34d", "metadata": {}, "outputs": [], "source": [ - "# Clean\n", - "plotting = threshold_utils.pre_clean(plotting)" + "# display_maps(vehicle_positions,first_last,flagged_segments)" ] }, { - "cell_type": "code", - "execution_count": null, - "id": "dd343cdd-acf3-4433-a702-8c3eb53ba1f0", + "cell_type": "markdown", + "id": "f7ed9dc6-80ce-46f9-ae59-a5ed2bbcad50", "metadata": {}, - "outputs": [], "source": [ - "plotting[\"Dropdown Menu\"] = plotting[\"Gtfs Dataset Name\"] + \" \" + plotting[\"Trip Id\"]" + "#### Function" ] }, { "cell_type": "code", "execution_count": null, - "id": "072f39d9-4bf9-4efc-a393-24f096cecf7e", + "id": "509bc2c0-14f3-4021-baf1-6d89a2409a79", "metadata": {}, "outputs": [], "source": [ - "def alt_dropdown(df, col_for_dropdown: str, dropdown_menu_title: str):\n", - " # Create dropdown menu\n", - " # Exclude \"none\" operators which are only scheduled data\n", - " df = df.loc[df[col_for_dropdown] != \"None\"][[col_for_dropdown]]\n", - " dropdown_list = df[col_for_dropdown].unique().tolist()\n", - "\n", - " # Show only first operator by default\n", - " initialize_first_op = sorted(dropdown_list)[0]\n", - " input_dropdown = alt.binding_select(\n", - " options=sorted(dropdown_list), name=dropdown_menu_title\n", - " )\n", - "\n", - " selection = alt.selection_single(\n", - " name=dropdown_menu_title,\n", - " fields=[col_for_dropdown],\n", - " bind=input_dropdown,\n", - " init={col_for_dropdown: initialize_first_op},\n", - " )\n", - "\n", - " return selection" + "def stage2_trouble_shooting(flagged_df:pd.DataFrame,\n", + " date:str, route:str, \n", + " stop_sequence:str, \n", + " trip:str, gtfs_key:str):\n", + " stg2 = import_stage_2(date, route, stop_sequence)\n", + " unique_trips = import_unique_trips(gtfs_key, trip, route)\n", + " \n", + " vehicle_positions = import_vehicle_positions(unique_trips, gtfs_key, trip)\n", + " flagged_segments = import_segments(flagged_df, route, gtfs_key)\n", + " first_last = find_first_last_points(route, trip, gtfs_key)\n", + " \n", + " display_maps(vehicle_positions,first_last,flagged_segments)\n", + " return vehicle_positions, first_last, flagged_segments\n", + " " ] }, { "cell_type": "code", "execution_count": null, - "id": "6b919e59-03e8-426f-b9b3-4468d5b1b06b", + "id": "7a4ae095-d010-46b5-80b2-0bbe948f249f", "metadata": {}, "outputs": [], "source": [ - "selection_test = alt_dropdown(plotting, \"Dropdown Menu\", \"Route\")" + "test1_allpts, test1_firstlast_pts, test1_flagged = stage2_trouble_shooting(flagged_df= m3,\n", + " date = analysis_date,\n", + " route = test_route,\n", + " stop_sequence = test_sequence,\n", + " trip = test_trip,\n", + " gtfs_key = test_gtfs_key)" ] }, { "cell_type": "code", "execution_count": null, - "id": "d7b9d14f-b708-4426-a1b8-4afb4e7c95ba", + "id": "b45a9b37-21ed-4ceb-9561-3c04568e8e68", "metadata": {}, "outputs": [], "source": [ - "(\n", - " threshold_utils.chart_size(\n", - " alt.Chart(plotting)\n", - " .mark_tick(\n", - " size=15,\n", - " thickness=5,\n", - " )\n", - " .encode(\n", - " x=\"Stop Sequence:N\",\n", - " y=\"Value:Q\",\n", - " color=alt.Color(\n", - " \"Flag:N\", scale=alt.Scale(range=cp.CALITP_CATEGORY_BOLD_COLORS)\n", - " ),\n", - " tooltip=plotting.columns.tolist(),\n", - " )\n", - " .interactive(),\n", - " 1100,\n", - " 400,\n", - " )\n", - " .add_selection(selection_test)\n", - " .transform_filter(selection_test)\n", - ")" + "test1_allpts.shape, test1_firstlast_pts.shape" ] }, { "cell_type": "code", "execution_count": null, - "id": "7ec90906-3364-440c-a951-eb5f2b1e84c2", + "id": "ad92a3aa-84db-4b25-9514-bd7be30861a2", "metadata": {}, "outputs": [], - "source": [ - "stop" - ] + "source": [] }, { "cell_type": "code", "execution_count": null, - "id": "140112e0-8ee0-481d-848c-4db091460418", + "id": "268c5f17-df4a-4d84-b3b0-d76a2727bcf7", "metadata": {}, "outputs": [], "source": [ - "type(vehicle_positions)" + "test_route2 = \"0fb4f3627996269dc7075276d3b69e36\"\n", + "test_stop = 13\n", + "test_gtfs_key2 = \"a4f6fd5552107e05fe9743ac7cce2c55\"\n", + "test_trip2 = \"16939095\"" ] }, { "cell_type": "code", "execution_count": null, - "id": "d843325a-f33c-40de-a3cb-befed24d645e", + "id": "deaa6fdf-37b8-49ee-97f2-46f74d41a449", "metadata": {}, "outputs": [], "source": [ - "vehicle_positions2 = vehicle_positions[\n", - " vehicle_positions.trip_id.isin(trips)\n", - "].reset_index()" + "test2_allpts, test2_firstlast_pts, test2_flagged = stage2_trouble_shooting(flagged_df= m3,\n", + " date = analysis_date,\n", + " route = test_route2,\n", + " stop_sequence = test_stop,\n", + " trip = test_trip2,\n", + " gtfs_key = test_gtfs_key2)" ] }, { "cell_type": "code", "execution_count": null, - "id": "8840809b-dd6f-4c0e-a68b-0a37f508df14", + "id": "330951ba-846e-408f-a581-ce946df0cc76", "metadata": {}, "outputs": [], "source": [ - "vehicle_positions2.shape" + "test2_allpts.shape, test2_firstlast_pts.shape" ] }, { "cell_type": "code", "execution_count": null, - "id": "896b5c73-0835-4ee7-a4f2-9d960778fe35", + "id": "db8465be-8938-4ebe-977c-6a784141b4da", "metadata": {}, "outputs": [], "source": [ - "gdf1 = pd.merge(\n", - " vehicle_positions2,\n", - " sample_data,\n", - " how=\"inner\",\n", - " on=[\"gtfs_dataset_key\", \"_gtfs_dataset_name\", \"trip_id\"],\n", - ")" + "# test2_base = test2_flagged.explore('flag', cmap= 'tab10', height = 400, width = 600, name = 'segments')" ] }, { "cell_type": "code", "execution_count": null, - "id": "3725d07d-7b9f-483b-aa56-8d428b9f3d11", + "id": "0d784404-f514-4648-b9c8-7809fb2fa474", "metadata": {}, "outputs": [], "source": [ - "gdf1.shape" + "# test2_all_pts_map = test2_allpts.explore(m = test2_base, color = 'red',style_kwds = {'weight':5}, legend_kwds = {'caption': 'all_points'}, name= 'points')" ] }, { "cell_type": "code", "execution_count": null, - "id": "c82d7834-567b-4de4-b465-aea8c1a62715", + "id": "e58353e4-d9c9-402d-99f7-3dd259a4b00e", "metadata": {}, "outputs": [], "source": [ - "gdf1 = gdf1[gdf1.stop_id.isin(stops)]" + "# test2_all_pts_map" ] }, { "cell_type": "code", "execution_count": null, - "id": "a86cec8e-e7fd-48ce-9692-c766df2b68e8", + "id": "2d7e0287-d90f-45c0-b3f1-45a6a0e7a2aa", "metadata": {}, "outputs": [], "source": [ - "gdf1.shape" + "# test2_firstlast_pts.explore( color = 'red',style_kwds = {'weight':5}, height = 400, width = 600, )" ] }, { - "cell_type": "code", - "execution_count": null, - "id": "3bf3c599-2aec-4ae7-b8c8-53a4eff8795a", + "cell_type": "markdown", + "id": "b0ff9ce9-9642-4f91-93de-f35bc45d2e1e", "metadata": { "tags": [] }, - "outputs": [], - "source": [ - "gdf1[\n", - " [\n", - " \"geometry\",\n", - " \"stop_id\",\n", - " \"stop_sequence\",\n", - " \"_gtfs_dataset_name\",\n", - " \"shape_array_key\",\n", - " \"speed_mph\",\n", - " \"flag\",\n", - " ]\n", - "].explore(\"flag\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f6d7831f-aed2-4e87-aae1-8ab6ddc08666", - "metadata": {}, - "outputs": [], - "source": [ - "stop" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "aae6b6c0-ce47-4ca1-b104-68b39ebcf2ca", - "metadata": {}, - "outputs": [], - "source": [ - "high_low_zero2 = high_low_zero.melt(\n", - " id_vars=[\n", - " \"_gtfs_dataset_name\",\n", - " \"shape_array_key\",\n", - " \"trip_id\",\n", - " \"stop_sequence\",\n", - " \"gtfs_dataset_key\",\n", - " \"loop_or_inlining\",\n", - " \"n_trips\",\n", - " \"meters_cat\",\n", - " \"seconds_cat\",\n", - " \"unusual_flag\",\n", - " \"time_of_day\",\n", - " ],\n", - " value_vars=[\"median_speed_mph\", \"speed_mph\", \"p20_speed_mph\", \"p80_speed_mph\"],\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "767abd20-d030-42d3-b85f-6d3023d69b8a", - "metadata": {}, - "outputs": [], - "source": [ - "high_low_zero2 = high_low_zero2.drop_duplicates(\n", - " subset=[\n", - " \"loop_or_inlining\",\n", - " \"shape_array_key\",\n", - " \"stop_sequence\",\n", - " \"time_of_day\",\n", - " \"variable\",\n", - " \"value\",\n", - " ]\n", - ").reset_index(drop=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "52e7ee5a-a40e-423e-ba4b-dea14de17982", - "metadata": {}, - "outputs": [], - "source": [ - "high_low_zero2.shape" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "23e7f746-b1b5-402f-92fd-dbc74840e013", - "metadata": {}, - "outputs": [], - "source": [ - "merge1.shape_array_key.nunique(), high_low_zero.shape_array_key.nunique()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8a25f067-956e-46a7-aa7a-5abf57e662f6", - "metadata": {}, - "outputs": [], - "source": [ - "# Clean\n", - "high_low_zero2 = threshold_utils.pre_clean(high_low_zero2)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b5eea02c-05ec-4707-a06d-9de1864e8fbe", - "metadata": {}, - "outputs": [], "source": [ - "# Add dropdown menu\n", - "high_low_zero2[\"Dropdown Menu\"] = (\n", - " high_low_zero2[\"Gtfs Dataset Name\"] + \" \" + high_low_zero2[\"Shape Array Key\"]\n", - ")" + "##### Sjoin" ] }, { "cell_type": "code", "execution_count": null, - "id": "571b471f-4a66-474f-8900-c3eaffde441e", + "id": "fd2a6beb-a2e9-498c-beed-ce7365ffa7a6", "metadata": {}, "outputs": [], "source": [ - "high_low_zero2[\"Route Type\"] = \"Route Type: \" + high_low_zero2[\n", - " \"Loop Or Inlining\"\n", - "].astype(str)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b7b429be-057c-4692-927e-92107b015ae6", - "metadata": {}, - "outputs": [], - "source": [ - "selection_test = alt_dropdown(high_low_zero2, \"Dropdown Menu\", \"Route\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "aa0fad8b-b49e-48be-8070-adaf6e63d541", - "metadata": {}, - "outputs": [], - "source": [ - "# https://github.com/altair-viz/altair/issues/1168\n", - "title = (\n", - " alt.Chart(high_low_zero2)\n", - " .mark_text(dy=-40, size=15, fontWeight=\"normal\")\n", - " .encode(\n", - " text=\"Route Type:N\",\n", - " )\n", - " .add_selection(selection_test)\n", - " .transform_filter(selection_test)\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1042a774-165c-4f7e-bfc9-c4d4980bd29b", - "metadata": {}, - "outputs": [], - "source": [ - "\"\"\"total_stops_altair = (\n", - " alt.Chart(stop_info)\n", - " .mark_text(dy=-40, size=15, fontWeight=\"normal\")\n", - " .encode(\n", - " text=\"Percentage Of Unusual Stops:N\",\n", + "def sjoin_vp_segments(segments: gpd.GeoDataFrame, vp_gdf: gpd.GeoDataFrame):\n", + " vp_in_seg = gpd.sjoin(\n", + " vp_gdf,\n", + " segments,\n", + " how = \"inner\",\n", + " predicate = \"within\"\n", " )\n", - " .add_selection(selection_test)\n", - " .transform_filter(selection_test)\n", - ")\"\"\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6067e93b-3519-45fc-b027-11cbcc82d80f", - "metadata": {}, - "outputs": [], - "source": [ - "main_chart = (\n", - " threshold_utils.chart_size(\n", - " alt.Chart(high_low_zero2)\n", - " .mark_tick(\n", - " size=15,\n", - " thickness=5,\n", - " )\n", - " .encode(\n", - " x=\"Stop Sequence:N\",\n", - " y=\"Value:Q\",\n", - " color=alt.Color(\n", - " \"Variable:N\", scale=alt.Scale(range=cp.CALITP_CATEGORY_BRIGHT_COLORS)\n", - " ),\n", - " tooltip=high_low_zero2.columns.tolist(),\n", - " )\n", - " .interactive(),\n", - " 1100,\n", - " 400,\n", - " )\n", - " .add_selection(selection_test)\n", - " .transform_filter(selection_test)\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "da0d6aad-26c3-439b-93d2-ba5a3abac77d", - "metadata": {}, - "outputs": [], - "source": [ - "high_low_zero2.shape" + " # vp_in_seg = vp_in_seg.set_geometry('geometry_left')\n", + " \n", + " return vp_in_seg\n", + "\n" ] }, { - "cell_type": "code", - "execution_count": null, - "id": "c77e2503-e211-48cc-a220-d96f82ab72df", + "cell_type": "markdown", + "id": "06df58c1-c7b7-4769-bd8d-696e337eefb3", "metadata": {}, - "outputs": [], "source": [ - "(title & total_stops_altair | main_chart)" + "### Stage1: \"vp_usable\"" ] }, { "cell_type": "code", "execution_count": null, - "id": "9f709e13-aa1e-44da-9027-dfafaead5dad", + "id": "b9eab37f-0569-4f07-9113-87200b0c7dfd", "metadata": {}, "outputs": [], "source": [ - "high_low_zero.shape_array_key.unique()" + "# What's the diff between stop segments normal/special/and without any notation?\n", + "usable = pd.read_parquet(f\"{SEGMENT_GCS}vp_usable_{analysis_date}\")" ] }, { "cell_type": "code", "execution_count": null, - "id": "2a2bda8b-d48c-44f3-a928-08aca894c565", + "id": "deb486c8-a800-485e-8a46-d994af1c0074", "metadata": {}, "outputs": [], "source": [ - "chart2 = threshold_utils.chart_size(chart2, 75, 200)" + "usable.sample()" ] }, { "cell_type": "code", "execution_count": null, - "id": "dd20293d-7c43-42c4-b053-de945860b6f0", + "id": "b3d8352e-7c60-4368-b78f-02e20136947a", "metadata": {}, "outputs": [], "source": [ - "chart2 = chart2.add_selection(selection_test).transform_filter(selection_test)" + "subset_for_merge2 = subset_for_merge.drop(columns = ['stop_sequence','stop_id','meters_elapsed','sec_elapsed'])" ] }, { "cell_type": "code", "execution_count": null, - "id": "44dc6896-e95d-42df-bbd5-f2bb2c2a2cc6", + "id": "813ae4db-0fef-4f10-9408-7284fc531ed2", "metadata": {}, "outputs": [], "source": [ - "title = threshold_utils.chart_size(title, 20, 20)" + "m_cols2 = ['gtfs_dataset_key',\n", + " 'trip_id']" ] }, { "cell_type": "code", "execution_count": null, - "id": "d651c8c6-179c-4157-a671-11006cb419df", + "id": "d08fa8db-f3a3-43f2-a763-a39cacc9cf9c", "metadata": {}, "outputs": [], "source": [ - "alt.data_transformers.enable(\"default\", max_rows=None)" + "subset_for_merge2.head()" ] }, { "cell_type": "code", "execution_count": null, - "id": "8e753711-8506-4939-8d9d-22566a641988", - "metadata": {}, + "id": "68a59632-cfbd-43fd-aca4-a502e400a854", + "metadata": { + "scrolled": true, + "tags": [] + }, "outputs": [], "source": [ - "title & (chart1.interactive() & chart2.interactive())" + "# m2[m2.trip_id == '1350']" ] } ], From cfb90e4f84a1988e929351e89fe772b93762ac16 Mon Sep 17 00:00:00 2001 From: amandaha8 Date: Thu, 29 Jun 2023 18:29:52 +0000 Subject: [PATCH 8/9] added sjoin --- rt_segment_speeds/12_speeds.ipynb | 865 +++++++++++++++++++++++------- 1 file changed, 665 insertions(+), 200 deletions(-) diff --git a/rt_segment_speeds/12_speeds.ipynb b/rt_segment_speeds/12_speeds.ipynb index 31d49bf30..e76bc7696 100644 --- a/rt_segment_speeds/12_speeds.ipynb +++ b/rt_segment_speeds/12_speeds.ipynb @@ -2,10 +2,27 @@ "cells": [ { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "id": "2c7feec3-aa18-42ab-94b9-cab4be608152", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/opt/conda/lib/python3.9/site-packages/geopandas/_compat.py:123: UserWarning: The Shapely GEOS version (3.11.1-CAPI-1.17.1) is incompatible with the GEOS version PyGEOS was compiled with (3.10.1-CAPI-1.16.0). Conversions between both will be slow.\n", + " warnings.warn(\n", + "/home/jovyan/data-analyses/rt_segment_speeds/_threshold_utils.py:1: UserWarning: Shapely 2.0 is installed, but because PyGEOS is also installed, GeoPandas will still use PyGEOS by default for now. To force to use and test Shapely 2.0, you have to set the environment variable USE_PYGEOS=0. You can do this before starting the Python process, or in your code before importing geopandas:\n", + "\n", + "import os\n", + "os.environ['USE_PYGEOS'] = '0'\n", + "import geopandas\n", + "\n", + "In a future release, GeoPandas will switch to using Shapely by default. If you are using PyGEOS directly (calling PyGEOS functions on geometries from GeoPandas), this will then stop working and you are encouraged to migrate from PyGEOS to Shapely 2.0 (https://shapely.readthedocs.io/en/latest/migration_pygeos.html).\n", + " import geopandas as gpd\n" + ] + } + ], "source": [ "import datetime\n", "import _speed_utils as speed_utils\n", @@ -28,7 +45,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "id": "0108ae4a-4518-4487-85f7-a5faa3e9cbf6", "metadata": {}, "outputs": [], @@ -41,7 +58,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "id": "80014c5e-695d-4280-89cd-4e7e2bb3d302", "metadata": {}, "outputs": [], @@ -59,7 +76,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "id": "2f0c5f4f-f419-42a8-8527-7060ed412092", "metadata": {}, "outputs": [], @@ -91,7 +108,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "id": "84ac97bf-ee4f-4d85-b523-8a36823f9d9a", "metadata": {}, "outputs": [], @@ -101,7 +118,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "id": "68950ae7-4061-47d6-ac48-5eac0b1f29c0", "metadata": {}, "outputs": [], @@ -111,7 +128,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "id": "b04dfb8b-7476-49df-873a-cea75dc61763", "metadata": {}, "outputs": [], @@ -136,7 +153,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "id": "e81e59fd-cc2f-408e-9148-1a1055425fc4", "metadata": {}, "outputs": [], @@ -170,7 +187,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "id": "0dfb836d-f919-4f2b-a0d1-9e4a4713ba8a", "metadata": {}, "outputs": [], @@ -180,7 +197,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "id": "9f84205d-93db-49f3-be99-6b5014f7faeb", "metadata": {}, "outputs": [], @@ -190,7 +207,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "id": "b0d2184f-8a44-4489-a1b4-2be8317142f1", "metadata": {}, "outputs": [], @@ -200,7 +217,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 12, "id": "940fb010-0dff-465e-bf8d-87dd3f4ba101", "metadata": {}, "outputs": [], @@ -210,7 +227,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 13, "id": "9d38d541-5c9c-4d31-8986-9c3928eb2f59", "metadata": {}, "outputs": [], @@ -255,7 +272,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 14, "id": "7201d5e3-f765-4e5d-9bbd-aa6a336bcc0a", "metadata": {}, "outputs": [], @@ -265,80 +282,220 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 15, "id": "2c5107cb-c574-449b-95b6-fb205f38502e", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2023-06-29 11:20:08.772300\n", + "Done with meters_\n", + "Done with sec_\n", + "sec_cat meters_cat \n", + "sec is avg meters is avg 1829\n", + " meters is high 110\n", + " meters is low 22\n", + "sec is high meters is avg 63\n", + " meters is high 40\n", + " meters is low 47\n", + "sec is low meters is low 850\n", + "dtype: int64\n", + "1022 rows left after filtering for rows with either high seconds OR low meters\n", + "division by 0 850\n", + "seconds too high 103\n", + "meters too low 69\n", + "Name: flag, dtype: int64\n", + "Took 0:00:00.217325\n" + ] + } + ], "source": [ "m2 = categorize_meters_speeds_pandas(subset)" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 16, "id": "2d1bf90c-d9ed-4861-a1be-23f356165a4c", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "division by 0 850\n", + "Name: flag, dtype: int64" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "m2.flag.value_counts()" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 17, "id": "ce0fbb35-f81e-4343-92d2-4382d2173dbd", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "3075512" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "len(m1)-len(m2)" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 18, "id": "508f1411-4328-4b80-a029-0ae516107ed0", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "850" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "len(m2)" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 19, "id": "9fe08d2a-b874-4439-aa5b-a52de58cad09", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "(73, 72067)" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "m2.trip_id.nunique(), m1.trip_id.nunique()" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 20, "id": "8e6d31ab-46a7-4e20-bb2f-9cac1a2d672d", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "(4, 4837)" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "m2.shape_array_key.nunique(), m1.shape_array_key.nunique()" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 21, "id": "647fad46-7f9b-4ce2-a26a-1ea69d02daee", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "(3, 76)" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "m2._gtfs_dataset_name.nunique(), m1._gtfs_dataset_name.nunique()" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 22, "id": "83036ccc-7339-42c2-b1f7-183734253c21", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
shape_array_key
loop_or_inlining
04
\n", + "
" + ], + "text/plain": [ + " shape_array_key\n", + "loop_or_inlining \n", + "0 4" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "m2.groupby([\"loop_or_inlining\"]).agg({\"shape_array_key\": \"nunique\"})" ] @@ -353,7 +510,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 23, "id": "468be3c9-7a24-4f01-84fd-31c137bc45e8", "metadata": { "scrolled": true, @@ -368,7 +525,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 24, "id": "4350f540-8f6b-4fb0-8b16-836245c0e44c", "metadata": { "scrolled": true, @@ -382,7 +539,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 25, "id": "ac68bdf7-26a0-4679-9a35-26f8a670018a", "metadata": {}, "outputs": [], @@ -392,7 +549,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 26, "id": "81d443cc-122f-46f1-87ec-dbdc74e0ca6a", "metadata": {}, "outputs": [], @@ -402,17 +559,36 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 27, "id": "314d9baf-de0e-460a-8c29-4504ba94cfa6", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "count 4.00\n", + "mean 100.00\n", + "std 0.00\n", + "min 100.00\n", + "25% 100.00\n", + "50% 100.00\n", + "75% 100.00\n", + "max 100.00\n", + "Name: percent_of_trips_with_problematic_rows, dtype: float64" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "df3['percent_of_trips_with_problematic_rows'].describe()" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 28, "id": "5de3efe6-2233-4251-93a8-1f8dd6fb2dae", "metadata": { "tags": [] @@ -434,7 +610,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 29, "id": "a2a705af-b588-463b-b6ce-f999b2050208", "metadata": {}, "outputs": [], @@ -462,7 +638,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 30, "id": "1e36c5fc-ab3f-4129-97f9-ad9472b7d32a", "metadata": {}, "outputs": [], @@ -472,7 +648,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 31, "id": "1d6fe654-40ca-4758-bc2c-316e33d1a9d1", "metadata": {}, "outputs": [], @@ -482,7 +658,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 32, "id": "148e75f1-08dd-44c8-8179-319164d8e020", "metadata": { "tags": [] @@ -495,7 +671,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 33, "id": "b4350206-c237-44a3-abce-f8f38cde8117", "metadata": { "scrolled": true, @@ -509,7 +685,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 34, "id": "aa1e56d1-ec07-436c-8763-7bcf3dcbf7d4", "metadata": { "scrolled": true, @@ -523,7 +699,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 35, "id": "22e42aae-9281-4040-ab8c-6a10b93f6cf4", "metadata": { "scrolled": true, @@ -537,7 +713,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 36, "id": "0f21f08f-d4eb-4bbd-94d3-f4b031e97cf4", "metadata": {}, "outputs": [], @@ -563,7 +739,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 37, "id": "5ce07566-c1f0-4fa7-9550-2fa07b98dba8", "metadata": {}, "outputs": [], @@ -594,7 +770,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 38, "id": "66e83169-2b4a-4912-bc0e-1a0b3e8deea6", "metadata": {}, "outputs": [], @@ -653,27 +829,32 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 39, "id": "cab32ef3-cc66-40ce-aa19-59631734f539", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2023-06-29 11:20:18.037861\n", + "check in stage 2 1503\n", + "repeated timestamps 149\n", + "repeated timestamps & locations 27\n", + "repeated locations 21\n", + "Name: stage3_flag, dtype: int64\n", + "Have to check 88.41176470588236 % of rows in stage 2\n", + "Took 0:00:05.668398\n" + ] + } + ], "source": [ "m3 = flag_stage3(m2, analysis_date)" ] }, { "cell_type": "code", - "execution_count": null, - "id": "68a9dbba-ee6b-42b1-9203-1146d6cd56e9", - "metadata": {}, - "outputs": [], - "source": [ - "m3.shape" - ] - }, - { - "cell_type": "code", - "execution_count": null, + "execution_count": 40, "id": "1cca329c-14bc-4ad5-9465-1a63ca53df49", "metadata": {}, "outputs": [], @@ -683,17 +864,28 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 41, "id": "93e87778-edef-4d62-98aa-a4241f177892", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "(1503, 30)" + ] + }, + "execution_count": 41, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "m3.shape" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 42, "id": "21799f42-873e-41bd-b764-42cc297686a6", "metadata": {}, "outputs": [], @@ -711,7 +903,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 43, "id": "0a469849-f903-44e4-9d2a-4f3775270a52", "metadata": {}, "outputs": [], @@ -722,7 +914,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 44, "id": "c23a767c-80b5-439d-97e2-b7fe5e6bfd06", "metadata": {}, "outputs": [], @@ -732,7 +924,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 45, "id": "6e946c68-3476-459d-a869-77ac37b5fb07", "metadata": {}, "outputs": [], @@ -742,7 +934,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 46, "id": "b4fa40bf-387c-4301-ba13-2bd16b15cd24", "metadata": {}, "outputs": [], @@ -760,7 +952,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 47, "id": "6397dc45-c271-4057-a0d8-1962846d4f94", "metadata": {}, "outputs": [], @@ -776,7 +968,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 48, "id": "fe8f800a-f180-4495-a387-0367528823ba", "metadata": {}, "outputs": [], @@ -796,7 +988,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 49, "id": "ade9e07f-0b55-4561-96d1-6fd6adec0f1a", "metadata": {}, "outputs": [], @@ -808,6 +1000,7 @@ " [\"shape_array_key\"]\n", " )\n", " \n", + " # Filter to just one trip/route/operator\n", " df = vp_trips[(vp_trips.gtfs_dataset_key == gtfs_key)\n", " & (vp_trips.shape_array_key == route)\n", " & (vp_trips.trip_id == trip)].reset_index(drop = True)\n", @@ -816,12 +1009,12 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 50, "id": "c8003044-b7e4-477e-9395-fa881a2fa2b3", "metadata": {}, "outputs": [], "source": [ - "unique_trips = import_unique_trips(test_gtfs_key, test_trip, test_route)" + "# unique_trips = import_unique_trips(test_gtfs_key, test_trip, test_route)" ] }, { @@ -834,7 +1027,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 51, "id": "ac6e78c3-694d-4297-8db1-f0f4d6faadbf", "metadata": {}, "outputs": [], @@ -865,22 +1058,22 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 52, "id": "b47ea0cb-6031-4d98-b963-efbef949d169", "metadata": {}, "outputs": [], "source": [ - "vehicle_positions = import_vehicle_positions(unique_trips, test_gtfs_key, test_trip)" + "#vehicle_positions = import_vehicle_positions(unique_trips, test_gtfs_key, test_trip)" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 53, "id": "f0f96480-8328-43ed-9add-0a74b533fc8d", "metadata": {}, "outputs": [], "source": [ - "len(vehicle_positions)" + "#len(vehicle_positions)" ] }, { @@ -893,12 +1086,14 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 54, "id": "17bb5083-ace2-4e2d-8400-3bf948625909", "metadata": {}, "outputs": [], "source": [ "def import_segments(flagged_df: pd.DataFrame, route:str, gtfs_key:str) -> gpd.GeoDataFrame:\n", + " \n", + " # Load in ALL segments, flag them.\n", " gdf = gpd.read_parquet(f\"{SEGMENT_GCS}stop_segments_{analysis_date}.parquet\",\n", " filters = [[(\"shape_array_key\", \"==\", route),\n", " (\"gtfs_dataset_key\", \"==\", gtfs_key),\n", @@ -912,13 +1107,13 @@ " incorrect_segments = flagged_df[(flagged_df.shape_array_key == route) & (flagged_df.gtfs_dataset_key == gtfs_key)]\n", " incorrect_segments_list = incorrect_segments.stop_sequence.unique().tolist()\n", " incorrect_segments_filtered = gdf[gdf.stop_sequence.isin(incorrect_segments_list)].reset_index(drop = True)\n", - " incorrect_segments_filtered['flag'] = 'incorrect'\n", + " incorrect_segments_filtered['flag'] = 'contains 0m/0sec'\n", " \n", " # Filter for correct segments\n", " correct_segments = flagged_df[~flagged_df.stop_sequence.isin(incorrect_segments_list)]\n", " correct_segments_list = correct_segments.stop_sequence.unique().tolist()\n", " correct_segments_filtered = gdf[gdf.stop_sequence.isin(correct_segments_list)].reset_index(drop = True)\n", - " correct_segments_filtered['flag'] = 'correct'\n", + " correct_segments_filtered['flag'] = 'does not contain 0m/0sec'\n", " \n", " final = pd.concat([correct_segments_filtered, incorrect_segments_filtered])\n", " \n", @@ -927,17 +1122,17 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 55, "id": "9f3a302a-f604-49fe-ae9b-ee8db85466de", "metadata": {}, "outputs": [], "source": [ - "# flagged_segments = import_segments(m3, test_route, test_gtfs_key)" + "flagged_segments = import_segments(m3, test_route, test_gtfs_key)" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 56, "id": "e8a14cfd-38b8-4326-9eac-a711f1a189e8", "metadata": {}, "outputs": [], @@ -951,7 +1146,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 57, "id": "3c08c38e-7419-4ff6-b74f-5f15615e52c4", "metadata": {}, "outputs": [], @@ -969,7 +1164,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 58, "id": "9ea744bf-8019-4b7c-988f-f95196b56435", "metadata": {}, "outputs": [], @@ -993,22 +1188,49 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 59, "id": "7c7aa90f-3e80-472e-b61d-94afd6c0ec01", "metadata": {}, "outputs": [], "source": [ - "first_last = find_first_last_points(test_route, test_trip, test_gtfs_key)" + "# first_last = find_first_last_points(test_route, test_trip, test_gtfs_key)" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 60, "id": "5634169a-f26b-4174-b46a-3aa872bc1bdb", "metadata": {}, "outputs": [], "source": [ - "len(first_last)" + "# len(first_last)" + ] + }, + { + "cell_type": "markdown", + "id": "9aba7f4e-2b1a-4f1b-aaf2-1bc7bb3cf221", + "metadata": {}, + "source": [ + "#### Sjoin " + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "id": "d535d059-efd9-49dd-9759-8663679ad5e1", + "metadata": {}, + "outputs": [], + "source": [ + "def sjoin_vp_segments(segments: gpd.GeoDataFrame, vp_gdf: gpd.GeoDataFrame):\n", + " vp_in_seg = gpd.sjoin(\n", + " vp_gdf,\n", + " segments,\n", + " how = \"inner\",\n", + " predicate = \"within\"\n", + " )\n", + " \n", + " \n", + " return vp_in_seg" ] }, { @@ -1023,24 +1245,44 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 62, "id": "1a785180-38bb-4d33-a96e-3385c84ed2f1", "metadata": {}, "outputs": [], "source": [ - "def display_maps(all_points: gpd.GeoDataFrame, first_last_points: gpd.GeoDataFrame, segments: gpd.GeoDataFrame):\n", - " base1 = segments.explore('flag', cmap= 'tab10', height = 400, width = 600, name = 'segments')\n", + "def display_maps(all_points: gpd.GeoDataFrame, \n", + " first_last_points: gpd.GeoDataFrame,\n", + " segments: gpd.GeoDataFrame,\n", + " sjoin_results: gpd.GeoDataFrame):\n", " \n", - " all_points_map = all_points.explore(m = base1, color = 'red',style_kwds = {'weight':5}, legend_kwds = {'caption': 'all_points'}, name= 'points')\n", + " base1 = segments.explore('flag', cmap= 'tab10', height = 400, width = 600, name = 'segments')\n", + " all_points_map = all_points.explore(m = base1, color = 'red',style_kwds = {'weight':6}, name= 'points')\n", " \n", + " print('ALL POINTS')\n", " display(all_points_map) \n", - " first_last_map = first_last_points.explore('stop_sequence', cmap = 'tab10',style_kwds = {'weight':5},height = 400, width = 600,)\n", - " display(first_last_map)" + " \n", + " \n", + " # Right left geo\n", + " sjoin_points = sjoin_results.set_geometry('geometry_left')\n", + " sjoin_segments = sjoin_results.set_geometry('geometry_right')\n", + " sjoin_segments.geometry_right = sjoin_segments.geometry_right.buffer(35)\n", + " base3 = sjoin_segments.explore('flag', cmap= 'tab10', height = 400, width = 600, name = 'segments')\n", + " sjoin_map = sjoin_points.explore(m = base3, color = 'orange',style_kwds = {'weight':6}, name= 'points')\n", + " \n", + " print('SJOIN')\n", + " display(sjoin_map)\n", + " \n", + " base2 = segments.explore('flag', cmap= 'tab10', height = 400, width = 600, name = 'segments')\n", + " first_last_map = first_last_points.explore(m = base2, color = 'pink',style_kwds = {'weight':6},height = 400, width = 600,)\n", + " \n", + " print('ALL FIRST AND LAST')\n", + " display(first_last_map)\n", + " " ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 63, "id": "bab6bfc4-3d08-46fe-be33-6179ac5df34d", "metadata": {}, "outputs": [], @@ -1058,166 +1300,389 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 64, "id": "509bc2c0-14f3-4021-baf1-6d89a2409a79", "metadata": {}, "outputs": [], "source": [ "def stage2_trouble_shooting(flagged_df:pd.DataFrame,\n", - " date:str, route:str, \n", - " stop_sequence:str, \n", - " trip:str, gtfs_key:str):\n", - " stg2 = import_stage_2(date, route, stop_sequence)\n", + " date:str, \n", + " route:str, \n", + " trip:str, \n", + " gtfs_key:str):\n", " unique_trips = import_unique_trips(gtfs_key, trip, route)\n", " \n", + " # Find all recorded vps\n", " vehicle_positions = import_vehicle_positions(unique_trips, gtfs_key, trip)\n", + " \n", + " # Flag segments, whether one row contains 1+ 0/0 division or not\n", " flagged_segments = import_segments(flagged_df, route, gtfs_key)\n", + " \n", + " # Find first and last pt kept\n", " first_last = find_first_last_points(route, trip, gtfs_key)\n", " \n", - " display_maps(vehicle_positions,first_last,flagged_segments)\n", - " return vehicle_positions, first_last, flagged_segments\n", + " # Sjoin \n", + " sjoin_results = sjoin_vp_segments(flagged_segments,vehicle_positions)\n", + " \n", + " # Display maps\n", + " display_maps(vehicle_positions,first_last,flagged_segments,sjoin_results)\n", " " ] }, { - "cell_type": "code", - "execution_count": null, - "id": "7a4ae095-d010-46b5-80b2-0bbe948f249f", - "metadata": {}, - "outputs": [], - "source": [ - "test1_allpts, test1_firstlast_pts, test1_flagged = stage2_trouble_shooting(flagged_df= m3,\n", - " date = analysis_date,\n", - " route = test_route,\n", - " stop_sequence = test_sequence,\n", - " trip = test_trip,\n", - " gtfs_key = test_gtfs_key)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b45a9b37-21ed-4ceb-9561-3c04568e8e68", + "cell_type": "markdown", + "id": "056dc4ec-dde7-4f5a-bcd6-8ebd5b9d6982", "metadata": {}, - "outputs": [], "source": [ - "test1_allpts.shape, test1_firstlast_pts.shape" + "#### Example Trip 1" ] }, { "cell_type": "code", - "execution_count": null, - "id": "ad92a3aa-84db-4b25-9514-bd7be30861a2", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "268c5f17-df4a-4d84-b3b0-d76a2727bcf7", - "metadata": {}, + "execution_count": 65, + "id": "2bd26525-b824-4b1a-a2bd-817b2207e3fe", + "metadata": { + "scrolled": true, + "tags": [] + }, "outputs": [], "source": [ - "test_route2 = \"0fb4f3627996269dc7075276d3b69e36\"\n", - "test_stop = 13\n", - "test_gtfs_key2 = \"a4f6fd5552107e05fe9743ac7cce2c55\"\n", - "test_trip2 = \"16939095\"" + "# subset[(subset.stop_sequence == test_sequence) & (subset.shape_array_key == test_route)]" ] }, { "cell_type": "code", - "execution_count": null, - "id": "deaa6fdf-37b8-49ee-97f2-46f74d41a449", + "execution_count": 66, + "id": "7a4ae095-d010-46b5-80b2-0bbe948f249f", "metadata": {}, - "outputs": [], - "source": [ - "test2_allpts, test2_firstlast_pts, test2_flagged = stage2_trouble_shooting(flagged_df= m3,\n", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "ALL POINTS\n" + ] + }, + { + "data": { + "text/html": [ + "
Make this Notebook Trusted to load map: File -> Trust Notebook
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "SJOIN\n" + ] + }, + { + "data": { + "text/html": [ + "
Make this Notebook Trusted to load map: File -> Trust Notebook
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "ALL FIRST AND LAST\n" + ] + }, + { + "data": { + "text/html": [ + "
Make this Notebook Trusted to load map: File -> Trust Notebook
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "stage2_trouble_shooting(flagged_df= m3,\n", " date = analysis_date,\n", - " route = test_route2,\n", - " stop_sequence = test_stop,\n", - " trip = test_trip2,\n", - " gtfs_key = test_gtfs_key2)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "330951ba-846e-408f-a581-ce946df0cc76", - "metadata": {}, - "outputs": [], - "source": [ - "test2_allpts.shape, test2_firstlast_pts.shape" + " route = test_route,\n", + " trip = test_trip,\n", + " gtfs_key = test_gtfs_key)" ] }, { - "cell_type": "code", - "execution_count": null, - "id": "db8465be-8938-4ebe-977c-6a784141b4da", + "cell_type": "markdown", + "id": "38c6d374-155d-4fcc-985a-7c892eaecb46", "metadata": {}, - "outputs": [], "source": [ - "# test2_base = test2_flagged.explore('flag', cmap= 'tab10', height = 400, width = 600, name = 'segments')" + "#### Example Trip 2" ] }, { "cell_type": "code", - "execution_count": null, - "id": "0d784404-f514-4648-b9c8-7809fb2fa474", + "execution_count": 67, + "id": "268c5f17-df4a-4d84-b3b0-d76a2727bcf7", "metadata": {}, "outputs": [], "source": [ - "# test2_all_pts_map = test2_allpts.explore(m = test2_base, color = 'red',style_kwds = {'weight':5}, legend_kwds = {'caption': 'all_points'}, name= 'points')" + "test_route2 = \"0fb4f3627996269dc7075276d3b69e36\"\n", + "test_gtfs_key2 = \"a4f6fd5552107e05fe9743ac7cce2c55\"\n", + "test_trip2 = \"16939095\"" ] }, { "cell_type": "code", - "execution_count": null, - "id": "e58353e4-d9c9-402d-99f7-3dd259a4b00e", + "execution_count": 68, + "id": "c7a491e8-4659-4104-8786-0d1d38cf89b4", "metadata": {}, "outputs": [], "source": [ - "# test2_all_pts_map" + "#unique_trips = import_unique_trips(gtfs_key, trip, route)\n", + " \n", + "#vehicle_positions = import_vehicle_positions(unique_trips, gtfs_key, trip)" ] }, { "cell_type": "code", - "execution_count": null, - "id": "2d7e0287-d90f-45c0-b3f1-45a6a0e7a2aa", + "execution_count": 69, + "id": "deaa6fdf-37b8-49ee-97f2-46f74d41a449", "metadata": {}, - "outputs": [], - "source": [ - "# test2_firstlast_pts.explore( color = 'red',style_kwds = {'weight':5}, height = 400, width = 600, )" + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "ALL POINTS\n" + ] + }, + { + "data": { + "text/html": [ + "
Make this Notebook Trusted to load map: File -> Trust Notebook
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "SJOIN\n" + ] + }, + { + "data": { + "text/html": [ + "
Make this Notebook Trusted to load map: File -> Trust Notebook
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "ALL FIRST AND LAST\n" + ] + }, + { + "data": { + "text/html": [ + "
Make this Notebook Trusted to load map: File -> Trust Notebook
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "stage2_trouble_shooting(flagged_df= m3,\n", + " date = analysis_date,\n", + " route = test_route2,\n", + " trip = test_trip2,\n", + " gtfs_key = test_gtfs_key2)" ] }, { "cell_type": "markdown", - "id": "b0ff9ce9-9642-4f91-93de-f35bc45d2e1e", - "metadata": { - "tags": [] - }, - "source": [ - "##### Sjoin" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "fd2a6beb-a2e9-498c-beed-ce7365ffa7a6", - "metadata": {}, - "outputs": [], - "source": [ - "def sjoin_vp_segments(segments: gpd.GeoDataFrame, vp_gdf: gpd.GeoDataFrame):\n", - " vp_in_seg = gpd.sjoin(\n", - " vp_gdf,\n", - " segments,\n", - " how = \"inner\",\n", - " predicate = \"within\"\n", - " )\n", - " # vp_in_seg = vp_in_seg.set_geometry('geometry_left')\n", - " \n", - " return vp_in_seg\n", - "\n" + "id": "27d10dab-c7b0-4ddd-b70b-b3c6b7b3e579", + "metadata": {}, + "source": [ + "#### Example Trip 3" + ] + }, + { + "cell_type": "code", + "execution_count": 70, + "id": "f951ac97-43af-452f-9cb7-d40f71c114c9", + "metadata": {}, + "outputs": [], + "source": [ + "test_route3 = \"07c9a47264a43d8d0d16ef7109e8fd68\"\n", + "test_gtfs_key3 = \"db56b50ab86b5f7a4ae2fc2dd9889bbe\"\n", + "test_trip3 = \"1089348\"" + ] + }, + { + "cell_type": "code", + "execution_count": 71, + "id": "26e85057-05a9-4606-af4a-7be3e08ae2a2", + "metadata": {}, + "outputs": [], + "source": [ + "# subset[(subset.stop_sequence == 34) & (subset.shape_array_key == test_route3)]" + ] + }, + { + "cell_type": "code", + "execution_count": 81, + "id": "91d07c20-9d78-4eea-8b9c-293df8ade5a3", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "ALL POINTS\n" + ] + }, + { + "data": { + "text/html": [ + "
Make this Notebook Trusted to load map: File -> Trust Notebook
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "SJOIN\n" + ] + }, + { + "data": { + "text/html": [ + "
Make this Notebook Trusted to load map: File -> Trust Notebook
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "ALL FIRST AND LAST\n" + ] + }, + { + "data": { + "text/html": [ + "
Make this Notebook Trusted to load map: File -> Trust Notebook
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "ALL POINTS\n" + ] + }, + { + "data": { + "text/html": [ + "
Make this Notebook Trusted to load map: File -> Trust Notebook
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "SJOIN\n" + ] + }, + { + "data": { + "text/html": [ + "
Make this Notebook Trusted to load map: File -> Trust Notebook
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "ALL FIRST AND LAST\n" + ] + }, + { + "data": { + "text/html": [ + "
Make this Notebook Trusted to load map: File -> Trust Notebook
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "stage2_trouble_shooting(flagged_df= m3,\n", + " date = analysis_date,\n", + " route = test_route3,\n", + " trip = test_trip3,\n", + " gtfs_key = test_gtfs_key3)" ] }, { From 447e7144998590f038926827a08eedf29e715148 Mon Sep 17 00:00:00 2001 From: amandaha8 Date: Fri, 30 Jun 2023 18:02:55 +0000 Subject: [PATCH 9/9] tested routes w/ many trips --- rt_segment_speeds/12_speeds.ipynb | 767 ++++++++++++++++++++---------- 1 file changed, 509 insertions(+), 258 deletions(-) diff --git a/rt_segment_speeds/12_speeds.ipynb b/rt_segment_speeds/12_speeds.ipynb index e76bc7696..b3ea92af5 100644 --- a/rt_segment_speeds/12_speeds.ipynb +++ b/rt_segment_speeds/12_speeds.ipynb @@ -2,27 +2,10 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": 36, "id": "2c7feec3-aa18-42ab-94b9-cab4be608152", "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/opt/conda/lib/python3.9/site-packages/geopandas/_compat.py:123: UserWarning: The Shapely GEOS version (3.11.1-CAPI-1.17.1) is incompatible with the GEOS version PyGEOS was compiled with (3.10.1-CAPI-1.16.0). Conversions between both will be slow.\n", - " warnings.warn(\n", - "/home/jovyan/data-analyses/rt_segment_speeds/_threshold_utils.py:1: UserWarning: Shapely 2.0 is installed, but because PyGEOS is also installed, GeoPandas will still use PyGEOS by default for now. To force to use and test Shapely 2.0, you have to set the environment variable USE_PYGEOS=0. You can do this before starting the Python process, or in your code before importing geopandas:\n", - "\n", - "import os\n", - "os.environ['USE_PYGEOS'] = '0'\n", - "import geopandas\n", - "\n", - "In a future release, GeoPandas will switch to using Shapely by default. If you are using PyGEOS directly (calling PyGEOS functions on geometries from GeoPandas), this will then stop working and you are encouraged to migrate from PyGEOS to Shapely 2.0 (https://shapely.readthedocs.io/en/latest/migration_pygeos.html).\n", - " import geopandas as gpd\n" - ] - } - ], + "outputs": [], "source": [ "import datetime\n", "import _speed_utils as speed_utils\n", @@ -45,7 +28,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 37, "id": "0108ae4a-4518-4487-85f7-a5faa3e9cbf6", "metadata": {}, "outputs": [], @@ -58,7 +41,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 38, "id": "80014c5e-695d-4280-89cd-4e7e2bb3d302", "metadata": {}, "outputs": [], @@ -76,7 +59,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 39, "id": "2f0c5f4f-f419-42a8-8527-7060ed412092", "metadata": {}, "outputs": [], @@ -108,7 +91,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 40, "id": "84ac97bf-ee4f-4d85-b523-8a36823f9d9a", "metadata": {}, "outputs": [], @@ -118,7 +101,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 41, "id": "68950ae7-4061-47d6-ac48-5eac0b1f29c0", "metadata": {}, "outputs": [], @@ -128,7 +111,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 42, "id": "b04dfb8b-7476-49df-873a-cea75dc61763", "metadata": {}, "outputs": [], @@ -153,7 +136,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 43, "id": "e81e59fd-cc2f-408e-9148-1a1055425fc4", "metadata": {}, "outputs": [], @@ -187,7 +170,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 44, "id": "0dfb836d-f919-4f2b-a0d1-9e4a4713ba8a", "metadata": {}, "outputs": [], @@ -197,7 +180,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 45, "id": "9f84205d-93db-49f3-be99-6b5014f7faeb", "metadata": {}, "outputs": [], @@ -207,7 +190,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 46, "id": "b0d2184f-8a44-4489-a1b4-2be8317142f1", "metadata": {}, "outputs": [], @@ -217,7 +200,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 47, "id": "940fb010-0dff-465e-bf8d-87dd3f4ba101", "metadata": {}, "outputs": [], @@ -227,16 +210,16 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 48, "id": "9d38d541-5c9c-4d31-8986-9c3928eb2f59", "metadata": {}, "outputs": [], "source": [ - "def categorize_meters_speeds_pandas(df)-> pd.DataFrame:\n", + "def categorize_meters_speeds_pandas()-> pd.DataFrame:\n", " start = datetime.datetime.now()\n", " print(start)\n", " \n", - " #df = merge_all_speeds(analysis_date)\n", + " df = merge_all_speeds(analysis_date)\n", " \n", " # Categorize\n", " df1 = categorize_by_percentile_pandas(df, \"meters_elapsed\", \"meters_\")\n", @@ -272,17 +255,17 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 49, "id": "7201d5e3-f765-4e5d-9bbd-aa6a336bcc0a", "metadata": {}, "outputs": [], "source": [ - "subset = m1[m1.shape_array_key.isin(sample_0_keys)].reset_index()" + "# subset = m1[m1.shape_array_key.isin(sample_0_keys)].reset_index()" ] }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 50, "id": "2c5107cb-c574-449b-95b6-fb205f38502e", "metadata": {}, "outputs": [ @@ -290,45 +273,45 @@ "name": "stdout", "output_type": "stream", "text": [ - "2023-06-29 11:20:08.772300\n", + "2023-06-30 10:15:15.239284\n", "Done with meters_\n", "Done with sec_\n", "sec_cat meters_cat \n", - "sec is avg meters is avg 1829\n", - " meters is high 110\n", - " meters is low 22\n", - "sec is high meters is avg 63\n", - " meters is high 40\n", - " meters is low 47\n", - "sec is low meters is low 850\n", + "sec is avg meters is avg 2415102\n", + " meters is high 70745\n", + " meters is low 139528\n", + "sec is high meters is avg 57245\n", + " meters is high 83074\n", + " meters is low 13695\n", + "sec is low meters is low 296973\n", "dtype: int64\n", - "1022 rows left after filtering for rows with either high seconds OR low meters\n", - "division by 0 850\n", - "seconds too high 103\n", - "meters too low 69\n", + "590515 rows left after filtering for rows with either high seconds OR low meters\n", + "division by 0 296973\n", + "meters too low 153223\n", + "seconds too high 140319\n", "Name: flag, dtype: int64\n", - "Took 0:00:00.217325\n" + "Took 0:02:29.450038\n" ] } ], "source": [ - "m2 = categorize_meters_speeds_pandas(subset)" + "m2 = categorize_meters_speeds_pandas()" ] }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 51, "id": "2d1bf90c-d9ed-4861-a1be-23f356165a4c", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "division by 0 850\n", + "division by 0 296973\n", "Name: flag, dtype: int64" ] }, - "execution_count": 16, + "execution_count": 51, "metadata": {}, "output_type": "execute_result" } @@ -339,17 +322,17 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 52, "id": "ce0fbb35-f81e-4343-92d2-4382d2173dbd", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "3075512" + "2779389" ] }, - "execution_count": 17, + "execution_count": 52, "metadata": {}, "output_type": "execute_result" } @@ -360,17 +343,17 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 53, "id": "508f1411-4328-4b80-a029-0ae516107ed0", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "850" + "296973" ] }, - "execution_count": 18, + "execution_count": 53, "metadata": {}, "output_type": "execute_result" } @@ -381,17 +364,17 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 54, "id": "9fe08d2a-b874-4439-aa5b-a52de58cad09", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "(73, 72067)" + "(45357, 72067)" ] }, - "execution_count": 19, + "execution_count": 54, "metadata": {}, "output_type": "execute_result" } @@ -402,38 +385,38 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 55, "id": "8e6d31ab-46a7-4e20-bb2f-9cac1a2d672d", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "(4, 4837)" + "'2155 routes flagged'" ] }, - "execution_count": 20, + "execution_count": 55, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "m2.shape_array_key.nunique(), m1.shape_array_key.nunique()" + "f\"{m1.shape_array_key.nunique() - m2.shape_array_key.nunique()} routes flagged\"" ] }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 56, "id": "647fad46-7f9b-4ce2-a26a-1ea69d02daee", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "(3, 76)" + "(63, 76)" ] }, - "execution_count": 21, + "execution_count": 56, "metadata": {}, "output_type": "execute_result" } @@ -444,7 +427,70 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 57, + "id": "0f4e4b50-081d-4516-81cf-d5bdfb5d469f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
shape_array_key
loop_or_inlining
03970
1867
\n", + "
" + ], + "text/plain": [ + " shape_array_key\n", + "loop_or_inlining \n", + "0 3970\n", + "1 867" + ] + }, + "execution_count": 57, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "m1.groupby([\"loop_or_inlining\"]).agg({\"shape_array_key\": \"nunique\"})" + ] + }, + { + "cell_type": "code", + "execution_count": 58, "id": "83036ccc-7339-42c2-b1f7-183734253c21", "metadata": {}, "outputs": [ @@ -479,7 +525,7 @@ " \n", " \n", " 0\n", - " 4\n", + " 2682\n", " \n", " \n", "\n", @@ -488,10 +534,10 @@ "text/plain": [ " shape_array_key\n", "loop_or_inlining \n", - "0 4" + "0 2682" ] }, - "execution_count": 22, + "execution_count": 58, "metadata": {}, "output_type": "execute_result" } @@ -510,7 +556,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 59, "id": "468be3c9-7a24-4f01-84fd-31c137bc45e8", "metadata": { "scrolled": true, @@ -525,7 +571,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 60, "id": "4350f540-8f6b-4fb0-8b16-836245c0e44c", "metadata": { "scrolled": true, @@ -539,7 +585,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 61, "id": "ac68bdf7-26a0-4679-9a35-26f8a670018a", "metadata": {}, "outputs": [], @@ -549,7 +595,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 62, "id": "81d443cc-122f-46f1-87ec-dbdc74e0ca6a", "metadata": {}, "outputs": [], @@ -559,25 +605,25 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 63, "id": "314d9baf-de0e-460a-8c29-4504ba94cfa6", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "count 4.00\n", - "mean 100.00\n", - "std 0.00\n", - "min 100.00\n", - "25% 100.00\n", - "50% 100.00\n", - "75% 100.00\n", - "max 100.00\n", + "count 2682.00\n", + "mean 82.86\n", + "std 26.65\n", + "min 1.52\n", + "25% 75.00\n", + "50% 100.00\n", + "75% 100.00\n", + "max 100.00\n", "Name: percent_of_trips_with_problematic_rows, dtype: float64" ] }, - "execution_count": 27, + "execution_count": 63, "metadata": {}, "output_type": "execute_result" } @@ -588,7 +634,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 64, "id": "5de3efe6-2233-4251-93a8-1f8dd6fb2dae", "metadata": { "tags": [] @@ -610,7 +656,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 65, "id": "a2a705af-b588-463b-b6ce-f999b2050208", "metadata": {}, "outputs": [], @@ -638,7 +684,7 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 66, "id": "1e36c5fc-ab3f-4129-97f9-ad9472b7d32a", "metadata": {}, "outputs": [], @@ -648,7 +694,7 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 67, "id": "1d6fe654-40ca-4758-bc2c-316e33d1a9d1", "metadata": {}, "outputs": [], @@ -658,7 +704,7 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 68, "id": "148e75f1-08dd-44c8-8179-319164d8e020", "metadata": { "tags": [] @@ -671,7 +717,7 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 69, "id": "b4350206-c237-44a3-abce-f8f38cde8117", "metadata": { "scrolled": true, @@ -685,7 +731,7 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 70, "id": "aa1e56d1-ec07-436c-8763-7bcf3dcbf7d4", "metadata": { "scrolled": true, @@ -699,7 +745,7 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 71, "id": "22e42aae-9281-4040-ab8c-6a10b93f6cf4", "metadata": { "scrolled": true, @@ -713,7 +759,7 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 72, "id": "0f21f08f-d4eb-4bbd-94d3-f4b031e97cf4", "metadata": {}, "outputs": [], @@ -739,7 +785,7 @@ }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 73, "id": "5ce07566-c1f0-4fa7-9550-2fa07b98dba8", "metadata": {}, "outputs": [], @@ -770,7 +816,7 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 74, "id": "66e83169-2b4a-4912-bc0e-1a0b3e8deea6", "metadata": {}, "outputs": [], @@ -829,7 +875,7 @@ }, { "cell_type": "code", - "execution_count": 39, + "execution_count": 75, "id": "cab32ef3-cc66-40ce-aa19-59631734f539", "metadata": {}, "outputs": [ @@ -837,14 +883,14 @@ "name": "stdout", "output_type": "stream", "text": [ - "2023-06-29 11:20:18.037861\n", - "check in stage 2 1503\n", - "repeated timestamps 149\n", - "repeated timestamps & locations 27\n", - "repeated locations 21\n", + "2023-06-30 10:17:51.135694\n", + "check in stage 2 538914\n", + "repeated timestamps 54883\n", + "repeated timestamps & locations 107\n", + "repeated locations 42\n", "Name: stage3_flag, dtype: int64\n", - "Have to check 88.41176470588236 % of rows in stage 2\n", - "Took 0:00:05.668398\n" + "Have to check 90.73451121819154 % of rows in stage 2\n", + "Took 0:00:27.583738\n" ] } ], @@ -854,7 +900,7 @@ }, { "cell_type": "code", - "execution_count": 40, + "execution_count": 76, "id": "1cca329c-14bc-4ad5-9465-1a63ca53df49", "metadata": {}, "outputs": [], @@ -864,17 +910,17 @@ }, { "cell_type": "code", - "execution_count": 41, + "execution_count": 77, "id": "93e87778-edef-4d62-98aa-a4241f177892", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "(1503, 30)" + "(538914, 29)" ] }, - "execution_count": 41, + "execution_count": 77, "metadata": {}, "output_type": "execute_result" } @@ -885,7 +931,7 @@ }, { "cell_type": "code", - "execution_count": 42, + "execution_count": 78, "id": "21799f42-873e-41bd-b764-42cc297686a6", "metadata": {}, "outputs": [], @@ -894,52 +940,233 @@ ] }, { - "cell_type": "markdown", - "id": "4b1876cf-9e8b-4c30-8723-2226133b8e01", + "cell_type": "code", + "execution_count": 149, + "id": "17ac977c-d220-414e-be9f-540eec051e06", "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
shape_array_keygtfs_dataset_keytrip_idn_trips
296389809fd4704a18ae0ad64f8170e0167b565222fe2cf728fd3f16b2ff51e133fe8c183-oeiebzuc1162
295464805fef558a9bf81d57143cab635b27b1c0e3039da063db95ebabd3fe4ee611a411083276_M11159
396202ac5104538290bb7c7d14b926884e6efac0e3039da063db95ebabd3fe4ee611a411060883_M11157
527457e5ec67542d6f30fa38fdcf2f63c90109c0e3039da063db95ebabd3fe4ee611a411083144_M11156
1166043928b30e00772c10a38c11ea12ad78695222fe2cf728fd3f16b2ff51e133fe8c183-0rjkhjagy150
555304edc5ab1a2be1d269306161ce38e0b2adc0e3039da063db95ebabd3fe4ee611a411042148_M11138
\n", + "
" + ], + "text/plain": [ + " shape_array_key gtfs_dataset_key \\\n", + "296389 809fd4704a18ae0ad64f8170e0167b56 5222fe2cf728fd3f16b2ff51e133fe8c \n", + "295464 805fef558a9bf81d57143cab635b27b1 c0e3039da063db95ebabd3fe4ee611a4 \n", + "396202 ac5104538290bb7c7d14b926884e6efa c0e3039da063db95ebabd3fe4ee611a4 \n", + "527457 e5ec67542d6f30fa38fdcf2f63c90109 c0e3039da063db95ebabd3fe4ee611a4 \n", + "116604 3928b30e00772c10a38c11ea12ad7869 5222fe2cf728fd3f16b2ff51e133fe8c \n", + "555304 edc5ab1a2be1d269306161ce38e0b2ad c0e3039da063db95ebabd3fe4ee611a4 \n", + "\n", + " trip_id n_trips \n", + "296389 183-oeiebzuc1 162 \n", + "295464 11083276_M11 159 \n", + "396202 11060883_M11 157 \n", + "527457 11083144_M11 156 \n", + "116604 183-0rjkhjagy 150 \n", + "555304 11042148_M11 138 " + ] + }, + "execution_count": 149, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "#### Stage2: \"vp_stop_segment\"/A1_sjoin_vp_segments\n" + "# Find routes with the most trips\n", + "(m3\n", + " .sort_values(['n_trips'], ascending = False)\n", + " .drop_duplicates(['shape_array_key'])\n", + " [['shape_array_key','gtfs_dataset_key', 'trip_id', 'n_trips']]\n", + " .head(6)\n", + ")" ] }, { "cell_type": "code", - "execution_count": 43, - "id": "0a469849-f903-44e4-9d2a-4f3775270a52", + "execution_count": 90, + "id": "3869ed7a-a951-4ed0-bfa9-bbdba7177790", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "n_trips\n", + "140 40\n", + "146 28\n", + "150 28\n", + "147 24\n", + "148 18\n", + "152 18\n", + "149 16\n", + "151 16\n", + "158 12\n", + "157 8\n", + "154 6\n", + "156 6\n", + "159 4\n", + "160 2\n", + "161 2\n", + "162 2\n", + "dtype: int64" + ] + }, + "execution_count": 90, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# Select one route to look at\n", - "test_route = \"106d979b9a9e6338827a8e1c145e69fd\"" + "m3[m3.shape_array_key == \"809fd4704a18ae0ad64f8170e0167b56\"][['n_trips']].value_counts()" ] }, { "cell_type": "code", - "execution_count": 44, - "id": "c23a767c-80b5-439d-97e2-b7fe5e6bfd06", + "execution_count": 91, + "id": "a6c3db80-c4bf-4264-873c-c9912cdc9dc5", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "n_trips\n", + "140 50\n", + "54 22\n", + "125 20\n", + "132 14\n", + "126 12\n", + "145 10\n", + "136 8\n", + "141 6\n", + "142 6\n", + "155 4\n", + "158 4\n", + "144 4\n", + "151 4\n", + "143 4\n", + "159 4\n", + "153 2\n", + "156 2\n", + "157 2\n", + "dtype: int64" + ] + }, + "execution_count": 91, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "m3[m3.shape_array_key == \"805fef558a9bf81d57143cab635b27b1\"][['n_trips']].value_counts()" + ] + }, + { + "cell_type": "markdown", + "id": "4b1876cf-9e8b-4c30-8723-2226133b8e01", + "metadata": {}, + "source": [ + "#### Stage2: \"vp_stop_segment\"/A1_sjoin_vp_segments\n" + ] + }, + { + "cell_type": "code", + "execution_count": 92, + "id": "0a469849-f903-44e4-9d2a-4f3775270a52", "metadata": {}, "outputs": [], "source": [ - "test_sequence = 39" + "# Select one route to look at\n", + "test_route = \"3928b30e00772c10a38c11ea12ad7869\"" ] }, { "cell_type": "code", - "execution_count": 45, + "execution_count": 93, "id": "6e946c68-3476-459d-a869-77ac37b5fb07", "metadata": {}, "outputs": [], "source": [ - "test_gtfs_key = \"db56b50ab86b5f7a4ae2fc2dd9889bbe\"" + "test_gtfs_key = \"5222fe2cf728fd3f16b2ff51e133fe8c\"" ] }, { "cell_type": "code", - "execution_count": 46, + "execution_count": 94, "id": "b4fa40bf-387c-4301-ba13-2bd16b15cd24", "metadata": {}, "outputs": [], "source": [ - "test_trip = '1088405'" + "test_trip = '183-0rjkhjagy'" ] }, { @@ -952,7 +1179,7 @@ }, { "cell_type": "code", - "execution_count": 47, + "execution_count": 95, "id": "6397dc45-c271-4057-a0d8-1962846d4f94", "metadata": {}, "outputs": [], @@ -968,7 +1195,7 @@ }, { "cell_type": "code", - "execution_count": 48, + "execution_count": 96, "id": "fe8f800a-f180-4495-a387-0367528823ba", "metadata": {}, "outputs": [], @@ -988,7 +1215,7 @@ }, { "cell_type": "code", - "execution_count": 49, + "execution_count": 97, "id": "ade9e07f-0b55-4561-96d1-6fd6adec0f1a", "metadata": {}, "outputs": [], @@ -1009,7 +1236,7 @@ }, { "cell_type": "code", - "execution_count": 50, + "execution_count": 98, "id": "c8003044-b7e4-477e-9395-fa881a2fa2b3", "metadata": {}, "outputs": [], @@ -1027,7 +1254,7 @@ }, { "cell_type": "code", - "execution_count": 51, + "execution_count": 99, "id": "ac6e78c3-694d-4297-8db1-f0f4d6faadbf", "metadata": {}, "outputs": [], @@ -1058,7 +1285,7 @@ }, { "cell_type": "code", - "execution_count": 52, + "execution_count": 100, "id": "b47ea0cb-6031-4d98-b963-efbef949d169", "metadata": {}, "outputs": [], @@ -1068,7 +1295,7 @@ }, { "cell_type": "code", - "execution_count": 53, + "execution_count": 101, "id": "f0f96480-8328-43ed-9add-0a74b533fc8d", "metadata": {}, "outputs": [], @@ -1086,7 +1313,7 @@ }, { "cell_type": "code", - "execution_count": 54, + "execution_count": 102, "id": "17bb5083-ace2-4e2d-8400-3bf948625909", "metadata": {}, "outputs": [], @@ -1122,7 +1349,7 @@ }, { "cell_type": "code", - "execution_count": 55, + "execution_count": 103, "id": "9f3a302a-f604-49fe-ae9b-ee8db85466de", "metadata": {}, "outputs": [], @@ -1132,7 +1359,7 @@ }, { "cell_type": "code", - "execution_count": 56, + "execution_count": 104, "id": "e8a14cfd-38b8-4326-9eac-a711f1a189e8", "metadata": {}, "outputs": [], @@ -1146,7 +1373,7 @@ }, { "cell_type": "code", - "execution_count": 57, + "execution_count": 105, "id": "3c08c38e-7419-4ff6-b74f-5f15615e52c4", "metadata": {}, "outputs": [], @@ -1164,7 +1391,7 @@ }, { "cell_type": "code", - "execution_count": 58, + "execution_count": 106, "id": "9ea744bf-8019-4b7c-988f-f95196b56435", "metadata": {}, "outputs": [], @@ -1188,7 +1415,7 @@ }, { "cell_type": "code", - "execution_count": 59, + "execution_count": 107, "id": "7c7aa90f-3e80-472e-b61d-94afd6c0ec01", "metadata": {}, "outputs": [], @@ -1198,7 +1425,7 @@ }, { "cell_type": "code", - "execution_count": 60, + "execution_count": 108, "id": "5634169a-f26b-4174-b46a-3aa872bc1bdb", "metadata": {}, "outputs": [], @@ -1216,7 +1443,7 @@ }, { "cell_type": "code", - "execution_count": 61, + "execution_count": 109, "id": "d535d059-efd9-49dd-9759-8663679ad5e1", "metadata": {}, "outputs": [], @@ -1245,7 +1472,7 @@ }, { "cell_type": "code", - "execution_count": 62, + "execution_count": 124, "id": "1a785180-38bb-4d33-a96e-3385c84ed2f1", "metadata": {}, "outputs": [], @@ -1275,14 +1502,14 @@ " base2 = segments.explore('flag', cmap= 'tab10', height = 400, width = 600, name = 'segments')\n", " first_last_map = first_last_points.explore(m = base2, color = 'pink',style_kwds = {'weight':6},height = 400, width = 600,)\n", " \n", - " print('ALL FIRST AND LAST')\n", + " print('FIRST AND LAST')\n", " display(first_last_map)\n", " " ] }, { "cell_type": "code", - "execution_count": 63, + "execution_count": 125, "id": "bab6bfc4-3d08-46fe-be33-6179ac5df34d", "metadata": {}, "outputs": [], @@ -1295,12 +1522,26 @@ "id": "f7ed9dc6-80ce-46f9-ae59-a5ed2bbcad50", "metadata": {}, "source": [ - "#### Function" + "#### Function\n", + "\n", + "Previous tried routes\n", + "test_route = \"106d979b9a9e6338827a8e1c145e69fd\"\n", + "test_sequence = 39\n", + "test_gtfs_key = \"db56b50ab86b5f7a4ae2fc2dd9889bbe\"\n", + "test_trip = '1088405'\n", + "\n", + "test_route2 = \"0fb4f3627996269dc7075276d3b69e36\"\n", + "test_gtfs_key2 = \"a4f6fd5552107e05fe9743ac7cce2c55\"\n", + "test_trip2 = \"16939095\"\n", + "\n", + "test_route3 = \"07c9a47264a43d8d0d16ef7109e8fd68\"\n", + "test_gtfs_key3 = \"db56b50ab86b5f7a4ae2fc2dd9889bbe\"\n", + "test_trip3 = \"1089348\"" ] }, { "cell_type": "code", - "execution_count": 64, + "execution_count": 126, "id": "509bc2c0-14f3-4021-baf1-6d89a2409a79", "metadata": {}, "outputs": [], @@ -1339,7 +1580,7 @@ }, { "cell_type": "code", - "execution_count": 65, + "execution_count": 127, "id": "2bd26525-b824-4b1a-a2bd-817b2207e3fe", "metadata": { "scrolled": true, @@ -1352,7 +1593,7 @@ }, { "cell_type": "code", - "execution_count": 66, + "execution_count": 128, "id": "7a4ae095-d010-46b5-80b2-0bbe948f249f", "metadata": {}, "outputs": [ @@ -1366,10 +1607,10 @@ { "data": { "text/html": [ - "
Make this Notebook Trusted to load map: File -> Trust Notebook
" + "
Make this Notebook Trusted to load map: File -> Trust Notebook
" ], "text/plain": [ - "" + "" ] }, "metadata": {}, @@ -1385,10 +1626,10 @@ { "data": { "text/html": [ - "
Make this Notebook Trusted to load map: File -> Trust Notebook
" + "
Make this Notebook Trusted to load map: File -> Trust Notebook
" ], "text/plain": [ - "" + "" ] }, "metadata": {}, @@ -1398,16 +1639,16 @@ "name": "stdout", "output_type": "stream", "text": [ - "ALL FIRST AND LAST\n" + "FIRST AND LAST\n" ] }, { "data": { "text/html": [ - "
Make this Notebook Trusted to load map: File -> Trust Notebook
" + "
Make this Notebook Trusted to load map: File -> Trust Notebook
" ], "text/plain": [ - "" + "" ] }, "metadata": {}, @@ -1432,31 +1673,29 @@ }, { "cell_type": "code", - "execution_count": 67, + "execution_count": 129, "id": "268c5f17-df4a-4d84-b3b0-d76a2727bcf7", "metadata": {}, "outputs": [], "source": [ - "test_route2 = \"0fb4f3627996269dc7075276d3b69e36\"\n", - "test_gtfs_key2 = \"a4f6fd5552107e05fe9743ac7cce2c55\"\n", - "test_trip2 = \"16939095\"" + "test_route2 = \"805fef558a9bf81d57143cab635b27b1\"\n", + "test_gtfs_key2 = \"c0e3039da063db95ebabd3fe4ee611a4\"\n", + "test_trip2 = \"11083276_M11\"" ] }, { "cell_type": "code", - "execution_count": 68, - "id": "c7a491e8-4659-4104-8786-0d1d38cf89b4", + "execution_count": 141, + "id": "c0914a84-b24d-442f-aaca-acf401b9209c", "metadata": {}, "outputs": [], "source": [ - "#unique_trips = import_unique_trips(gtfs_key, trip, route)\n", - " \n", - "#vehicle_positions = import_vehicle_positions(unique_trips, gtfs_key, trip)" + "# m1[(m1.stop_sequence == 17) & (m1.shape_array_key == test_route2)]" ] }, { "cell_type": "code", - "execution_count": 69, + "execution_count": 131, "id": "deaa6fdf-37b8-49ee-97f2-46f74d41a449", "metadata": {}, "outputs": [ @@ -1470,10 +1709,10 @@ { "data": { "text/html": [ - "
Make this Notebook Trusted to load map: File -> Trust Notebook
" + "
Make this Notebook Trusted to load map: File -> Trust Notebook
" ], "text/plain": [ - "" + "" ] }, "metadata": {}, @@ -1489,10 +1728,10 @@ { "data": { "text/html": [ - "
Make this Notebook Trusted to load map: File -> Trust Notebook
" + "
Make this Notebook Trusted to load map: File -> Trust Notebook
" ], "text/plain": [ - "" + "" ] }, "metadata": {}, @@ -1502,16 +1741,16 @@ "name": "stdout", "output_type": "stream", "text": [ - "ALL FIRST AND LAST\n" + "FIRST AND LAST\n" ] }, { "data": { "text/html": [ - "
Make this Notebook Trusted to load map: File -> Trust Notebook
" + "
Make this Notebook Trusted to load map: File -> Trust Notebook
" ], "text/plain": [ - "" + "" ] }, "metadata": {}, @@ -1536,19 +1775,19 @@ }, { "cell_type": "code", - "execution_count": 70, + "execution_count": 150, "id": "f951ac97-43af-452f-9cb7-d40f71c114c9", "metadata": {}, "outputs": [], "source": [ - "test_route3 = \"07c9a47264a43d8d0d16ef7109e8fd68\"\n", - "test_gtfs_key3 = \"db56b50ab86b5f7a4ae2fc2dd9889bbe\"\n", - "test_trip3 = \"1089348\"" + "test_route3 = \"edc5ab1a2be1d269306161ce38e0b2ad\"\n", + "test_gtfs_key3 = \"c0e3039da063db95ebabd3fe4ee611a4\"\n", + "test_trip3 = \"11042148_M11\"" ] }, { "cell_type": "code", - "execution_count": 71, + "execution_count": 151, "id": "26e85057-05a9-4606-af4a-7be3e08ae2a2", "metadata": {}, "outputs": [], @@ -1558,7 +1797,7 @@ }, { "cell_type": "code", - "execution_count": 81, + "execution_count": 152, "id": "91d07c20-9d78-4eea-8b9c-293df8ade5a3", "metadata": {}, "outputs": [ @@ -1572,10 +1811,10 @@ { "data": { "text/html": [ - "
Make this Notebook Trusted to load map: File -> Trust Notebook
" + "
Make this Notebook Trusted to load map: File -> Trust Notebook
" ], "text/plain": [ - "" + "" ] }, "metadata": {}, @@ -1591,10 +1830,10 @@ { "data": { "text/html": [ - "
Make this Notebook Trusted to load map: File -> Trust Notebook
" + "
Make this Notebook Trusted to load map: File -> Trust Notebook
" ], "text/plain": [ - "" + "" ] }, "metadata": {}, @@ -1604,73 +1843,16 @@ "name": "stdout", "output_type": "stream", "text": [ - "ALL FIRST AND LAST\n" + "FIRST AND LAST\n" ] }, { "data": { "text/html": [ - "
Make this Notebook Trusted to load map: File -> Trust Notebook
" + "
Make this Notebook Trusted to load map: File -> Trust Notebook
" ], "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "ALL POINTS\n" - ] - }, - { - "data": { - "text/html": [ - "
Make this Notebook Trusted to load map: File -> Trust Notebook
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "SJOIN\n" - ] - }, - { - "data": { - "text/html": [ - "
Make this Notebook Trusted to load map: File -> Trust Notebook
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "ALL FIRST AND LAST\n" - ] - }, - { - "data": { - "text/html": [ - "
Make this Notebook Trusted to load map: File -> Trust Notebook
" - ], - "text/plain": [ - "" + "" ] }, "metadata": {}, @@ -1695,7 +1877,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 153, "id": "b9eab37f-0569-4f07-9113-87200b0c7dfd", "metadata": {}, "outputs": [], @@ -1706,27 +1888,84 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 154, "id": "deb486c8-a800-485e-8a46-d994af1c0074", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
_gtfs_dataset_nametrip_idlocation_timestamplocation_timestamp_localactivity_datehourxyvp_idxgtfs_dataset_key
8658254San Diego Vehicle Positions168484052023-04-12 15:37:42+00:002023-04-12 08:37:422023-04-128-117.1432.798658254a4f6fd5552107e05fe9743ac7cce2c55
\n", + "
" + ], + "text/plain": [ + " _gtfs_dataset_name trip_id location_timestamp \\\n", + "8658254 San Diego Vehicle Positions 16848405 2023-04-12 15:37:42+00:00 \n", + "\n", + " location_timestamp_local activity_date hour x y vp_idx \\\n", + "8658254 2023-04-12 08:37:42 2023-04-12 8 -117.14 32.79 8658254 \n", + "\n", + " gtfs_dataset_key \n", + "8658254 a4f6fd5552107e05fe9743ac7cce2c55 " + ] + }, + "execution_count": 154, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "usable.sample()" ] }, { "cell_type": "code", - "execution_count": null, - "id": "b3d8352e-7c60-4368-b78f-02e20136947a", - "metadata": {}, - "outputs": [], - "source": [ - "subset_for_merge2 = subset_for_merge.drop(columns = ['stop_sequence','stop_id','meters_elapsed','sec_elapsed'])" - ] - }, - { - "cell_type": "code", - "execution_count": null, + "execution_count": 155, "id": "813ae4db-0fef-4f10-9408-7284fc531ed2", "metadata": {}, "outputs": [], @@ -1737,10 +1976,22 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 156, "id": "d08fa8db-f3a3-43f2-a763-a39cacc9cf9c", "metadata": {}, - "outputs": [], + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'subset_for_merge2' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[156], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43msubset_for_merge2\u001b[49m\u001b[38;5;241m.\u001b[39mhead()\n", + "\u001b[0;31mNameError\u001b[0m: name 'subset_for_merge2' is not defined" + ] + } + ], "source": [ "subset_for_merge2.head()" ]