diff --git a/database_compendium/_modidx.py b/database_compendium/_modidx.py index 5c7b32d..9836616 100644 --- a/database_compendium/_modidx.py +++ b/database_compendium/_modidx.py @@ -29,6 +29,8 @@ 'database_compendium/utils/Nomis_scraper_functions.py')}, 'database_compendium.utils.ONS_scraper_functions': { 'database_compendium.utils.ONS_scraper_functions.find_ONS_cols': ( 'ons_scraper_functions.html#find_ons_cols', 'database_compendium/utils/ONS_scraper_functions.py'), + 'database_compendium.utils.ONS_scraper_functions.find_ONS_cols_and_unique_vals': ( 'ons_scraper_functions.html#find_ons_cols_and_unique_vals', + 'database_compendium/utils/ONS_scraper_functions.py'), 'database_compendium.utils.ONS_scraper_functions.get_ONS_datasets_titles_descriptions': ( 'ons_scraper_functions.html#get_ons_datasets_titles_descriptions', 'database_compendium/utils/ONS_scraper_functions.py'), 'database_compendium.utils.ONS_scraper_functions.get_ONS_datasets_urls': ( 'ons_scraper_functions.html#get_ons_datasets_urls', diff --git a/database_compendium/utils/ONS_scraper_functions.py b/database_compendium/utils/ONS_scraper_functions.py index a7e9649..13e0012 100644 --- a/database_compendium/utils/ONS_scraper_functions.py +++ b/database_compendium/utils/ONS_scraper_functions.py @@ -1,7 +1,8 @@ # AUTOGENERATED! DO NOT EDIT! File to edit: ../../nbs/00_ONS_scraper_functions.ipynb. # %% auto 0 -__all__ = ['get_ONS_datasets_titles_descriptions', 'get_ONS_datasets_urls', 'find_ONS_cols', 'get_ONS_long_description'] +__all__ = ['get_ONS_datasets_titles_descriptions', 'get_ONS_datasets_urls', 'find_ONS_cols', 'get_ONS_long_description', + 'find_ONS_cols_and_unique_vals'] # %% ../../nbs/00_ONS_scraper_functions.ipynb 4 import requests @@ -153,3 +154,40 @@ def get_ONS_long_description(): description_L.append(temp_desc) return description_L + +# %% ../../nbs/00_ONS_scraper_functions.ipynb 13 +def find_ONS_cols_and_unique_vals(url): + """ + Using the url provided this function: + - Checks a download is possible + - Downloads the csv file of the dataset + - Get all the column titles + - Get the unique values from columns containing non-numeric data + + Check is string contains number:https://stackoverflow.com/questions/354038/how-do-i-check-if-a-string-represents-a-number-float-or-int + """ + + temp = requests.get(url) + try: + temp = temp.json() + except: + return {} + + if temp['downloads']: + temp_url = temp['downloads']['csv']['href'] + + csv_url = requests.get(temp_url).text + temp_df = pd.read_csv(StringIO(csv_url), low_memory=False) + + col_data = {} + + for col in temp_df.columns: + col_data[col] = None + + if type(temp_df.loc[:, col][0]) == str: # Check for string data type + if not temp_df.loc[:, col][0].replace('.','', 1).isdigit(): # if the data is a string ensure that it isn't numeric + col_data[col] = list(temp_df.loc[:, col].unique()) + else: + col_data = {} # This means the link didn't have a csv file href + + return col_data diff --git a/database_compendium/utils/generate_Metadata.py b/database_compendium/utils/generate_Metadata.py index 7f191c1..55d598e 100644 --- a/database_compendium/utils/generate_Metadata.py +++ b/database_compendium/utils/generate_Metadata.py @@ -32,7 +32,6 @@ def createMetadata(): except: cols.append('') - try: col_data.append(osf.find_ONS_cols_and_unique_vals(url)) except: diff --git a/nbs/00_ONS_scraper_functions.ipynb b/nbs/00_ONS_scraper_functions.ipynb index ff0432a..5b4eec7 100644 --- a/nbs/00_ONS_scraper_functions.ipynb +++ b/nbs/00_ONS_scraper_functions.ipynb @@ -171,7 +171,8 @@ ], "source": [ "# The url for the first dataset in the list\n", - "get_ONS_datasets_urls()[0]" + "urls = get_ONS_datasets_urls()\n", + "urls[0]" ] }, { @@ -301,6 +302,70 @@ " raise" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#| export\n", + "def find_ONS_cols_and_unique_vals(url):\n", + " \"\"\"\n", + " Using the url provided this function:\n", + " - Checks a download is possible\n", + " - Downloads the csv file of the dataset\n", + " - Get all the column titles\n", + " - Get the unique values from columns containing non-numeric data\n", + "\n", + " Check is string contains number:https://stackoverflow.com/questions/354038/how-do-i-check-if-a-string-represents-a-number-float-or-int\n", + " \"\"\"\n", + "\n", + " temp = requests.get(url)\n", + " try:\n", + " temp = temp.json()\n", + " except:\n", + " return {}\n", + "\n", + " if temp['downloads']:\n", + " temp_url = temp['downloads']['csv']['href']\n", + "\n", + " csv_url = requests.get(temp_url).text\n", + " temp_df = pd.read_csv(StringIO(csv_url), low_memory=False)\n", + "\n", + " col_data = {}\n", + "\n", + " for col in temp_df.columns:\n", + " col_data[col] = None\n", + "\n", + " if type(temp_df.loc[:, col][0]) == str: # Check for string data type\n", + " if not temp_df.loc[:, col][0].replace('.','', 1).isdigit(): # if the data is a string ensure that it isn't numeric\n", + " col_data[col] = list(temp_df.loc[:, col].unique())\n", + " else:\n", + " col_data = {} # This means the link didn't have a csv file href\n", + "\n", + " return col_data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "dict_keys(['v4_2', 'LCL', 'UCL', 'yyyy-qq', 'Time', 'uk-only', 'Geography', 'measure-of-wellbeing', 'MeasureOfWellbeing', 'wellbeing-estimate', 'Estimate', 'seasonal-adjustment', 'SeasonalAdjustment'])" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "find_ONS_cols_and_unique_vals(urls[0]).keys()" + ] + }, { "cell_type": "code", "execution_count": null, diff --git a/nbs/01_Nomis_scraper_functions.ipynb b/nbs/01_Nomis_scraper_functions.ipynb index 1e22780..62257ed 100644 --- a/nbs/01_Nomis_scraper_functions.ipynb +++ b/nbs/01_Nomis_scraper_functions.ipynb @@ -301,42 +301,28 @@ "metadata": {}, "outputs": [ { - "ename": "ConnectTimeout", - "evalue": "HTTPSConnectionPool(host='www.nomisweb.co.uk', port=443): Max retries exceeded with url: /api/v01/dataset/NM_1089_1/OCCRATBROOM.def.sdmx.json (Caused by ConnectTimeoutError(, 'Connection to www.nomisweb.co.uk timed out. (connect timeout=None)'))", - "output_type": "error", - "traceback": [ - "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[1;31mTimeoutError\u001b[0m Traceback (most recent call last)", - "File \u001b[1;32mC:\\Python311\\Lib\\site-packages\\urllib3\\connection.py:203\u001b[0m, in \u001b[0;36mHTTPConnection._new_conn\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 202\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m--> 203\u001b[0m sock \u001b[38;5;241m=\u001b[39m \u001b[43mconnection\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcreate_connection\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 204\u001b[0m \u001b[43m \u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_dns_host\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mport\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 205\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtimeout\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 206\u001b[0m \u001b[43m \u001b[49m\u001b[43msource_address\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msource_address\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 207\u001b[0m \u001b[43m \u001b[49m\u001b[43msocket_options\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msocket_options\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 208\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 209\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m socket\u001b[38;5;241m.\u001b[39mgaierror \u001b[38;5;28;01mas\u001b[39;00m e:\n", - "File \u001b[1;32mC:\\Python311\\Lib\\site-packages\\urllib3\\util\\connection.py:85\u001b[0m, in \u001b[0;36mcreate_connection\u001b[1;34m(address, timeout, source_address, socket_options)\u001b[0m\n\u001b[0;32m 84\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m---> 85\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m err\n\u001b[0;32m 86\u001b[0m \u001b[38;5;28;01mfinally\u001b[39;00m:\n\u001b[0;32m 87\u001b[0m \u001b[38;5;66;03m# Break explicitly a reference cycle\u001b[39;00m\n", - "File \u001b[1;32mC:\\Python311\\Lib\\site-packages\\urllib3\\util\\connection.py:73\u001b[0m, in \u001b[0;36mcreate_connection\u001b[1;34m(address, timeout, source_address, socket_options)\u001b[0m\n\u001b[0;32m 72\u001b[0m sock\u001b[38;5;241m.\u001b[39mbind(source_address)\n\u001b[1;32m---> 73\u001b[0m sock\u001b[38;5;241m.\u001b[39mconnect(sa)\n\u001b[0;32m 74\u001b[0m \u001b[38;5;66;03m# Break explicitly a reference cycle\u001b[39;00m\n", - "\u001b[1;31mTimeoutError\u001b[0m: [WinError 10060] A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond", - "\nThe above exception was the direct cause of the following exception:\n", - "\u001b[1;31mConnectTimeoutError\u001b[0m Traceback (most recent call last)", - "File \u001b[1;32mC:\\Python311\\Lib\\site-packages\\urllib3\\connectionpool.py:790\u001b[0m, in \u001b[0;36mHTTPConnectionPool.urlopen\u001b[1;34m(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, preload_content, decode_content, **response_kw)\u001b[0m\n\u001b[0;32m 789\u001b[0m \u001b[38;5;66;03m# Make the request on the HTTPConnection object\u001b[39;00m\n\u001b[1;32m--> 790\u001b[0m response \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_make_request\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 791\u001b[0m \u001b[43m \u001b[49m\u001b[43mconn\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 792\u001b[0m \u001b[43m \u001b[49m\u001b[43mmethod\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 793\u001b[0m \u001b[43m \u001b[49m\u001b[43murl\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 794\u001b[0m \u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtimeout_obj\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 795\u001b[0m \u001b[43m \u001b[49m\u001b[43mbody\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mbody\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 796\u001b[0m \u001b[43m \u001b[49m\u001b[43mheaders\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mheaders\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 797\u001b[0m \u001b[43m \u001b[49m\u001b[43mchunked\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mchunked\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 798\u001b[0m \u001b[43m \u001b[49m\u001b[43mretries\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mretries\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 799\u001b[0m \u001b[43m \u001b[49m\u001b[43mresponse_conn\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mresponse_conn\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 800\u001b[0m \u001b[43m \u001b[49m\u001b[43mpreload_content\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpreload_content\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 801\u001b[0m \u001b[43m \u001b[49m\u001b[43mdecode_content\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdecode_content\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 802\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mresponse_kw\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 803\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 805\u001b[0m \u001b[38;5;66;03m# Everything went great!\u001b[39;00m\n", - "File \u001b[1;32mC:\\Python311\\Lib\\site-packages\\urllib3\\connectionpool.py:491\u001b[0m, in \u001b[0;36mHTTPConnectionPool._make_request\u001b[1;34m(self, conn, method, url, body, headers, retries, timeout, chunked, response_conn, preload_content, decode_content, enforce_content_length)\u001b[0m\n\u001b[0;32m 490\u001b[0m new_e \u001b[38;5;241m=\u001b[39m _wrap_proxy_error(new_e, conn\u001b[38;5;241m.\u001b[39mproxy\u001b[38;5;241m.\u001b[39mscheme)\n\u001b[1;32m--> 491\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m new_e\n\u001b[0;32m 493\u001b[0m \u001b[38;5;66;03m# conn.request() calls http.client.*.request, not the method in\u001b[39;00m\n\u001b[0;32m 494\u001b[0m \u001b[38;5;66;03m# urllib3.request. It also calls makefile (recv) on the socket.\u001b[39;00m\n", - "File \u001b[1;32mC:\\Python311\\Lib\\site-packages\\urllib3\\connectionpool.py:467\u001b[0m, in \u001b[0;36mHTTPConnectionPool._make_request\u001b[1;34m(self, conn, method, url, body, headers, retries, timeout, chunked, response_conn, preload_content, decode_content, enforce_content_length)\u001b[0m\n\u001b[0;32m 466\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m--> 467\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_validate_conn\u001b[49m\u001b[43m(\u001b[49m\u001b[43mconn\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 468\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m (SocketTimeout, BaseSSLError) \u001b[38;5;28;01mas\u001b[39;00m e:\n", - "File \u001b[1;32mC:\\Python311\\Lib\\site-packages\\urllib3\\connectionpool.py:1092\u001b[0m, in \u001b[0;36mHTTPSConnectionPool._validate_conn\u001b[1;34m(self, conn)\u001b[0m\n\u001b[0;32m 1091\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m conn\u001b[38;5;241m.\u001b[39mis_closed:\n\u001b[1;32m-> 1092\u001b[0m \u001b[43mconn\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mconnect\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 1094\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m conn\u001b[38;5;241m.\u001b[39mis_verified:\n", - "File \u001b[1;32mC:\\Python311\\Lib\\site-packages\\urllib3\\connection.py:611\u001b[0m, in \u001b[0;36mHTTPSConnection.connect\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 610\u001b[0m sock: socket\u001b[38;5;241m.\u001b[39msocket \u001b[38;5;241m|\u001b[39m ssl\u001b[38;5;241m.\u001b[39mSSLSocket\n\u001b[1;32m--> 611\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msock \u001b[38;5;241m=\u001b[39m sock \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_new_conn\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 612\u001b[0m server_hostname: \u001b[38;5;28mstr\u001b[39m \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhost\n", - "File \u001b[1;32mC:\\Python311\\Lib\\site-packages\\urllib3\\connection.py:212\u001b[0m, in \u001b[0;36mHTTPConnection._new_conn\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 211\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m SocketTimeout \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m--> 212\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m ConnectTimeoutError(\n\u001b[0;32m 213\u001b[0m \u001b[38;5;28mself\u001b[39m,\n\u001b[0;32m 214\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mConnection to \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhost\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m timed out. (connect timeout=\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtimeout\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m)\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m 215\u001b[0m ) \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01me\u001b[39;00m\n\u001b[0;32m 217\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mOSError\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e:\n", - "\u001b[1;31mConnectTimeoutError\u001b[0m: (, 'Connection to www.nomisweb.co.uk timed out. (connect timeout=None)')", - "\nThe above exception was the direct cause of the following exception:\n", - "\u001b[1;31mMaxRetryError\u001b[0m Traceback (most recent call last)", - "File \u001b[1;32mC:\\Python311\\Lib\\site-packages\\requests\\adapters.py:486\u001b[0m, in \u001b[0;36mHTTPAdapter.send\u001b[1;34m(self, request, stream, timeout, verify, cert, proxies)\u001b[0m\n\u001b[0;32m 485\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m--> 486\u001b[0m resp \u001b[38;5;241m=\u001b[39m \u001b[43mconn\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43murlopen\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 487\u001b[0m \u001b[43m \u001b[49m\u001b[43mmethod\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrequest\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmethod\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 488\u001b[0m \u001b[43m \u001b[49m\u001b[43murl\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43murl\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 489\u001b[0m \u001b[43m \u001b[49m\u001b[43mbody\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrequest\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mbody\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 490\u001b[0m \u001b[43m \u001b[49m\u001b[43mheaders\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrequest\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mheaders\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 491\u001b[0m \u001b[43m \u001b[49m\u001b[43mredirect\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[0;32m 492\u001b[0m \u001b[43m \u001b[49m\u001b[43massert_same_host\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[0;32m 493\u001b[0m \u001b[43m \u001b[49m\u001b[43mpreload_content\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[0;32m 494\u001b[0m \u001b[43m \u001b[49m\u001b[43mdecode_content\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[0;32m 495\u001b[0m \u001b[43m \u001b[49m\u001b[43mretries\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmax_retries\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 496\u001b[0m \u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtimeout\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 497\u001b[0m \u001b[43m \u001b[49m\u001b[43mchunked\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mchunked\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 498\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 500\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m (ProtocolError, \u001b[38;5;167;01mOSError\u001b[39;00m) \u001b[38;5;28;01mas\u001b[39;00m err:\n", - "File \u001b[1;32mC:\\Python311\\Lib\\site-packages\\urllib3\\connectionpool.py:844\u001b[0m, in \u001b[0;36mHTTPConnectionPool.urlopen\u001b[1;34m(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, preload_content, decode_content, **response_kw)\u001b[0m\n\u001b[0;32m 842\u001b[0m new_e \u001b[38;5;241m=\u001b[39m ProtocolError(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mConnection aborted.\u001b[39m\u001b[38;5;124m\"\u001b[39m, new_e)\n\u001b[1;32m--> 844\u001b[0m retries \u001b[38;5;241m=\u001b[39m \u001b[43mretries\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mincrement\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 845\u001b[0m \u001b[43m \u001b[49m\u001b[43mmethod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43murl\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43merror\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mnew_e\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m_pool\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m_stacktrace\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43msys\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mexc_info\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;241;43m2\u001b[39;49m\u001b[43m]\u001b[49m\n\u001b[0;32m 846\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 847\u001b[0m retries\u001b[38;5;241m.\u001b[39msleep()\n", - "File \u001b[1;32mC:\\Python311\\Lib\\site-packages\\urllib3\\util\\retry.py:515\u001b[0m, in \u001b[0;36mRetry.increment\u001b[1;34m(self, method, url, response, error, _pool, _stacktrace)\u001b[0m\n\u001b[0;32m 514\u001b[0m reason \u001b[38;5;241m=\u001b[39m error \u001b[38;5;129;01mor\u001b[39;00m ResponseError(cause)\n\u001b[1;32m--> 515\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m MaxRetryError(_pool, url, reason) \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mreason\u001b[39;00m \u001b[38;5;66;03m# type: ignore[arg-type]\u001b[39;00m\n\u001b[0;32m 517\u001b[0m log\u001b[38;5;241m.\u001b[39mdebug(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mIncremented Retry for (url=\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;132;01m%s\u001b[39;00m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m): \u001b[39m\u001b[38;5;132;01m%r\u001b[39;00m\u001b[38;5;124m\"\u001b[39m, url, new_retry)\n", - "\u001b[1;31mMaxRetryError\u001b[0m: HTTPSConnectionPool(host='www.nomisweb.co.uk', port=443): Max retries exceeded with url: /api/v01/dataset/NM_1089_1/OCCRATBROOM.def.sdmx.json (Caused by ConnectTimeoutError(, 'Connection to www.nomisweb.co.uk timed out. (connect timeout=None)'))", - "\nDuring handling of the above exception, another exception occurred:\n", - "\u001b[1;31mConnectTimeout\u001b[0m Traceback (most recent call last)", - "Cell \u001b[1;32mIn[13], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m params \u001b[38;5;241m=\u001b[39m \u001b[43mget_nomis_datasets_parameters\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 2\u001b[0m params[\u001b[38;5;241m0\u001b[39m]\n", - "Cell \u001b[1;32mIn[10], line 63\u001b[0m, in \u001b[0;36mget_nomis_datasets_parameters\u001b[1;34m()\u001b[0m\n\u001b[0;32m 60\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m parameter_name \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mlist\u001b[39m(parameter_names_list[i]\u001b[38;5;241m.\u001b[39mkeys()):\n\u001b[0;32m 62\u001b[0m url \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mhttps://www.nomisweb.co.uk/api/v01/dataset/\u001b[39m\u001b[38;5;124m'\u001b[39m \u001b[38;5;241m+\u001b[39m ids[i] \u001b[38;5;241m+\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m/\u001b[39m\u001b[38;5;124m'\u001b[39m \u001b[38;5;241m+\u001b[39m parameter_name \u001b[38;5;241m+\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m.def.sdmx.json\u001b[39m\u001b[38;5;124m'\u001b[39m\n\u001b[1;32m---> 63\u001b[0m response \u001b[38;5;241m=\u001b[39m \u001b[43mrequests\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[43murl\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 64\u001b[0m info \u001b[38;5;241m=\u001b[39m response\u001b[38;5;241m.\u001b[39mjson()\n\u001b[0;32m 66\u001b[0m values \u001b[38;5;241m=\u001b[39m info[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mstructure\u001b[39m\u001b[38;5;124m'\u001b[39m][\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mcodelists\u001b[39m\u001b[38;5;124m'\u001b[39m][\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mcodelist\u001b[39m\u001b[38;5;124m'\u001b[39m][\u001b[38;5;241m0\u001b[39m][\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mcode\u001b[39m\u001b[38;5;124m'\u001b[39m]\n", - "File \u001b[1;32mC:\\Python311\\Lib\\site-packages\\requests\\api.py:73\u001b[0m, in \u001b[0;36mget\u001b[1;34m(url, params, **kwargs)\u001b[0m\n\u001b[0;32m 62\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mget\u001b[39m(url, params\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[0;32m 63\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124mr\u001b[39m\u001b[38;5;124;03m\"\"\"Sends a GET request.\u001b[39;00m\n\u001b[0;32m 64\u001b[0m \n\u001b[0;32m 65\u001b[0m \u001b[38;5;124;03m :param url: URL for the new :class:`Request` object.\u001b[39;00m\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 70\u001b[0m \u001b[38;5;124;03m :rtype: requests.Response\u001b[39;00m\n\u001b[0;32m 71\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[1;32m---> 73\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mrequest\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mget\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43murl\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mparams\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mparams\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[1;32mC:\\Python311\\Lib\\site-packages\\requests\\api.py:59\u001b[0m, in \u001b[0;36mrequest\u001b[1;34m(method, url, **kwargs)\u001b[0m\n\u001b[0;32m 55\u001b[0m \u001b[38;5;66;03m# By using the 'with' statement we are sure the session is closed, thus we\u001b[39;00m\n\u001b[0;32m 56\u001b[0m \u001b[38;5;66;03m# avoid leaving sockets open which can trigger a ResourceWarning in some\u001b[39;00m\n\u001b[0;32m 57\u001b[0m \u001b[38;5;66;03m# cases, and look like a memory leak in others.\u001b[39;00m\n\u001b[0;32m 58\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m sessions\u001b[38;5;241m.\u001b[39mSession() \u001b[38;5;28;01mas\u001b[39;00m session:\n\u001b[1;32m---> 59\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43msession\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrequest\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmethod\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmethod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43murl\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43murl\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[1;32mC:\\Python311\\Lib\\site-packages\\requests\\sessions.py:589\u001b[0m, in \u001b[0;36mSession.request\u001b[1;34m(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json)\u001b[0m\n\u001b[0;32m 584\u001b[0m send_kwargs \u001b[38;5;241m=\u001b[39m {\n\u001b[0;32m 585\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtimeout\u001b[39m\u001b[38;5;124m\"\u001b[39m: timeout,\n\u001b[0;32m 586\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mallow_redirects\u001b[39m\u001b[38;5;124m\"\u001b[39m: allow_redirects,\n\u001b[0;32m 587\u001b[0m }\n\u001b[0;32m 588\u001b[0m send_kwargs\u001b[38;5;241m.\u001b[39mupdate(settings)\n\u001b[1;32m--> 589\u001b[0m resp \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msend\u001b[49m\u001b[43m(\u001b[49m\u001b[43mprep\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43msend_kwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 591\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m resp\n", - "File \u001b[1;32mC:\\Python311\\Lib\\site-packages\\requests\\sessions.py:703\u001b[0m, in \u001b[0;36mSession.send\u001b[1;34m(self, request, **kwargs)\u001b[0m\n\u001b[0;32m 700\u001b[0m start \u001b[38;5;241m=\u001b[39m preferred_clock()\n\u001b[0;32m 702\u001b[0m \u001b[38;5;66;03m# Send the request\u001b[39;00m\n\u001b[1;32m--> 703\u001b[0m r \u001b[38;5;241m=\u001b[39m \u001b[43madapter\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msend\u001b[49m\u001b[43m(\u001b[49m\u001b[43mrequest\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 705\u001b[0m \u001b[38;5;66;03m# Total elapsed time of the request (approximately)\u001b[39;00m\n\u001b[0;32m 706\u001b[0m elapsed \u001b[38;5;241m=\u001b[39m preferred_clock() \u001b[38;5;241m-\u001b[39m start\n", - "File \u001b[1;32mC:\\Python311\\Lib\\site-packages\\requests\\adapters.py:507\u001b[0m, in \u001b[0;36mHTTPAdapter.send\u001b[1;34m(self, request, stream, timeout, verify, cert, proxies)\u001b[0m\n\u001b[0;32m 504\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(e\u001b[38;5;241m.\u001b[39mreason, ConnectTimeoutError):\n\u001b[0;32m 505\u001b[0m \u001b[38;5;66;03m# TODO: Remove this in 3.0.0: see #2811\u001b[39;00m\n\u001b[0;32m 506\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(e\u001b[38;5;241m.\u001b[39mreason, NewConnectionError):\n\u001b[1;32m--> 507\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m ConnectTimeout(e, request\u001b[38;5;241m=\u001b[39mrequest)\n\u001b[0;32m 509\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(e\u001b[38;5;241m.\u001b[39mreason, ResponseError):\n\u001b[0;32m 510\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m RetryError(e, request\u001b[38;5;241m=\u001b[39mrequest)\n", - "\u001b[1;31mConnectTimeout\u001b[0m: HTTPSConnectionPool(host='www.nomisweb.co.uk', port=443): Max retries exceeded with url: /api/v01/dataset/NM_1089_1/OCCRATBROOM.def.sdmx.json (Caused by ConnectTimeoutError(, 'Connection to www.nomisweb.co.uk timed out. (connect timeout=None)'))" - ] + "data": { + "text/plain": [ + "{'GEOGRAPHY': ['United Kingdom',\n", + " 'Great Britain',\n", + " 'England',\n", + " 'Wales',\n", + " 'Scotland',\n", + " 'Northern Ireland',\n", + " 'England and Wales'],\n", + " 'SEX': ['Male', 'Female', 'Total'],\n", + " 'ITEM': ['Total claimants',\n", + " 'Students on vacation',\n", + " 'Temporarily stopped',\n", + " 'Claimants under 18 years',\n", + " 'Married females'],\n", + " 'MEASURES': ['claimants', 'workforce', 'active', 'residence'],\n", + " 'FREQ': ['Monthly', 'Quarterly', 'Half-yearly, semester', 'Annually']}" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ diff --git a/nbs/combiningEverything.ipynb b/nbs/combiningEverything.ipynb index 5fad428..8fe0485 100644 --- a/nbs/combiningEverything.ipynb +++ b/nbs/combiningEverything.ipynb @@ -64,79 +64,78 @@ "\n", " \"\"\" ONS Data \"\"\"\n", " \n", - " print(\"Loading ONS...\")\n", + " # print(\"Loading ONS...\")\n", " \n", - " # Takes over 70 mins to run\n", - " titles, descriptions = osf.get_ONS_datasets_titles_descriptions()\n", - " urls = osf.get_ONS_datasets_urls()\n", - " long_desc = osf.get_ONS_long_description()\n", + " # # Takes over 70 mins to run\n", + " # titles, descriptions = osf.get_ONS_datasets_titles_descriptions()\n", + " # urls = osf.get_ONS_datasets_urls()\n", + " # long_desc = osf.get_ONS_long_description()\n", " \n", - " latest_release = []\n", - " cols = []\n", - " col_data = []\n", - " count = 0\n", + " # latest_release = []\n", + " # cols = []\n", + " # col_data = []\n", + " # count = 0\n", " \n", - " for url in urls:\n", - " response = requests.get(url)\n", - " try: \n", - " latest_release.append(response.json()['release_date'])\n", - " except:\n", - " latest_release.append(float('nan'))\n", + " # for url in urls:\n", + " # response = requests.get(url)\n", + " # try: \n", + " # latest_release.append(response.json()['release_date'])\n", + " # except:\n", + " # latest_release.append(float('nan'))\n", " \n", - " try:\n", - " cols.append(osf.find_ONS_cols(url))\n", - " except:\n", - " cols.append('')\n", + " # try:\n", + " # cols.append(osf.find_ONS_cols(url))\n", + " # except:\n", + " # cols.append('')\n", " \n", - " \n", - " try:\n", - " col_data.append(osf.find_ONS_cols_and_unique_vals(url))\n", - " except:\n", - " col_data.append('')\n", + " # try:\n", + " # col_data.append(osf.find_ONS_cols_and_unique_vals(url))\n", + " # except:\n", + " # col_data.append('')\n", " \n", " \n", - " count +=1\n", + " # count +=1\n", " \n", - " ONS_df = pd.DataFrame({'Title': titles, 'Description': descriptions, \n", - " 'Long_description': long_desc, 'Columns': cols, \n", - " 'Unique_parameters': col_data, 'Latest_release': latest_release})\n", + " # ONS_df = pd.DataFrame({'Title': titles, 'Description': descriptions, \n", + " # 'Long_description': long_desc, 'Columns': cols, \n", + " # 'Unique_parameters': col_data, 'Latest_release': latest_release})\n", " \n", " \n", " \"\"\" Nomis Data \"\"\"\n", " \n", - " print(\"Loading Nomis...\")\n", + " # print(\"Loading Nomis...\")\n", " \n", - " # Takes around 50 mins\n", - " titles, descriptions, l_descriptions = nsf.get_nomis_datasets_titles_descriptions()\n", - " latest_release = nsf.get_nomis_last_updated()\n", - " cols = np.empty(len(titles))\n", - " params = nsf.get_nomis_datasets_parameters()\n", + " # # Takes around 50 mins\n", + " # titles, descriptions, l_descriptions = nsf.get_nomis_datasets_titles_descriptions()\n", + " # latest_release = nsf.get_nomis_last_updated()\n", + " # cols = np.empty(len(titles))\n", + " # params = nsf.get_nomis_datasets_parameters()\n", " \n", - " nomis_df = pd.DataFrame({'Title': titles, 'Description': descriptions, \n", - " 'Long_description': l_descriptions, 'Columns': cols, \n", - " 'Unique_parameters': params, 'Latest_release': latest_release})\n", + " # nomis_df = pd.DataFrame({'Title': titles, 'Description': descriptions, \n", + " # 'Long_description': l_descriptions, 'Columns': cols, \n", + " # 'Unique_parameters': params, 'Latest_release': latest_release})\n", "\n", "\n", " \"\"\" Monthly Insolvency Statistics \"\"\"\n", " \n", - " print(\"Loading insolvency stats\")\n", - " cols = []\n", - " col_data = []\n", - " insolvency_stats, long_desc = iss.get_insolvency_stats()\n", - " titles = insolvency_stats.keys()\n", + " # print(\"Loading insolvency stats\")\n", + " # cols = []\n", + " # col_data = []\n", + " # insolvency_stats, long_desc = iss.get_insolvency_stats()\n", + " # titles = insolvency_stats.keys()\n", " \n", - " # The descriptions on latest releases are all the same so making a repeated list\n", - " description = [iss.get_mis_description()] * len(titles)\n", - " latest_release = [iss.get_mis_last_updated()] * len(titles)\n", - " long_desc = [long_desc] * len(titles)\n", + " # # The descriptions on latest releases are all the same so making a repeated list\n", + " # description = [iss.get_mis_description()] * len(titles)\n", + " # latest_release = [iss.get_mis_last_updated()] * len(titles)\n", + " # long_desc = [long_desc] * len(titles)\n", " \n", - " for title in titles:\n", - " cols.append(list(insolvency_stats[title].columns))\n", - " col_data.append(iss.get_insolvency_unique_column_vals(insolvency_stats[title]))\n", + " # for title in titles:\n", + " # cols.append(list(insolvency_stats[title].columns))\n", + " # col_data.append(iss.get_insolvency_unique_column_vals(insolvency_stats[title]))\n", " \n", - " insolvency_df = pd.DataFrame({'Title': titles, 'Description': description, \n", - " 'Long_description': long_desc, 'Columns': cols, \n", - " 'Unique_parameters': col_data, 'Latest_release': latest_release})\n", + " # insolvency_df = pd.DataFrame({'Title': titles, 'Description': description, \n", + " # 'Long_description': long_desc, 'Columns': cols, \n", + " # 'Unique_parameters': col_data, 'Latest_release': latest_release})\n", "\n", " \n", " \"\"\" Police Data - currently only for Bethnal green so is commented out \"\"\"\n", @@ -146,20 +145,20 @@ " response = requests.get(url)\n", " records = response.json()['records']\n", "\n", - " constituency_coords = {}\n", + " constituency_coords = pds.get_constituency_coordinates()\n", "\n", - " for record in records:\n", - " coords = []\n", - " name = record['fields']['pcon22nm'] # This will change as the dataset gets updated each year\n", - " temp_coords = record['fields']['geo_shape']['coordinates']\n", + " # for record in records:\n", + " # coords = []\n", + " # name = record['fields']['pcon22nm'] # This will change as the dataset gets updated each year\n", + " # temp_coords = record['fields']['geo_shape']['coordinates']\n", " \n", - " temp_coords = pds.flatten_list(temp_coords)\n", + " # temp_coords = pds.flatten_list(temp_coords)\n", " \n", - " offset = math.floor(len(temp_coords)/4)\n", - " constituency_coords[name] = (str(round(temp_coords[0][1], 3)) + ',' + str(round(temp_coords[0][0], 3)) + ':')\n", - " constituency_coords[name] += (str(round(temp_coords[offset][1], 3)) + ',' + str(round(temp_coords[offset][0], 3)) + ':')\n", - " constituency_coords[name] += (str(round(temp_coords[offset*2][1], 3)) + ',' + str(round(temp_coords[offset*2][0], 3)) + ':')\n", - " constituency_coords[name] += (str(round(temp_coords[offset*3][1], 3)) + ',' + str(round(temp_coords[offset*3][0], 3)))\n", + " # offset = math.floor(len(temp_coords)/4)\n", + " # constituency_coords[name] = (str(round(temp_coords[0][1], 3)) + ',' + str(round(temp_coords[0][0], 3)) + ':')\n", + " # constituency_coords[name] += (str(round(temp_coords[offset][1], 3)) + ',' + str(round(temp_coords[offset][0], 3)) + ':')\n", + " # constituency_coords[name] += (str(round(temp_coords[offset*2][1], 3)) + ',' + str(round(temp_coords[offset*2][0], 3)) + ':')\n", + " # constituency_coords[name] += (str(round(temp_coords[offset*3][1], 3)) + ',' + str(round(temp_coords[offset*3][0], 3)))\n", "\n", " street_level_crimes, sl_last_updated = pds.get_street_level_crimes(constituency_coords['Bethnal Green and Bow'], '2023-03', 'poly')\n", " no_loc_crimes = pds.get_crimes_no_loc('metropolitan', '2023-03')\n", @@ -205,36 +204,36 @@ "\n", " \"\"\" NHS Quality and Outcomes \"\"\"\n", "\n", - " print(\"Loading NHS Quality and Outcomes...\")\n", - " NHS_quality_outcomes, long_description, latest_release = qos.get_NHS_qualityOutcomes()\n", + " # print(\"Loading NHS Quality and Outcomes...\")\n", + " # NHS_quality_outcomes, long_description, latest_release = qos.get_NHS_qualityOutcomes()\n", "\n", - " titles = []\n", - " for title in NHS_quality_outcomes.keys():\n", - " titles.append(title.split(':')[1]) # Removing the text before : which is 'Table n:'\n", + " # titles = []\n", + " # for title in NHS_quality_outcomes.keys():\n", + " # titles.append(title.split(':')[1]) # Removing the text before : which is 'Table n:'\n", " \n", - " num_tables = len(titles)\n", - " descriptions = [''] * num_tables\n", - " long_descriptions = [long_description] * num_tables\n", - " last_rel = [latest_release] * num_tables\n", + " # num_tables = len(titles)\n", + " # descriptions = [''] * num_tables\n", + " # long_descriptions = [long_description] * num_tables\n", + " # last_rel = [latest_release] * num_tables\n", " \n", - " cols = []\n", - " unique_params = []\n", - " for dataset in NHS_quality_outcomes.keys():\n", - " sheet = NHS_quality_outcomes[dataset]\n", - " temp_cols, temp_unqParams = qos.get_qualityOutcomes_uniqueColumnValues(sheet)\n", - " cols.append(temp_cols)\n", - " unique_params.append(temp_unqParams)\n", + " # cols = []\n", + " # unique_params = []\n", + " # for dataset in NHS_quality_outcomes.keys():\n", + " # sheet = NHS_quality_outcomes[dataset]\n", + " # temp_cols, temp_unqParams = qos.get_qualityOutcomes_uniqueColumnValues(sheet)\n", + " # cols.append(temp_cols)\n", + " # unique_params.append(temp_unqParams)\n", "\n", - " qualOutcomes_df = pd.DataFrame({'Title': titles, 'Description': descriptions, \n", - " 'Long_description': long_descriptions, 'Columns': cols, \n", - " 'Unique_parameters': unique_params, 'Latest_release': latest_release})\n", + " # qualOutcomes_df = pd.DataFrame({'Title': titles, 'Description': descriptions, \n", + " # 'Long_description': long_descriptions, 'Columns': cols, \n", + " # 'Unique_parameters': unique_params, 'Latest_release': latest_release})\n", "\n", " \n", - " # metadata_df = pd.concat([ONS_df, nomis_df, insolvency_df, police_df, ]).reset_index(drop=True)\n", - " metadata_df = pd.concat([police_df, qualOutcomes_df]).reset_index(drop=True)\n", + " # # metadata_df = pd.concat([ONS_df, nomis_df, insolvency_df, police_df, ]).reset_index(drop=True)\n", + " # metadata_df = pd.concat([police_df, qualOutcomes_df]).reset_index(drop=True)\n", " print(\"Complete\")\n", "\n", - " return metadata_df" + " return police_df #metadata_df" ] }, { @@ -242,20 +241,7 @@ "execution_count": null, "id": "7f46ab34-fc88-4b17-843a-4fcd0bc66eb0", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Loading ONS...\n", - "Loading Nomis...\n", - "Loading insolvency stats\n", - "Loding police data...\n", - "Loading NHS Quality and Outcomes...\n", - "Complete\n" - ] - } - ], + "outputs": [], "source": [ "metadata_df = createMetadata()" ] @@ -265,492 +251,7 @@ "execution_count": null, "id": "13b2d6c1-bda6-4d4e-aa67-1de6b61e225b", "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
TitleDescriptionLong_descriptionColumnsUnique_parametersLatest_release
0Police Data - Street-level crimesCrimes at street-level; either within a 1 mile...The Police API Documentation for Street-Level ...[category, location_type, context, location_su...{'category': ['anti-social-behaviour', 'bicycl...2023-06-01
1Police Data - Crimes with no locationReturns a list of crimes that could not be map...The Police API Documentation for Crimes with N...[category, context, location_subtype, outcome_...{'category': ['bicycle-theft', 'burglary', 'cr...2023-06-01
2Police Data - Stop and searches by areaStop and searches at street-level; either with...The Police API Documentation for Stop and Sear...[outcome, self_defined_ethnicity, gender, legi...{'outcome': ['A no further action disposal', '...2023-06
3Police Data - Stop and searches with no locationStop and searches that could not be mapped to ...The Police API Documentation for Stop and Sear...[age_range, outcome, self_defined_ethnicity, g...{'age_range': ['over 34', '18-24', None, '25-3...2023-06
4Prevalence, achievement and personalised care...The objective of the Quality and Outcomes Fra...[Region name, Achievement Score (max 12), Regi...{'Region ODS code': ['ENG', 'Y56', 'Y58', 'Y59...22/09/2022
5Achievement and personalised care adjustments...The objective of the Quality and Outcomes Fra...[Region name, Region ODS code, Total Achieveme...{'Region ODS code': ['ENG', 'Y56', 'Y58', 'Y59...22/09/2022
6Prevalence, achievement and personalised care...The objective of the Quality and Outcomes Fra...[Region name, Achievement Score (max 12), Regi...{'Region ODS code': ['ENG', 'Y56', 'Y58', 'Y59...22/09/2022
7Prevalence, achievement and personalised care...The objective of the Quality and Outcomes Fra...[Region name, Achievement Score (max 12), Achi...{'Region ODS code': ['ENG', 'Y56', 'Y58', 'Y59...22/09/2022
8Prevalence, achievement and personalised care...The objective of the Quality and Outcomes Fra...[Region name, Region ODS code, Total Denominat...{'Region ODS code': ['ENG', 'Y56', 'Y58', 'Y59...22/09/2022
9Prevalence and achievement, cardiovascular gr...The objective of the Quality and Outcomes Fra...[Region name, List size, Region ONS code, Achi...{'Region ODS code': ['ENG', 'Y56', 'Y58', 'Y59...22/09/2022
10Prevalence, achievement and personalised care...The objective of the Quality and Outcomes Fra...[Region name, Region ODS code, Total Achieveme...{'Region ODS code': ['ENG', 'Y56', 'Y58', 'Y59...22/09/2022
11Prevalence, achievement and personalised care...The objective of the Quality and Outcomes Fra...[Region name, Region ODS code, Total Denominat...{'Region ODS code': ['ENG', 'Y56', 'Y58', 'Y59...22/09/2022
12Prevalence, achievement and personalised care...The objective of the Quality and Outcomes Fra...[Region name, Region ODS code, Total Denominat...{'Region ODS code': ['ENG', 'Y56', 'Y58', 'Y59...22/09/2022
13Prevalence and achievement, lifestyle group, ...The objective of the Quality and Outcomes Fra...[Region name, Total Achievement Score (max 8),...{'Region ODS code': ['ENG', 'Y56', 'Y58', 'Y59...22/09/2022
14Achievement and personalised care adjustments...The objective of the Quality and Outcomes Fra...[Region name, Achievement Score (max 12), Regi...{'Region ODS code': ['ENG', 'Y56', 'Y58', 'Y59...22/09/2022
15Prevalence, achievement and personalised care...The objective of the Quality and Outcomes Fra...[Region name, Region ODS code, Total Denominat...{'Region ODS code': ['ENG', 'Y56', 'Y58', 'Y59...22/09/2022
16Prevalence and achievement, high dependancy a...The objective of the Quality and Outcomes Fra...[Region name, Region ONS code, Year on year ch...{'Region ODS code': ['ENG', 'Y56', 'Y58', 'Y59...22/09/2022
17Prevalence, achievement and personalised care...The objective of the Quality and Outcomes Fra...[Region name, Total Achievement Score (max 76)...{'Region ODS code': ['ENG', 'Y56', 'Y58', 'Y59...22/09/2022
18Prevalence, achievement and personalised care...The objective of the Quality and Outcomes Fra...[Region name, Region ODS code, Total Denominat...{'Region ODS code': ['ENG', 'Y56', 'Y58', 'Y59...22/09/2022
19Prevalence and achievement, high dependancy a...The objective of the Quality and Outcomes Fra...[Region name, List size, Region ONS code, Year...{'Region ODS code': ['ENG', 'Y56', 'Y58', 'Y59...22/09/2022
20Prevalence, achievement and personalised care...The objective of the Quality and Outcomes Fra...[Region name, Total Achievement Score (max 44)...{'Region ODS code': ['ENG', 'Y56', 'Y58', 'Y59...22/09/2022
21Prevalence, achievement and personalised care...The objective of the Quality and Outcomes Fra...[Region name, Region ODS code, Total Denominat...{'Region ODS code': ['ENG', 'Y56', 'Y58', 'Y59...22/09/2022
22Prevalence and achievement, mental health and...The objective of the Quality and Outcomes Fra...[Region name, Region ONS code, Total Achieveme...{'Region ODS code': ['ENG', 'Y56', 'Y58', 'Y59...22/09/2022
23Prevalence and achievement, mental health and...The objective of the Quality and Outcomes Fra...[Region name, List size, Region ONS code, Year...{'Region ODS code': ['ENG', 'Y56', 'Y58', 'Y59...22/09/2022
24Prevalence, achievement and personalised care...The objective of the Quality and Outcomes Fra...[Region name, Region ODS code, Total Denominat...{'Region ODS code': ['ENG', 'Y56', 'Y58', 'Y59...22/09/2022
25Prevalence and achievement, musculoskeletal g...The objective of the Quality and Outcomes Fra...[Region name, Region ONS code, Year on year ch...{'Region ODS code': ['ENG', 'Y56', 'Y58', 'Y59...22/09/2022
26Prevalence, achievement and personalised care...The objective of the Quality and Outcomes Fra...[Region name, Total Achievement Score (max 6),...{'Region ODS code': ['ENG', 'Y56', 'Y58', 'Y59...22/09/2022
27Achievement and personalised care adjustments...The objective of the Quality and Outcomes Fra...[Region name, Achievement Score (max 7), Regio...{'Region ODS code': ['ENG', 'Y56', 'Y58', 'Y59...22/09/2022
28Achievement and personalised care adjustments...The objective of the Quality and Outcomes Fra...[Region name, List size ages 1 to 2, Region OD...{'Region ODS code': ['ENG', 'Y56', 'Y58', 'Y59...22/09/2022
29Achievement, quality improvement group, early...The objective of the Quality and Outcomes Fra...[Region name, List size, Region ONS code, Achi...{'Region ODS code': ['ENG', 'Y56', 'Y58', 'Y59...22/09/2022
30Achievement, quality improvement group, care ...The objective of the Quality and Outcomes Fra...[Region name, List size, Region ONS code, Achi...{'Region ODS code': ['ENG', 'Y56', 'Y58', 'Y59...22/09/2022
\n", - "
" - ], - "text/plain": [ - " Title \\\n", - "0 Police Data - Street-level crimes \n", - "1 Police Data - Crimes with no location \n", - "2 Police Data - Stop and searches by area \n", - "3 Police Data - Stop and searches with no location \n", - "4 Prevalence, achievement and personalised care... \n", - "5 Achievement and personalised care adjustments... \n", - "6 Prevalence, achievement and personalised care... \n", - "7 Prevalence, achievement and personalised care... \n", - "8 Prevalence, achievement and personalised care... \n", - "9 Prevalence and achievement, cardiovascular gr... \n", - "10 Prevalence, achievement and personalised care... \n", - "11 Prevalence, achievement and personalised care... \n", - "12 Prevalence, achievement and personalised care... \n", - "13 Prevalence and achievement, lifestyle group, ... \n", - "14 Achievement and personalised care adjustments... \n", - "15 Prevalence, achievement and personalised care... \n", - "16 Prevalence and achievement, high dependancy a... \n", - "17 Prevalence, achievement and personalised care... \n", - "18 Prevalence, achievement and personalised care... \n", - "19 Prevalence and achievement, high dependancy a... \n", - "20 Prevalence, achievement and personalised care... \n", - "21 Prevalence, achievement and personalised care... \n", - "22 Prevalence and achievement, mental health and... \n", - "23 Prevalence and achievement, mental health and... \n", - "24 Prevalence, achievement and personalised care... \n", - "25 Prevalence and achievement, musculoskeletal g... \n", - "26 Prevalence, achievement and personalised care... \n", - "27 Achievement and personalised care adjustments... \n", - "28 Achievement and personalised care adjustments... \n", - "29 Achievement, quality improvement group, early... \n", - "30 Achievement, quality improvement group, care ... \n", - "\n", - " Description \\\n", - "0 Crimes at street-level; either within a 1 mile... \n", - "1 Returns a list of crimes that could not be map... \n", - "2 Stop and searches at street-level; either with... \n", - "3 Stop and searches that could not be mapped to ... \n", - "4 \n", - "5 \n", - "6 \n", - "7 \n", - "8 \n", - "9 \n", - "10 \n", - "11 \n", - "12 \n", - "13 \n", - "14 \n", - "15 \n", - "16 \n", - "17 \n", - "18 \n", - "19 \n", - "20 \n", - "21 \n", - "22 \n", - "23 \n", - "24 \n", - "25 \n", - "26 \n", - "27 \n", - "28 \n", - "29 \n", - "30 \n", - "\n", - " Long_description \\\n", - "0 The Police API Documentation for Street-Level ... \n", - "1 The Police API Documentation for Crimes with N... \n", - "2 The Police API Documentation for Stop and Sear... \n", - "3 The Police API Documentation for Stop and Sear... \n", - "4 The objective of the Quality and Outcomes Fra... \n", - "5 The objective of the Quality and Outcomes Fra... \n", - "6 The objective of the Quality and Outcomes Fra... \n", - "7 The objective of the Quality and Outcomes Fra... \n", - "8 The objective of the Quality and Outcomes Fra... \n", - "9 The objective of the Quality and Outcomes Fra... \n", - "10 The objective of the Quality and Outcomes Fra... \n", - "11 The objective of the Quality and Outcomes Fra... \n", - "12 The objective of the Quality and Outcomes Fra... \n", - "13 The objective of the Quality and Outcomes Fra... \n", - "14 The objective of the Quality and Outcomes Fra... \n", - "15 The objective of the Quality and Outcomes Fra... \n", - "16 The objective of the Quality and Outcomes Fra... \n", - "17 The objective of the Quality and Outcomes Fra... \n", - "18 The objective of the Quality and Outcomes Fra... \n", - "19 The objective of the Quality and Outcomes Fra... \n", - "20 The objective of the Quality and Outcomes Fra... \n", - "21 The objective of the Quality and Outcomes Fra... \n", - "22 The objective of the Quality and Outcomes Fra... \n", - "23 The objective of the Quality and Outcomes Fra... \n", - "24 The objective of the Quality and Outcomes Fra... \n", - "25 The objective of the Quality and Outcomes Fra... \n", - "26 The objective of the Quality and Outcomes Fra... \n", - "27 The objective of the Quality and Outcomes Fra... \n", - "28 The objective of the Quality and Outcomes Fra... \n", - "29 The objective of the Quality and Outcomes Fra... \n", - "30 The objective of the Quality and Outcomes Fra... \n", - "\n", - " Columns \\\n", - "0 [category, location_type, context, location_su... \n", - "1 [category, context, location_subtype, outcome_... \n", - "2 [outcome, self_defined_ethnicity, gender, legi... \n", - "3 [age_range, outcome, self_defined_ethnicity, g... \n", - "4 [Region name, Achievement Score (max 12), Regi... \n", - "5 [Region name, Region ODS code, Total Achieveme... \n", - "6 [Region name, Achievement Score (max 12), Regi... \n", - "7 [Region name, Achievement Score (max 12), Achi... \n", - "8 [Region name, Region ODS code, Total Denominat... \n", - "9 [Region name, List size, Region ONS code, Achi... \n", - "10 [Region name, Region ODS code, Total Achieveme... \n", - "11 [Region name, Region ODS code, Total Denominat... \n", - "12 [Region name, Region ODS code, Total Denominat... \n", - "13 [Region name, Total Achievement Score (max 8),... \n", - "14 [Region name, Achievement Score (max 12), Regi... \n", - "15 [Region name, Region ODS code, Total Denominat... \n", - "16 [Region name, Region ONS code, Year on year ch... \n", - "17 [Region name, Total Achievement Score (max 76)... \n", - "18 [Region name, Region ODS code, Total Denominat... \n", - "19 [Region name, List size, Region ONS code, Year... \n", - "20 [Region name, Total Achievement Score (max 44)... \n", - "21 [Region name, Region ODS code, Total Denominat... \n", - "22 [Region name, Region ONS code, Total Achieveme... \n", - "23 [Region name, List size, Region ONS code, Year... \n", - "24 [Region name, Region ODS code, Total Denominat... \n", - "25 [Region name, Region ONS code, Year on year ch... \n", - "26 [Region name, Total Achievement Score (max 6),... \n", - "27 [Region name, Achievement Score (max 7), Regio... \n", - "28 [Region name, List size ages 1 to 2, Region OD... \n", - "29 [Region name, List size, Region ONS code, Achi... \n", - "30 [Region name, List size, Region ONS code, Achi... \n", - "\n", - " Unique_parameters Latest_release \n", - "0 {'category': ['anti-social-behaviour', 'bicycl... 2023-06-01 \n", - "1 {'category': ['bicycle-theft', 'burglary', 'cr... 2023-06-01 \n", - "2 {'outcome': ['A no further action disposal', '... 2023-06 \n", - "3 {'age_range': ['over 34', '18-24', None, '25-3... 2023-06 \n", - "4 {'Region ODS code': ['ENG', 'Y56', 'Y58', 'Y59... 22/09/2022 \n", - "5 {'Region ODS code': ['ENG', 'Y56', 'Y58', 'Y59... 22/09/2022 \n", - "6 {'Region ODS code': ['ENG', 'Y56', 'Y58', 'Y59... 22/09/2022 \n", - "7 {'Region ODS code': ['ENG', 'Y56', 'Y58', 'Y59... 22/09/2022 \n", - "8 {'Region ODS code': ['ENG', 'Y56', 'Y58', 'Y59... 22/09/2022 \n", - "9 {'Region ODS code': ['ENG', 'Y56', 'Y58', 'Y59... 22/09/2022 \n", - "10 {'Region ODS code': ['ENG', 'Y56', 'Y58', 'Y59... 22/09/2022 \n", - "11 {'Region ODS code': ['ENG', 'Y56', 'Y58', 'Y59... 22/09/2022 \n", - "12 {'Region ODS code': ['ENG', 'Y56', 'Y58', 'Y59... 22/09/2022 \n", - "13 {'Region ODS code': ['ENG', 'Y56', 'Y58', 'Y59... 22/09/2022 \n", - "14 {'Region ODS code': ['ENG', 'Y56', 'Y58', 'Y59... 22/09/2022 \n", - "15 {'Region ODS code': ['ENG', 'Y56', 'Y58', 'Y59... 22/09/2022 \n", - "16 {'Region ODS code': ['ENG', 'Y56', 'Y58', 'Y59... 22/09/2022 \n", - "17 {'Region ODS code': ['ENG', 'Y56', 'Y58', 'Y59... 22/09/2022 \n", - "18 {'Region ODS code': ['ENG', 'Y56', 'Y58', 'Y59... 22/09/2022 \n", - "19 {'Region ODS code': ['ENG', 'Y56', 'Y58', 'Y59... 22/09/2022 \n", - "20 {'Region ODS code': ['ENG', 'Y56', 'Y58', 'Y59... 22/09/2022 \n", - "21 {'Region ODS code': ['ENG', 'Y56', 'Y58', 'Y59... 22/09/2022 \n", - "22 {'Region ODS code': ['ENG', 'Y56', 'Y58', 'Y59... 22/09/2022 \n", - "23 {'Region ODS code': ['ENG', 'Y56', 'Y58', 'Y59... 22/09/2022 \n", - "24 {'Region ODS code': ['ENG', 'Y56', 'Y58', 'Y59... 22/09/2022 \n", - "25 {'Region ODS code': ['ENG', 'Y56', 'Y58', 'Y59... 22/09/2022 \n", - "26 {'Region ODS code': ['ENG', 'Y56', 'Y58', 'Y59... 22/09/2022 \n", - "27 {'Region ODS code': ['ENG', 'Y56', 'Y58', 'Y59... 22/09/2022 \n", - "28 {'Region ODS code': ['ENG', 'Y56', 'Y58', 'Y59... 22/09/2022 \n", - "29 {'Region ODS code': ['ENG', 'Y56', 'Y58', 'Y59... 22/09/2022 \n", - "30 {'Region ODS code': ['ENG', 'Y56', 'Y58', 'Y59... 22/09/2022 " - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "metadata_df" ] diff --git a/nbs/index.ipynb b/nbs/index.ipynb index 4fa7082..f929c49 100644 --- a/nbs/index.ipynb +++ b/nbs/index.ipynb @@ -27,7 +27,8 @@ ")\n", "\n", "import requests\n", - "import pandas as pd" + "import pandas as pd\n", + "import numpy as np" ] }, { @@ -71,7 +72,8 @@ "- pandas library\n", "- BeautifulSoup library\n", "- re (Regular Expression) module\n", - "- Math" + "- Math\n", + "- numpy" ] }, { @@ -85,7 +87,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "These scripts are aimed at retrieving dataset titles, two descriptions (long and short), column titles, unique non-numeric column values, and the release date / date of last update." + "The following shows how the functions can be used to collect metadata on the datasets available from each source. \n", + "- These scripts are aimed at retrieving dataset titles, two descriptions (long and short), column titles, unique non-numeric column values, and the release date / date of last update. \n", + "- Titles and descriptions are given as strings, columns as a list, and unique column values as a dictionary (with the key being the column title). " ] }, { @@ -93,7 +97,7 @@ "metadata": {}, "source": [ "### ONS Functions\n", - "> ONS functions include: get_ONS_datasets_titles_descriptions(), get_ONS_long_description(), get_ONS_datasets_urls(), find_ONS_cols(), find_ONS_cols_and_unique_vals(). For more information check the specific documentation for the ONS functions." + "> ONS functions include: get_ONS_datasets_titles_descriptions(), get_ONS_long_description(), get_ONS_datasets_urls(), find_ONS_cols(), find_ONS_cols_and_unique_vals()." ] }, { @@ -155,7 +159,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Nomis Functions" + "### Nomis Functions\n", + "> Nomis functions include: get_nomis_datasets_titles_descriptions(), get_nomis_dataset_parameters(), and get_nomis_last_updated()." ] }, { @@ -163,13 +168,29 @@ "execution_count": null, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "from database_compendium.utils.Nomis_scraper_functions import *\n", + "\n", + "# Titles, Descriptions, and long Descriptions\n", + "titles, descriptions, l_descriptions = get_nomis_datasets_titles_descriptions()\n", + "\n", + "# Unfortunately the Nomis api doesn't currently have a way of collecting columns from a dataset without specifying parameters \n", + "# before hand. I use a blank array so the data fits in a combined dataframe with the data from other sources.\n", + "cols = np.empty(len(titles))\n", + "\n", + "# Dataset Unique parameters\n", + "params = get_nomis_datasets_parameters()\n", + "\n", + "# Most recent release date\n", + "latest_release = get_nomis_last_updated()" + ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### Insolvency Functions" + "### Insolvency Functions\n", + "> The insolvency statistics are released as an excel file once a month, this means most of there are fewer functions as almost all the data needed is in said file. Functions include: get_insolvency_stats(), get_mis_description(), and get_mis_last_updated()." ] }, { @@ -177,13 +198,29 @@ "execution_count": null, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "from database_compendium.utils.insolvency_stats_scrapers import *\n", + "\n", + "# Insolvency stats are given as a dictionary of dataframes where the key is the dataset title\n", + "insolvency_stats, long_desc = get_insolvency_stats()\n", + "titles = list(insolvency_stats.keys())\n", + "\n", + "# The descriptions and latest releases are all the same\n", + "description = get_mis_description()\n", + "latest_release = get_mis_last_updated()\n", + "\n", + "# Dataset columns and unique column values\n", + "cols = list(insolvency_stats[titles[0]].columns)\n", + "col_data = get_insolvency_unique_column_vals(insolvency_stats[titles[0]])" + ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### Police Data Functions" + "### Police Data Functions\n", + "> The police data is a bit more awkward as it requires either a latitude and longitude or a poly-area which is made up of latitude and longitude pairs. In an attempt to make things easier the get_constituency_coordinates() function was added which returns a dictionary containing every westminster parliamentary constituency and four coordinates as a very low res poly-area. For those with no location, the search is done by police force.\n", + "> Functions include: get_constituency_coordinates(), get_street_level_crimes(), get_crimes_no_loc(), get_searches_no_loc." ] }, { @@ -191,7 +228,52 @@ "execution_count": null, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "from database_compendium.utils.police_data_scrapers import *\n", + "\n", + "# Coordinates for all constituencies (UK) - for a list of contituency names use constituency_coords.keys()\n", + "constituency_coords = get_constituency_coordinates()\n", + "\n", + "street_level_crimes, sl_last_updated = get_street_level_crimes(constituency_coords['Bethnal Green and Bow'], '2023-03', 'poly')\n", + "stop_searches, ss_last_updated = get_stop_searches(constituency_coords['Bethnal Green and Bow'], '2023-03', 'poly')\n", + "\n", + "no_loc_crimes = get_crimes_no_loc('metropolitan', '2023-03')\n", + "searches_no_loc = get_searches_no_loc('metropolitan', '2023-03')\n", + "\n", + "# Given a dataset gets unique column values - done individually \n", + "col_data = get_unique_col_vals(street_level_crimes)\n", + "\n", + "# Columns are the keys from the unique column values \n", + "cols = col_data.keys()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### NHS Quality and Outcomes\n", + "> As with the insolvency stats, this comes as an excel file.\n", + "> Functions include: get_NHS_qualityOutcomes(), get_qualityOutcomes_uniqueColumnValues" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from database_compendium.utils.NHS_QualityOutcomes_scrapers import *\n", + "\n", + "# Returns dictionary of dataframes.\n", + "# The latest release and long description are both the same for all datasets in this file\n", + "NHS_quality_outcomes, long_description, latest_release = get_NHS_qualityOutcomes()\n", + "\n", + "# Datasets Titles\n", + "titles = list(NHS_quality_outcomes.keys())\n", + "\n", + "sheet = NHS_quality_outcomes[titles[0]]\n", + "cols, uniqueParams = get_qualityOutcomes_uniqueColumnValues(sheet)" + ] }, { "cell_type": "markdown", @@ -215,6 +297,13 @@ "source": [ "Please note that the provided documentation is a general guide based on the information available in the code snippet. You might need to adjust the documentation according to the specific needs of your project and any further developments made to the code." ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "For more information on the functions and the how the code works check the functions files." + ] } ], "metadata": { diff --git a/nbs/sidebar.yml b/nbs/sidebar.yml index 94f87d4..a4a8999 100644 --- a/nbs/sidebar.yml +++ b/nbs/sidebar.yml @@ -9,6 +9,8 @@ website: - 04_NHS_Quality&Outcomes_scrapers.ipynb - 05_Embeddings_analysis.ipynb - 06_matching_columns.ipynb + - combiningEverything.ipynb + - network.html - test_env.ipynb - section: figures contents: