Skip to content
This repository has been archived by the owner on Feb 7, 2024. It is now read-only.

Commit

Permalink
updated README
Browse files Browse the repository at this point in the history
  • Loading branch information
RowanTrickett committed Aug 14, 2023
1 parent c9b696d commit f276410
Show file tree
Hide file tree
Showing 8 changed files with 316 additions and 634 deletions.
2 changes: 2 additions & 0 deletions database_compendium/_modidx.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@
'database_compendium/utils/Nomis_scraper_functions.py')},
'database_compendium.utils.ONS_scraper_functions': { 'database_compendium.utils.ONS_scraper_functions.find_ONS_cols': ( 'ons_scraper_functions.html#find_ons_cols',
'database_compendium/utils/ONS_scraper_functions.py'),
'database_compendium.utils.ONS_scraper_functions.find_ONS_cols_and_unique_vals': ( 'ons_scraper_functions.html#find_ons_cols_and_unique_vals',
'database_compendium/utils/ONS_scraper_functions.py'),
'database_compendium.utils.ONS_scraper_functions.get_ONS_datasets_titles_descriptions': ( 'ons_scraper_functions.html#get_ons_datasets_titles_descriptions',
'database_compendium/utils/ONS_scraper_functions.py'),
'database_compendium.utils.ONS_scraper_functions.get_ONS_datasets_urls': ( 'ons_scraper_functions.html#get_ons_datasets_urls',
Expand Down
40 changes: 39 additions & 1 deletion database_compendium/utils/ONS_scraper_functions.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
# AUTOGENERATED! DO NOT EDIT! File to edit: ../../nbs/00_ONS_scraper_functions.ipynb.

# %% auto 0
__all__ = ['get_ONS_datasets_titles_descriptions', 'get_ONS_datasets_urls', 'find_ONS_cols', 'get_ONS_long_description']
__all__ = ['get_ONS_datasets_titles_descriptions', 'get_ONS_datasets_urls', 'find_ONS_cols', 'get_ONS_long_description',
'find_ONS_cols_and_unique_vals']

# %% ../../nbs/00_ONS_scraper_functions.ipynb 4
import requests
Expand Down Expand Up @@ -153,3 +154,40 @@ def get_ONS_long_description():
description_L.append(temp_desc)

return description_L

# %% ../../nbs/00_ONS_scraper_functions.ipynb 13
def find_ONS_cols_and_unique_vals(url):
"""
Using the url provided this function:
- Checks a download is possible
- Downloads the csv file of the dataset
- Get all the column titles
- Get the unique values from columns containing non-numeric data
Check is string contains number:https://stackoverflow.com/questions/354038/how-do-i-check-if-a-string-represents-a-number-float-or-int
"""

temp = requests.get(url)
try:
temp = temp.json()
except:
return {}

if temp['downloads']:
temp_url = temp['downloads']['csv']['href']

csv_url = requests.get(temp_url).text
temp_df = pd.read_csv(StringIO(csv_url), low_memory=False)

col_data = {}

for col in temp_df.columns:
col_data[col] = None

if type(temp_df.loc[:, col][0]) == str: # Check for string data type
if not temp_df.loc[:, col][0].replace('.','', 1).isdigit(): # if the data is a string ensure that it isn't numeric
col_data[col] = list(temp_df.loc[:, col].unique())
else:
col_data = {} # This means the link didn't have a csv file href

return col_data
1 change: 0 additions & 1 deletion database_compendium/utils/generate_Metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,6 @@ def createMetadata():
except:
cols.append('')


try:
col_data.append(osf.find_ONS_cols_and_unique_vals(url))
except:
Expand Down
67 changes: 66 additions & 1 deletion nbs/00_ONS_scraper_functions.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,8 @@
],
"source": [
"# The url for the first dataset in the list\n",
"get_ONS_datasets_urls()[0]"
"urls = get_ONS_datasets_urls()\n",
"urls[0]"
]
},
{
Expand Down Expand Up @@ -301,6 +302,70 @@
" raise"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#| export\n",
"def find_ONS_cols_and_unique_vals(url):\n",
" \"\"\"\n",
" Using the url provided this function:\n",
" - Checks a download is possible\n",
" - Downloads the csv file of the dataset\n",
" - Get all the column titles\n",
" - Get the unique values from columns containing non-numeric data\n",
"\n",
" Check is string contains number:https://stackoverflow.com/questions/354038/how-do-i-check-if-a-string-represents-a-number-float-or-int\n",
" \"\"\"\n",
"\n",
" temp = requests.get(url)\n",
" try:\n",
" temp = temp.json()\n",
" except:\n",
" return {}\n",
"\n",
" if temp['downloads']:\n",
" temp_url = temp['downloads']['csv']['href']\n",
"\n",
" csv_url = requests.get(temp_url).text\n",
" temp_df = pd.read_csv(StringIO(csv_url), low_memory=False)\n",
"\n",
" col_data = {}\n",
"\n",
" for col in temp_df.columns:\n",
" col_data[col] = None\n",
"\n",
" if type(temp_df.loc[:, col][0]) == str: # Check for string data type\n",
" if not temp_df.loc[:, col][0].replace('.','', 1).isdigit(): # if the data is a string ensure that it isn't numeric\n",
" col_data[col] = list(temp_df.loc[:, col].unique())\n",
" else:\n",
" col_data = {} # This means the link didn't have a csv file href\n",
"\n",
" return col_data"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"dict_keys(['v4_2', 'LCL', 'UCL', 'yyyy-qq', 'Time', 'uk-only', 'Geography', 'measure-of-wellbeing', 'MeasureOfWellbeing', 'wellbeing-estimate', 'Estimate', 'seasonal-adjustment', 'SeasonalAdjustment'])"
]
},
"execution_count": null,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"find_ONS_cols_and_unique_vals(urls[0]).keys()"
]
},
{
"cell_type": "code",
"execution_count": null,
Expand Down
Loading

0 comments on commit f276410

Please sign in to comment.