updated README

Autonomy-Data-Unit · Aug 14, 2023 · f276410 · f276410
1 parent c9b696d
commit f276410
Show file tree

Hide file tree

Showing 8 changed files with 316 additions and 634 deletions.
diff --git a/database_compendium/_modidx.py b/database_compendium/_modidx.py
@@ -29,6 +29,8 @@
                                                                                                                                                  'database_compendium/utils/Nomis_scraper_functions.py')},
             'database_compendium.utils.ONS_scraper_functions': { 'database_compendium.utils.ONS_scraper_functions.find_ONS_cols': ( 'ons_scraper_functions.html#find_ons_cols',
                                                                                                                                     'database_compendium/utils/ONS_scraper_functions.py'),
+                                                                 'database_compendium.utils.ONS_scraper_functions.find_ONS_cols_and_unique_vals': ( 'ons_scraper_functions.html#find_ons_cols_and_unique_vals',
+                                                                                                                                                    'database_compendium/utils/ONS_scraper_functions.py'),
                                                                  'database_compendium.utils.ONS_scraper_functions.get_ONS_datasets_titles_descriptions': ( 'ons_scraper_functions.html#get_ons_datasets_titles_descriptions',
                                                                                                                                                            'database_compendium/utils/ONS_scraper_functions.py'),
                                                                  'database_compendium.utils.ONS_scraper_functions.get_ONS_datasets_urls': ( 'ons_scraper_functions.html#get_ons_datasets_urls',

diff --git a/database_compendium/utils/ONS_scraper_functions.py b/database_compendium/utils/ONS_scraper_functions.py
@@ -1,7 +1,8 @@
 # AUTOGENERATED! DO NOT EDIT! File to edit: ../../nbs/00_ONS_scraper_functions.ipynb.
 
 # %% auto 0
-__all__ = ['get_ONS_datasets_titles_descriptions', 'get_ONS_datasets_urls', 'find_ONS_cols', 'get_ONS_long_description']
+__all__ = ['get_ONS_datasets_titles_descriptions', 'get_ONS_datasets_urls', 'find_ONS_cols', 'get_ONS_long_description',
+           'find_ONS_cols_and_unique_vals']
 
 # %% ../../nbs/00_ONS_scraper_functions.ipynb 4
 import requests
@@ -153,3 +154,40 @@ def get_ONS_long_description():
         description_L.append(temp_desc)
 
     return description_L
+
+# %% ../../nbs/00_ONS_scraper_functions.ipynb 13
+def find_ONS_cols_and_unique_vals(url):
+    """
+    Using the url provided this function:
+    - Checks a download is possible
+    - Downloads the csv file of the dataset
+    - Get all the column titles
+    - Get the unique values from columns containing non-numeric data
+
+    Check is string contains number:https://stackoverflow.com/questions/354038/how-do-i-check-if-a-string-represents-a-number-float-or-int
+    """
+
+    temp = requests.get(url)
+    try:
+        temp = temp.json()
+    except:
+        return {}
+
+    if temp['downloads']:
+        temp_url = temp['downloads']['csv']['href']
+
+        csv_url = requests.get(temp_url).text
+        temp_df = pd.read_csv(StringIO(csv_url), low_memory=False)
+
+        col_data = {}
+
+        for col in temp_df.columns:
+            col_data[col] = None
+
+            if type(temp_df.loc[:, col][0]) == str: # Check for string data type
+                if not temp_df.loc[:, col][0].replace('.','', 1).isdigit(): # if the data is a string ensure that it isn't numeric
+                    col_data[col] = list(temp_df.loc[:, col].unique())
+    else:
+        col_data = {}   # This means the link didn't have a csv file href
+
+    return col_data
diff --git a/database_compendium/utils/generate_Metadata.py b/database_compendium/utils/generate_Metadata.py
@@ -32,7 +32,6 @@ def createMetadata():
         except:
             cols.append('')
 
-
         try:
             col_data.append(osf.find_ONS_cols_and_unique_vals(url))
         except:

diff --git a/nbs/00_ONS_scraper_functions.ipynb b/nbs/00_ONS_scraper_functions.ipynb
@@ -171,7 +171,8 @@
    ],
    "source": [
     "# The url for the first dataset in the list\n",
-    "get_ONS_datasets_urls()[0]"
+    "urls = get_ONS_datasets_urls()\n",
+    "urls[0]"
    ]
   },
   {
@@ -301,6 +302,70 @@
     "    raise"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#| export\n",
+    "def find_ONS_cols_and_unique_vals(url):\n",
+    "    \"\"\"\n",
+    "    Using the url provided this function:\n",
+    "    - Checks a download is possible\n",
+    "    - Downloads the csv file of the dataset\n",
+    "    - Get all the column titles\n",
+    "    - Get the unique values from columns containing non-numeric data\n",
+    "\n",
+    "    Check is string contains number:https://stackoverflow.com/questions/354038/how-do-i-check-if-a-string-represents-a-number-float-or-int\n",
+    "    \"\"\"\n",
+    "\n",
+    "    temp = requests.get(url)\n",
+    "    try:\n",
+    "        temp = temp.json()\n",
+    "    except:\n",
+    "        return {}\n",
+    "\n",
+    "    if temp['downloads']:\n",
+    "        temp_url = temp['downloads']['csv']['href']\n",
+    "\n",
+    "        csv_url = requests.get(temp_url).text\n",
+    "        temp_df = pd.read_csv(StringIO(csv_url), low_memory=False)\n",
+    "\n",
+    "        col_data = {}\n",
+    "\n",
+    "        for col in temp_df.columns:\n",
+    "            col_data[col] = None\n",
+    "\n",
+    "            if type(temp_df.loc[:, col][0]) == str: # Check for string data type\n",
+    "                if not temp_df.loc[:, col][0].replace('.','', 1).isdigit(): # if the data is a string ensure that it isn't numeric\n",
+    "                    col_data[col] = list(temp_df.loc[:, col].unique())\n",
+    "    else:\n",
+    "        col_data = {}   # This means the link didn't have a csv file href\n",
+    "\n",
+    "    return col_data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "dict_keys(['v4_2', 'LCL', 'UCL', 'yyyy-qq', 'Time', 'uk-only', 'Geography', 'measure-of-wellbeing', 'MeasureOfWellbeing', 'wellbeing-estimate', 'Estimate', 'seasonal-adjustment', 'SeasonalAdjustment'])"
+      ]
+     },
+     "execution_count": null,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "find_ONS_cols_and_unique_vals(urls[0]).keys()"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,