Merge pull request #43 from miha42-github/V3.0.2

Adopt alpine image, enable edgar only returns, fix bugs
miha42-github · May 4, 2024 · bb6f14a · bb6f14a
2 parents 4e17aa8 + 90324dc
commit bb6f14a
Show file tree

Hide file tree

Showing 4 changed files with 157 additions and 64 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -1,5 +1,6 @@
 # Use an official Python runtime as a parent image
-FROM python:3.12-slim
+# FROM python:3.12-slim
+FROM python:3.12-alpine
 
 # Set the working directory in the container to /app
 WORKDIR /app
@@ -8,8 +9,8 @@ WORKDIR /app
 ADD . /app
 
 # Install curl and create directory
-RUN apt-get update && apt-get install -y curl && mkdir -p /app/edgar_data
-# RUN apk --no-cache add curl && mkdir -p /app/edgar_data
+# RUN apt-get update && apt-get install -y curl && mkdir -p /app/edgar_data
+RUN apk --no-cache add curl curl-dev gcc musl-dev linux-headers && mkdir -p /app/edgar_data
 
 # Install any needed packages specified in requirements.txt
 RUN pip install --no-cache-dir -r requirements.txt

diff --git a/company_dns.py b/company_dns.py
@@ -8,6 +8,7 @@
 from starlette.middleware.base import BaseHTTPMiddleware
 import uvicorn
 import logging
+import sys
 
 from lib.sic import SICQueries
 from lib.edgar import EdgarQueries
@@ -58,27 +59,10 @@ async def wikipedia_firmographics(request):
 
 # -------------------------------------------------------------- #
 # BEGIN: General query functions
+# 
 async def general_query(request):
-    try:
-        gq.query = request.path_params['company_name']
-        # Log the query request as a debug message
-        logger.debug(f'Performing general query for company name: [{request.path_params["company_name"]}]')
-        company_wiki_data = gq.get_firmographics_wikipedia()
-        # logger.debug(f'Company wiki data: {company_wiki_data}')
-        if company_wiki_data['code'] != 200:
-            logger.error(f'There were [0] results for resource [company_name].')
-            return JSONResponse(company_wiki_data)
-
-        general_company_data = gq.merge_data(company_wiki_data['data'], company_wiki_data['data']['cik'])
-        # Call check_status_and_return to check the status of the data and return the data or an error message
-        checked_data = _check_status_and_return(general_company_data, request.path_params['company_name'])
-        if 'error' in checked_data:
-            return JSONResponse(checked_data, status_code=checked_data['code'])
-        return JSONResponse(checked_data)
-    except Exception as e:
-        logger.error(f'Error: {e}')
-        general_company_data = {'error': 'A general or code error has occured', 'code': 500}
-        return JSONResponse(general_company_data, status_code=general_company_data['code'])
+    return _handle_request(request, gq, gq.get_firmographics, 'company_name')
+
 # END: General query functions
 # -------------------------------------------------------------- #
 
@@ -96,7 +80,7 @@ def _check_status_and_return(result_data, resource_name):
         return {'message': return_msg, 'code': return_code, 'data': result_data}
     return result_data
 
-def _prepare_logging(log_level=logging.INFO):
+def _prepare_logging(log_level=logging.DEBUG):
     logging.basicConfig(format='%(levelname)s:\t%(asctime)s [module: %(name)s] %(message)s', level=log_level)
     return logging.getLogger(__file__)
 
@@ -199,6 +183,6 @@ async def dispatch(self, request, call_next):
 
 if __name__ == "__main__": 
     try:
-        uvicorn.run(app, host='0.0.0.0', port=8000, log_level='info', lifespan='off')
+        uvicorn.run(app, host='0.0.0.0', port=8000, log_level='debug', lifespan='off')
     except KeyboardInterrupt:
         logger.info("Server was shut down by the user.")
diff --git a/lib/firmographics.py b/lib/firmographics.py
@@ -1,6 +1,7 @@
 from . import edgar
 from . import wikipedia
 import sys
+import unicodedata
 import urllib.parse as url_parse
 import logging
 from geopy.geocoders import ArcGIS
@@ -92,8 +93,8 @@ def get_all_summaries(self):
         my_query.query = self.query
         return my_query.get_all_details(firmographics=False)
 
-    def get_all_details(self):
-        my_query = edgar.EdgarQueries(db_file=self.db_file)
+    def get_all_details(self, flat_return=False):
+        my_query = edgar.EdgarQueries(db_file=self.db_file, flat_return=flat_return)
         my_query.query = self.query
         return my_query.get_all_details(firmographics=True)
 
@@ -122,9 +123,31 @@ def _augment_wikidata(self, wiki_return):
             wiki_return['data']['googleFinance'] = FINANCEURL + my_ticker + ':' + my_encoded_exchange
 
         return wiki_return
+
+    def _set_location_data(self, final_company):
+        if 'address' in final_company:
+            my_address = ", ".join([final_company['address'], final_company['city'], final_company['stateProvince'], final_company['zipPostal']])
+            (longitude, latitude, address, raw_data) = self.locate(my_address)
+            final_company['longitude'] = longitude
+            final_company['latitude'] = latitude
+            final_company['googleMaps'] = MAPSURL + url_parse.quote(address)
+        else:
+            final_company['longitude'] = UKN
+            final_company['latitude'] = UKN
+            final_company['address'] = UKN
+        return final_company
+
+    def _set_google_urls(self, final_company):
+        my_encoded_name = url_parse.quote(final_company['name'])
+        final_company['googleNews'] = NEWSURL + my_encoded_name
+        final_company['googlePatents'] = PATENTSURL + my_encoded_name
+        if 'tickers' in final_company:
+            my_encoded_exchange = url_parse.quote(final_company['tickers'][0])
+            my_ticker = final_company['tickers'][1]
+            final_company['googleFinance'] = FINANCEURL + my_ticker + ':' + my_encoded_exchange
+        return final_company
 
-
-    def merge_data(self, wiki_data, cik, company_name=None):
+    def merge_data(self, wiki_data=None):
         # TODO there are potential cases where there isn't a wikipedia page, but there is EDGAR data.
         #       Therefore we need to figure out how to handle this, perhaps we can do an OR? For sure
         #       it is better to return something instead of nothing.  In fact and in general if we 
@@ -135,6 +158,12 @@ def merge_data(self, wiki_data, cik, company_name=None):
         my_function = sys._getframe(0).f_code.co_name
         my_class = self.__class__.__name__
 
+        # Set the CIK to UKN
+        cik = UKN
+
+        # If wiki_data is not None and it exists then we need to set the CIK to wiki_data['cik']
+        if wiki_data is not None and wiki_data['cik']: cik = wiki_data['cik']
+
         # If there isn't a cik that is know we need to return with wiki_data
         wiki_return = {
                 'code': 200, 
@@ -217,22 +246,92 @@ def merge_data(self, wiki_data, cik, company_name=None):
             final_company['latitude'] = UKN
             final_company['address'] = UKN
 
-        # Phase 4 - Add google links for various services
-        my_encoded_name = url_parse.quote(final_company['name'])
-        final_company['googleNews'] = NEWSURL + my_encoded_name
-        final_company['googlePatents'] = PATENTSURL + my_encoded_name
-
-        if 'tickers' in final_company:
-            my_encoded_exchange = url_parse.quote(final_company['tickers'][0])
-            my_ticker = final_company['tickers'][1]
-            final_company['googleFinance'] = FINANCEURL + my_ticker + ':' + my_encoded_exchange
+        # Phase 4 - Add google urls
+        final_company = self._set_google_urls(final_company)
 
-
         # Return the merged result
         return {
             'code': 200, 
-            'message': 'Wikipedia data and EDGAR has been detected and merged for the company [' + wiki_data['name'] + '].',
+            'message': f'Wikipedia data and EDGAR has been detected and merged for the company [{wiki_data['name']}].',
             'module': my_class + '-> ' + my_function,
             'data': final_company,
             'dependencies': DEPENDENCIES
-        }
+        }
+
+    def get_firmographics(self):
+        """This function is the main entry point for the module used to gather company firmographics from a variety of data sources.  The basic flow of the function is as follows as of today:
+        
+        1. It will first query wikipedia for the firmographics based upon company name as defined in self.query.
+        2. If the wikipedia data is successfully returned and the company is public it will then query EDGAR for additional firmographics and merge the data with the wikipedia data.
+        3. If the wikipedia data is successfully returned and the company is not public it will return the wikipedia data.
+        4. If the wikipedia data is not successfully returned it will then query EDGAR for the firmographics and that is successful it will be reformatted and returned.
+        5. Finally if there isn't any data returned it will return an error message.
+
+        Returns:
+            dict: A dictionary with the following keys:
+                code (int): The HTTP status code of the response.
+                message (str): A message describing the result of the query.
+                module (str): The module and function that was executed.
+                data (dict): The firmographics data for the company.
+                dependencies (dict): A dictionary of the dependencies for the module.
+        """
+        my_function = sys._getframe(0).f_code.co_name
+        my_class = self.__class__.__name__
+
+        lookup_err_prototype = {
+                'errorType': 'LookupError',
+                'module': my_class + '-> ' + my_function,
+                'dependencies': DEPENDENCIES  
+        }
+
+        company_firmographics = None
+
+        # Log the start of the query
+        self.logger.info(f'Attempting to gather firmographics company [{self.query}]')
+
+        try:
+            self.logger.debug(f'Performing general query for company name: [{self.query}]')
+            wiki_response = self.get_firmographics_wikipedia()
+            if wiki_response['code'] == 200:
+                self.logger.info(f'Wikipedia results for [{self.query}] returned successfully.')
+                company_firmographics = self.merge_data(wiki_data=wiki_response['data'])
+        except Exception as e:
+            self.logger.error(f'Wikidata results for [{self.query}] returned [{e}].')
+            lookup_err_prototype['code'] = 542
+            lookup_err_prototype['message'] = f'Using general query nable to find a company by the name [{self.query}], encountered error: [{e}]. Maybe you should try an alternative structure like [{self.query} Inc., {self.query} Corp., or {self.query} Corporation].'
+            return lookup_err_prototype
+        if company_firmographics: return company_firmographics
+
+        try:
+            self.logger.debug(f'Performing EDGAR query for company name: [{self.query}]')
+            company_firmographics = self.get_all_details(flat_return=True)
+        except Exception as e:
+            self.logger.error(f'Wikidata results for [{self.query}] returned [{e}].')
+            lookup_err_prototype['code'] = 542
+            lookup_err_prototype['message'] = f'Using EDGAR query nable to find a company by the name [{self.query}], encountered error: [{e}]. Maybe you should try an alternative structure like [{self.query} Inc., {self.query} Corp., or {self.query} Corporation].'
+            return lookup_err_prototype
+
+        # If we have a company_firmographics object then we need to clean it up and 
+        # enrich it with urls and location data
+        if company_firmographics:
+            self.logger.info(f'EDGAR results for [{self.query}] returned successfully.')
+            # Get the first key company_firmographics['companies'] 
+            new_company_name = next(iter(company_firmographics['companies']), None)
+            company_firmographics = company_firmographics['companies'][new_company_name]
+            company_firmographics = self._set_location_data(company_firmographics)
+            company_firmographics = self._set_google_urls(company_firmographics)
+            # Remove all leading and trailing white space from the category
+            self.logger.debug(f'Extra characters in category [{ord(company_firmographics['category'][0])}]')
+            company_firmographics['category'] = company_firmographics['category'].lstrip('<br>').rstrip('</br>')
+            company_firmographics['type'] = 'Public company'           
+            return {
+                'code': 200, 
+                'message': f'Only EDGAR has been detected and returned for the company [{new_company_name}].',
+                'module': my_class + '-> ' + my_function,
+                'data': company_firmographics,
+                'dependencies': DEPENDENCIES
+            }
+        else:
+            lookup_err_prototype['code'] = 404
+            lookup_err_prototype['message'] = f'Unable to find a company by the name [{self.query}]. Maybe you should try an alternative structure like [{self.query} Inc., {self.query} Corp., or {self.query} Corporation].'
+            return lookup_err_prototype
diff --git a/lib/wikipedia.py b/lib/wikipedia.py
@@ -15,9 +15,6 @@
 # Used for setting attributes consistently when unknown
 UKN = 'Unknown'
 
-# Determine how we output when executed as a CLI
-DEBUG = None
-
 # Package and data dependencies
 DEPENDENCIES = {
     'modules': {'wptools':'https://pypi.org/project/wptools/'},
@@ -106,29 +103,45 @@ def get_firmographics(self):
 
         # TODO try to do the right thing by trying different common combinations like Company, Inc.; Company Corp, etc.
         # Log the start of this process including self.query
-        self.logger.info('Starting retrieval of firmographics for [' + self.query + '] via its wikipedia page.')
-        company_page = wptools.page(self.query, silent=True)
-        if not company_page:
-            self.logger.error('A wikipedia page for [' + self.query + '] was not found.')
+        self.logger.info(f'Starting retrieval of firmographics for [{self.query}] via its wikipedia page.')
+        company_page = None
+        try:
+            company_page = wptools.page(self.query, silent=True)
+            if not company_page:
+                self.logger.error(f'A wikipedia page for [{self.query}] was not found.')
+                return lookup_error
+            # Log the completion of the page creation
+            self.logger.debug(f'Page results for [{self.query}]: {company_page}')
+        except Exception as e:
+            self.logger.error(f'A wikipedia page for [{self.query}] was not found due to [{e}].')
             return lookup_error
-        # Log the completion of the page creation
-        self.logger.debug(f'Page results for [{self.query}]: {company_page}')
 
         # Prepare to get the infoblox for the company
         # Log the start of the process to get the infobox for the company
-        self.logger.info('Starting process to retrieve infobox for [' + self.query + '].')
-        parse_results = company_page.get_parse(show=False)
-        if not parse_results.data['infobox']:
-            self.logger.error('An infobox for [' + self.query + '] was not found.')
+        self.logger.info(f'Starting process to retrieve infobox for [{self.query}].')
+        parse_results = None
+        try:
+            parse_results = company_page.get_parse(show=False)
+            if not parse_results.data['infobox']:
+                self.logger.error('An infobox for [' + self.query + '] was not found.')
+                return lookup_error
+            # Log the completion of the infobox creation
+            self.logger.info('Completed infobox retrieval for [' + self.query + '].')
+        except Exception as e:
+            self.logger.error(f'An infobox for [{self.query}] was not found due to [{e}].')
             return lookup_error
-        # Log the completion of the infobox creation
-        self.logger.info('Completed infobox retrieval for [' + self.query + '].')
 
-        company_info = parse_results.data['infobox']
-        if not company_info: 
-            self.logger.error('An infobox for [' + self.query + '] was not found.')
+        # Get the company info from the infobox
+        company_info = None
+        try:
+            company_info = parse_results.data['infobox']
+            if not company_info: 
+                self.logger.error('An infobox for [' + self.query + '] was not found.')
+                return lookup_error
+            self.logger.info('Completed infobox parse for [' + self.query + '].')
+        except Exception as e:
+            self.logger.error(f'An infobox for [{self.query}] was not found due to [{e}].')
             return lookup_error
-        self.logger.info('Completed infobox parse for [' + self.query + '].')
 
         # Obtain the query results
         try:
@@ -149,10 +162,6 @@ def get_firmographics(self):
             self.logger.info('Completed wikidata retrieval for [' + self.query + '].')
         except:
             return lookup_error
-
-        # Debugging output
-        if DEBUG == 1: pprint.pprint(page_data.data['wikidata'])
-        elif DEBUG == 2: pprint.pprint(company_info)
 
         # Log the beginning of the firmographics data extraction
         self.logger.info('Starting firmographics data extraction for [' + self.query + '].')