Skip to content

Commit

Permalink
Merge pull request #43 from miha42-github/V3.0.2
Browse files Browse the repository at this point in the history
Adopt alpine image, enable edgar only returns, fix bugs
  • Loading branch information
miha42-github authored May 4, 2024
2 parents 4e17aa8 + 90324dc commit bb6f14a
Show file tree
Hide file tree
Showing 4 changed files with 157 additions and 64 deletions.
7 changes: 4 additions & 3 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# Use an official Python runtime as a parent image
FROM python:3.12-slim
# FROM python:3.12-slim
FROM python:3.12-alpine

# Set the working directory in the container to /app
WORKDIR /app
Expand All @@ -8,8 +9,8 @@ WORKDIR /app
ADD . /app

# Install curl and create directory
RUN apt-get update && apt-get install -y curl && mkdir -p /app/edgar_data
# RUN apk --no-cache add curl && mkdir -p /app/edgar_data
# RUN apt-get update && apt-get install -y curl && mkdir -p /app/edgar_data
RUN apk --no-cache add curl curl-dev gcc musl-dev linux-headers && mkdir -p /app/edgar_data

# Install any needed packages specified in requirements.txt
RUN pip install --no-cache-dir -r requirements.txt
Expand Down
28 changes: 6 additions & 22 deletions company_dns.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from starlette.middleware.base import BaseHTTPMiddleware
import uvicorn
import logging
import sys

from lib.sic import SICQueries
from lib.edgar import EdgarQueries
Expand Down Expand Up @@ -58,27 +59,10 @@ async def wikipedia_firmographics(request):

# -------------------------------------------------------------- #
# BEGIN: General query functions
#
async def general_query(request):
try:
gq.query = request.path_params['company_name']
# Log the query request as a debug message
logger.debug(f'Performing general query for company name: [{request.path_params["company_name"]}]')
company_wiki_data = gq.get_firmographics_wikipedia()
# logger.debug(f'Company wiki data: {company_wiki_data}')
if company_wiki_data['code'] != 200:
logger.error(f'There were [0] results for resource [company_name].')
return JSONResponse(company_wiki_data)

general_company_data = gq.merge_data(company_wiki_data['data'], company_wiki_data['data']['cik'])
# Call check_status_and_return to check the status of the data and return the data or an error message
checked_data = _check_status_and_return(general_company_data, request.path_params['company_name'])
if 'error' in checked_data:
return JSONResponse(checked_data, status_code=checked_data['code'])
return JSONResponse(checked_data)
except Exception as e:
logger.error(f'Error: {e}')
general_company_data = {'error': 'A general or code error has occured', 'code': 500}
return JSONResponse(general_company_data, status_code=general_company_data['code'])
return _handle_request(request, gq, gq.get_firmographics, 'company_name')

# END: General query functions
# -------------------------------------------------------------- #

Expand All @@ -96,7 +80,7 @@ def _check_status_and_return(result_data, resource_name):
return {'message': return_msg, 'code': return_code, 'data': result_data}
return result_data

def _prepare_logging(log_level=logging.INFO):
def _prepare_logging(log_level=logging.DEBUG):
logging.basicConfig(format='%(levelname)s:\t%(asctime)s [module: %(name)s] %(message)s', level=log_level)
return logging.getLogger(__file__)

Expand Down Expand Up @@ -199,6 +183,6 @@ async def dispatch(self, request, call_next):

if __name__ == "__main__":
try:
uvicorn.run(app, host='0.0.0.0', port=8000, log_level='info', lifespan='off')
uvicorn.run(app, host='0.0.0.0', port=8000, log_level='debug', lifespan='off')
except KeyboardInterrupt:
logger.info("Server was shut down by the user.")
131 changes: 115 additions & 16 deletions lib/firmographics.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from . import edgar
from . import wikipedia
import sys
import unicodedata
import urllib.parse as url_parse
import logging
from geopy.geocoders import ArcGIS
Expand Down Expand Up @@ -92,8 +93,8 @@ def get_all_summaries(self):
my_query.query = self.query
return my_query.get_all_details(firmographics=False)

def get_all_details(self):
my_query = edgar.EdgarQueries(db_file=self.db_file)
def get_all_details(self, flat_return=False):
my_query = edgar.EdgarQueries(db_file=self.db_file, flat_return=flat_return)
my_query.query = self.query
return my_query.get_all_details(firmographics=True)

Expand Down Expand Up @@ -122,9 +123,31 @@ def _augment_wikidata(self, wiki_return):
wiki_return['data']['googleFinance'] = FINANCEURL + my_ticker + ':' + my_encoded_exchange

return wiki_return

def _set_location_data(self, final_company):
if 'address' in final_company:
my_address = ", ".join([final_company['address'], final_company['city'], final_company['stateProvince'], final_company['zipPostal']])
(longitude, latitude, address, raw_data) = self.locate(my_address)
final_company['longitude'] = longitude
final_company['latitude'] = latitude
final_company['googleMaps'] = MAPSURL + url_parse.quote(address)
else:
final_company['longitude'] = UKN
final_company['latitude'] = UKN
final_company['address'] = UKN
return final_company

def _set_google_urls(self, final_company):
my_encoded_name = url_parse.quote(final_company['name'])
final_company['googleNews'] = NEWSURL + my_encoded_name
final_company['googlePatents'] = PATENTSURL + my_encoded_name
if 'tickers' in final_company:
my_encoded_exchange = url_parse.quote(final_company['tickers'][0])
my_ticker = final_company['tickers'][1]
final_company['googleFinance'] = FINANCEURL + my_ticker + ':' + my_encoded_exchange
return final_company


def merge_data(self, wiki_data, cik, company_name=None):
def merge_data(self, wiki_data=None):
# TODO there are potential cases where there isn't a wikipedia page, but there is EDGAR data.
# Therefore we need to figure out how to handle this, perhaps we can do an OR? For sure
# it is better to return something instead of nothing. In fact and in general if we
Expand All @@ -135,6 +158,12 @@ def merge_data(self, wiki_data, cik, company_name=None):
my_function = sys._getframe(0).f_code.co_name
my_class = self.__class__.__name__

# Set the CIK to UKN
cik = UKN

# If wiki_data is not None and it exists then we need to set the CIK to wiki_data['cik']
if wiki_data is not None and wiki_data['cik']: cik = wiki_data['cik']

# If there isn't a cik that is know we need to return with wiki_data
wiki_return = {
'code': 200,
Expand Down Expand Up @@ -217,22 +246,92 @@ def merge_data(self, wiki_data, cik, company_name=None):
final_company['latitude'] = UKN
final_company['address'] = UKN

# Phase 4 - Add google links for various services
my_encoded_name = url_parse.quote(final_company['name'])
final_company['googleNews'] = NEWSURL + my_encoded_name
final_company['googlePatents'] = PATENTSURL + my_encoded_name

if 'tickers' in final_company:
my_encoded_exchange = url_parse.quote(final_company['tickers'][0])
my_ticker = final_company['tickers'][1]
final_company['googleFinance'] = FINANCEURL + my_ticker + ':' + my_encoded_exchange
# Phase 4 - Add google urls
final_company = self._set_google_urls(final_company)


# Return the merged result
return {
'code': 200,
'message': 'Wikipedia data and EDGAR has been detected and merged for the company [' + wiki_data['name'] + '].',
'message': f'Wikipedia data and EDGAR has been detected and merged for the company [{wiki_data['name']}].',
'module': my_class + '-> ' + my_function,
'data': final_company,
'dependencies': DEPENDENCIES
}
}

def get_firmographics(self):
"""This function is the main entry point for the module used to gather company firmographics from a variety of data sources. The basic flow of the function is as follows as of today:
1. It will first query wikipedia for the firmographics based upon company name as defined in self.query.
2. If the wikipedia data is successfully returned and the company is public it will then query EDGAR for additional firmographics and merge the data with the wikipedia data.
3. If the wikipedia data is successfully returned and the company is not public it will return the wikipedia data.
4. If the wikipedia data is not successfully returned it will then query EDGAR for the firmographics and that is successful it will be reformatted and returned.
5. Finally if there isn't any data returned it will return an error message.
Returns:
dict: A dictionary with the following keys:
code (int): The HTTP status code of the response.
message (str): A message describing the result of the query.
module (str): The module and function that was executed.
data (dict): The firmographics data for the company.
dependencies (dict): A dictionary of the dependencies for the module.
"""
my_function = sys._getframe(0).f_code.co_name
my_class = self.__class__.__name__

lookup_err_prototype = {
'errorType': 'LookupError',
'module': my_class + '-> ' + my_function,
'dependencies': DEPENDENCIES
}

company_firmographics = None

# Log the start of the query
self.logger.info(f'Attempting to gather firmographics company [{self.query}]')

try:
self.logger.debug(f'Performing general query for company name: [{self.query}]')
wiki_response = self.get_firmographics_wikipedia()
if wiki_response['code'] == 200:
self.logger.info(f'Wikipedia results for [{self.query}] returned successfully.')
company_firmographics = self.merge_data(wiki_data=wiki_response['data'])
except Exception as e:
self.logger.error(f'Wikidata results for [{self.query}] returned [{e}].')
lookup_err_prototype['code'] = 542
lookup_err_prototype['message'] = f'Using general query nable to find a company by the name [{self.query}], encountered error: [{e}]. Maybe you should try an alternative structure like [{self.query} Inc., {self.query} Corp., or {self.query} Corporation].'
return lookup_err_prototype
if company_firmographics: return company_firmographics

try:
self.logger.debug(f'Performing EDGAR query for company name: [{self.query}]')
company_firmographics = self.get_all_details(flat_return=True)
except Exception as e:
self.logger.error(f'Wikidata results for [{self.query}] returned [{e}].')
lookup_err_prototype['code'] = 542
lookup_err_prototype['message'] = f'Using EDGAR query nable to find a company by the name [{self.query}], encountered error: [{e}]. Maybe you should try an alternative structure like [{self.query} Inc., {self.query} Corp., or {self.query} Corporation].'
return lookup_err_prototype

# If we have a company_firmographics object then we need to clean it up and
# enrich it with urls and location data
if company_firmographics:
self.logger.info(f'EDGAR results for [{self.query}] returned successfully.')
# Get the first key company_firmographics['companies']
new_company_name = next(iter(company_firmographics['companies']), None)
company_firmographics = company_firmographics['companies'][new_company_name]
company_firmographics = self._set_location_data(company_firmographics)
company_firmographics = self._set_google_urls(company_firmographics)
# Remove all leading and trailing white space from the category
self.logger.debug(f'Extra characters in category [{ord(company_firmographics['category'][0])}]')
company_firmographics['category'] = company_firmographics['category'].lstrip('<br>').rstrip('</br>')
company_firmographics['type'] = 'Public company'
return {
'code': 200,
'message': f'Only EDGAR has been detected and returned for the company [{new_company_name}].',
'module': my_class + '-> ' + my_function,
'data': company_firmographics,
'dependencies': DEPENDENCIES
}
else:
lookup_err_prototype['code'] = 404
lookup_err_prototype['message'] = f'Unable to find a company by the name [{self.query}]. Maybe you should try an alternative structure like [{self.query} Inc., {self.query} Corp., or {self.query} Corporation].'
return lookup_err_prototype
55 changes: 32 additions & 23 deletions lib/wikipedia.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,6 @@
# Used for setting attributes consistently when unknown
UKN = 'Unknown'

# Determine how we output when executed as a CLI
DEBUG = None

# Package and data dependencies
DEPENDENCIES = {
'modules': {'wptools':'https://pypi.org/project/wptools/'},
Expand Down Expand Up @@ -106,29 +103,45 @@ def get_firmographics(self):

# TODO try to do the right thing by trying different common combinations like Company, Inc.; Company Corp, etc.
# Log the start of this process including self.query
self.logger.info('Starting retrieval of firmographics for [' + self.query + '] via its wikipedia page.')
company_page = wptools.page(self.query, silent=True)
if not company_page:
self.logger.error('A wikipedia page for [' + self.query + '] was not found.')
self.logger.info(f'Starting retrieval of firmographics for [{self.query}] via its wikipedia page.')
company_page = None
try:
company_page = wptools.page(self.query, silent=True)
if not company_page:
self.logger.error(f'A wikipedia page for [{self.query}] was not found.')
return lookup_error
# Log the completion of the page creation
self.logger.debug(f'Page results for [{self.query}]: {company_page}')
except Exception as e:
self.logger.error(f'A wikipedia page for [{self.query}] was not found due to [{e}].')
return lookup_error
# Log the completion of the page creation
self.logger.debug(f'Page results for [{self.query}]: {company_page}')

# Prepare to get the infoblox for the company
# Log the start of the process to get the infobox for the company
self.logger.info('Starting process to retrieve infobox for [' + self.query + '].')
parse_results = company_page.get_parse(show=False)
if not parse_results.data['infobox']:
self.logger.error('An infobox for [' + self.query + '] was not found.')
self.logger.info(f'Starting process to retrieve infobox for [{self.query}].')
parse_results = None
try:
parse_results = company_page.get_parse(show=False)
if not parse_results.data['infobox']:
self.logger.error('An infobox for [' + self.query + '] was not found.')
return lookup_error
# Log the completion of the infobox creation
self.logger.info('Completed infobox retrieval for [' + self.query + '].')
except Exception as e:
self.logger.error(f'An infobox for [{self.query}] was not found due to [{e}].')
return lookup_error
# Log the completion of the infobox creation
self.logger.info('Completed infobox retrieval for [' + self.query + '].')

company_info = parse_results.data['infobox']
if not company_info:
self.logger.error('An infobox for [' + self.query + '] was not found.')
# Get the company info from the infobox
company_info = None
try:
company_info = parse_results.data['infobox']
if not company_info:
self.logger.error('An infobox for [' + self.query + '] was not found.')
return lookup_error
self.logger.info('Completed infobox parse for [' + self.query + '].')
except Exception as e:
self.logger.error(f'An infobox for [{self.query}] was not found due to [{e}].')
return lookup_error
self.logger.info('Completed infobox parse for [' + self.query + '].')

# Obtain the query results
try:
Expand All @@ -149,10 +162,6 @@ def get_firmographics(self):
self.logger.info('Completed wikidata retrieval for [' + self.query + '].')
except:
return lookup_error

# Debugging output
if DEBUG == 1: pprint.pprint(page_data.data['wikidata'])
elif DEBUG == 2: pprint.pprint(company_info)

# Log the beginning of the firmographics data extraction
self.logger.info('Starting firmographics data extraction for [' + self.query + '].')
Expand Down

0 comments on commit bb6f14a

Please sign in to comment.