diff --git a/example_spreadsheets/plzBeispielTabelle_kurz+PLZ.xlsx b/example_spreadsheets/plzBeispielTabelle_kurz+PLZ.xlsx new file mode 100644 index 0000000..999373b Binary files /dev/null and b/example_spreadsheets/plzBeispielTabelle_kurz+PLZ.xlsx differ diff --git a/example_spreadsheets_english_modified/plzBeispielTabelle.xlsx b/example_spreadsheets_english_modified/plzBeispielTabelle.xlsx new file mode 100644 index 0000000..1596002 Binary files /dev/null and b/example_spreadsheets_english_modified/plzBeispielTabelle.xlsx differ diff --git a/example_spreadsheets_english_modified/plzBeispielTabelle_kurz+PLZ.xlsx b/example_spreadsheets_english_modified/plzBeispielTabelle_kurz+PLZ.xlsx new file mode 100644 index 0000000..30ad6a7 Binary files /dev/null and b/example_spreadsheets_english_modified/plzBeispielTabelle_kurz+PLZ.xlsx differ diff --git a/example_spreadsheets_english_modified/plzBeispielTabelle_kurz.xlsx b/example_spreadsheets_english_modified/plzBeispielTabelle_kurz.xlsx new file mode 100644 index 0000000..d32ef1f Binary files /dev/null and b/example_spreadsheets_english_modified/plzBeispielTabelle_kurz.xlsx differ diff --git a/plzcrawler/__pycache__/plzCrawler.cpython-310.pyc b/plzcrawler/__pycache__/plzCrawler.cpython-310.pyc new file mode 100644 index 0000000..579478d Binary files /dev/null and b/plzcrawler/__pycache__/plzCrawler.cpython-310.pyc differ diff --git a/plzcrawler_modified/__pycache__/plzCrawler.cpython-310.pyc b/plzcrawler_modified/__pycache__/plzCrawler.cpython-310.pyc new file mode 100644 index 0000000..0055f65 Binary files /dev/null and b/plzcrawler_modified/__pycache__/plzCrawler.cpython-310.pyc differ diff --git a/plzcrawler_modified/plzCrawler.py b/plzcrawler_modified/plzCrawler.py new file mode 100644 index 0000000..1127568 --- /dev/null +++ b/plzcrawler_modified/plzCrawler.py @@ -0,0 +1,151 @@ +#! python3 +#plzcrawler.py - Program for adding postal codes to the address table +# https://nominatim.openstreetmap.org + +import requests +import json +from pprint import pprint +import os +import pandas as pd + +# Read street & city for all rows from .xlsx table and make them available in Python +class AddressFinder: + def __init__(self, excelTableNameString, workSheetString, tableDirectoryRawString, *args): + self.workSheetString = workSheetString + self.excelTableNameString = excelTableNameString + self.tableDirectoryRawString = tableDirectoryRawString + + self.namenListe = [] + self.strassenListe = [] + self.plzListe = [] + self.stadtListe = [] + self.placePlzListe = [] + self.placeIdListe = [] + + def openAddressTable(self): + ''' + takes the name of an Excel spreadsheet in the form of a string, + the name of the corresponding spreadsheet in the form of a string, + & the Windows table directory in the form of a raw string; + navigate to the specified windows path & open the table as pandas-dataframe; + saves table as pandas dataframe-object + ''' + # Navigate to directory containing table + dir = os.chdir(self.tableDirectoryRawString) + # print(Path.cwd()) + + openedAddressTable = pd.read_excel(open(f'{self.excelTableNameString}.xlsx', 'rb'), + sheet_name=self.workSheetString) + # print(df) + return openedAddressTable + + def getColumnLists(self, addressDataframe): + ''' + takes panda's dataframe object as an input argument; + pulls relevant columns out of input-object; + outputs selected column values in lists of equal length + ''' + + # Get column "Name" from input table (dataframe-object). + namen = addressDataframe['Name'] + namenNpArray = namen.values + self.namenListe = list(namenNpArray) + + # Get column "Street" from input table (dataframe-object). + strassen = addressDataframe['Street'] + strassenNpArray = strassen.values + self.strassenListe = list(strassenNpArray) + + # Get column "City" from input table (dataframe-object). + stadt = addressDataframe['City'] + stadtNpArray = stadt.values + self.stadtListe = list(stadtNpArray) + + return self.namenListe, self.strassenListe, self.stadtListe + + + # perform Nominatum searches on all rows + # Assumption: lists of the same length with column data from table + def searchAddress(self, strassenListe, stadtListe): + """ + takes street names & numbers, as well as city names in the form of lists of the same length; + uses nominatum tool & the individual strings within the input lists (as addresses) to perform a search in the OpenStreetMap API; + gives the zip code of the first search result or (if not found) a placeholder ('N/A') in the form of a list + """ + + + for i, strasse in enumerate(strassenListe, start=-1): + print(strasse) + stadt = stadtListe[i+1] + print(stadt) + + luckySearch=json.dumps(requests.get( + f'https://nominatim.openstreetmap.org/search.php?street={strasse}&city={stadt}&country=DE&format=jsonv2').json()) + + try: + luckySearchDic = json.loads(luckySearch)[0] #erster Eintrag in Suchergebnissen + #pprint(luckySearchDic) + placeID = luckySearchDic.get('place_id') + self.placeIdListe.append(placeID) + + placeSearch = json.dumps(requests.get( + f'https://nominatim.openstreetmap.org/details.php?place_id={placeID}' + f'&addressdetails=1&hierarchy=0&group_hierarchy=1&format=json').json()) + + placeSearchDic = json.loads(placeSearch) + placePlz = placeSearchDic.get('addresstags').get('postcode') + if placePlz != None: + self.placePlzListe.append(placePlz) + print(placePlz, end=2*'\n') + elif placePlz == None: + calcPlacePlz = placeSearchDic.get('calculated_postcode') + self.placePlzListe.append(calcPlacePlz) + print(calcPlacePlz, end=2*'\n') + else: + self.placePlzListe.append('N/A - keine PLZ gefunden') + + except AttributeError as atError: + print('An '+str(atError.__class__.__name__) + ' error occurred.\n' + 'Most likely the "addesstags" entry in the API request was an empty list.\n' + 'which means the address could not be found.', end=2*'\n') + self.placePlzListe.append('N/A - address cannot be found') + + except IndexError as inError: + print('An '+str(inError.__class__.__name__) + ' error occurred.\n' + 'Apparently no map entry was found for the address.\n' + 'This could be because the address has an unusual format.', end=2*'\n') + self.placePlzListe.append('N/A - Check address format!') + + return self.placePlzListe, self.placeIdListe + + +# Add zip code of first search result for each row in .xlsx table + def makePlzDataframe(self): + """takes column data (at least one(!), from original table) in the form of lists; + converts this to a dictionary; + outputs combined lists of column names in the form of a pandas dataframe""" + # # list of name, degree, score + # firstColumnList = self.namenListe + # secondColumnList = self.strassenListe + # thirdColumnList = self.placePlzListe + # fourthColumnList = self.stadtListe + # dictionary of lists + plzDict = { + 'Name': self.namenListe, + 'Street': self.strassenListe, + 'Zip code': self.placePlzListe, + 'City': self.stadtListe, + } + + plzDf = pd.DataFrame(plzDict) + return plzDf + + # save new table + def savePlzTable(self, inputDfName): + """takes pandas dataframe; + saves pandas dataframe under the original table name (with "+PLZ") in the form of an .xlsx table""" + + inputDfName.to_excel(f'{self.excelTableNameString}+PLZ.xlsx', sheet_name=self.workSheetString) + + + diff --git a/plzcrawler_modified/plzMaker.py b/plzcrawler_modified/plzMaker.py new file mode 100644 index 0000000..f85198e --- /dev/null +++ b/plzcrawler_modified/plzMaker.py @@ -0,0 +1,71 @@ +#! python3 +#plzMaker.py - Script to run plzcrawler.py (to add zip codes to address table) + +from plzCrawler import AddressFinder + +# Input query for table data: +print(f"""\nPlease tell me the name of your address table. +It must contain the columns "Name", "Street", "ZIP", "City". +and it must be an .xlsx document +Excel-Name:""") +ExcelName = input() + +print(f"""\nPlease tell me the name of the spreadsheet within your address table +spreadsheetname:""") +spreadsheetname = input() + +print(f"""\nPlease tell me the path to the folder in which your table is located. +The path must be specified in Windows format - e.g. 'C:\\Users\\Username\\Documents' +Pfad:""") +addressofexcelfile = input() + +print(f"""\nDANGER!: +If a table with the name '{ExcelName}+zip code' exists in the specified folder, it will be overwritten. +If you want to avoid this, please move this table to another folder first! +Do you agree that the table '{ExcelName}+zip code' may be overwritten? [y/n]""") +confirmation = input() + +crawl = None +if confirmation != 'y': + crawl = False +else: + crawl = True + +while not crawl: + print(f"""\nExecution of the program was aborted. +You can always start the program again when you are ready.""") + break +else: + try: + f = AddressFinder(ExcelName, spreadsheetname, addressofexcelfile) + openn = f.openAddressTable() + extract1, extract2, extract3 = f.getColumnLists(openn) + + find1, find2 = f.searchAddress(extract2, extract3) + + df = f.makePlzDataframe() + + + + + f.savePlzTable(df) + + print(f"\nYou can find their table including postcodes in the following directory\n" + f">>> '{f.tableDirectoryRawString}'\n" + f"The table bears the name '{f.excelTableNameString}+PLZ'") + + except FileNotFoundError as fnfError: + print('\nThe following error has occurred:\n' + + str(fnfError.__class__.__name__) + '\n' + '----------\n' + 'Apparently the specified file or path could not be found.\n' + 'The program will now exit without making any changes.\n' + 'You can restart the program execution to enter the correct data.\n', end=2 * '\n') + + except ImportError as impError: + print('\nThe following error occurred:\n' + + str(impError.__class__.__name__) + '\n' + '----------\n' + 'An incorrect name was probably specified for the spreadsheet.\n' + 'The program will now exit without making any changes.\n' + 'You can restart the program execution to enter the correct data.\n', end=2 * '\n')