Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

modified_english_spreadsheets #15

Draft
wants to merge 1 commit into
base: modifiedEnglishSpreadsheets
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file added plzcrawler/__pycache__/plzCrawler.cpython-310.pyc
Binary file not shown.
Binary file not shown.
151 changes: 151 additions & 0 deletions plzcrawler_modified/plzCrawler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,151 @@
#! python3
#plzcrawler.py - Program for adding postal codes to the address table
# https://nominatim.openstreetmap.org

import requests
import json
from pprint import pprint
import os
import pandas as pd

# Read street & city for all rows from .xlsx table and make them available in Python
class AddressFinder:
def __init__(self, excelTableNameString, workSheetString, tableDirectoryRawString, *args):
self.workSheetString = workSheetString
self.excelTableNameString = excelTableNameString
self.tableDirectoryRawString = tableDirectoryRawString

self.namenListe = []
self.strassenListe = []
self.plzListe = []
self.stadtListe = []
self.placePlzListe = []
self.placeIdListe = []

def openAddressTable(self):
'''
takes the name of an Excel spreadsheet in the form of a string,
the name of the corresponding spreadsheet in the form of a string,
& the Windows table directory in the form of a raw string;
navigate to the specified windows path & open the table as pandas-dataframe;
saves table as pandas dataframe-object
'''
# Navigate to directory containing table
dir = os.chdir(self.tableDirectoryRawString)
# print(Path.cwd())

openedAddressTable = pd.read_excel(open(f'{self.excelTableNameString}.xlsx', 'rb'),
sheet_name=self.workSheetString)
# print(df)
return openedAddressTable

def getColumnLists(self, addressDataframe):
'''
takes panda's dataframe object as an input argument;
pulls relevant columns out of input-object;
outputs selected column values in lists of equal length
'''

# Get column "Name" from input table (dataframe-object).
namen = addressDataframe['Name']
namenNpArray = namen.values
self.namenListe = list(namenNpArray)

# Get column "Street" from input table (dataframe-object).
strassen = addressDataframe['Street']
strassenNpArray = strassen.values
self.strassenListe = list(strassenNpArray)

# Get column "City" from input table (dataframe-object).
stadt = addressDataframe['City']
stadtNpArray = stadt.values
self.stadtListe = list(stadtNpArray)

return self.namenListe, self.strassenListe, self.stadtListe


# perform Nominatum searches on all rows
# Assumption: lists of the same length with column data from table
def searchAddress(self, strassenListe, stadtListe):
"""
takes street names & numbers, as well as city names in the form of lists of the same length;
uses nominatum tool & the individual strings within the input lists (as addresses) to perform a search in the OpenStreetMap API;
gives the zip code of the first search result or (if not found) a placeholder ('N/A') in the form of a list
"""


for i, strasse in enumerate(strassenListe, start=-1):
print(strasse)
stadt = stadtListe[i+1]
print(stadt)

luckySearch=json.dumps(requests.get(
f'https://nominatim.openstreetmap.org/search.php?street={strasse}&city={stadt}&country=DE&format=jsonv2').json())

try:
luckySearchDic = json.loads(luckySearch)[0] #erster Eintrag in Suchergebnissen
#pprint(luckySearchDic)
placeID = luckySearchDic.get('place_id')
self.placeIdListe.append(placeID)

placeSearch = json.dumps(requests.get(
f'https://nominatim.openstreetmap.org/details.php?place_id={placeID}'
f'&addressdetails=1&hierarchy=0&group_hierarchy=1&format=json').json())

placeSearchDic = json.loads(placeSearch)
placePlz = placeSearchDic.get('addresstags').get('postcode')
if placePlz != None:
self.placePlzListe.append(placePlz)
print(placePlz, end=2*'\n')
elif placePlz == None:
calcPlacePlz = placeSearchDic.get('calculated_postcode')
self.placePlzListe.append(calcPlacePlz)
print(calcPlacePlz, end=2*'\n')
else:
self.placePlzListe.append('N/A - keine PLZ gefunden')

except AttributeError as atError:
print('An '+str(atError.__class__.__name__) + ' error occurred.\n'
'Most likely the "addesstags" entry in the API request was an empty list.\n'
'which means the address could not be found.', end=2*'\n')
self.placePlzListe.append('N/A - address cannot be found')

except IndexError as inError:
print('An '+str(inError.__class__.__name__) + ' error occurred.\n'
'Apparently no map entry was found for the address.\n'
'This could be because the address has an unusual format.', end=2*'\n')
self.placePlzListe.append('N/A - Check address format!')

return self.placePlzListe, self.placeIdListe


# Add zip code of first search result for each row in .xlsx table
def makePlzDataframe(self):
"""takes column data (at least one(!), from original table) in the form of lists;
converts this to a dictionary;
outputs combined lists of column names in the form of a pandas dataframe"""
# # list of name, degree, score
# firstColumnList = self.namenListe
# secondColumnList = self.strassenListe
# thirdColumnList = self.placePlzListe
# fourthColumnList = self.stadtListe
# dictionary of lists
plzDict = {
'Name': self.namenListe,
'Street': self.strassenListe,
'Zip code': self.placePlzListe,
'City': self.stadtListe,
}

plzDf = pd.DataFrame(plzDict)
return plzDf

# save new table
def savePlzTable(self, inputDfName):
"""takes pandas dataframe;
saves pandas dataframe under the original table name (with "+PLZ") in the form of an .xlsx table"""

inputDfName.to_excel(f'{self.excelTableNameString}+PLZ.xlsx', sheet_name=self.workSheetString)



71 changes: 71 additions & 0 deletions plzcrawler_modified/plzMaker.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
#! python3
#plzMaker.py - Script to run plzcrawler.py (to add zip codes to address table)

from plzCrawler import AddressFinder

# Input query for table data:
print(f"""\nPlease tell me the name of your address table.
It must contain the columns "Name", "Street", "ZIP", "City".
and it must be an .xlsx document
Excel-Name:""")
ExcelName = input()

print(f"""\nPlease tell me the name of the spreadsheet within your address table
spreadsheetname:""")
spreadsheetname = input()

print(f"""\nPlease tell me the path to the folder in which your table is located.
The path must be specified in Windows format - e.g. 'C:\\Users\\Username\\Documents'
Pfad:""")
addressofexcelfile = input()

print(f"""\nDANGER!:
If a table with the name '{ExcelName}+zip code' exists in the specified folder, it will be overwritten.
If you want to avoid this, please move this table to another folder first!
Do you agree that the table '{ExcelName}+zip code' may be overwritten? [y/n]""")
confirmation = input()

crawl = None
if confirmation != 'y':
crawl = False
else:
crawl = True

while not crawl:
print(f"""\nExecution of the program was aborted.
You can always start the program again when you are ready.""")
break
else:
try:
f = AddressFinder(ExcelName, spreadsheetname, addressofexcelfile)
openn = f.openAddressTable()
extract1, extract2, extract3 = f.getColumnLists(openn)

find1, find2 = f.searchAddress(extract2, extract3)

df = f.makePlzDataframe()




f.savePlzTable(df)

print(f"\nYou can find their table including postcodes in the following directory\n"
f">>> '{f.tableDirectoryRawString}'\n"
f"The table bears the name '{f.excelTableNameString}+PLZ'")

except FileNotFoundError as fnfError:
print('\nThe following error has occurred:\n'
+ str(fnfError.__class__.__name__) + '\n'
'----------\n'
'Apparently the specified file or path could not be found.\n'
'The program will now exit without making any changes.\n'
'You can restart the program execution to enter the correct data.\n', end=2 * '\n')

except ImportError as impError:
print('\nThe following error occurred:\n'
+ str(impError.__class__.__name__) + '\n'
'----------\n'
'An incorrect name was probably specified for the spreadsheet.\n'
'The program will now exit without making any changes.\n'
'You can restart the program execution to enter the correct data.\n', end=2 * '\n')