diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..002243c --- /dev/null +++ b/.env.example @@ -0,0 +1,3 @@ +PASSWORD=PASSWORD +USERNAME=USERNAME +DB_URI=CONNECTION_LINK_USING_${USERNAME}_AND_${PASSWORD} diff --git a/README.md b/README.md index f2cdcb7..0c0ecc7 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,8 @@ # seniordesign Senior Design Repository for the Statefarm Automotive Fraud Project + +## Database Access +Make a copy of the ``.env.example`` file and make the following changes. +1. remove ``.example`` from the extension +2. Paste the username and password provided in MongoDB Atlas (if you should have access but do not, please contact @waseem-polus) +3. Paste the connection URL provided provided in MongoDB Atlas. Include the password and username fields using ``${VARIABLE}`` syntax to embed the value of the variable \ No newline at end of file diff --git a/scrapers/craigslist.py b/src/scrapers/craigslist.py similarity index 100% rename from scrapers/craigslist.py rename to src/scrapers/craigslist.py diff --git a/src/scrapers/database.py b/src/scrapers/database.py new file mode 100644 index 0000000..ed36ac8 --- /dev/null +++ b/src/scrapers/database.py @@ -0,0 +1,57 @@ +from dotenv import load_dotenv +import pymongo +import os +from datetime import date + +def get_conn(db): + # load environment variable containing db uri (which includes username and password) + load_dotenv() + db_uri = os.getenv("DB_URI") + + # create a mongodb connection + try: + client = pymongo.MongoClient(db_uri) + + # return a friendly error if a URI error is thrown + except pymongo.errors.ConfigurationError: + print("An Invalid URI host error was received. Is your Atlas host name correct in your connection string (found the .env)?") + return {"success" : False, "db": 0} + + # use a database named "Test" + return {"success" : True, "db": client.get_database(db)} + +def post_raw(source, title, price, location, miles, link, images = None, postBody = None, longitude = None, latitude = None, attributes = None): + car = { + "title": title, + "price": price, + "location": location, + "odometer": miles, + "link": link, + "source": source, + "scrapeDate": str(date.today()) + } + + if (images is not None): + car["images"] = images + + if (postBody is not None): + car["postBody"] = postBody + + if (longitude is not None): + car["longitude"] = longitude + + if (latitude is not None): + car["latitude"] = latitude + + if (attributes is not None): + for attr in attributes: + car[attr["label"]] = attr["value"] + + # Insert into collection called "scrape_test" + conn = get_conn("scrape") + + if (conn["success"]): + result = conn["db"]["scraped_raw"].insert_one(car) + return result.acknowledged + else: + return False \ No newline at end of file diff --git a/scrapers/facebook.py b/src/scrapers/facebook.py similarity index 58% rename from scrapers/facebook.py rename to src/scrapers/facebook.py index 4a78e6e..00b5e45 100644 --- a/scrapers/facebook.py +++ b/src/scrapers/facebook.py @@ -1,9 +1,11 @@ from selenium import webdriver from bs4 import BeautifulSoup -import pandas as pd -from selenium.webdriver.chrome.service import Service as ChromeService +from selenium.webdriver.chrome.options import Options import time +import database + + #list of cities to scrape; can be expanded cities = [ 'nyc', 'la', 'chicago', 'houston', 'miami', @@ -23,14 +25,23 @@ # Create a new Selenium WebDriver instance -chrome_service = ChromeService(executable_path='C:/Users/athiyam/Downloads/chromedriver-mac-arm64') -driver = webdriver.Chrome(service=chrome_service) +print("Setting up headless browser") +options = Options() +options.add_argument("--headless=new") + +print("Creating a new Selenium WebDriver instance") +driver = webdriver.Chrome(options=options) # Create a list to store the scraped data +print("Started scraping...") data = {} for url in urls: # Navigate to the URL + print(f"Navigating to {url}") driver.get(url) + + print(f"Loading {url}") + time.sleep(2) scroll = 2000 @@ -49,50 +60,52 @@ soup = BeautifulSoup(html, 'html.parser') # Find all of the automotive listings on the page - listings = soup.find_all('div', class_='x9f619 x78zum5 x1r8uery xdt5ytf x1iyjqo2 xs83m0k x1e558r4 x150jy0e x1iorvi4 xjkvuk6 xnpuxes x291uyu x1uepa24') + car_posts = soup.find_all('div', class_='x9f619 x78zum5 x1r8uery xdt5ytf x1iyjqo2 xs83m0k x1e558r4 x150jy0e x1iorvi4 xjkvuk6 xnpuxes x291uyu x1uepa24') # Iterate over the listings and scrape the data - for listing in listings: + for post in car_posts: + print("Scraping new listing") try: # Get the title of the listing - title = listing.find('span', class_='x1lliihq x6ikm8r x10wlt62 x1n2onr6').text + title = post.find('span', class_='x1lliihq x6ikm8r x10wlt62 x1n2onr6').text except AttributeError: title = 'N/A' # Handle missing title try: # Get the price of the listing - price = listing.find('span', class_='x193iq5w xeuugli x13faqbe x1vvkbs x1xmvt09 x1lliihq x1s928wv xhkezso x1gmr53x x1cpjm7i x1fgarty x1943h6x xudqn12 x676frb x1lkfr7t x1lbecb7 x1s688f xzsf02u').text + price = post.find('span', class_='x193iq5w xeuugli x13faqbe x1vvkbs x1xmvt09 x1lliihq x1s928wv xhkezso x1gmr53x x1cpjm7i x1fgarty x1943h6x xudqn12 x676frb x1lkfr7t x1lbecb7 x1s688f xzsf02u').text except AttributeError: price = 'N/A' # Handle missing price try: # Get the location of the listing - location = listing.find('span', class_='x1lliihq x6ikm8r x10wlt62 x1n2onr6 xlyipyv xuxw1ft x1j85h84').text + location = post.find('span', class_='x1lliihq x6ikm8r x10wlt62 x1n2onr6 xlyipyv xuxw1ft x1j85h84').text except AttributeError: location = 'N/A' # Handle missing location try: # Get the miles of the car - miles = listing.find_all('span', class_='x1lliihq x6ikm8r x10wlt62 x1n2onr6 xlyipyv xuxw1ft x1j85h84')[1].text + miles = post.find_all('span', class_='x1lliihq x6ikm8r x10wlt62 x1n2onr6 xlyipyv xuxw1ft x1j85h84')[1].text except (AttributeError, IndexError): miles = 'N/A' # Handle missing miles try: # Get the link to the listing - link = 'https://www.facebook.com' + listing.find('a', class_='x1i10hfl xjbqb8w x6umtig x1b1mbwd xaqea5y xav7gou x9f619 x1ypdohk xt0psk2 xe8uvvx xdj266r x11i5rnm xat24cr x1mh8g0r xexx8yu x4uap5 x18d9i69 xkhd6sd x16tdsg8 x1hl2dhg xggy1nq x1a2a7pz x1heor9g x1lku1pv')['href'] + link = 'https://www.facebook.com' + post.find('a', class_='x1i10hfl xjbqb8w x6umtig x1b1mbwd xaqea5y xav7gou x9f619 x1ypdohk xt0psk2 xe8uvvx xdj266r x11i5rnm xat24cr x1mh8g0r xexx8yu x4uap5 x18d9i69 xkhd6sd x16tdsg8 x1hl2dhg xggy1nq x1a2a7pz x1heor9g x1lku1pv')['href'] except (AttributeError, TypeError): link = 'N/A' # Handle missing link - + # Add the data to the list if (title, price, location, miles, link) not in data: data[(title, price, location, miles, link)] = True + postSuccess = database.post_raw("facebook", title, price, location, miles, link) + if (postSuccess): + print("Save to DB") + else: + print("Failed to save to DB") + else: + print("Listing is a duplicate") -# Close the Selenium WebDriver instance -driver.quit() -# Create a Pandas DataFrame from the scraped data -df = pd.DataFrame(list(data.keys()), columns=['Title', 'Price', 'Location', 'Miles', 'Link']) -df.dropna(how='all', inplace=True) - -# Write the DataFrame to an Excel file -df.to_excel('facebook_marketplace_automotive_postings.xlsx', index=False) \ No newline at end of file +# Close the Selenium WebDriver instance +driver.quit() \ No newline at end of file