Merge pull request #33 from lryanle/10/setup-db

10/setup db
lryanle · Oct 31, 2023 · faac772 · faac772 · vercel · Oct 31, 2023
2 parents 3674042 + d6f41d2
commit faac772
Show file tree

Hide file tree

Showing 5 changed files with 99 additions and 20 deletions.
diff --git a/.env.example b/.env.example
@@ -0,0 +1,3 @@
+PASSWORD=PASSWORD
+USERNAME=USERNAME
+DB_URI=CONNECTION_LINK_USING_${USERNAME}_AND_${PASSWORD}
diff --git a/README.md b/README.md
@@ -1,2 +1,8 @@
 # seniordesign
 Senior Design Repository for the Statefarm Automotive Fraud Project
+
+## Database Access
+Make a copy of the ``.env.example`` file and make the following changes.
+1. remove ``.example`` from the extension
+2. Paste the username and password provided in MongoDB Atlas (if you should have access but do not, please contact @waseem-polus)
+3. Paste the connection URL provided provided in MongoDB Atlas. Include the password and username fields using ``${VARIABLE}`` syntax to embed the value of the variable
diff --git a/scrapers/craigslist.py → src/scrapers/craigslist.py b/scrapers/craigslist.py → src/scrapers/craigslist.py
diff --git a/src/scrapers/database.py b/src/scrapers/database.py
@@ -0,0 +1,57 @@
+from dotenv import load_dotenv
+import pymongo
+import os
+from datetime import date
+
+def get_conn(db):
+  # load environment variable containing db uri (which includes username and password)
+  load_dotenv()
+  db_uri = os.getenv("DB_URI")
+
+  # create a mongodb connection
+  try:
+    client = pymongo.MongoClient(db_uri)
+
+  # return a friendly error if a URI error is thrown 
+  except pymongo.errors.ConfigurationError:
+    print("An Invalid URI host error was received. Is your Atlas host name correct in your connection string (found the .env)?")
+    return {"success" : False, "db": 0}
+
+  # use a database named "Test"
+  return {"success" : True, "db": client.get_database(db)}
+
+def post_raw(source, title, price, location, miles, link, images = None, postBody = None, longitude = None, latitude = None, attributes = None):
+  car = {
+    "title": title, 
+    "price": price, 
+    "location": location, 
+    "odometer": miles, 
+    "link": link,
+    "source": source,
+    "scrapeDate": str(date.today())
+  }
+
+  if (images is not None):
+    car["images"] = images
+
+  if (postBody is not None):
+    car["postBody"] = postBody
+
+  if (longitude is not None):
+    car["longitude"] = longitude
+
+  if (latitude is not None):
+    car["latitude"] = latitude
+
+  if (attributes is not None):
+    for attr in attributes:
+      car[attr["label"]] = attr["value"]
+
+  # Insert into collection called "scrape_test"
+  conn = get_conn("scrape")
+
+  if (conn["success"]):
+    result = conn["db"]["scraped_raw"].insert_one(car)
+    return result.acknowledged
+  else:
+    return False
diff --git a/scrapers/facebook.py → src/scrapers/facebook.py b/scrapers/facebook.py → src/scrapers/facebook.py
@@ -1,9 +1,11 @@
 from selenium import webdriver
 from bs4 import BeautifulSoup
-import pandas as pd
-from selenium.webdriver.chrome.service import Service as ChromeService
+from selenium.webdriver.chrome.options import Options
 import time
 
+import database
+
+
 #list of cities to scrape; can be expanded
 cities = [
     'nyc', 'la', 'chicago', 'houston', 'miami', 
@@ -23,14 +25,23 @@
 
 # Create a new Selenium WebDriver instance
 
-chrome_service = ChromeService(executable_path='C:/Users/athiyam/Downloads/chromedriver-mac-arm64')
-driver = webdriver.Chrome(service=chrome_service)
+print("Setting up headless browser")
+options = Options()
+options.add_argument("--headless=new")
+
+print("Creating a new Selenium WebDriver instance")
+driver = webdriver.Chrome(options=options)
 
 # Create a list to store the scraped data
+print("Started scraping...")
 data = {}
 for url in urls:
     # Navigate to the URL
+    print(f"Navigating to {url}")
     driver.get(url)
+
+    print(f"Loading {url}")
+
     time.sleep(2)
     scroll = 2000
 
@@ -49,50 +60,52 @@
     soup = BeautifulSoup(html, 'html.parser')
 
     # Find all of the automotive listings on the page
-    listings = soup.find_all('div', class_='x9f619 x78zum5 x1r8uery xdt5ytf x1iyjqo2 xs83m0k x1e558r4 x150jy0e x1iorvi4 xjkvuk6 xnpuxes x291uyu x1uepa24')
+    car_posts = soup.find_all('div', class_='x9f619 x78zum5 x1r8uery xdt5ytf x1iyjqo2 xs83m0k x1e558r4 x150jy0e x1iorvi4 xjkvuk6 xnpuxes x291uyu x1uepa24')
 
     # Iterate over the listings and scrape the data
-    for listing in listings:
+    for post in car_posts:
+        print("Scraping new listing")
         try:
             # Get the title of the listing
-            title = listing.find('span', class_='x1lliihq x6ikm8r x10wlt62 x1n2onr6').text
+            title = post.find('span', class_='x1lliihq x6ikm8r x10wlt62 x1n2onr6').text
         except AttributeError:
             title = 'N/A'  # Handle missing title
 
         try:
             # Get the price of the listing
-            price = listing.find('span', class_='x193iq5w xeuugli x13faqbe x1vvkbs x1xmvt09 x1lliihq x1s928wv xhkezso x1gmr53x x1cpjm7i x1fgarty x1943h6x xudqn12 x676frb x1lkfr7t x1lbecb7 x1s688f xzsf02u').text
+            price = post.find('span', class_='x193iq5w xeuugli x13faqbe x1vvkbs x1xmvt09 x1lliihq x1s928wv xhkezso x1gmr53x x1cpjm7i x1fgarty x1943h6x xudqn12 x676frb x1lkfr7t x1lbecb7 x1s688f xzsf02u').text
         except AttributeError:
             price = 'N/A'  # Handle missing price
 
         try:
             # Get the location of the listing
-            location = listing.find('span', class_='x1lliihq x6ikm8r x10wlt62 x1n2onr6 xlyipyv xuxw1ft x1j85h84').text
+            location = post.find('span', class_='x1lliihq x6ikm8r x10wlt62 x1n2onr6 xlyipyv xuxw1ft x1j85h84').text
         except AttributeError:
             location = 'N/A'  # Handle missing location
 
         try:
             # Get the miles of the car
-            miles = listing.find_all('span', class_='x1lliihq x6ikm8r x10wlt62 x1n2onr6 xlyipyv xuxw1ft x1j85h84')[1].text
+            miles = post.find_all('span', class_='x1lliihq x6ikm8r x10wlt62 x1n2onr6 xlyipyv xuxw1ft x1j85h84')[1].text
         except (AttributeError, IndexError):
             miles = 'N/A'  # Handle missing miles
 
         try:
             # Get the link to the listing
-            link = 'https://www.facebook.com' + listing.find('a', class_='x1i10hfl xjbqb8w x6umtig x1b1mbwd xaqea5y xav7gou x9f619 x1ypdohk xt0psk2 xe8uvvx xdj266r x11i5rnm xat24cr x1mh8g0r xexx8yu x4uap5 x18d9i69 xkhd6sd x16tdsg8 x1hl2dhg xggy1nq x1a2a7pz x1heor9g x1lku1pv')['href']
+            link = 'https://www.facebook.com' + post.find('a', class_='x1i10hfl xjbqb8w x6umtig x1b1mbwd xaqea5y xav7gou x9f619 x1ypdohk xt0psk2 xe8uvvx xdj266r x11i5rnm xat24cr x1mh8g0r xexx8yu x4uap5 x18d9i69 xkhd6sd x16tdsg8 x1hl2dhg xggy1nq x1a2a7pz x1heor9g x1lku1pv')['href']
         except (AttributeError, TypeError):
             link = 'N/A'  # Handle missing link
-        
+
         # Add the data to the list
         if (title, price, location, miles, link) not in data:
             data[(title, price, location, miles, link)] = True
+            postSuccess = database.post_raw("facebook", title, price, location, miles, link)
+            if (postSuccess):
+                print("Save to DB")
+            else:
+                print("Failed to save to DB")
+        else:
+            print("Listing is a duplicate")
 
-# Close the Selenium WebDriver instance
-driver.quit()
 
-# Create a Pandas DataFrame from the scraped data
-df = pd.DataFrame(list(data.keys()), columns=['Title', 'Price', 'Location', 'Miles', 'Link'])
-df.dropna(how='all', inplace=True)
-
-# Write the DataFrame to an Excel file
-df.to_excel('facebook_marketplace_automotive_postings.xlsx', index=False)
+# Close the Selenium WebDriver instance
+driver.quit()