Skip to content

Commit

Permalink
Merge pull request #33 from lryanle/10/setup-db
Browse files Browse the repository at this point in the history
10/setup db
  • Loading branch information
waseem-polus authored Oct 31, 2023
2 parents 3674042 + d6f41d2 commit faac772
Show file tree
Hide file tree
Showing 5 changed files with 99 additions and 20 deletions.
3 changes: 3 additions & 0 deletions .env.example
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
PASSWORD=PASSWORD
USERNAME=USERNAME
DB_URI=CONNECTION_LINK_USING_${USERNAME}_AND_${PASSWORD}
6 changes: 6 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,2 +1,8 @@
# seniordesign
Senior Design Repository for the Statefarm Automotive Fraud Project

## Database Access
Make a copy of the ``.env.example`` file and make the following changes.
1. remove ``.example`` from the extension
2. Paste the username and password provided in MongoDB Atlas (if you should have access but do not, please contact @waseem-polus)
3. Paste the connection URL provided provided in MongoDB Atlas. Include the password and username fields using ``${VARIABLE}`` syntax to embed the value of the variable
File renamed without changes.
57 changes: 57 additions & 0 deletions src/scrapers/database.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
from dotenv import load_dotenv
import pymongo
import os
from datetime import date

def get_conn(db):
# load environment variable containing db uri (which includes username and password)
load_dotenv()
db_uri = os.getenv("DB_URI")

# create a mongodb connection
try:
client = pymongo.MongoClient(db_uri)

# return a friendly error if a URI error is thrown
except pymongo.errors.ConfigurationError:
print("An Invalid URI host error was received. Is your Atlas host name correct in your connection string (found the .env)?")
return {"success" : False, "db": 0}

# use a database named "Test"
return {"success" : True, "db": client.get_database(db)}

def post_raw(source, title, price, location, miles, link, images = None, postBody = None, longitude = None, latitude = None, attributes = None):
car = {
"title": title,
"price": price,
"location": location,
"odometer": miles,
"link": link,
"source": source,
"scrapeDate": str(date.today())
}

if (images is not None):
car["images"] = images

if (postBody is not None):
car["postBody"] = postBody

if (longitude is not None):
car["longitude"] = longitude

if (latitude is not None):
car["latitude"] = latitude

if (attributes is not None):
for attr in attributes:
car[attr["label"]] = attr["value"]

# Insert into collection called "scrape_test"
conn = get_conn("scrape")

if (conn["success"]):
result = conn["db"]["scraped_raw"].insert_one(car)
return result.acknowledged
else:
return False
53 changes: 33 additions & 20 deletions scrapers/facebook.py → src/scrapers/facebook.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.chrome.options import Options
import time

import database


#list of cities to scrape; can be expanded
cities = [
'nyc', 'la', 'chicago', 'houston', 'miami',
Expand All @@ -23,14 +25,23 @@

# Create a new Selenium WebDriver instance

chrome_service = ChromeService(executable_path='C:/Users/athiyam/Downloads/chromedriver-mac-arm64')
driver = webdriver.Chrome(service=chrome_service)
print("Setting up headless browser")
options = Options()
options.add_argument("--headless=new")

print("Creating a new Selenium WebDriver instance")
driver = webdriver.Chrome(options=options)

# Create a list to store the scraped data
print("Started scraping...")
data = {}
for url in urls:
# Navigate to the URL
print(f"Navigating to {url}")
driver.get(url)

print(f"Loading {url}")

time.sleep(2)
scroll = 2000

Expand All @@ -49,50 +60,52 @@
soup = BeautifulSoup(html, 'html.parser')

# Find all of the automotive listings on the page
listings = soup.find_all('div', class_='x9f619 x78zum5 x1r8uery xdt5ytf x1iyjqo2 xs83m0k x1e558r4 x150jy0e x1iorvi4 xjkvuk6 xnpuxes x291uyu x1uepa24')
car_posts = soup.find_all('div', class_='x9f619 x78zum5 x1r8uery xdt5ytf x1iyjqo2 xs83m0k x1e558r4 x150jy0e x1iorvi4 xjkvuk6 xnpuxes x291uyu x1uepa24')

# Iterate over the listings and scrape the data
for listing in listings:
for post in car_posts:
print("Scraping new listing")
try:
# Get the title of the listing
title = listing.find('span', class_='x1lliihq x6ikm8r x10wlt62 x1n2onr6').text
title = post.find('span', class_='x1lliihq x6ikm8r x10wlt62 x1n2onr6').text
except AttributeError:
title = 'N/A' # Handle missing title

try:
# Get the price of the listing
price = listing.find('span', class_='x193iq5w xeuugli x13faqbe x1vvkbs x1xmvt09 x1lliihq x1s928wv xhkezso x1gmr53x x1cpjm7i x1fgarty x1943h6x xudqn12 x676frb x1lkfr7t x1lbecb7 x1s688f xzsf02u').text
price = post.find('span', class_='x193iq5w xeuugli x13faqbe x1vvkbs x1xmvt09 x1lliihq x1s928wv xhkezso x1gmr53x x1cpjm7i x1fgarty x1943h6x xudqn12 x676frb x1lkfr7t x1lbecb7 x1s688f xzsf02u').text
except AttributeError:
price = 'N/A' # Handle missing price

try:
# Get the location of the listing
location = listing.find('span', class_='x1lliihq x6ikm8r x10wlt62 x1n2onr6 xlyipyv xuxw1ft x1j85h84').text
location = post.find('span', class_='x1lliihq x6ikm8r x10wlt62 x1n2onr6 xlyipyv xuxw1ft x1j85h84').text
except AttributeError:
location = 'N/A' # Handle missing location

try:
# Get the miles of the car
miles = listing.find_all('span', class_='x1lliihq x6ikm8r x10wlt62 x1n2onr6 xlyipyv xuxw1ft x1j85h84')[1].text
miles = post.find_all('span', class_='x1lliihq x6ikm8r x10wlt62 x1n2onr6 xlyipyv xuxw1ft x1j85h84')[1].text
except (AttributeError, IndexError):
miles = 'N/A' # Handle missing miles

try:
# Get the link to the listing
link = 'https://www.facebook.com' + listing.find('a', class_='x1i10hfl xjbqb8w x6umtig x1b1mbwd xaqea5y xav7gou x9f619 x1ypdohk xt0psk2 xe8uvvx xdj266r x11i5rnm xat24cr x1mh8g0r xexx8yu x4uap5 x18d9i69 xkhd6sd x16tdsg8 x1hl2dhg xggy1nq x1a2a7pz x1heor9g x1lku1pv')['href']
link = 'https://www.facebook.com' + post.find('a', class_='x1i10hfl xjbqb8w x6umtig x1b1mbwd xaqea5y xav7gou x9f619 x1ypdohk xt0psk2 xe8uvvx xdj266r x11i5rnm xat24cr x1mh8g0r xexx8yu x4uap5 x18d9i69 xkhd6sd x16tdsg8 x1hl2dhg xggy1nq x1a2a7pz x1heor9g x1lku1pv')['href']
except (AttributeError, TypeError):
link = 'N/A' # Handle missing link

# Add the data to the list
if (title, price, location, miles, link) not in data:
data[(title, price, location, miles, link)] = True
postSuccess = database.post_raw("facebook", title, price, location, miles, link)
if (postSuccess):
print("Save to DB")
else:
print("Failed to save to DB")
else:
print("Listing is a duplicate")

# Close the Selenium WebDriver instance
driver.quit()

# Create a Pandas DataFrame from the scraped data
df = pd.DataFrame(list(data.keys()), columns=['Title', 'Price', 'Location', 'Miles', 'Link'])
df.dropna(how='all', inplace=True)

# Write the DataFrame to an Excel file
df.to_excel('facebook_marketplace_automotive_postings.xlsx', index=False)
# Close the Selenium WebDriver instance
driver.quit()

1 comment on commit faac772

@vercel
Copy link

@vercel vercel bot commented on faac772 Oct 31, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Successfully deployed to the following URLs:

seniordesign – ./

seniordesign-git-main-lryanle.vercel.app
seniordesign-lryanle.vercel.app
smare.vercel.app
smare.lryanle.com

Please sign in to comment.