Skip to content

Commit

Permalink
Feat: Site Mapping (#46)
Browse files Browse the repository at this point in the history
* wip: add site mapping

* chore: cleanup
  • Loading branch information
jaypyles authored Nov 17, 2024
1 parent 3a0762f commit 7d80ff5
Show file tree
Hide file tree
Showing 35 changed files with 848 additions and 344 deletions.
19 changes: 19 additions & 0 deletions api/backend/job/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
from .job import (
query,
insert,
update_job,
delete_jobs,
get_jobs_per_day,
get_queued_job,
average_elements_per_link,
)

__all__ = [
"query",
"insert",
"update_job",
"delete_jobs",
"get_jobs_per_day",
"get_queued_job",
"average_elements_per_link",
]
2 changes: 1 addition & 1 deletion api/backend/job.py → api/backend/job/job.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@
from pymongo import DESCENDING

# LOCAL
from api.backend.models import FetchOptions
from api.backend.database import get_job_collection
from api.backend.job.models.job_options import FetchOptions

LOG = logging.getLogger(__name__)

Expand Down
Empty file.
14 changes: 14 additions & 0 deletions api/backend/job/models/job_options.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
from pydantic import BaseModel
from typing import Any, Optional
from api.backend.job.models.site_map import SiteMap


class FetchOptions(BaseModel):
chat: Optional[bool] = None


class JobOptions(BaseModel):
multi_page_scrape: bool = False
custom_headers: dict[str, Any] = {}
proxies: list[str] = []
site_map: Optional[SiteMap] = None
14 changes: 14 additions & 0 deletions api/backend/job/models/site_map.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
from pydantic import BaseModel
from typing import Literal


class Action(BaseModel):
type: Literal["click", "input"]
xpath: str
name: str
input: str = ""
do_once: bool = True


class SiteMap(BaseModel):
actions: list[Action]
30 changes: 30 additions & 0 deletions api/backend/job/scraping/scraping_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
import time
from typing import cast

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait


def scrape_content(driver: webdriver.Chrome, pages: set[tuple[str, str]]):
_ = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.TAG_NAME, "body"))
)

last_height = cast(str, driver.execute_script("return document.body.scrollHeight"))
while True:
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

time.sleep(3) # Wait for the page to load
new_height = cast(
str, driver.execute_script("return document.body.scrollHeight")
)

if new_height == last_height:
break

last_height = new_height

pages.add((driver.page_source, driver.current_url))
return driver.page_source
Empty file.
94 changes: 94 additions & 0 deletions api/backend/job/site_mapping/site_mapping.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
from api.backend.job.models.site_map import Action, SiteMap
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.by import By
from typing import Any
import logging
import time
from copy import deepcopy

from api.backend.job.scraping.scraping_utils import scrape_content
from selenium.webdriver.support.ui import WebDriverWait
from seleniumwire.inspect import TimeoutException
from seleniumwire.webdriver import Chrome
from selenium.webdriver.support import expected_conditions as EC

LOG = logging.getLogger(__name__)


def clear_done_actions(site_map: dict[str, Any]):
"""Clear all actions that have been clicked."""
cleared_site_map = deepcopy(site_map)

cleared_site_map["actions"] = [
action for action in cleared_site_map["actions"] if not action["do_once"]
]

return cleared_site_map


def handle_input(action: Action, driver: webdriver.Chrome):
try:
element = WebDriverWait(driver, 10).until(
EC.element_to_be_clickable((By.XPATH, action.xpath))
)
LOG.info(f"Sending keys: {action.input} to element: {element}")

element.send_keys(action.input)

except NoSuchElementException:
LOG.info(f"Element not found: {action.xpath}")
return False

except TimeoutException:
LOG.info(f"Timeout waiting for element: {action.xpath}")
return False

except Exception as e:
LOG.info(f"Error handling input: {e}")
return False

return True


def handle_click(action: Action, driver: webdriver.Chrome):
try:
element = driver.find_element(By.XPATH, action.xpath)
LOG.info(f"Clicking element: {element}")

element.click()

except NoSuchElementException:
LOG.info(f"Element not found: {action.xpath}")
return False

return True


ACTION_MAP = {
"click": handle_click,
"input": handle_input,
}


async def handle_site_mapping(
site_map_dict: dict[str, Any],
driver: Chrome,
pages: set[tuple[str, str]],
):
site_map = SiteMap(**site_map_dict)
LOG.info(f"Handling site map: {site_map}")

for action in site_map.actions:
action_handler = ACTION_MAP[action.type]
if not action_handler(action, driver):
return

time.sleep(2)

_ = scrape_content(driver, pages)

cleared_site_map_dict = clear_done_actions(site_map_dict)

if cleared_site_map_dict["actions"]:
await handle_site_mapping(cleared_site_map_dict, driver, pages)
12 changes: 4 additions & 8 deletions api/backend/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,14 @@
from typing import Any, Optional, Union
from datetime import datetime

# LOCAL
from api.backend.job.models.job_options import JobOptions

# PDM
import pydantic


class FetchOptions(pydantic.BaseModel):
chat: Optional[bool] = None



class Element(pydantic.BaseModel):
Expand All @@ -22,12 +24,6 @@ class CapturedElement(pydantic.BaseModel):
name: str


class JobOptions(pydantic.BaseModel):
multi_page_scrape: bool = False
custom_headers: Optional[dict[str, Any]] = {}
proxies: Optional[list[str]] = []


class RetrieveScrapeJobs(pydantic.BaseModel):
user: str

Expand Down
9 changes: 2 additions & 7 deletions api/backend/routers/job_router.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,22 +12,17 @@
from fastapi.responses import JSONResponse, StreamingResponse

# LOCAL
from api.backend.job import (
query,
insert,
update_job,
delete_jobs,
)
from api.backend.job import query, insert, update_job, delete_jobs
from api.backend.models import (
UpdateJobs,
DownloadJob,
FetchOptions,
DeleteScrapeJobs,
Job,
)
from api.backend.schemas import User
from api.backend.auth.auth_utils import get_current_user
from api.backend.utils import clean_text
from api.backend.job.models.job_options import FetchOptions

LOG = logging.getLogger(__name__)

Expand Down
41 changes: 17 additions & 24 deletions api/backend/scraping.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,20 @@
import logging
from typing import Any, Optional
import time
import random

from bs4 import BeautifulSoup
from lxml import etree
from seleniumwire import webdriver
from lxml.etree import _Element # type: ignore [reportPrivateImport]
from lxml.etree import _Element # pyright: ignore [reportPrivateUsage]
from fake_useragent import UserAgent
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.chrome.options import Options as ChromeOptions
from urllib.parse import urlparse, urljoin
from api.backend.models import Element, CapturedElement
from api.backend.job.site_mapping.site_mapping import (
handle_site_mapping,
)
from api.backend.job.scraping.scraping_utils import scrape_content
from api.backend.job.models.site_map import SiteMap

LOG = logging.getLogger(__name__)

Expand Down Expand Up @@ -95,6 +96,7 @@ async def make_site_request(
pages: set[tuple[str, str]] = set(),
original_url: str = "",
proxies: Optional[list[str]] = [],
site_map: Optional[dict[str, Any]] = None,
) -> None:
"""Make basic `GET` request to site using Selenium."""
# Check if URL has already been visited
Expand All @@ -114,27 +116,16 @@ async def make_site_request(
final_url = driver.current_url
visited_urls.add(url)
visited_urls.add(final_url)
_ = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.TAG_NAME, "body"))
)

last_height = driver.execute_script("return document.body.scrollHeight")
while True:
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
page_source = scrape_content(driver, pages)

time.sleep(3) # Wait for the page to load
new_height = driver.execute_script("return document.body.scrollHeight")

if new_height == last_height:
break

last_height = new_height

final_height = driver.execute_script("return document.body.scrollHeight")

page_source = driver.page_source
LOG.debug(f"Page source for url: {url}\n{page_source}")
pages.add((page_source, final_url))
if site_map:
LOG.info("Site map: %s", site_map)
_ = await handle_site_mapping(
site_map,
driver,
pages,
)
finally:
driver.quit()

Expand Down Expand Up @@ -192,6 +183,7 @@ async def scrape(
headers: Optional[dict[str, Any]],
multi_page_scrape: bool = False,
proxies: Optional[list[str]] = [],
site_map: Optional[SiteMap] = None,
):
visited_urls: set[str] = set()
pages: set[tuple[str, str]] = set()
Expand All @@ -204,6 +196,7 @@ async def scrape(
pages=pages,
original_url=url,
proxies=proxies,
site_map=site_map,
)

elements: list[dict[str, dict[str, list[CapturedElement]]]] = list()
Expand Down
1 change: 1 addition & 0 deletions api/backend/worker/job_worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ async def process_job():
job["job_options"]["custom_headers"],
job["job_options"]["multi_page_scrape"],
job["job_options"]["proxies"],
job["job_options"]["site_map"],
)
LOG.info(
f"Scraped result for url: {job['url']}, with elements: {job['elements']}\n{scraped}"
Expand Down
3 changes: 3 additions & 0 deletions docker-compose.dev.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,5 +10,8 @@ services:
- "$PWD/package-lock.json:/app/package-lock.json"
- "$PWD/tsconfig.json:/app/tsconfig.json"
scraperr_api:
environment:
- LOG_LEVEL=INFO
volumes:
- "$PWD/api:/project/api"
- "$PWD/scraping:/project/scraping"
Loading

0 comments on commit 7d80ff5

Please sign in to comment.