-
Notifications
You must be signed in to change notification settings - Fork 63
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* wip: add site mapping * chore: cleanup
- Loading branch information
Showing
35 changed files
with
848 additions
and
344 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
from .job import ( | ||
query, | ||
insert, | ||
update_job, | ||
delete_jobs, | ||
get_jobs_per_day, | ||
get_queued_job, | ||
average_elements_per_link, | ||
) | ||
|
||
__all__ = [ | ||
"query", | ||
"insert", | ||
"update_job", | ||
"delete_jobs", | ||
"get_jobs_per_day", | ||
"get_queued_job", | ||
"average_elements_per_link", | ||
] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
from pydantic import BaseModel | ||
from typing import Any, Optional | ||
from api.backend.job.models.site_map import SiteMap | ||
|
||
|
||
class FetchOptions(BaseModel): | ||
chat: Optional[bool] = None | ||
|
||
|
||
class JobOptions(BaseModel): | ||
multi_page_scrape: bool = False | ||
custom_headers: dict[str, Any] = {} | ||
proxies: list[str] = [] | ||
site_map: Optional[SiteMap] = None |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
from pydantic import BaseModel | ||
from typing import Literal | ||
|
||
|
||
class Action(BaseModel): | ||
type: Literal["click", "input"] | ||
xpath: str | ||
name: str | ||
input: str = "" | ||
do_once: bool = True | ||
|
||
|
||
class SiteMap(BaseModel): | ||
actions: list[Action] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
import time | ||
from typing import cast | ||
|
||
from selenium import webdriver | ||
from selenium.webdriver.common.by import By | ||
from selenium.webdriver.support import expected_conditions as EC | ||
from selenium.webdriver.support.ui import WebDriverWait | ||
|
||
|
||
def scrape_content(driver: webdriver.Chrome, pages: set[tuple[str, str]]): | ||
_ = WebDriverWait(driver, 10).until( | ||
EC.presence_of_element_located((By.TAG_NAME, "body")) | ||
) | ||
|
||
last_height = cast(str, driver.execute_script("return document.body.scrollHeight")) | ||
while True: | ||
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") | ||
|
||
time.sleep(3) # Wait for the page to load | ||
new_height = cast( | ||
str, driver.execute_script("return document.body.scrollHeight") | ||
) | ||
|
||
if new_height == last_height: | ||
break | ||
|
||
last_height = new_height | ||
|
||
pages.add((driver.page_source, driver.current_url)) | ||
return driver.page_source |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,94 @@ | ||
from api.backend.job.models.site_map import Action, SiteMap | ||
from selenium import webdriver | ||
from selenium.common.exceptions import NoSuchElementException | ||
from selenium.webdriver.common.by import By | ||
from typing import Any | ||
import logging | ||
import time | ||
from copy import deepcopy | ||
|
||
from api.backend.job.scraping.scraping_utils import scrape_content | ||
from selenium.webdriver.support.ui import WebDriverWait | ||
from seleniumwire.inspect import TimeoutException | ||
from seleniumwire.webdriver import Chrome | ||
from selenium.webdriver.support import expected_conditions as EC | ||
|
||
LOG = logging.getLogger(__name__) | ||
|
||
|
||
def clear_done_actions(site_map: dict[str, Any]): | ||
"""Clear all actions that have been clicked.""" | ||
cleared_site_map = deepcopy(site_map) | ||
|
||
cleared_site_map["actions"] = [ | ||
action for action in cleared_site_map["actions"] if not action["do_once"] | ||
] | ||
|
||
return cleared_site_map | ||
|
||
|
||
def handle_input(action: Action, driver: webdriver.Chrome): | ||
try: | ||
element = WebDriverWait(driver, 10).until( | ||
EC.element_to_be_clickable((By.XPATH, action.xpath)) | ||
) | ||
LOG.info(f"Sending keys: {action.input} to element: {element}") | ||
|
||
element.send_keys(action.input) | ||
|
||
except NoSuchElementException: | ||
LOG.info(f"Element not found: {action.xpath}") | ||
return False | ||
|
||
except TimeoutException: | ||
LOG.info(f"Timeout waiting for element: {action.xpath}") | ||
return False | ||
|
||
except Exception as e: | ||
LOG.info(f"Error handling input: {e}") | ||
return False | ||
|
||
return True | ||
|
||
|
||
def handle_click(action: Action, driver: webdriver.Chrome): | ||
try: | ||
element = driver.find_element(By.XPATH, action.xpath) | ||
LOG.info(f"Clicking element: {element}") | ||
|
||
element.click() | ||
|
||
except NoSuchElementException: | ||
LOG.info(f"Element not found: {action.xpath}") | ||
return False | ||
|
||
return True | ||
|
||
|
||
ACTION_MAP = { | ||
"click": handle_click, | ||
"input": handle_input, | ||
} | ||
|
||
|
||
async def handle_site_mapping( | ||
site_map_dict: dict[str, Any], | ||
driver: Chrome, | ||
pages: set[tuple[str, str]], | ||
): | ||
site_map = SiteMap(**site_map_dict) | ||
LOG.info(f"Handling site map: {site_map}") | ||
|
||
for action in site_map.actions: | ||
action_handler = ACTION_MAP[action.type] | ||
if not action_handler(action, driver): | ||
return | ||
|
||
time.sleep(2) | ||
|
||
_ = scrape_content(driver, pages) | ||
|
||
cleared_site_map_dict = clear_done_actions(site_map_dict) | ||
|
||
if cleared_site_map_dict["actions"]: | ||
await handle_site_mapping(cleared_site_map_dict, driver, pages) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.