Skip to content

Commit

Permalink
feat: add proxies (#39)
Browse files Browse the repository at this point in the history
  • Loading branch information
jaypyles authored Nov 10, 2024
1 parent 266b91e commit 1cdffd9
Show file tree
Hide file tree
Showing 23 changed files with 284 additions and 86 deletions.
2 changes: 2 additions & 0 deletions .github/workflows/docker-image.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
name: ci
requires:
- unit-tests
on:
push:
branches: ["master"]
Expand Down
25 changes: 25 additions & 0 deletions .github/workflows/unit-tests.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
name: Unit Tests

on:
push:
branches:
- master
pull_request:
branches:
- master

jobs:
unit-tests:
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v4

- name: Install pdm
run: pip install pdm

- name: Install project dependencies
run: pdm install

- name: Run tests
run: PYTHONPATH=. pdm run pytest api/backend/tests
5 changes: 3 additions & 2 deletions api/backend/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,9 @@ class CapturedElement(pydantic.BaseModel):


class JobOptions(pydantic.BaseModel):
multi_page_scrape: bool
custom_headers: Optional[dict[str, Any]]
multi_page_scrape: bool = False
custom_headers: Optional[dict[str, Any]] = {}
proxies: Optional[list[str]] = []


class RetrieveScrapeJobs(pydantic.BaseModel):
Expand Down
3 changes: 1 addition & 2 deletions api/backend/routers/job_router.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
import csv
import logging
import random
from typing import Optional

# PDM
from fastapi import Depends, APIRouter
Expand All @@ -27,7 +26,7 @@
Job,
)
from api.backend.schemas import User
from api.backend.auth.auth_utils import get_current_user, EMPTY_USER
from api.backend.auth.auth_utils import get_current_user
from api.backend.utils import clean_text

LOG = logging.getLogger(__name__)
Expand Down
28 changes: 24 additions & 4 deletions api/backend/scraping.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import logging
from typing import Any, Optional
import time
import random

from bs4 import BeautifulSoup
from lxml import etree
Expand All @@ -12,7 +13,6 @@
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.chrome.options import Options as ChromeOptions
from urllib.parse import urlparse, urljoin

from api.backend.models import Element, CapturedElement

LOG = logging.getLogger(__name__)
Expand Down Expand Up @@ -60,15 +60,31 @@ def _interceptor(request: Any):
return _interceptor


def create_driver():
def create_driver(proxies: Optional[list[str]] = []):
ua = UserAgent()
chrome_options = ChromeOptions()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument(f"user-agent={ua.random}")

return webdriver.Chrome(options=chrome_options)
sw_options = {}
if proxies:
selected_proxy = proxies[random.randint(0, len(proxies) - 1)]
LOG.info(f"Using proxy: {selected_proxy}")

sw_options = {
"proxy": {
"https": f"https://{selected_proxy}",
"http": f"http://{selected_proxy}",
}
}

driver = webdriver.Chrome(
options=chrome_options,
seleniumwire_options=sw_options,
)
return driver


async def make_site_request(
Expand All @@ -78,13 +94,14 @@ async def make_site_request(
visited_urls: set[str] = set(),
pages: set[tuple[str, str]] = set(),
original_url: str = "",
proxies: Optional[list[str]] = [],
) -> None:
"""Make basic `GET` request to site using Selenium."""
# Check if URL has already been visited
if url in visited_urls:
return

driver = create_driver()
driver = create_driver(proxies)
driver.implicitly_wait(10)

if headers:
Expand All @@ -93,6 +110,7 @@ async def make_site_request(
try:
LOG.info(f"Visiting URL: {url}")
driver.get(url)

final_url = driver.current_url
visited_urls.add(url)
visited_urls.add(final_url)
Expand Down Expand Up @@ -173,6 +191,7 @@ async def scrape(
xpaths: list[Element],
headers: Optional[dict[str, Any]],
multi_page_scrape: bool = False,
proxies: Optional[list[str]] = [],
):
visited_urls: set[str] = set()
pages: set[tuple[str, str]] = set()
Expand All @@ -184,6 +203,7 @@ async def scrape(
visited_urls=visited_urls,
pages=pages,
original_url=url,
proxies=proxies,
)

elements: list[dict[str, dict[str, list[CapturedElement]]]] = list()
Expand Down
6 changes: 4 additions & 2 deletions api/backend/tests/factories/job_factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,14 @@
fake = Faker()


def create_job():
def create_job(
job_options: JobOptions = JobOptions(multi_page_scrape=False, custom_headers={})
):
return Job(
id=uuid.uuid4().hex,
url="https://example.com",
elements=[Element(name="test", xpath="xpath")],
job_options=JobOptions(multi_page_scrape=False, custom_headers={}),
job_options=job_options,
)


Expand Down
16 changes: 13 additions & 3 deletions api/backend/tests/job/test_download_job.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,18 @@

mocked_job = create_completed_job().model_dump()
mock_results = [mocked_job]
mocked_random_int = 123456


@pytest.mark.asyncio
@patch("api.backend.app.query")
async def test_download(mock_query: AsyncMock):
@patch("api.backend.routers.job_router.query")
@patch("api.backend.routers.job_router.random.randint")
async def test_download(mock_randint: AsyncMock, mock_query: AsyncMock):
# Ensure the mock returns immediately
mock_query.return_value = mock_results
mock_randint.return_value = mocked_random_int

# Create a DownloadJob instance
download_job = DownloadJob(ids=[mocked_job["id"]])

# Make a POST request to the /download endpoint
Expand All @@ -26,5 +32,9 @@ async def test_download(mock_query: AsyncMock):

# Check the content of the CSV
csv_content = response.content.decode("utf-8")
expected_csv = f"id,url,element_name,xpath,text,user,time_created\r\n{mocked_job['id']},https://example.com,element_name,//div,example,{mocked_job['user']},{mocked_job['time_created']}\r\n"
expected_csv = (
f'"id","url","element_name","xpath","text","user","time_created"\r\n'
f'"{mocked_job["id"]}-{mocked_random_int}","https://example.com","element_name","//div","example",'
f'"{mocked_job["user"]}","{mocked_job["time_created"]}"\r\n'
)
assert csv_content == expected_csv
Empty file.
33 changes: 33 additions & 0 deletions api/backend/tests/scraping/test_scraping.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
import pytest
from unittest.mock import AsyncMock, patch, MagicMock
from api.backend.tests.factories.job_factory import create_job
from api.backend.models import JobOptions
from api.backend.scraping import create_driver


mocked_job = create_job(
job_options=JobOptions(
multi_page_scrape=False, custom_headers={}, proxies=["127.0.0.1:8080"]
)
).model_dump()


@pytest.mark.asyncio
@patch("seleniumwire.webdriver.Chrome.get")
async def test_proxy(mock_get: AsyncMock):
# Mock the response of the requests.get call
mock_response = MagicMock()
mock_get.return_value = mock_response

driver = create_driver(proxies=["127.0.0.1:8080"])
assert driver is not None

# Simulate a request
driver.get("http://example.com")
response = driver.last_request

# Check if the proxy header is set correctly
if response:
assert response.headers["Proxy"] == "127.0.0.1:8080"

driver.quit()
1 change: 1 addition & 0 deletions api/backend/worker/job_worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ async def process_job():
[Element(**j) for j in job["elements"]],
job["job_options"]["custom_headers"],
job["job_options"]["multi_page_scrape"],
job["job_options"]["proxies"],
)
LOG.info(
f"Scraped result for url: {job['url']}, with elements: {job['elements']}\n{scraped}"
Expand Down
2 changes: 1 addition & 1 deletion docker-compose.dev.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,4 +19,4 @@ services:
ports:
- "8000:8000"
volumes:
- "$PWD/api:/project/app/api"
- "$PWD/api:/project/api"
Original file line number Diff line number Diff line change
@@ -1,38 +1,84 @@
import { RawJobOptions } from "@/types/job";
import { Box, FormControlLabel, Checkbox, TextField } from "@mui/material";
import { Dispatch, SetStateAction } from "react";

import { JobOptions } from "@/types/job";

export type JobSubmitterOptionsProps = {
jobOptions: JobOptions;
setJobOptions: Dispatch<SetStateAction<JobOptions>>;
jobOptions: RawJobOptions;
setJobOptions: Dispatch<SetStateAction<RawJobOptions>>;
customJSONSelected: boolean;
setCustomJSONSelected: Dispatch<SetStateAction<boolean>>;
handleSelectProxies: () => void;
proxiesSelected: boolean;
};

export const JobSubmitterOptions = ({
jobOptions,
setJobOptions,
customJSONSelected,
setCustomJSONSelected,
handleSelectProxies,
proxiesSelected,
}: JobSubmitterOptionsProps) => {
const handleMultiPageScrapeChange = () => {
setJobOptions((prevJobOptions) => ({
...prevJobOptions,
multi_page_scrape: !prevJobOptions.multi_page_scrape,
}));
};

const handleProxiesChange = (e: React.ChangeEvent<HTMLInputElement>) => {
setJobOptions((prevJobOptions) => ({
...prevJobOptions,
proxies: e.target.value,
}));
};

const handleCustomHeadersChange = (
e: React.ChangeEvent<HTMLInputElement>
) => {
setJobOptions((prevJobOptions) => ({
...prevJobOptions,
custom_headers: e.target.value,
}));
};

return (
<Box bgcolor="background.paper" className="flex flex-col mb-2 rounded-md">
<div id="options" className="p-2 flex flex-row space-x-2">
<FormControlLabel
label="Multi-Page Scrape"
className="mr-0"
control={
<Checkbox
checked={jobOptions.multi_page_scrape}
onChange={() =>
setJobOptions((prevJobOptions) => ({
...prevJobOptions,
multi_page_scrape: !prevJobOptions.multi_page_scrape,
}))
}
onChange={handleMultiPageScrapeChange}
/>
}
></FormControlLabel>
<FormControlLabel
label="Proxies"
control={
<Checkbox
checked={proxiesSelected}
onChange={handleSelectProxies}
/>
}
></FormControlLabel>
{proxiesSelected ? (
<div id="proxies">
<TextField
InputLabelProps={{ shrink: false }}
fullWidth
multiline={false}
variant="outlined"
value={jobOptions.proxies || ""}
onChange={handleProxiesChange}
inputProps={{
style: { whiteSpace: "nowrap", overflowX: "auto" },
}}
/>
</div>
) : null}
<FormControlLabel
label="Custom Headers (JSON)"
control={
Expand All @@ -58,14 +104,8 @@ export const JobSubmitterOptions = ({
minRows={4}
variant="outlined"
value={jobOptions.custom_headers || ""}
onChange={(e) =>
setJobOptions((prevJobOptions) => ({
...prevJobOptions,
custom_headers: e.target.value,
}))
}
onChange={handleCustomHeadersChange}
style={{ maxHeight: "20vh", overflow: "auto" }}
className="mt-2"
/>
</div>
) : null}
Expand Down
Loading

0 comments on commit 1cdffd9

Please sign in to comment.