diff --git a/.github/workflows/docker-image.yml b/.github/workflows/docker-image.yml index a498ba2..024fc6a 100644 --- a/.github/workflows/docker-image.yml +++ b/.github/workflows/docker-image.yml @@ -1,4 +1,6 @@ name: ci +requires: + - unit-tests on: push: branches: ["master"] diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml new file mode 100644 index 0000000..c824f72 --- /dev/null +++ b/.github/workflows/unit-tests.yml @@ -0,0 +1,25 @@ +name: Unit Tests + +on: + push: + branches: + - master + pull_request: + branches: + - master + +jobs: + unit-tests: + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Install pdm + run: pip install pdm + + - name: Install project dependencies + run: pdm install + + - name: Run tests + run: PYTHONPATH=. pdm run pytest api/backend/tests diff --git a/api/backend/models.py b/api/backend/models.py index 8bc7066..fb10d70 100644 --- a/api/backend/models.py +++ b/api/backend/models.py @@ -23,8 +23,9 @@ class CapturedElement(pydantic.BaseModel): class JobOptions(pydantic.BaseModel): - multi_page_scrape: bool - custom_headers: Optional[dict[str, Any]] + multi_page_scrape: bool = False + custom_headers: Optional[dict[str, Any]] = {} + proxies: Optional[list[str]] = [] class RetrieveScrapeJobs(pydantic.BaseModel): diff --git a/api/backend/routers/job_router.py b/api/backend/routers/job_router.py index 35caa3f..d5884ca 100644 --- a/api/backend/routers/job_router.py +++ b/api/backend/routers/job_router.py @@ -5,7 +5,6 @@ import csv import logging import random -from typing import Optional # PDM from fastapi import Depends, APIRouter @@ -27,7 +26,7 @@ Job, ) from api.backend.schemas import User -from api.backend.auth.auth_utils import get_current_user, EMPTY_USER +from api.backend.auth.auth_utils import get_current_user from api.backend.utils import clean_text LOG = logging.getLogger(__name__) diff --git a/api/backend/scraping.py b/api/backend/scraping.py index d106402..9418403 100644 --- a/api/backend/scraping.py +++ b/api/backend/scraping.py @@ -1,6 +1,7 @@ import logging from typing import Any, Optional import time +import random from bs4 import BeautifulSoup from lxml import etree @@ -12,7 +13,6 @@ from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.chrome.options import Options as ChromeOptions from urllib.parse import urlparse, urljoin - from api.backend.models import Element, CapturedElement LOG = logging.getLogger(__name__) @@ -60,7 +60,7 @@ def _interceptor(request: Any): return _interceptor -def create_driver(): +def create_driver(proxies: Optional[list[str]] = []): ua = UserAgent() chrome_options = ChromeOptions() chrome_options.add_argument("--headless") @@ -68,7 +68,23 @@ def create_driver(): chrome_options.add_argument("--disable-dev-shm-usage") chrome_options.add_argument(f"user-agent={ua.random}") - return webdriver.Chrome(options=chrome_options) + sw_options = {} + if proxies: + selected_proxy = proxies[random.randint(0, len(proxies) - 1)] + LOG.info(f"Using proxy: {selected_proxy}") + + sw_options = { + "proxy": { + "https": f"https://{selected_proxy}", + "http": f"http://{selected_proxy}", + } + } + + driver = webdriver.Chrome( + options=chrome_options, + seleniumwire_options=sw_options, + ) + return driver async def make_site_request( @@ -78,13 +94,14 @@ async def make_site_request( visited_urls: set[str] = set(), pages: set[tuple[str, str]] = set(), original_url: str = "", + proxies: Optional[list[str]] = [], ) -> None: """Make basic `GET` request to site using Selenium.""" # Check if URL has already been visited if url in visited_urls: return - driver = create_driver() + driver = create_driver(proxies) driver.implicitly_wait(10) if headers: @@ -93,6 +110,7 @@ async def make_site_request( try: LOG.info(f"Visiting URL: {url}") driver.get(url) + final_url = driver.current_url visited_urls.add(url) visited_urls.add(final_url) @@ -173,6 +191,7 @@ async def scrape( xpaths: list[Element], headers: Optional[dict[str, Any]], multi_page_scrape: bool = False, + proxies: Optional[list[str]] = [], ): visited_urls: set[str] = set() pages: set[tuple[str, str]] = set() @@ -184,6 +203,7 @@ async def scrape( visited_urls=visited_urls, pages=pages, original_url=url, + proxies=proxies, ) elements: list[dict[str, dict[str, list[CapturedElement]]]] = list() diff --git a/api/backend/tests/factories/job_factory.py b/api/backend/tests/factories/job_factory.py index 88a28b4..59ac566 100644 --- a/api/backend/tests/factories/job_factory.py +++ b/api/backend/tests/factories/job_factory.py @@ -5,12 +5,14 @@ fake = Faker() -def create_job(): +def create_job( + job_options: JobOptions = JobOptions(multi_page_scrape=False, custom_headers={}) +): return Job( id=uuid.uuid4().hex, url="https://example.com", elements=[Element(name="test", xpath="xpath")], - job_options=JobOptions(multi_page_scrape=False, custom_headers={}), + job_options=job_options, ) diff --git a/api/backend/tests/job/test_download_job.py b/api/backend/tests/job/test_download_job.py index 5259c11..8567dc0 100644 --- a/api/backend/tests/job/test_download_job.py +++ b/api/backend/tests/job/test_download_job.py @@ -9,12 +9,18 @@ mocked_job = create_completed_job().model_dump() mock_results = [mocked_job] +mocked_random_int = 123456 @pytest.mark.asyncio -@patch("api.backend.app.query") -async def test_download(mock_query: AsyncMock): +@patch("api.backend.routers.job_router.query") +@patch("api.backend.routers.job_router.random.randint") +async def test_download(mock_randint: AsyncMock, mock_query: AsyncMock): + # Ensure the mock returns immediately mock_query.return_value = mock_results + mock_randint.return_value = mocked_random_int + + # Create a DownloadJob instance download_job = DownloadJob(ids=[mocked_job["id"]]) # Make a POST request to the /download endpoint @@ -26,5 +32,9 @@ async def test_download(mock_query: AsyncMock): # Check the content of the CSV csv_content = response.content.decode("utf-8") - expected_csv = f"id,url,element_name,xpath,text,user,time_created\r\n{mocked_job['id']},https://example.com,element_name,//div,example,{mocked_job['user']},{mocked_job['time_created']}\r\n" + expected_csv = ( + f'"id","url","element_name","xpath","text","user","time_created"\r\n' + f'"{mocked_job["id"]}-{mocked_random_int}","https://example.com","element_name","//div","example",' + f'"{mocked_job["user"]}","{mocked_job["time_created"]}"\r\n' + ) assert csv_content == expected_csv diff --git a/api/backend/tests/scraping/__init__.py b/api/backend/tests/scraping/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/api/backend/tests/scraping/test_scraping.py b/api/backend/tests/scraping/test_scraping.py new file mode 100644 index 0000000..28cb9bf --- /dev/null +++ b/api/backend/tests/scraping/test_scraping.py @@ -0,0 +1,33 @@ +import pytest +from unittest.mock import AsyncMock, patch, MagicMock +from api.backend.tests.factories.job_factory import create_job +from api.backend.models import JobOptions +from api.backend.scraping import create_driver + + +mocked_job = create_job( + job_options=JobOptions( + multi_page_scrape=False, custom_headers={}, proxies=["127.0.0.1:8080"] + ) +).model_dump() + + +@pytest.mark.asyncio +@patch("seleniumwire.webdriver.Chrome.get") +async def test_proxy(mock_get: AsyncMock): + # Mock the response of the requests.get call + mock_response = MagicMock() + mock_get.return_value = mock_response + + driver = create_driver(proxies=["127.0.0.1:8080"]) + assert driver is not None + + # Simulate a request + driver.get("http://example.com") + response = driver.last_request + + # Check if the proxy header is set correctly + if response: + assert response.headers["Proxy"] == "127.0.0.1:8080" + + driver.quit() diff --git a/api/backend/worker/job_worker.py b/api/backend/worker/job_worker.py index 16be253..6ae5c16 100644 --- a/api/backend/worker/job_worker.py +++ b/api/backend/worker/job_worker.py @@ -23,6 +23,7 @@ async def process_job(): [Element(**j) for j in job["elements"]], job["job_options"]["custom_headers"], job["job_options"]["multi_page_scrape"], + job["job_options"]["proxies"], ) LOG.info( f"Scraped result for url: {job['url']}, with elements: {job['elements']}\n{scraped}" diff --git a/docker-compose.dev.yml b/docker-compose.dev.yml index dcb3b3b..0c47476 100644 --- a/docker-compose.dev.yml +++ b/docker-compose.dev.yml @@ -19,4 +19,4 @@ services: ports: - "8000:8000" volumes: - - "$PWD/api:/project/app/api" + - "$PWD/api:/project/api" diff --git a/src/components/submit/job-submitter/job-submitter-options/job-submitter-options.tsx b/src/components/submit/job-submitter/job-submitter-options/job-submitter-options.tsx index f29e5df..a199126 100644 --- a/src/components/submit/job-submitter/job-submitter-options/job-submitter-options.tsx +++ b/src/components/submit/job-submitter/job-submitter-options/job-submitter-options.tsx @@ -1,13 +1,14 @@ +import { RawJobOptions } from "@/types/job"; import { Box, FormControlLabel, Checkbox, TextField } from "@mui/material"; import { Dispatch, SetStateAction } from "react"; -import { JobOptions } from "@/types/job"; - export type JobSubmitterOptionsProps = { - jobOptions: JobOptions; - setJobOptions: Dispatch>; + jobOptions: RawJobOptions; + setJobOptions: Dispatch>; customJSONSelected: boolean; setCustomJSONSelected: Dispatch>; + handleSelectProxies: () => void; + proxiesSelected: boolean; }; export const JobSubmitterOptions = ({ @@ -15,24 +16,69 @@ export const JobSubmitterOptions = ({ setJobOptions, customJSONSelected, setCustomJSONSelected, + handleSelectProxies, + proxiesSelected, }: JobSubmitterOptionsProps) => { + const handleMultiPageScrapeChange = () => { + setJobOptions((prevJobOptions) => ({ + ...prevJobOptions, + multi_page_scrape: !prevJobOptions.multi_page_scrape, + })); + }; + + const handleProxiesChange = (e: React.ChangeEvent) => { + setJobOptions((prevJobOptions) => ({ + ...prevJobOptions, + proxies: e.target.value, + })); + }; + + const handleCustomHeadersChange = ( + e: React.ChangeEvent + ) => { + setJobOptions((prevJobOptions) => ({ + ...prevJobOptions, + custom_headers: e.target.value, + })); + }; + return (
- setJobOptions((prevJobOptions) => ({ - ...prevJobOptions, - multi_page_scrape: !prevJobOptions.multi_page_scrape, - })) - } + onChange={handleMultiPageScrapeChange} + /> + } + > + } > + {proxiesSelected ? ( +
+ +
+ ) : null} - setJobOptions((prevJobOptions) => ({ - ...prevJobOptions, - custom_headers: e.target.value, - })) - } + onChange={handleCustomHeadersChange} style={{ maxHeight: "20vh", overflow: "auto" }} - className="mt-2" />
) : null} diff --git a/src/components/submit/job-submitter/job-submitter.tsx b/src/components/submit/job-submitter/job-submitter.tsx index 1e7fc6e..83b92e9 100644 --- a/src/components/submit/job-submitter/job-submitter.tsx +++ b/src/components/submit/job-submitter/job-submitter.tsx @@ -4,11 +4,12 @@ import React, { useEffect, useState, Dispatch } from "react"; import { Element } from "@/types"; import { useAuth } from "@/contexts/AuthContext"; import { useRouter } from "next/router"; -import { Constants } from "@/lib"; - +import { RawJobOptions } from "@/types/job"; +import { parseJobOptions, validateURL } from "@/lib"; import { JobSubmitterHeader } from "./job-submitter-header"; import { JobSubmitterInput } from "./job-submitter-input"; import { JobSubmitterOptions } from "./job-submitter-options"; +import { ApiService } from "@/services"; interface StateProps { submittedURL: string; @@ -25,22 +26,20 @@ interface Props { stateProps: StateProps; } -interface JobOptions { - multi_page_scrape: boolean; - custom_headers: null | string; -} +const initialJobOptions: RawJobOptions = { + multi_page_scrape: false, + custom_headers: null, + proxies: null, +}; export const JobSubmitter = ({ stateProps }: Props) => { const { user } = useAuth(); const router = useRouter(); - const { job_options } = router.query; const { submittedURL, - setSubmittedURL, rows, - isValidURL, setIsValidUrl, setSnackbarMessage, setSnackbarOpen, @@ -49,22 +48,16 @@ export const JobSubmitter = ({ stateProps }: Props) => { const [urlError, setUrlError] = useState(null); const [loading, setLoading] = useState(false); - const [jobOptions, setJobOptions] = useState({ - multi_page_scrape: false, - custom_headers: null, - }); + const [jobOptions, setJobOptions] = + useState(initialJobOptions); const [customJSONSelected, setCustomJSONSelected] = useState(false); + const [proxiesSelected, setProxiesSelected] = useState(false); - function validateURL(url: string): boolean { - try { - new URL(url); - return true; - } catch (_) { - return false; - } - } + const handleSelectProxies = () => { + setProxiesSelected(!proxiesSelected); + }; - const handleSubmit = () => { + const handleSubmit = async () => { if (!validateURL(submittedURL)) { setIsValidUrl(false); setUrlError("Please enter a valid URL."); @@ -76,6 +69,7 @@ export const JobSubmitter = ({ stateProps }: Props) => { setLoading(true); let customHeaders; + try { customHeaders = jobOptions.custom_headers ? JSON.parse(jobOptions.custom_headers) @@ -88,21 +82,14 @@ export const JobSubmitter = ({ stateProps }: Props) => { return; } - fetch(`${Constants.DOMAIN}/api/submit-scrape-job`, { - method: "POST", - headers: { "content-type": "application/json" }, - body: JSON.stringify({ - url: submittedURL, - elements: rows, - user: user?.email, - time_created: new Date().toISOString(), - job_options: { - ...jobOptions, - custom_headers: customHeaders, - }, - }), - }) - .then((response) => { + await ApiService.submitJob( + submittedURL, + rows, + user, + jobOptions, + customHeaders + ) + .then(async (response) => { if (!response.ok) { return response.json().then((error) => { throw new Error(error.error); @@ -126,26 +113,15 @@ export const JobSubmitter = ({ stateProps }: Props) => { .finally(() => setLoading(false)); }; + // Parse the job options from the query string useEffect(() => { if (job_options) { - const jsonOptions = JSON.parse(job_options as string); - const newJobOptions: JobOptions = { - multi_page_scrape: false, - custom_headers: null, - }; - - if ( - jsonOptions.custom_headers && - Object.keys(jsonOptions.custom_headers).length - ) { - setCustomJSONSelected(true); - newJobOptions.custom_headers = JSON.stringify( - jsonOptions.custom_headers - ); - } - - newJobOptions.multi_page_scrape = jsonOptions.multi_page_scrape; - setJobOptions(newJobOptions); + parseJobOptions( + job_options as string, + setCustomJSONSelected, + setProxiesSelected, + setJobOptions + ); } }, [job_options]); @@ -165,6 +141,8 @@ export const JobSubmitter = ({ stateProps }: Props) => { setJobOptions={setJobOptions} customJSONSelected={customJSONSelected} setCustomJSONSelected={setCustomJSONSelected} + handleSelectProxies={handleSelectProxies} + proxiesSelected={proxiesSelected} /> diff --git a/src/lib/helpers/index.ts b/src/lib/helpers/index.ts new file mode 100644 index 0000000..3898b27 --- /dev/null +++ b/src/lib/helpers/index.ts @@ -0,0 +1,2 @@ +export * from "./parse-job-options"; +export * from "./validate-url"; diff --git a/src/lib/helpers/parse-job-options.ts b/src/lib/helpers/parse-job-options.ts new file mode 100644 index 0000000..e5c22bc --- /dev/null +++ b/src/lib/helpers/parse-job-options.ts @@ -0,0 +1,36 @@ +import { Dispatch, SetStateAction } from "react"; + +import { RawJobOptions } from "@/types"; + +export const parseJobOptions = ( + job_options: string, + setCustomJSONSelected: Dispatch>, + setProxiesSelected: Dispatch>, + setJobOptions: Dispatch> +) => { + if (job_options) { + const jsonOptions = JSON.parse(job_options as string); + const newJobOptions: RawJobOptions = { + multi_page_scrape: false, + custom_headers: null, + proxies: null, + }; + + if ( + jsonOptions.custom_headers && + Object.keys(jsonOptions.custom_headers).length + ) { + setCustomJSONSelected(true); + newJobOptions.custom_headers = JSON.stringify(jsonOptions.custom_headers); + } + + newJobOptions.multi_page_scrape = jsonOptions.multi_page_scrape; + + if (jsonOptions.proxies) { + setProxiesSelected(true); + newJobOptions.proxies = jsonOptions.proxies.join(","); + } + + setJobOptions(newJobOptions); + } +}; diff --git a/src/lib/helpers/validate-url.ts b/src/lib/helpers/validate-url.ts new file mode 100644 index 0000000..f0c5007 --- /dev/null +++ b/src/lib/helpers/validate-url.ts @@ -0,0 +1,8 @@ +export function validateURL(url: string): boolean { + try { + new URL(url); + return true; + } catch (_) { + return false; + } +} diff --git a/src/lib/index.ts b/src/lib/index.ts index e8a6cf9..e248e9d 100644 --- a/src/lib/index.ts +++ b/src/lib/index.ts @@ -1,2 +1,3 @@ export * from "./constants"; export * from "./utils"; +export * from "./helpers"; diff --git a/src/services/api-service/api-service.ts b/src/services/api-service/api-service.ts new file mode 100644 index 0000000..f4aa55d --- /dev/null +++ b/src/services/api-service/api-service.ts @@ -0,0 +1,5 @@ +import * as functions from "./functions"; + +export const ApiService = { + ...functions, +}; diff --git a/src/services/api-service/functions/index.ts b/src/services/api-service/functions/index.ts new file mode 100644 index 0000000..8d20937 --- /dev/null +++ b/src/services/api-service/functions/index.ts @@ -0,0 +1 @@ +export * from "./submit-job"; diff --git a/src/services/api-service/functions/submit-job.ts b/src/services/api-service/functions/submit-job.ts new file mode 100644 index 0000000..9e82a71 --- /dev/null +++ b/src/services/api-service/functions/submit-job.ts @@ -0,0 +1,25 @@ +import { Constants } from "@/lib"; + +export const submitJob = async ( + submittedURL: string, + rows: any[], + user: any, + jobOptions: any, + customHeaders: any +) => { + return await fetch(`${Constants.DOMAIN}/api/submit-scrape-job`, { + method: "POST", + headers: { "content-type": "application/json" }, + body: JSON.stringify({ + url: submittedURL, + elements: rows, + user: user?.email, + time_created: new Date().toISOString(), + job_options: { + ...jobOptions, + custom_headers: customHeaders, + proxies: jobOptions.proxies ? jobOptions.proxies.split(",") : [], + }, + }), + }); +}; diff --git a/src/services/api-service/index.ts b/src/services/api-service/index.ts new file mode 100644 index 0000000..1fcaddd --- /dev/null +++ b/src/services/api-service/index.ts @@ -0,0 +1 @@ +export * from "./api-service"; diff --git a/src/services/index.ts b/src/services/index.ts new file mode 100644 index 0000000..1fcaddd --- /dev/null +++ b/src/services/index.ts @@ -0,0 +1 @@ +export * from "./api-service"; diff --git a/src/types/job.ts b/src/types/job.ts index 062bf32..93bc381 100644 --- a/src/types/job.ts +++ b/src/types/job.ts @@ -15,4 +15,11 @@ export interface Job { export type JobOptions = { multi_page_scrape: boolean; custom_headers: null | string; + proxies: string[]; +}; + +export type RawJobOptions = { + multi_page_scrape: boolean; + custom_headers: string | null; + proxies: string | null; };