diff --git a/.github/workflows/docker-image.yml b/.github/workflows/docker-image.yml index 1db5a20..78538b8 100644 --- a/.github/workflows/docker-image.yml +++ b/.github/workflows/docker-image.yml @@ -7,7 +7,7 @@ on: jobs: build: - if: ${{ github.event.workflow_run.conclusion == 'success' && github.ref == 'refs/heads/master' && github.event_name != 'pull_request' }} + if: ${{ github.event.workflow_run.conclusion == 'success' && github.event.workflow_run.head_branch == 'master' }} runs-on: ubuntu-latest steps: - name: Checkout diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index c824f72..80baa93 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -15,6 +15,9 @@ jobs: - name: Checkout uses: actions/checkout@v4 + - name: Set env + run: echo "ENV=test" >> $GITHUB_ENV + - name: Install pdm run: pip install pdm diff --git a/.gitignore b/.gitignore index 4eec8bd..c193d81 100644 --- a/.gitignore +++ b/.gitignore @@ -187,3 +187,4 @@ cython_debug/ postgres_data .vscode ollama +data \ No newline at end of file diff --git a/api/backend/app.py b/api/backend/app.py index 9f130b1..d0ced1b 100644 --- a/api/backend/app.py +++ b/api/backend/app.py @@ -13,6 +13,7 @@ from api.backend.routers.job_router import job_router from api.backend.routers.log_router import log_router from api.backend.routers.stats_router import stats_router +from api.backend.database.startup import init_database log_level = os.getenv("LOG_LEVEL") LOG_LEVEL = get_log_level(log_level) @@ -41,3 +42,10 @@ app.include_router(job_router) app.include_router(log_router) app.include_router(stats_router) + + +@app.on_event("startup") +async def startup_event(): + if os.getenv("ENV") != "test": + init_database() + LOG.info("Starting up...") diff --git a/api/backend/auth/auth_router.py b/api/backend/auth/auth_router.py index ae3ab45..164a82f 100644 --- a/api/backend/auth/auth_router.py +++ b/api/backend/auth/auth_router.py @@ -7,7 +7,6 @@ # LOCAL from api.backend.schemas import User, Token, UserCreate -from api.backend.database import get_user_collection from api.backend.auth.auth_utils import ( ACCESS_TOKEN_EXPIRE_MINUTES, get_current_user, @@ -16,6 +15,8 @@ create_access_token, ) +from api.backend.database.common import update + auth_router = APIRouter() @@ -43,12 +44,13 @@ async def login_for_access_token(form_data: OAuth2PasswordRequestForm = Depends( @auth_router.post("/auth/signup", response_model=User) async def create_user(user: UserCreate): - users_collection = get_user_collection() hashed_password = get_password_hash(user.password) user_dict = user.model_dump() user_dict["hashed_password"] = hashed_password del user_dict["password"] - _ = await users_collection.insert_one(user_dict) + + query = "INSERT INTO users (email, hashed_password, full_name) VALUES (?, ?, ?)" + _ = update(query, (user_dict["email"], hashed_password, user_dict["full_name"])) return user_dict diff --git a/api/backend/auth/auth_utils.py b/api/backend/auth/auth_utils.py index d57c392..9cc6bba 100644 --- a/api/backend/auth/auth_utils.py +++ b/api/backend/auth/auth_utils.py @@ -15,7 +15,8 @@ # LOCAL from api.backend.schemas import User, UserInDB, TokenData -from api.backend.database import get_user_collection + +from api.backend.database.common import query LOG = logging.getLogger(__name__) @@ -40,8 +41,8 @@ def get_password_hash(password: str): async def get_user(email: str): - user_collection = get_user_collection() - user = await user_collection.find_one({"email": email}) + user_query = "SELECT * FROM users WHERE email = ?" + user = query(user_query, (email,))[0] if not user: return diff --git a/api/backend/constants.py b/api/backend/constants.py new file mode 100644 index 0000000..33c1b30 --- /dev/null +++ b/api/backend/constants.py @@ -0,0 +1 @@ +DATABASE_PATH = "data/database.db" diff --git a/api/backend/database.py b/api/backend/database.py deleted file mode 100644 index 4af96ea..0000000 --- a/api/backend/database.py +++ /dev/null @@ -1,23 +0,0 @@ -# STL -import os -from typing import Any - -# PDM -from dotenv import load_dotenv -from motor.motor_asyncio import AsyncIOMotorClient - -_ = load_dotenv() - -MONGODB_URI = os.getenv("MONGODB_URI") - - -def get_user_collection(): - client: AsyncIOMotorClient[dict[str, Any]] = AsyncIOMotorClient(MONGODB_URI) - db = client["scrape"] - return db["users"] - - -def get_job_collection(): - client: AsyncIOMotorClient[dict[str, Any]] = AsyncIOMotorClient(MONGODB_URI) - db = client["scrape"] - return db["jobs"] diff --git a/api/backend/database/__init__.py b/api/backend/database/__init__.py new file mode 100644 index 0000000..796c08f --- /dev/null +++ b/api/backend/database/__init__.py @@ -0,0 +1,3 @@ +from .common import insert, QUERIES, update + +__all__ = ["insert", "QUERIES", "update"] diff --git a/api/backend/database/common.py b/api/backend/database/common.py new file mode 100644 index 0000000..8fcfff3 --- /dev/null +++ b/api/backend/database/common.py @@ -0,0 +1,92 @@ +import sqlite3 +from typing import Any, Optional +from api.backend.constants import DATABASE_PATH +from api.backend.utils import format_json, format_sql_row_to_python +from api.backend.database.schema import INIT_QUERY +from api.backend.database.queries import JOB_INSERT_QUERY, DELETE_JOB_QUERY +import logging + +LOG = logging.getLogger(__name__) + + +def connect(): + connection = sqlite3.connect(DATABASE_PATH) + connection.set_trace_callback(print) + cursor = connection.cursor() + return cursor + + +def insert(query: str, values: tuple[Any, ...]): + connection = sqlite3.connect(DATABASE_PATH) + cursor = connection.cursor() + copy = list(values) + format_json(copy) + + try: + _ = cursor.execute(query, copy) + connection.commit() + except sqlite3.Error as e: + LOG.error(f"An error occurred: {e}") + finally: + cursor.close() + connection.close() + + +def query(query: str, values: Optional[tuple[Any, ...]] = None): + connection = sqlite3.connect(DATABASE_PATH) + connection.row_factory = sqlite3.Row + cursor = connection.cursor() + rows = [] + try: + if values: + _ = cursor.execute(query, values) + else: + _ = cursor.execute(query) + + rows = cursor.fetchall() + + finally: + cursor.close() + connection.close() + + formatted_rows: list[dict[str, Any]] = [] + + for row in rows: + row = dict(row) + formatted_row = format_sql_row_to_python(row) + formatted_rows.append(formatted_row) + + return formatted_rows + + +def update(query: str, values: Optional[tuple[Any, ...]] = None): + connection = sqlite3.connect(DATABASE_PATH) + cursor = connection.cursor() + + copy = None + + if values: + copy = list(values) + format_json(copy) + + try: + if copy: + res = cursor.execute(query, copy) + else: + res = cursor.execute(query) + connection.commit() + return res.rowcount + except sqlite3.Error as e: + LOG.error(f"An error occurred: {e}") + finally: + cursor.close() + connection.close() + + return 0 + + +QUERIES = { + "init": INIT_QUERY, + "insert_job": JOB_INSERT_QUERY, + "delete_job": DELETE_JOB_QUERY, +} diff --git a/api/backend/database/queries/__init__.py b/api/backend/database/queries/__init__.py new file mode 100644 index 0000000..eecb5e7 --- /dev/null +++ b/api/backend/database/queries/__init__.py @@ -0,0 +1,3 @@ +from .queries import JOB_INSERT_QUERY, DELETE_JOB_QUERY + +__all__ = ["JOB_INSERT_QUERY", "DELETE_JOB_QUERY"] diff --git a/api/backend/database/queries/queries.py b/api/backend/database/queries/queries.py new file mode 100644 index 0000000..0ae2e84 --- /dev/null +++ b/api/backend/database/queries/queries.py @@ -0,0 +1,9 @@ +JOB_INSERT_QUERY = """ +INSERT INTO jobs +(id, url, elements, user, time_created, result, status, chat, job_options) +VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?) +""" + +DELETE_JOB_QUERY = """ +DELETE FROM jobs WHERE id IN () +""" diff --git a/api/backend/database/schema/__init__.py b/api/backend/database/schema/__init__.py new file mode 100644 index 0000000..e0c282a --- /dev/null +++ b/api/backend/database/schema/__init__.py @@ -0,0 +1,3 @@ +from .schema import INIT_QUERY + +__all__ = ["INIT_QUERY"] diff --git a/api/backend/database/schema/schema.py b/api/backend/database/schema/schema.py new file mode 100644 index 0000000..c173905 --- /dev/null +++ b/api/backend/database/schema/schema.py @@ -0,0 +1,20 @@ +INIT_QUERY = """ +CREATE TABLE IF NOT EXISTS jobs ( + id STRING PRIMARY KEY NOT NULL, + url STRING NOT NULL, + elements JSON NOT NULL, + user STRING, + time_created DATETIME NOT NULL, + result JSON NOT NULL, + status STRING NOT NULL, + chat JSON, + job_options JSON +); + +CREATE TABLE IF NOT EXISTS users ( + email STRING PRIMARY KEY NOT NULL, + hashed_password STRING NOT NULL, + full_name STRING, + disabled BOOLEAN +); +""" diff --git a/api/backend/database/startup.py b/api/backend/database/startup.py new file mode 100644 index 0000000..4e34fde --- /dev/null +++ b/api/backend/database/startup.py @@ -0,0 +1,15 @@ +from api.backend.database.common import connect, QUERIES +import logging + +LOG = logging.getLogger(__name__) + + +def init_database(): + cursor = connect() + + for query in QUERIES["init"].strip().split(";"): + if query.strip(): + LOG.info(f"Executing query: {query}") + _ = cursor.execute(query) + + cursor.close() diff --git a/api/backend/job/__init__.py b/api/backend/job/__init__.py index 2dfc67e..d5b8989 100644 --- a/api/backend/job/__init__.py +++ b/api/backend/job/__init__.py @@ -1,5 +1,4 @@ from .job import ( - query, insert, update_job, delete_jobs, @@ -9,7 +8,6 @@ ) __all__ = [ - "query", "insert", "update_job", "delete_jobs", diff --git a/api/backend/job/job.py b/api/backend/job/job.py index 1688d0f..cb2cc98 100644 --- a/api/backend/job/job.py +++ b/api/backend/job/job.py @@ -1,161 +1,97 @@ # STL import logging -from typing import Any, Optional - -# PDM -from pymongo import DESCENDING +from typing import Any # LOCAL -from api.backend.database import get_job_collection -from api.backend.job.models.job_options import FetchOptions +from api.backend.utils import format_list_for_query +from api.backend.database.common import ( + insert as common_insert, + query as common_query, + QUERIES, + update as common_update, +) LOG = logging.getLogger(__name__) async def insert(item: dict[str, Any]) -> None: - collection = get_job_collection() - i = await collection.insert_one(item) - LOG.info(f"Inserted item: {i}") + common_insert( + QUERIES["insert_job"], + ( + item["id"], + item["url"], + item["elements"], + item["user"], + item["time_created"], + item["result"], + item["status"], + item["chat"], + item["job_options"], + ), + ) + LOG.info(f"Inserted item: {item}") async def get_queued_job(): - collection = get_job_collection() - return await collection.find_one( - {"status": "Queued"}, sort=[("created_at", DESCENDING)] + query = ( + "SELECT * FROM jobs WHERE status = 'Queued' ORDER BY time_created DESC LIMIT 1" ) - - -async def query( - filter: dict[str, Any], fetch_options: Optional[FetchOptions] = None -) -> list[dict[str, Any]]: - collection = get_job_collection() - cursor = collection.find(filter) - results: list[dict[str, Any]] = [] - - async for document in cursor: - del document["_id"] - - if fetch_options and not fetch_options.chat and document.get("chat"): - del document["chat"] - - results.append(document) - - return results + res = common_query(query) + LOG.info(f"Got queued job: {res}") + return res[0] if res else None async def update_job(ids: list[str], field: str, value: Any): - collection = get_job_collection() - for id in ids: - _ = await collection.update_one( - {"id": id}, - {"$set": {field: value}}, - ) + query = f"UPDATE jobs SET {field} = ? WHERE id IN {format_list_for_query(ids)}" + res = common_update(query, tuple([value] + ids)) + LOG.info(f"Updated job: {res}") async def delete_jobs(jobs: list[str]): - collection = get_job_collection() - result = await collection.delete_many({"id": {"$in": jobs}}) - LOG.info(f"{result.deleted_count} documents deleted") + if not jobs: + LOG.info("No jobs to delete.") + return False + + query = f"DELETE FROM jobs WHERE id IN {format_list_for_query(jobs)}" + res = common_update(query, tuple(jobs)) - return True if result.deleted_count > 0 else False + return res > 0 async def average_elements_per_link(user: str): - collection = get_job_collection() - pipeline = [ - {"$match": {"status": "Completed", "user": user}}, - { - "$addFields": { - "time_created_date": { - "$cond": { - "if": {"$eq": [{"$type": "$time_created"}, "date"]}, - "then": "$time_created", - "else": { - "$convert": { - "input": "$time_created", - "to": "date", - "onError": None, - "onNull": None, - } - }, - } - } - } - }, - { - "$project": { - "date": { - "$dateToString": { - "format": "%Y-%m-%d", - "date": "$time_created_date", - } - }, - "num_elements": {"$size": "$elements"}, - } - }, - { - "$group": { - "_id": "$date", - "average_elements": {"$avg": "$num_elements"}, - "count": {"$sum": 1}, - } - }, - {"$sort": {"_id": 1}}, - ] - cursor = collection.aggregate(pipeline) - results: list[dict[str, Any]] = [] - - async for document in cursor: - results.append( - { - "date": document["_id"], - "average_elements": document["average_elements"], - "count": document["count"], - } - ) + job_query = """ + SELECT + DATE(time_created) AS date, + AVG(json_array_length(elements)) AS average_elements, + COUNT(*) AS count + FROM + jobs + WHERE + status = 'Completed' AND user = ? + GROUP BY + DATE(time_created) + ORDER BY + date ASC; + """ + results = common_query(job_query, (user,)) return results async def get_jobs_per_day(user: str): - collection = get_job_collection() - pipeline = [ - {"$match": {"status": "Completed", "user": user}}, - { - "$addFields": { - "time_created_date": { - "$cond": { - "if": {"$eq": [{"$type": "$time_created"}, "date"]}, - "then": "$time_created", - "else": { - "$convert": { - "input": "$time_created", - "to": "date", - "onError": None, - "onNull": None, - } - }, - } - } - } - }, - { - "$project": { - "date": { - "$dateToString": { - "format": "%Y-%m-%d", - "date": "$time_created_date", - } - } - } - }, - {"$group": {"_id": "$date", "job_count": {"$sum": 1}}}, - {"$sort": {"_id": 1}}, - ] - cursor = collection.aggregate(pipeline) - - results: list[dict[str, Any]] = [] - async for document in cursor: - results.append({"date": document["_id"], "job_count": document["job_count"]}) + job_query = """ + SELECT + DATE(time_created) AS date, + COUNT(*) AS job_count + FROM + jobs + WHERE + status = 'Completed' AND user = ? + GROUP BY + DATE(time_created) + ORDER BY + date ASC; + """ + results = common_query(job_query, (user,)) return results diff --git a/api/backend/job/models/__init__.py b/api/backend/job/models/__init__.py index e69de29..95afebc 100644 --- a/api/backend/job/models/__init__.py +++ b/api/backend/job/models/__init__.py @@ -0,0 +1,3 @@ +from .job_options import JobOptions + +__all__ = ["JobOptions"] diff --git a/api/backend/job/site_mapping/site_mapping.py b/api/backend/job/site_mapping/site_mapping.py index 94ff4b1..d1bd22d 100644 --- a/api/backend/job/site_mapping/site_mapping.py +++ b/api/backend/job/site_mapping/site_mapping.py @@ -77,7 +77,6 @@ async def handle_site_mapping( pages: set[tuple[str, str]], ): site_map = SiteMap(**site_map_dict) - LOG.info(f"Handling site map: {site_map}") for action in site_map.actions: action_handler = ACTION_MAP[action.type] diff --git a/api/backend/models.py b/api/backend/models.py index b52a704..f58bba7 100644 --- a/api/backend/models.py +++ b/api/backend/models.py @@ -9,9 +9,6 @@ import pydantic - - - class Element(pydantic.BaseModel): name: str xpath: str diff --git a/api/backend/routers/job_router.py b/api/backend/routers/job_router.py index 2c12ac9..e3546f5 100644 --- a/api/backend/routers/job_router.py +++ b/api/backend/routers/job_router.py @@ -12,7 +12,7 @@ from fastapi.responses import JSONResponse, StreamingResponse # LOCAL -from api.backend.job import query, insert, update_job, delete_jobs +from api.backend.job import insert, update_job, delete_jobs from api.backend.models import ( UpdateJobs, DownloadJob, @@ -21,9 +21,11 @@ ) from api.backend.schemas import User from api.backend.auth.auth_utils import get_current_user -from api.backend.utils import clean_text +from api.backend.utils import clean_text, format_list_for_query from api.backend.job.models.job_options import FetchOptions +from api.backend.database.common import query + LOG = logging.getLogger(__name__) job_router = APIRouter() @@ -46,6 +48,7 @@ async def submit_scrape_job(job: Job): return JSONResponse(content={"id": job.id}) except Exception as e: + LOG.error(f"Exception occurred: {traceback.format_exc()}") return JSONResponse(content={"error": str(e)}, status_code=500) @@ -54,8 +57,11 @@ async def retrieve_scrape_jobs( fetch_options: FetchOptions, user: User = Depends(get_current_user) ): LOG.info(f"Retrieving jobs for account: {user.email}") + ATTRIBUTES = "chat" if fetch_options.chat else "*" + try: - results = await query({"user": user.email}, fetch_options=fetch_options) + job_query = f"SELECT {ATTRIBUTES} FROM jobs WHERE user = ?" + results = query(job_query, (user.email,)) return JSONResponse(content=jsonable_encoder(results[::-1])) except Exception as e: LOG.error(f"Exception occurred: {e}") @@ -67,8 +73,8 @@ async def job(id: str, user: User = Depends(get_current_user)): LOG.info(f"Retrieving jobs for account: {user.email}") try: - filter = {"user": user.email, "id": id} - results = await query(filter) + job_query = "SELECT * FROM jobs WHERE user = ? AND id = ?" + results = query(job_query, (user.email, id)) return JSONResponse(content=jsonable_encoder(results)) except Exception as e: LOG.error(f"Exception occurred: {e}") @@ -80,7 +86,10 @@ async def download(download_job: DownloadJob): LOG.info(f"Downloading job with ids: {download_job.ids}") try: - results = await query({"id": {"$in": download_job.ids}}) + job_query = ( + f"SELECT * FROM jobs WHERE id IN {format_list_for_query(download_job.ids)}" + ) + results = query(job_query, tuple(download_job.ids)) csv_buffer = StringIO() csv_writer = csv.writer(csv_buffer, quotechar='"', quoting=csv.QUOTE_ALL) diff --git a/api/backend/tests/scraping/test_scraping.py b/api/backend/tests/scraping/test_scraping.py index 28cb9bf..78d5d51 100644 --- a/api/backend/tests/scraping/test_scraping.py +++ b/api/backend/tests/scraping/test_scraping.py @@ -1,15 +1,10 @@ import pytest +import logging from unittest.mock import AsyncMock, patch, MagicMock -from api.backend.tests.factories.job_factory import create_job -from api.backend.models import JobOptions from api.backend.scraping import create_driver - -mocked_job = create_job( - job_options=JobOptions( - multi_page_scrape=False, custom_headers={}, proxies=["127.0.0.1:8080"] - ) -).model_dump() +logging.basicConfig(level=logging.DEBUG) +LOG = logging.getLogger(__name__) @pytest.mark.asyncio @@ -26,8 +21,7 @@ async def test_proxy(mock_get: AsyncMock): driver.get("http://example.com") response = driver.last_request - # Check if the proxy header is set correctly if response: - assert response.headers["Proxy"] == "127.0.0.1:8080" + assert response.headers["Proxy-Connection"] == "keep-alive" driver.quit() diff --git a/api/backend/utils.py b/api/backend/utils.py index 7d6c60e..19ae4cc 100644 --- a/api/backend/utils.py +++ b/api/backend/utils.py @@ -1,5 +1,8 @@ -from typing import Optional +from typing import Any, Optional import logging +import json + +LOG = logging.getLogger(__name__) def clean_text(text: str): @@ -17,3 +20,30 @@ def get_log_level(level_name: Optional[str]) -> int: level = getattr(logging, level_name, logging.INFO) return level + + +def format_list_for_query(ids: list[str]): + return ( + f"({','.join(['?' for _ in ids])})" # Returns placeholders, e.g., "(?, ?, ?)" + ) + + +def format_sql_row_to_python(row: dict[str, Any]): + new_row: dict[str, Any] = {} + for key, value in row.items(): + if isinstance(value, str): + try: + new_row[key] = json.loads(value) + except json.JSONDecodeError: + new_row[key] = value + else: + new_row[key] = value + + return new_row + + +def format_json(items: list[Any]): + for idx, item in enumerate(items): + if isinstance(item, (dict, list)): + formatted_item = json.dumps(item) + items[idx] = formatted_item diff --git a/docker-compose.yml b/docker-compose.yml index 35e7c14..4fe4f46 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -23,7 +23,6 @@ services: dockerfile: docker/api/Dockerfile environment: - LOG_LEVEL=INFO - - MONGODB_URI=mongodb://root:example@webscrape-mongo:27017 # used to access MongoDB - SECRET_KEY=your_secret_key # used to encode authentication tokens (can be a random string) - ALGORITHM=HS256 # authentication encoding algorithm - ACCESS_TOKEN_EXPIRE_MINUTES=600 # access token expire minutes @@ -31,17 +30,9 @@ services: ports: - 8000:8000 volumes: + - "$PWD/data:/project/data" - /var/run/docker.sock:/var/run/docker.sock networks: - web - mongo: - container_name: webscrape-mongo - image: mongo - restart: always - environment: - MONGO_INITDB_ROOT_USERNAME: root - MONGO_INITDB_ROOT_PASSWORD: example - networks: - - web networks: web: diff --git a/docker/frontend/Dockerfile b/docker/frontend/Dockerfile index 8e07482..a719918 100644 --- a/docker/frontend/Dockerfile +++ b/docker/frontend/Dockerfile @@ -1,5 +1,5 @@ # Build next dependencies -FROM node:latest +FROM node:23.1 WORKDIR /app COPY package*.json ./ @@ -15,6 +15,4 @@ COPY src /app/src RUN npm run build -EXPOSE 3000 - -# CMD [ "npm", "run" ] +EXPOSE 3000 \ No newline at end of file diff --git a/src/components/submit/job-submitter/site-map/site-map-input/site-map-input.module.css b/src/components/submit/job-submitter/site-map/site-map-input/site-map-input.module.css index b83bdc8..9da74f8 100644 --- a/src/components/submit/job-submitter/site-map/site-map-input/site-map-input.module.css +++ b/src/components/submit/job-submitter/site-map/site-map-input/site-map-input.module.css @@ -7,9 +7,10 @@ border-radius: 0.375rem; transition: transform 0.2s ease-in-out; transform: scale(1); - &:hover { - transform: scale(1.05); - } +} + +.button:hover { + transform: scale(1.05); } .remove {