Skip to content

Commit

Permalink
Refactor: Remove Proxy Dependency (#44)
Browse files Browse the repository at this point in the history
  • Loading branch information
jaypyles authored Nov 12, 2024
1 parent 1dfd3ca commit b3bf780
Show file tree
Hide file tree
Showing 12 changed files with 77 additions and 164 deletions.
77 changes: 1 addition & 76 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ Scraperr is a self-hosted web application that allows users to scrape data from

From the table, users can download an excel sheet of the job's results, along with an option to rerun the job.

View the [docs](https://scraperr-docs.pages.dev).
View the [docs](https://scraperr-docs.pages.dev) for a quickstart guide and more information.

## Features

Expand Down Expand Up @@ -64,87 +64,12 @@ View the [docs](https://scraperr-docs.pages.dev).

![chat](https://github.com/jaypyles/www-scrape/blob/master/docs/chat_page.png)

## Installation

1. Clone the repository:

```sh
git clone https://github.com/jaypyles/scraperr.git

```

2. Set environmental variables and labels in `docker-compose.yml`.

```yaml
scraperr:
labels:
- "traefik.enable=true"
- "traefik.http.routers.scraperr.rule=Host(`localhost`)" # change this to your domain, if not running on localhost
- "traefik.http.routers.scraperr.entrypoints=web" # websecure if using https
- "traefik.http.services.scraperr.loadbalancer.server.port=3000"

scraperr_api:
environment:
- LOG_LEVEL=INFO
- MONGODB_URI=mongodb://root:example@webscrape-mongo:27017 # used to access MongoDB
- SECRET_KEY=your_secret_key # used to encode authentication tokens (can be a random string)
- ALGORITHM=HS256 # authentication encoding algorithm
- ACCESS_TOKEN_EXPIRE_MINUTES=600 # access token expire minutes
labels:
- "traefik.enable=true"
- "traefik.http.routers.scraperr_api.rule=Host(`localhost`) && PathPrefix(`/api`)" # change this to your domain, if not running on localhost
- "traefik.http.routers.scraperr_api.entrypoints=web" # websecure if using https
- "traefik.http.middlewares.api-stripprefix.stripprefix.prefixes=/api"
- "traefik.http.routers.scraperr_api.middlewares=api-stripprefix"
- "traefik.http.services.scraperr_api.loadbalancer.server.port=8000"

mongo:
environment:
MONGO_INITDB_ROOT_USERNAME: root
MONGO_INITDB_ROOT_PASSWORD: example
```
Don't want to use `traefik`? This configuration can be used in other reverse proxies, as long as the API is proxied to `/api` of the frontend container. This is currently
not able to be used without a reverse proxy, due to limitations of runtime client-side environmental variables in `next.js`.

3. Deploy

```sh
make up
```

The app provides its own `traefik` configuration to use independently, but can easily be reverse-proxied by any other app, or your own reverse-proxy.

## Usage

1. Open the application in your browser at `http://localhost`.
2. Enter the URL you want to scrape in the URL field.
3. Add elements to scrape by specifying a name and the corresponding XPath.
4. Click the "Submit" button to queue URL to be scraped.
5. View queue in the "Previous Jobs" section.

## API Endpoints

Use this service as an API for your own projects. Due to this using FastAPI, a docs page is available at `/docs` for the API.

![docs](https://github.com/jaypyles/www-scrape/blob/master/docs/docs_page.png)

## AI

Currently supports either an Ollama instance or OpenAI's ChatGPT, using your own API key. Setting up is easy as either setting the Ollama url or the OpenAI API key in the API's environmental variables in the `docker-compose.yml` file:

```yaml
scraperr_api:
environment:
- OLLAMA_URL=http://ollama:11434
- OLLAMA_MODEL=llama3.1
# or
- OPENAI_KEY=<your_key>
- OPENAI_MODEL=gpt3.5-turbo
```

The model's names are taken from the documentation of their respective technologies.

## Troubleshooting

Q: When running Scraperr, I'm met with "404 Page not found".
Expand Down
2 changes: 1 addition & 1 deletion api/backend/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@

LOG = logging.getLogger(__name__)

app = FastAPI(title="api")
app = FastAPI(title="api", root_path="/api")

app.add_middleware(
CORSMiddleware,
Expand Down
8 changes: 0 additions & 8 deletions docker-compose.dev.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,6 @@ version: "3"
services:
scraperr:
command: ["npm", "run", "dev"]
labels:
- "traefik.enable=true"
- "traefik.http.routers.scraperr.rule=Host(`localhost`)"
- "traefik.http.routers.scraperr.entrypoints=web"
- "traefik.http.services.scraperr.loadbalancer.server.port=3000"
- "traefik.http.routers.scraperr.tls=false"
volumes:
- "$PWD/src:/app/src"
- "$PWD/public:/app/public"
Expand All @@ -16,7 +10,5 @@ services:
- "$PWD/package-lock.json:/app/package-lock.json"
- "$PWD/tsconfig.json:/app/tsconfig.json"
scraperr_api:
ports:
- "8000:8000"
volumes:
- "$PWD/api:/project/api"
35 changes: 7 additions & 28 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,11 @@ services:
dockerfile: docker/frontend/Dockerfile
container_name: scraperr
command: ["npm", "run", "start"]
labels:
- "traefik.enable=true"
- "traefik.http.routers.scraperr.rule=Host(`localhost`)" # change this to your domain, if not running on localhost
- "traefik.http.routers.scraperr.entrypoints=web" # websecure if using https
- "traefik.http.services.scraperr.loadbalancer.server.port=3000"
environment:
- NEXT_PUBLIC_API_URL=http://localhost:8000 # your API URL
- SERVER_URL=http://scraperr_api:8000 # your docker container API URL
ports:
- 80:3000
networks:
- web
scraperr_api:
Expand All @@ -21,36 +21,15 @@ services:
dockerfile: docker/api/Dockerfile
environment:
- LOG_LEVEL=INFO
- OLLAMA_URL=http://ollama:11434
- OLLAMA_MODEL=phi3
- MONGODB_URI=mongodb://root:example@webscrape-mongo:27017 # used to access MongoDB
- SECRET_KEY=your_secret_key # used to encode authentication tokens (can be a random string)
- ALGORITHM=HS256 # authentication encoding algorithm
- ACCESS_TOKEN_EXPIRE_MINUTES=600 # access token expire minutes
container_name: scraperr_api
volumes:
- /var/run/docker.sock:/var/run/docker.sock
labels:
- "traefik.enable=true"
- "traefik.http.routers.scraperr_api.rule=Host(`localhost`) && PathPrefix(`/api`)" # change this to your domain, if not running on localhost
- "traefik.http.routers.scraperr_api.entrypoints=web" # websecure if using https
- "traefik.http.middlewares.api-stripprefix.stripprefix.prefixes=/api"
- "traefik.http.routers.scraperr_api.middlewares=api-stripprefix"
- "traefik.http.services.scraperr_api.loadbalancer.server.port=8000"
networks:
- web
traefik:
image: traefik:latest
container_name: traefik
command:
- "--providers.docker=true"
- "--entrypoints.web.address=:80"
- "--entrypoints.websecure.address=:443"
ports:
- 80:80
- 443:443
- 8000:8000
volumes:
- /var/run/docker.sock:/var/run/docker.sock:ro"
- /var/run/docker.sock:/var/run/docker.sock
networks:
- web
mongo:
Expand Down
28 changes: 17 additions & 11 deletions src/components/jobs/JobTable.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -48,11 +48,14 @@ export const JobTable: React.FC<JobTableProps> = ({ jobs, setJobs }) => {
const router = useRouter();

const handleDownload = async (ids: string[]) => {
const response = await fetch(`${Constants.DOMAIN}/api/download`, {
method: "POST",
headers: { "Content-Type": "application/json" },
body: JSON.stringify({ ids: ids }),
});
const response = await fetch(
`${process.env.NEXT_PUBLIC_API_URL}/api/download`,
{
method: "POST",
headers: { "Content-Type": "application/json" },
body: JSON.stringify({ ids: ids }),
}
);

if (response.ok) {
const blob = await response.blob();
Expand Down Expand Up @@ -104,11 +107,14 @@ export const JobTable: React.FC<JobTableProps> = ({ jobs, setJobs }) => {
};

const handleDeleteSelected = async () => {
const response = await fetch(`${Constants.DOMAIN}/api/delete-scrape-jobs`, {
method: "POST",
headers: { "Content-Type": "application/json" },
body: JSON.stringify({ ids: Array.from(selectedJobs) }),
});
const response = await fetch(
`${process.env.NEXT_PUBLIC_API_URL}/api/delete-scrape-jobs`,
{
method: "POST",
headers: { "Content-Type": "application/json" },
body: JSON.stringify({ ids: Array.from(selectedJobs) }),
}
);

if (response.ok) {
setJobs((jobs) =>
Expand Down Expand Up @@ -142,7 +148,7 @@ export const JobTable: React.FC<JobTableProps> = ({ jobs, setJobs }) => {
value: value,
};

await fetch(`${Constants.DOMAIN}/api/update`, {
await fetch(`${process.env.NEXT_PUBLIC_API_URL}/api/update`, {
method: "POST",
headers: {
"Content-Type": "application/json",
Expand Down
6 changes: 3 additions & 3 deletions src/contexts/AuthContext.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ export const AuthProvider: React.FC<AuthProps> = ({ children }) => {
const token = Cookies.get("token");
if (token) {
axios
.get(`${Constants.DOMAIN}/api/auth/users/me`, {
.get(`${process.env.NEXT_PUBLIC_API_URL}/api/auth/users/me`, {
headers: { Authorization: `Bearer ${token}` },
})
.then((response) => {
Expand All @@ -43,7 +43,7 @@ export const AuthProvider: React.FC<AuthProps> = ({ children }) => {
params.append("username", email);
params.append("password", password);
const response = await axios.post(
`${Constants.DOMAIN}/api/auth/token`,
`${process.env.NEXT_PUBLIC_API_URL}/api/auth/token`,
params
);
Cookies.set("token", response.data.access_token, {
Expand All @@ -54,7 +54,7 @@ export const AuthProvider: React.FC<AuthProps> = ({ children }) => {
sameSite: "Lax",
});
const userResponse = await axios.get(
`${Constants.DOMAIN}/api/auth/users/me`,
`${process.env.NEXT_PUBLIC_API_URL}/api/auth/users/me`,
{
headers: { Authorization: `Bearer ${response.data.access_token}` },
}
Expand Down
34 changes: 20 additions & 14 deletions src/lib/utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ export const fetchJobs = async (
fetchOptions: fetchOptions = {}
) => {
const token = Cookies.get("token");
await fetch(`/api/retrieve-scrape-jobs`, {
await fetch(`${process.env.NEXT_PUBLIC_API_URL}/api/retrieve-scrape-jobs`, {
method: "POST",
headers: {
"content-type": "application/json",
Expand All @@ -29,12 +29,15 @@ export const fetchJobs = async (
export const fetchJob = async (id: string) => {
const token = Cookies.get("token");
try {
const response = await fetch(`/api/job/${id}`, {
headers: {
"content-type": "application/json",
Authorization: `Bearer ${token}`,
},
});
const response = await fetch(
`${process.env.NEXT_PUBLIC_API_URL}/api/job/${id}`,
{
headers: {
"content-type": "application/json",
Authorization: `Bearer ${token}`,
},
}
);
const data = await response.json();
return data;
} catch (error) {
Expand All @@ -48,12 +51,15 @@ export const checkAI = async (
) => {
const token = Cookies.get("token");
try {
const response = await fetch(`/api/ai/check`, {
headers: {
"content-type": "application/json",
Authorization: `Bearer ${token}`,
},
});
const response = await fetch(
`${process.env.NEXT_PUBLIC_API_URL}/api/ai/check`,
{
headers: {
"content-type": "application/json",
Authorization: `Bearer ${token}`,
},
}
);
const data = await response.json();
setAiEnabled(data);
} catch (error) {
Expand All @@ -69,7 +75,7 @@ export const updateJob = async (ids: string[], field: string, value: any) => {
field: field,
value: value,
};
await fetch(`/api/update`, {
await fetch(`${process.env.NEXT_PUBLIC_API_URL}/api/update`, {
method: "POST",
headers: {
"content-type": "application/json",
Expand Down
2 changes: 1 addition & 1 deletion src/pages/chat.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ const AI: React.FC = () => {
}. The following messages will pertain to the content of the scraped job.`,
};

const response = await fetch("/api/ai", {
const response = await fetch(`${process.env.NEXT_PUBLIC_API_URL}/api/ai`, {
method: "POST",
headers: {
"Content-Type": "application/json",
Expand Down
4 changes: 2 additions & 2 deletions src/pages/jobs.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -22,15 +22,15 @@ export const getServerSideProps: GetServerSideProps = async (context) => {
if (token) {
try {
const userResponse = await axios.get(
`http://scraperr_api:8000/api/auth/users/me`,
`${process.env.SERVER_URL}/api/auth/users/me`,
{
headers: { Authorization: `Bearer ${token}` },
}
);
user = userResponse.data;

const jobsResponse = await axios.post(
`http://scraperr_api:8000/api/retrieve-scrape-jobs`,
`${process.env.SERVER_URL}/api/retrieve-scrape-jobs`,
{ user: user.email },
{
headers: {
Expand Down
4 changes: 3 additions & 1 deletion src/pages/logs.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,9 @@ interface logs {

export async function getStaticProps() {
try {
const response = await fetch(`http://scraperr_api:8000/initial_logs`);
const response = await fetch(
`${process.env.NEXT_PUBLIC_API_URL}/initial_logs`
);
const logJson: logs = await response.json();
const initialLogs = logJson.logs;

Expand Down
8 changes: 4 additions & 4 deletions src/pages/statistics.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ export const getServerSideProps: GetServerSideProps = async (context) => {
if (token) {
try {
const averageElementResponse = await fetch(
`http://scraperr_api:8000/statistics/get-average-element-per-link`,
`${process.env.SERVER_URL}/statistics/get-average-element-per-link`,
{
headers: { Authorization: `Bearer ${token}` },
}
Expand All @@ -39,7 +39,7 @@ export const getServerSideProps: GetServerSideProps = async (context) => {
averageElement = await averageElementResponse.json();

const averageJobResponse = await fetch(
`http://scraperr_api:8000/statistics/get-average-jobs-per-day`,
`${process.env.SERVER_URL}/statistics/get-average-jobs-per-day`,
{
headers: { Authorization: `Bearer ${token}` },
}
Expand Down Expand Up @@ -76,7 +76,7 @@ const Statistics: React.FC<StatProps> = ({ averageElement, averageJob }) => {
const fetchElementsData = async () => {
try {
const response = await fetch(
`${Constants.DOMAIN}/api/statistics/get-average-element-per-link`,
`${process.env.NEXT_PUBLIC_API_URL}/api/statistics/get-average-element-per-link`,
{
headers: {
"Content-Type": "application/json",
Expand All @@ -94,7 +94,7 @@ const Statistics: React.FC<StatProps> = ({ averageElement, averageJob }) => {
const fetchJobsData = async () => {
try {
const response = await fetch(
`${Constants.DOMAIN}/api/statistics/get-average-jobs-per-day`,
`${process.env.NEXT_PUBLIC_API_URL}/api/statistics/get-average-jobs-per-day`,
{
headers: {
"Content-Type": "application/json",
Expand Down
Loading

0 comments on commit b3bf780

Please sign in to comment.