Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merge #1

Merged
merged 11 commits into from
Jun 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 6 additions & 9 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,14 +1,11 @@
FROM apify/actor-node-puppeteer-chrome:latest
COPY package*.json ./
COPY yarn.lock ./
FROM apify/actor-node-playwright:latest
COPY --chown=myuser package*.json ./
COPY --chown=myuser yarn.lock ./

RUN yarn install
RUN yarn create playwright

USER root
# Create the directory and set permissions
RUN mkdir -p /home/myuser/storage/key_value_stores/__CRAWLEE_MIGRATING_KEY_VALUE_STORE__ && chown -R myuser:myuser /home/myuser/storage

# Switch back to the node user
USER myuser
COPY --chown=myuser . ./

COPY . ./
ENTRYPOINT yarn start
28 changes: 12 additions & 16 deletions index.js
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import express from "express";
import { Actor } from "apify";
import { PuppeteerCrawler, RequestList, sleep } from "crawlee";
import { PlaywrightCrawler, RequestList, sleep } from "crawlee";

const app = express();
const port = 3000;
Expand All @@ -18,29 +18,25 @@ const initializeCrawler = async () => {
});
const requestQueue = await Actor.openRequestQueue();

const crawler = new PuppeteerCrawler({
const crawler = new PlaywrightCrawler({
requestList,
requestQueue,
useSessionPool: false,
useSessionPool: true,
persistCookiesPerSession: false,
headless: true,
keepAlive: true,
minConcurrency: 5,
maxConcurrency: 15,
launchContext: {
launchOptions: {
defaultViewport: {
width: 1512,
height: 982,
},
},
userAgent:
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
},
maxConcurrency: 30,
requestHandler: async ({ request, page }) => {
await page.waitForNavigation({ waitUntil: "load" });
await page.route("**/*", (route) => {
if (route.request().resourceType() === "image") {
route.abort();
} else {
route.continue();
}
});

await sleep(2000);
await sleep(2000)

await page.evaluate(() => {
return window.scrollBy(0, window.innerHeight);
Expand Down
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,6 @@
"apify": "^3.2.0",
"crawlee": "^3.10.1",
"express": "^4.19.2",
"puppeteer": "^22.9.0"
"playwright": "^1.44.1"
}
}
23 changes: 21 additions & 2 deletions yarn.lock
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,7 @@
tiny-typed-emitter "^2.1.0"
tslib "^2.4.0"

"@crawlee/browser@3.10.1":
"@crawlee/browser@3.10.1", "@crawlee/browser@^3.10.1":
version "3.10.1"
resolved "https://registry.yarnpkg.com/@crawlee/browser/-/browser-3.10.1.tgz#a7d5416a95b2accfbd751bbcbc8ff4808a09f901"
integrity sha512-FgQbWQcIe787w8HhKYGjf4j3E64OmOJYkoKJa/KdWfSJ03PozAc3Bu8Kw1dSO8J0OOK8JpoBO5SnqazXcbjuzg==
Expand Down Expand Up @@ -252,7 +252,7 @@
proper-lockfile "^4.1.2"
tslib "^2.4.0"

"@crawlee/playwright@3.10.1":
"@crawlee/playwright@3.10.1", "@crawlee/playwright@^3.10.1":
version "3.10.1"
resolved "https://registry.yarnpkg.com/@crawlee/playwright/-/playwright-3.10.1.tgz#874f8d375023a06ab5361ee457ac3415a7f143c8"
integrity sha512-NjYlqSVJO31zwoKrjIgce23BYl1rs9+nLPAM9Ppf5fFtEgMPONRCrHSg8WzICXrqT3yX0vU2lxKLTF2qwBdsGQ==
Expand Down Expand Up @@ -1402,6 +1402,11 @@ fs-extra@^11.0.0, fs-extra@^11.2.0:
jsonfile "^6.0.1"
universalify "^2.0.0"

fsevents@2.3.2:
version "2.3.2"
resolved "https://registry.yarnpkg.com/fsevents/-/fsevents-2.3.2.tgz#8a526f78b8fdf4623b709e0b975c52c24c02fd1a"
integrity sha512-xiqMQR4xAeHTuB9uWm+fFRcIOgKBMiOBP+eXiyT7jsgVCq1bkVygt00oASowB7EdtpOHaaPgKt812P9ab+DDKA==

function-bind@^1.1.2:
version "1.1.2"
resolved "https://registry.yarnpkg.com/function-bind/-/function-bind-1.1.2.tgz#2c02d864d97f3ea6c8830c464cbd11ab6eab7a1c"
Expand Down Expand Up @@ -2277,6 +2282,20 @@ pkg-dir@^4.2.0:
dependencies:
find-up "^4.0.0"

playwright-core@1.44.1:
version "1.44.1"
resolved "https://registry.yarnpkg.com/playwright-core/-/playwright-core-1.44.1.tgz#53ec975503b763af6fc1a7aa995f34bc09ff447c"
integrity sha512-wh0JWtYTrhv1+OSsLPgFzGzt67Y7BE/ZS3jEqgGBlp2ppp1ZDj8c+9IARNW4dwf1poq5MgHreEM2KV/GuR4cFA==

playwright@^1.44.1:
version "1.44.1"
resolved "https://registry.yarnpkg.com/playwright/-/playwright-1.44.1.tgz#5634369d777111c1eea9180430b7a184028e7892"
integrity sha512-qr/0UJ5CFAtloI3avF95Y0L1xQo6r3LQArLIg/z/PoGJ6xa+EwzrwO5lpNr/09STxdHuUoP2mvuELJS+hLdtgg==
dependencies:
playwright-core "1.44.1"
optionalDependencies:
fsevents "2.3.2"

progress@2.0.3:
version "2.0.3"
resolved "https://registry.yarnpkg.com/progress/-/progress-2.0.3.tgz#7e8cf8d8f5b8f239c1bc68beb4eb78567d572ef8"
Expand Down
Loading