-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathindex.js
113 lines (93 loc) · 2.97 KB
/
index.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import express from "express";
import { Actor } from "apify";
import { PlaywrightCrawler, RequestList, sleep } from "crawlee";
const app = express();
const port = 3000;
app.use(express.json());
app.use(express.urlencoded({ extended: true }));
await Actor.init();
const contentMap = new Map(); // Map to store the page content based on request id
const initializeCrawler = async () => {
const requestList = await RequestList.open("my-list", [], {
keepDuplicateUrls: true,
});
const requestQueue = await Actor.openRequestQueue();
const crawler = new PlaywrightCrawler({
requestList,
requestQueue,
useSessionPool: true,
persistCookiesPerSession: false,
headless: true,
keepAlive: true,
minConcurrency: 5,
maxConcurrency: 30,
requestHandler: async ({ request, page }) => {
await page.route("**/*", (route) => {
if (route.request().resourceType() === "image") {
route.abort();
} else {
route.continue();
}
});
await sleep(2000)
await page.evaluate(() => {
return window.scrollBy(0, window.innerHeight);
});
await sleep(152);
await page.evaluate(() => {
return window.scrollBy(0, window.innerHeight);
});
await sleep(263);
const content = await page.content();
console.log(`Title: ${await page.title()}`);
console.log(`Content: ${content}`);
contentMap.set(request.uniqueKey, content); // Store content with uniqueKey
await requestQueue.markRequestHandled(request);
},
});
crawler.run();
return crawler;
};
const addToQueue = async (queue, url, uniqueKey) => {
await queue.addRequests([{ url, uniqueKey }]);
};
const getContent = async (uniqueKey, maxRetries = 20, delay = 1000) => {
for (let i = 0; i < maxRetries; i++) {
if (contentMap.has(uniqueKey)) {
const content = contentMap.get(uniqueKey);
contentMap.delete(uniqueKey); // Clean up the map
return content;
}
await sleep(delay); // wait before retrying
}
throw new Error("Failed to fetch the content in time");
};
app.post("/", async (req, res) => {
try {
const { url } = req.body;
if (!url) {
return res.status(400).json({ error: "URL query parameter is required" });
}
const queue = req.app.get("queue");
const uniqueKey = Math.random().toString();
console.log(`Adding URL to queue: ${url}`);
await addToQueue(queue, url, uniqueKey);
const content = await getContent(uniqueKey);
return res.json({ url, content });
} catch (error) {
console.error("Error processing request:", error);
return res.status(500).json({ error: "Internal server error" });
}
});
const startServer = async () => {
try {
app.set("queue", await initializeCrawler());
app.listen(port, () => {
console.log(`Server is running on http://localhost:${port}`);
});
} catch (error) {
console.error("Error initializing server:", error);
process.exit(1);
}
};
startServer();