Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: allow configuring what status codes will cause session retirement #1423

Merged
merged 3 commits into from
Jul 27, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 0 additions & 2 deletions packages/core/src/constants.ts

This file was deleted.

1 change: 0 additions & 1 deletion packages/core/src/index.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
export * from './errors';
export * from './autoscaling';
export * from './configuration';
export * from './constants';
export * from './crawlers';
export * from './enqueue_links';
export * from './events';
Expand Down
31 changes: 22 additions & 9 deletions packages/core/src/session_pool/session.ts
Original file line number Diff line number Diff line change
@@ -1,19 +1,15 @@
import type { Log } from '@apify/log';
import { cryptoRandomObjectId } from '@apify/utilities';
import type { Dictionary, Cookie as CookieObject, BrowserLikeResponse } from '@crawlee/types';
import type { BrowserLikeResponse, Cookie as CookieObject, Dictionary } from '@crawlee/types';
import type { IncomingMessage } from 'node:http';
import { EventEmitter } from 'node:events';
import ow from 'ow';
import type { Cookie } from 'tough-cookie';
import { CookieJar } from 'tough-cookie';
import { STATUS_CODES_BLOCKED } from '../constants';
import { log as defaultLog } from '../log';
import { EVENT_SESSION_RETIRED } from './events';
import { browserPoolCookieToToughCookie, getCookiesFromResponse, getDefaultCookieExpirationDate, toughCookieToBrowserPoolCookie } from '../cookie_utils';

// CONSTANTS
const DEFAULT_SESSION_MAX_AGE_SECS = 3000;

/**
* Persistable {@link Session} state.
*/
Expand Down Expand Up @@ -134,7 +130,7 @@ export class Session {
sessionPool,
id = `session_${cryptoRandomObjectId(10)}`,
cookieJar = new CookieJar(),
maxAgeSecs = DEFAULT_SESSION_MAX_AGE_SECS,
maxAgeSecs = 3000,
userData = {},
maxErrorScore = 3,
errorScoreDecrement = 0.5,
Expand Down Expand Up @@ -264,11 +260,28 @@ export class Session {
* by retiring the session when such code is received. Optionally the default status
* codes can be extended in the second parameter.
* @param statusCode HTTP status code.
* @param [blockedStatusCodes] Custom HTTP status codes that means blocking on particular website.
* @returns Whether the session was retired.
*/
retireOnBlockedStatusCodes(statusCode: number, blockedStatusCodes: number[] = []): boolean {
const isBlocked = STATUS_CODES_BLOCKED.concat(blockedStatusCodes).includes(statusCode);
retireOnBlockedStatusCodes(statusCode: number): boolean;

/**
* With certain status codes: `401`, `403` or `429` we can be certain
* that the target website is blocking us. This function helps to do this conveniently
* by retiring the session when such code is received. Optionally the default status
* codes can be extended in the second parameter.
* @param statusCode HTTP status code.
* @param [additionalBlockedStatusCodes]
* Custom HTTP status codes that means blocking on particular website.
*
* **This parameter is deprecated and will be removed in next major version.**
* @returns Whether the session was retired.
* @deprecated The parameter `additionalBlockedStatusCodes` is deprecated and will be removed in next major version.
*/
retireOnBlockedStatusCodes(statusCode: number, additionalBlockedStatusCodes?: number[]): boolean;

retireOnBlockedStatusCodes(statusCode: number, additionalBlockedStatusCodes: number[] = []): boolean {
// @ts-expect-error
const isBlocked = this.sessionPool.blockedStatusCodes.concat(additionalBlockedStatusCodes).includes(statusCode);
if (isBlocked) {
this.retire();
}
Expand Down
36 changes: 22 additions & 14 deletions packages/core/src/session_pool/session_pool.ts
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,13 @@ export interface SessionPoolOptions {
*/
createSessionFunction?: CreateSession;

/**
* Specifies which response status codes are considered as blocked.
* Session connected to such request will be marked as retired.
* @default [401, 403, 429]
*/
blockedStatusCodes?: number[];

/** @internal */
log?: Log;
}
Expand Down Expand Up @@ -110,17 +117,18 @@ export interface SessionPoolOptions {
* @category Scaling
*/
export class SessionPool extends EventEmitter {
log: Log;
maxPoolSize: number;
createSessionFunction: CreateSession;
keyValueStore!: KeyValueStore;
sessions: Session[] = [];
sessionMap = new Map<string, Session>();
sessionOptions: SessionOptions;
persistStateKeyValueStoreId?: string;
persistStateKey: string;
private _listener!: () => Promise<void>;
private events: EventManager;
protected log: Log;
protected maxPoolSize: number;
protected createSessionFunction: CreateSession;
protected keyValueStore!: KeyValueStore;
protected sessions: Session[] = [];
protected sessionMap = new Map<string, Session>();
protected sessionOptions: SessionOptions;
protected persistStateKeyValueStoreId?: string;
protected persistStateKey: string;
protected _listener!: () => Promise<void>;
protected events: EventManager;
protected readonly blockedStatusCodes: number[];

/**
* @internal
Expand All @@ -134,22 +142,22 @@ export class SessionPool extends EventEmitter {
persistStateKey: ow.optional.string,
createSessionFunction: ow.optional.function,
sessionOptions: ow.optional.object,
blockedStatusCodes: ow.optional.array.ofType(ow.number),
log: ow.optional.object,
}));

const {
maxPoolSize = 1000,

persistStateKeyValueStoreId,
persistStateKey = 'SDK_SESSION_POOL_STATE',

createSessionFunction,
sessionOptions = {},

blockedStatusCodes = [401, 403, 429],
log = defaultLog,
} = options;

this.config = config;
this.blockedStatusCodes = blockedStatusCodes;
this.events = config.getEventManager();
this.log = log.child({ prefix: 'SessionPool' });

Expand Down
2 changes: 2 additions & 0 deletions test/core/crawlers/basic_crawler.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -970,6 +970,7 @@ describe('BasicCrawler', () => {
});
await crawler.run();

// @ts-expect-error private symbol
expect(crawler.sessionPool.maxPoolSize).toEqual(10);
});

Expand Down Expand Up @@ -998,6 +999,7 @@ describe('BasicCrawler', () => {

await crawler.run();
expect(events.listenerCount(EventType.PERSIST_STATE)).toEqual(0);
// @ts-expect-error private symbol
expect(crawler.sessionPool.maxPoolSize).toEqual(10);
});
});
Expand Down
9 changes: 4 additions & 5 deletions test/core/crawlers/browser_crawler.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@ import {
Request,
RequestList,
Session,
STATUS_CODES_BLOCKED,
} from '@crawlee/puppeteer';
import { gotScraping } from 'got-scraping';
import { sleep } from '@crawlee/utils';
Expand Down Expand Up @@ -390,7 +389,7 @@ describe('BrowserCrawler', () => {

test('should throw on "blocked" status codes', async () => {
const baseUrl = 'https://example.com/';
const sources = STATUS_CODES_BLOCKED.map((statusCode) => {
const sources = [401, 403, 429].map((statusCode) => {
return {
url: baseUrl + statusCode,
userData: { statusCode },
Expand Down Expand Up @@ -423,7 +422,7 @@ describe('BrowserCrawler', () => {

await crawler.run();

expect(failedRequests.length).toBe(STATUS_CODES_BLOCKED.length);
expect(failedRequests.length).toBe(3);
failedRequests.forEach((fr) => {
const [msg] = fr.errorMessages;
expect(msg).toContain(`Request blocked - received ${fr.userData.statusCode} status code.`);
Expand All @@ -433,7 +432,7 @@ describe('BrowserCrawler', () => {

test('should throw on "blocked" status codes (retire session)', async () => {
const baseUrl = 'https://example.com/';
const sources = STATUS_CODES_BLOCKED.map((statusCode) => {
const sources = [401, 403, 429].map((statusCode) => {
return {
url: baseUrl + statusCode,
userData: { statusCode },
Expand Down Expand Up @@ -466,7 +465,7 @@ describe('BrowserCrawler', () => {

await crawler.run();

expect(failedRequests.length).toBe(STATUS_CODES_BLOCKED.length);
expect(failedRequests.length).toBe(3);
failedRequests.forEach((fr) => {
const [msg] = fr.errorMessages;
expect(msg).toContain(`Request blocked - received ${fr.userData.statusCode} status code.`);
Expand Down
6 changes: 4 additions & 2 deletions test/core/crawlers/cheerio_crawler.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@ import {
Request,
RequestList,
Session,
STATUS_CODES_BLOCKED,
} from '@crawlee/cheerio';
import express from 'express';
import fs from 'fs';
Expand Down Expand Up @@ -858,6 +857,7 @@ describe('CheerioCrawler', () => {

await cheerioCrawler.run();

// @ts-expect-error private symbol
const { sessions } = cheerioCrawler.sessionPool;
expect(sessions.length).toBe(4);
sessions.forEach((session) => {
Expand All @@ -878,7 +878,7 @@ describe('CheerioCrawler', () => {
});

test('should retire session on "blocked" status codes', async () => {
for (const code of STATUS_CODES_BLOCKED) {
for (const code of [401, 403, 429]) {
const failed: Request[] = [];
const sessions: Session[] = [];
const crawler = new CheerioCrawler({
Expand All @@ -899,7 +899,9 @@ describe('CheerioCrawler', () => {
});
await crawler.run();

// @ts-expect-error private symbol
expect(crawler.sessionPool.sessions.length).toBe(4);
// @ts-expect-error private symbol
// eslint-disable-next-line no-loop-func
crawler.sessionPool.sessions.forEach((session) => {
// @ts-expect-error Accessing private prop
Expand Down
5 changes: 3 additions & 2 deletions test/core/session_pool/session.test.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import { EVENT_SESSION_RETIRED, ProxyConfiguration, Session, SessionPool, STATUS_CODES_BLOCKED } from '@crawlee/core';
import { EVENT_SESSION_RETIRED, ProxyConfiguration, Session, SessionPool } from '@crawlee/core';
import type { Dictionary } from '@crawlee/utils';
import { entries, sleep } from '@crawlee/utils';

Expand Down Expand Up @@ -178,7 +178,8 @@ describe('Session - testing session behaviour ', () => {
expect(session.retireOnBlockedStatusCodes(200)).toBeFalsy();
expect(session.retireOnBlockedStatusCodes(400)).toBeFalsy();
expect(session.retireOnBlockedStatusCodes(500)).toBeFalsy();
STATUS_CODES_BLOCKED.forEach((status) => {
// @ts-expect-error
sessionPool.blockedStatusCodes.forEach((status) => {
const sess = new Session({ sessionPool });
let isCalled;
const call = () => { isCalled = true; };
Expand Down
Loading