Skip to content

Commit

Permalink
feat: allow configuring what status codes will cause session retireme…
Browse files Browse the repository at this point in the history
…nt (#1423)
  • Loading branch information
B4nan authored Jul 27, 2022
1 parent 2505fd7 commit cd3795f
Show file tree
Hide file tree
Showing 9 changed files with 104 additions and 37 deletions.
2 changes: 0 additions & 2 deletions packages/core/src/constants.ts

This file was deleted.

1 change: 0 additions & 1 deletion packages/core/src/index.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
export * from './errors';
export * from './autoscaling';
export * from './configuration';
export * from './constants';
export * from './crawlers';
export * from './enqueue_links';
export * from './events';
Expand Down
31 changes: 22 additions & 9 deletions packages/core/src/session_pool/session.ts
Original file line number Diff line number Diff line change
@@ -1,19 +1,15 @@
import type { Log } from '@apify/log';
import { cryptoRandomObjectId } from '@apify/utilities';
import type { Dictionary, Cookie as CookieObject, BrowserLikeResponse } from '@crawlee/types';
import type { BrowserLikeResponse, Cookie as CookieObject, Dictionary } from '@crawlee/types';
import type { IncomingMessage } from 'node:http';
import { EventEmitter } from 'node:events';
import ow from 'ow';
import type { Cookie } from 'tough-cookie';
import { CookieJar } from 'tough-cookie';
import { STATUS_CODES_BLOCKED } from '../constants';
import { log as defaultLog } from '../log';
import { EVENT_SESSION_RETIRED } from './events';
import { browserPoolCookieToToughCookie, getCookiesFromResponse, getDefaultCookieExpirationDate, toughCookieToBrowserPoolCookie } from '../cookie_utils';

// CONSTANTS
const DEFAULT_SESSION_MAX_AGE_SECS = 3000;

/**
* Persistable {@link Session} state.
*/
Expand Down Expand Up @@ -134,7 +130,7 @@ export class Session {
sessionPool,
id = `session_${cryptoRandomObjectId(10)}`,
cookieJar = new CookieJar(),
maxAgeSecs = DEFAULT_SESSION_MAX_AGE_SECS,
maxAgeSecs = 3000,
userData = {},
maxErrorScore = 3,
errorScoreDecrement = 0.5,
Expand Down Expand Up @@ -264,11 +260,28 @@ export class Session {
* by retiring the session when such code is received. Optionally the default status
* codes can be extended in the second parameter.
* @param statusCode HTTP status code.
* @param [blockedStatusCodes] Custom HTTP status codes that means blocking on particular website.
* @returns Whether the session was retired.
*/
retireOnBlockedStatusCodes(statusCode: number, blockedStatusCodes: number[] = []): boolean {
const isBlocked = STATUS_CODES_BLOCKED.concat(blockedStatusCodes).includes(statusCode);
retireOnBlockedStatusCodes(statusCode: number): boolean;

/**
* With certain status codes: `401`, `403` or `429` we can be certain
* that the target website is blocking us. This function helps to do this conveniently
* by retiring the session when such code is received. Optionally the default status
* codes can be extended in the second parameter.
* @param statusCode HTTP status code.
* @param [additionalBlockedStatusCodes]
* Custom HTTP status codes that means blocking on particular website.
*
* **This parameter is deprecated and will be removed in next major version.**
* @returns Whether the session was retired.
* @deprecated The parameter `additionalBlockedStatusCodes` is deprecated and will be removed in next major version.
*/
retireOnBlockedStatusCodes(statusCode: number, additionalBlockedStatusCodes?: number[]): boolean;

retireOnBlockedStatusCodes(statusCode: number, additionalBlockedStatusCodes: number[] = []): boolean {
// @ts-expect-error
const isBlocked = this.sessionPool.blockedStatusCodes.concat(additionalBlockedStatusCodes).includes(statusCode);
if (isBlocked) {
this.retire();
}
Expand Down
36 changes: 22 additions & 14 deletions packages/core/src/session_pool/session_pool.ts
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,13 @@ export interface SessionPoolOptions {
*/
createSessionFunction?: CreateSession;

/**
* Specifies which response status codes are considered as blocked.
* Session connected to such request will be marked as retired.
* @default [401, 403, 429]
*/
blockedStatusCodes?: number[];

/** @internal */
log?: Log;
}
Expand Down Expand Up @@ -110,17 +117,18 @@ export interface SessionPoolOptions {
* @category Scaling
*/
export class SessionPool extends EventEmitter {
log: Log;
maxPoolSize: number;
createSessionFunction: CreateSession;
keyValueStore!: KeyValueStore;
sessions: Session[] = [];
sessionMap = new Map<string, Session>();
sessionOptions: SessionOptions;
persistStateKeyValueStoreId?: string;
persistStateKey: string;
private _listener!: () => Promise<void>;
private events: EventManager;
protected log: Log;
protected maxPoolSize: number;
protected createSessionFunction: CreateSession;
protected keyValueStore!: KeyValueStore;
protected sessions: Session[] = [];
protected sessionMap = new Map<string, Session>();
protected sessionOptions: SessionOptions;
protected persistStateKeyValueStoreId?: string;
protected persistStateKey: string;
protected _listener!: () => Promise<void>;
protected events: EventManager;
protected readonly blockedStatusCodes: number[];

/**
* @internal
Expand All @@ -134,22 +142,22 @@ export class SessionPool extends EventEmitter {
persistStateKey: ow.optional.string,
createSessionFunction: ow.optional.function,
sessionOptions: ow.optional.object,
blockedStatusCodes: ow.optional.array.ofType(ow.number),
log: ow.optional.object,
}));

const {
maxPoolSize = 1000,

persistStateKeyValueStoreId,
persistStateKey = 'SDK_SESSION_POOL_STATE',

createSessionFunction,
sessionOptions = {},

blockedStatusCodes = [401, 403, 429],
log = defaultLog,
} = options;

this.config = config;
this.blockedStatusCodes = blockedStatusCodes;
this.events = config.getEventManager();
this.log = log.child({ prefix: 'SessionPool' });

Expand Down
2 changes: 2 additions & 0 deletions test/core/crawlers/basic_crawler.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -970,6 +970,7 @@ describe('BasicCrawler', () => {
});
await crawler.run();

// @ts-expect-error private symbol
expect(crawler.sessionPool.maxPoolSize).toEqual(10);
});

Expand Down Expand Up @@ -998,6 +999,7 @@ describe('BasicCrawler', () => {

await crawler.run();
expect(events.listenerCount(EventType.PERSIST_STATE)).toEqual(0);
// @ts-expect-error private symbol
expect(crawler.sessionPool.maxPoolSize).toEqual(10);
});
});
Expand Down
9 changes: 4 additions & 5 deletions test/core/crawlers/browser_crawler.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@ import {
Request,
RequestList,
Session,
STATUS_CODES_BLOCKED,
} from '@crawlee/puppeteer';
import { gotScraping } from 'got-scraping';
import { sleep } from '@crawlee/utils';
Expand Down Expand Up @@ -390,7 +389,7 @@ describe('BrowserCrawler', () => {

test('should throw on "blocked" status codes', async () => {
const baseUrl = 'https://example.com/';
const sources = STATUS_CODES_BLOCKED.map((statusCode) => {
const sources = [401, 403, 429].map((statusCode) => {
return {
url: baseUrl + statusCode,
userData: { statusCode },
Expand Down Expand Up @@ -423,7 +422,7 @@ describe('BrowserCrawler', () => {

await crawler.run();

expect(failedRequests.length).toBe(STATUS_CODES_BLOCKED.length);
expect(failedRequests.length).toBe(3);
failedRequests.forEach((fr) => {
const [msg] = fr.errorMessages;
expect(msg).toContain(`Request blocked - received ${fr.userData.statusCode} status code.`);
Expand All @@ -433,7 +432,7 @@ describe('BrowserCrawler', () => {

test('should throw on "blocked" status codes (retire session)', async () => {
const baseUrl = 'https://example.com/';
const sources = STATUS_CODES_BLOCKED.map((statusCode) => {
const sources = [401, 403, 429].map((statusCode) => {
return {
url: baseUrl + statusCode,
userData: { statusCode },
Expand Down Expand Up @@ -466,7 +465,7 @@ describe('BrowserCrawler', () => {

await crawler.run();

expect(failedRequests.length).toBe(STATUS_CODES_BLOCKED.length);
expect(failedRequests.length).toBe(3);
failedRequests.forEach((fr) => {
const [msg] = fr.errorMessages;
expect(msg).toContain(`Request blocked - received ${fr.userData.statusCode} status code.`);
Expand Down
6 changes: 4 additions & 2 deletions test/core/crawlers/cheerio_crawler.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@ import {
Request,
RequestList,
Session,
STATUS_CODES_BLOCKED,
} from '@crawlee/cheerio';
import express from 'express';
import fs from 'fs';
Expand Down Expand Up @@ -858,6 +857,7 @@ describe('CheerioCrawler', () => {

await cheerioCrawler.run();

// @ts-expect-error private symbol
const { sessions } = cheerioCrawler.sessionPool;
expect(sessions.length).toBe(4);
sessions.forEach((session) => {
Expand All @@ -878,7 +878,7 @@ describe('CheerioCrawler', () => {
});

test('should retire session on "blocked" status codes', async () => {
for (const code of STATUS_CODES_BLOCKED) {
for (const code of [401, 403, 429]) {
const failed: Request[] = [];
const sessions: Session[] = [];
const crawler = new CheerioCrawler({
Expand All @@ -899,7 +899,9 @@ describe('CheerioCrawler', () => {
});
await crawler.run();

// @ts-expect-error private symbol
expect(crawler.sessionPool.sessions.length).toBe(4);
// @ts-expect-error private symbol
// eslint-disable-next-line no-loop-func
crawler.sessionPool.sessions.forEach((session) => {
// @ts-expect-error Accessing private prop
Expand Down
5 changes: 3 additions & 2 deletions test/core/session_pool/session.test.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import { EVENT_SESSION_RETIRED, ProxyConfiguration, Session, SessionPool, STATUS_CODES_BLOCKED } from '@crawlee/core';
import { EVENT_SESSION_RETIRED, ProxyConfiguration, Session, SessionPool } from '@crawlee/core';
import type { Dictionary } from '@crawlee/utils';
import { entries, sleep } from '@crawlee/utils';

Expand Down Expand Up @@ -178,7 +178,8 @@ describe('Session - testing session behaviour ', () => {
expect(session.retireOnBlockedStatusCodes(200)).toBeFalsy();
expect(session.retireOnBlockedStatusCodes(400)).toBeFalsy();
expect(session.retireOnBlockedStatusCodes(500)).toBeFalsy();
STATUS_CODES_BLOCKED.forEach((status) => {
// @ts-expect-error
sessionPool.blockedStatusCodes.forEach((status) => {
const sess = new Session({ sessionPool });
let isCalled;
const call = () => { isCalled = true; };
Expand Down
Loading

0 comments on commit cd3795f

Please sign in to comment.