Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: allow configuring crawler statistics #2213

Merged
merged 13 commits into from
Dec 20, 2023
Merged
13 changes: 12 additions & 1 deletion packages/basic-crawler/src/internals/basic-crawler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ import type {
SessionPoolOptions,
Source,
StatisticState,
StatisticsOptions,
} from '@crawlee/core';
import {
AutoscaledPool,
Expand Down Expand Up @@ -338,6 +339,12 @@ export interface BasicCrawlerOptions<Context extends CrawlingContext = BasicCraw
* WARNING: these options are not guaranteed to be stable and may change or be removed at any time.
*/
experiments?: CrawlerExperiments;

/**
* Customize the way statistics collecting works, such as logging interval or
* whether to output them to the Key-Value store.
*/
statisticsOptions?: StatisticsOptions;
}

/**
Expand Down Expand Up @@ -524,6 +531,8 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
// internal
log: ow.optional.object,
experiments: ow.optional.object,

statisticsOptions: ow.optional.object,
};

/**
Expand Down Expand Up @@ -569,6 +578,8 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext

statusMessageLoggingInterval = 10,
statusMessageCallback,

statisticsOptions,
} = options;

this.requestList = requestList;
Expand Down Expand Up @@ -650,7 +661,7 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
this.sameDomainDelayMillis = sameDomainDelaySecs * 1000;
this.maxSessionRotations = maxSessionRotations;
this.handledRequestsCount = 0;
this.stats = new Statistics({ logMessage: `${log.getOptions().prefix} request statistics:`, config });
this.stats = new Statistics({ logMessage: `${log.getOptions().prefix} request statistics:`, config, ...statisticsOptions });
this.sessionPoolOptions = {
...sessionPoolOptions,
log,
Expand Down
58 changes: 52 additions & 6 deletions packages/core/src/crawlers/statistics.ts
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,13 @@ const errorTrackerConfig = {
showFullMessage: false,
};

/**
* Override persistence-related options provided in {@apilink StatisticsOptions} for a single method call
*/
interface PersistenceOptionsOverrides {
enablePersistence?: boolean;
}

/**
* The statistics class provides an interface to collecting and logging run
* statistics for requests.
Expand Down Expand Up @@ -92,6 +99,7 @@ export class Statistics {
private instanceStart!: number;
private logInterval: unknown;
private events: EventManager;
private enablePersistence: boolean;

/**
* @internal
Expand All @@ -102,13 +110,15 @@ export class Statistics {
logMessage: ow.optional.string,
keyValueStore: ow.optional.object,
config: ow.optional.object,
enablePersistence: ow.optional.boolean,
}));

const {
logIntervalSecs = 60,
logMessage = 'Statistics',
keyValueStore,
config = Configuration.getGlobalConfig(),
enablePersistence = true,
} = options;

this.logIntervalMillis = logIntervalSecs * 1000;
Expand All @@ -117,6 +127,7 @@ export class Statistics {
this.listener = this.persistState.bind(this);
this.events = config.getEventManager();
this.config = config;
this.enablePersistence = enablePersistence;

// initialize by "resetting"
this.reset();
Expand Down Expand Up @@ -155,7 +166,10 @@ export class Statistics {
this._teardown();
}

async resetStore() {
async resetStore(opts?: PersistenceOptionsOverrides) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

lets not use abbreviations unless needed (e.g. when there is a collision)

Suggested change
async resetStore(opts?: PersistenceOptionsOverrides) {
async resetStore(options?: PersistenceOptionsOverrides) {

if (!this.enablePersistence && !opts?.enablePersistence) {
return;
}
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i like to keep one blank line after control statements like if/for/while/etc, this applies to other places in the PR too (e.g. the persistState method), take it as a rule of thumb

Suggested change
}
}

if (!this.keyValueStore) {
return;
}
Expand Down Expand Up @@ -247,13 +261,14 @@ export class Statistics {
async startCapturing() {
this.keyValueStore ??= await KeyValueStore.open(null, { config: this.config });

await this._maybeLoadStatistics();

if (this.state.crawlerStartedAt === null) {
this.state.crawlerStartedAt = new Date();
}

this.events.on(EventType.PERSIST_STATE, this.listener);
if (this.enablePersistence) {
await this._maybeLoadStatistics();
this.events.on(EventType.PERSIST_STATE, this.listener);
}

this.logInterval = setInterval(() => {
this.log.info(this.logMessage, {
Expand Down Expand Up @@ -285,7 +300,10 @@ export class Statistics {
/**
* Persist internal state to the key value store
*/
async persistState() {
async persistState(opts?: PersistenceOptionsOverrides) {
if (!this.enablePersistence && !opts?.enablePersistence) {
return;
}
// this might be called before startCapturing was called without using await, should not crash
if (!this.keyValueStore) {
return;
Expand Down Expand Up @@ -382,11 +400,39 @@ export class Statistics {
}
}

interface StatisticsOptions {
/**
* Configuration for the {@apilink Statistics} instance used by the crawler
*/
export interface StatisticsOptions {
/**
* Interval in seconds to log the current statistics
* @default 60
*/
logIntervalSecs?: number;

/**
* Message to log with the current statistics
* @default 'Statistics'
*/
logMessage?: string;

/**
* Key value store instance to persist the statistics.
* If not provided, the default one will be used when capturing starts
*/
keyValueStore?: KeyValueStore;

/**
* Configuration instance to use
* @default Configuration.getGlobalConfig()
*/
config?: Configuration;

/**
* Use this flag to disable or enable periodic statistics persistence to key value store.
* @default true
*/
enablePersistence?: boolean;
}

/**
Expand Down
36 changes: 33 additions & 3 deletions packages/core/src/session_pool/session_pool.ts
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,19 @@ export interface SessionPoolOptions {

/** @internal */
log?: Log;

/**
* If set to true, `SessionPool` will periodically persist its state to the key-value store.
* @default true
*/
enablePersistence?: boolean;
}

/**
* Override persistence-related options provided in {@apilink SessionPoolOptions} for a single method call
*/
interface PersistenceOptionsOverrides {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wanted to have a single interface for both since it's exactly the same. Let's export it and reuse it instead of creating two local implementations.

I also don't like the name very much, will try to come up with something else, I don't have any ideas right now :]

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've restructured it a bit, so that the same interface is used in the constructor and options parameter. Does that look okay?

I'm also not sure where to actually put this interface in the project hierarchy. I don't see a matching module for utilities/types shared by both statistics and session pool

enablePersistence?: boolean;
}

/**
Expand Down Expand Up @@ -138,6 +151,8 @@ export class SessionPool extends EventEmitter {
protected _listener!: () => Promise<void>;
protected events: EventManager;
protected readonly blockedStatusCodes: number[];
protected enablePersistence: boolean;
protected isInitialized = false;

/**
* @internal
Expand All @@ -153,6 +168,7 @@ export class SessionPool extends EventEmitter {
sessionOptions: ow.optional.object,
blockedStatusCodes: ow.optional.array.ofType(ow.number),
log: ow.optional.object,
enablePersistence: ow.optional.boolean,
}));

const {
Expand All @@ -163,12 +179,14 @@ export class SessionPool extends EventEmitter {
sessionOptions = {},
blockedStatusCodes = [401, 403, 429],
log = defaultLog,
enablePersistence = true,
} = options;

this.config = config;
this.blockedStatusCodes = blockedStatusCodes;
this.events = config.getEventManager();
this.log = log.child({ prefix: 'SessionPool' });
this.enablePersistence = enablePersistence;

// Pool Configuration
this.maxPoolSize = maxPoolSize;
Expand Down Expand Up @@ -206,7 +224,12 @@ export class SessionPool extends EventEmitter {
* It is called automatically by the {@apilink SessionPool.open} function.
*/
async initialize(): Promise<void> {
if (this.isInitialized) return;
this.keyValueStore = await KeyValueStore.open(this.persistStateKeyValueStoreId, { config: this.config });
if (!this.enablePersistence) {
this.isInitialized = true;
return;
}

if (!this.persistStateKeyValueStoreId) {
// eslint-disable-next-line max-len
Expand All @@ -219,6 +242,7 @@ export class SessionPool extends EventEmitter {
this._listener = this.persistState.bind(this);

this.events.on(EventType.PERSIST_STATE, this._listener);
this.isInitialized = true;
}

/**
Expand Down Expand Up @@ -290,7 +314,10 @@ export class SessionPool extends EventEmitter {
return this._createSession();
}

async resetStore() {
async resetStore(opts?: PersistenceOptionsOverrides) {
if (!this.enablePersistence && !opts?.enablePersistence) {
return;
}
await this.keyValueStore?.setValue(this.persistStateKey, null);
}

Expand All @@ -310,7 +337,10 @@ export class SessionPool extends EventEmitter {
* Persists the current state of the `SessionPool` into the default {@apilink KeyValueStore}.
* The state is persisted automatically in regular intervals.
*/
async persistState(): Promise<void> {
async persistState(opts?: PersistenceOptionsOverrides): Promise<void> {
if (!this.enablePersistence && !opts?.enablePersistence) {
return;
}
this.log.debug('Persisting state', {
persistStateKeyValueStoreId: this.persistStateKeyValueStoreId,
persistStateKey: this.persistStateKey,
Expand All @@ -331,7 +361,7 @@ export class SessionPool extends EventEmitter {
* SessionPool should not work before initialization.
*/
protected _throwIfNotInitialized() {
if (!this._listener) throw new Error('SessionPool is not initialized.');
if (!this.isInitialized) throw new Error('SessionPool is not initialized.');
}

/**
Expand Down
Loading