Skip to content

Commit

Permalink
chore: maybe this fixes docs?
Browse files Browse the repository at this point in the history
  • Loading branch information
vladfrangu committed Mar 26, 2024
1 parent 53209f8 commit af21cc3
Show file tree
Hide file tree
Showing 3 changed files with 108 additions and 0 deletions.
17 changes: 17 additions & 0 deletions packages/core/src/storages/request_provider.ts
Original file line number Diff line number Diff line change
Expand Up @@ -375,6 +375,23 @@ export abstract class RequestProvider implements IStorage {
return new Request(requestOptions as unknown as RequestOptions);
}

/**
* Returns a next request in the queue to be processed, or `null` if there are no more pending requests.
*
* Once you successfully finish processing of the request, you need to call
* {@apilink RequestQueue.markRequestHandled}
* to mark the request as handled in the queue. If there was some error in processing the request,
* call {@apilink RequestQueue.reclaimRequest} instead,
* so that the queue will give the request to some other consumer in another call to the `fetchNextRequest` function.
*
* Note that the `null` return value doesn't mean the queue processing finished,
* it means there are currently no pending requests.
* To check whether all requests in queue were finished,
* use {@apilink RequestQueue.isFinished} instead.
*
* @returns
* Returns the request object or `null` if there are no more pending requests.
*/
abstract fetchNextRequest<T extends Dictionary = Dictionary>(options?: RequestOptions): Promise<Request<T> | null>;

/**
Expand Down
22 changes: 22 additions & 0 deletions packages/core/src/storages/request_queue.ts
Original file line number Diff line number Diff line change
Expand Up @@ -327,6 +327,12 @@ export class RequestQueue extends RequestProvider {
return super.markRequestHandled(...args);
}

/**
* Reclaims a failed request back to the queue, so that it can be returned for processing later again
* by another call to {@apilink RequestQueue.fetchNextRequest}.
* The request record in the queue is updated using the provided `request` parameter.
* For example, this lets you store the number of retries or error messages for the request.
*/
override async reclaimRequest(...args: Parameters<RequestProvider['reclaimRequest']>) {
checkStorageAccess();

Expand Down Expand Up @@ -359,6 +365,22 @@ export class RequestQueue extends RequestProvider {
this.lastActivity = new Date();
}

/**
* Opens a request queue and returns a promise resolving to an instance
* of the {@apilink RequestQueue} class.
*
* {@apilink RequestQueue} represents a queue of URLs to crawl, which is stored either on local filesystem or in the cloud.
* The queue is used for deep crawling of websites, where you start with several URLs and then
* recursively follow links to other pages. The data structure supports both breadth-first
* and depth-first crawling orders.
*
* For more details and code examples, see the {@apilink RequestQueue} class.
*
* @param [queueIdOrName]
* ID or name of the request queue to be opened. If `null` or `undefined`,
* the function returns the default request queue associated with the crawler run.
* @param [options] Open Request Queue options.
*/
static override async open(...args: Parameters<typeof RequestProvider.open>): Promise<RequestQueue> {
return super.open(...args) as Promise<RequestQueue>;
}
Expand Down
69 changes: 69 additions & 0 deletions packages/core/src/storages/request_queue_v2.ts
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,53 @@ const MAX_CACHED_REQUESTS = 2_000_000;
*/
const RECENTLY_HANDLED_CACHE_SIZE = 1000;

/**
* Represents a queue of URLs to crawl, which is used for deep crawling of websites
* where you start with several URLs and then recursively
* follow links to other pages. The data structure supports both breadth-first and depth-first crawling orders.
*
* Each URL is represented using an instance of the {@apilink Request} class.
* The queue can only contain unique URLs. More precisely, it can only contain {@apilink Request} instances
* with distinct `uniqueKey` properties. By default, `uniqueKey` is generated from the URL, but it can also be overridden.
* To add a single URL multiple times to the queue,
* corresponding {@apilink Request} objects will need to have different `uniqueKey` properties.
*
* Do not instantiate this class directly, use the {@apilink RequestQueue.open} function instead.
*
* `RequestQueue` is used by {@apilink BasicCrawler}, {@apilink CheerioCrawler}, {@apilink PuppeteerCrawler}
* and {@apilink PlaywrightCrawler} as a source of URLs to crawl.
* Unlike {@apilink RequestList}, `RequestQueue` supports dynamic adding and removing of requests.
* On the other hand, the queue is not optimized for operations that add or remove a large number of URLs in a batch.
*
* `RequestQueue` stores its data either on local disk or in the Apify Cloud,
* depending on whether the `APIFY_LOCAL_STORAGE_DIR` or `APIFY_TOKEN` environment variable is set.
*
* If the `APIFY_LOCAL_STORAGE_DIR` environment variable is set, the queue data is stored in
* that directory in an SQLite database file.
*
* If the `APIFY_TOKEN` environment variable is set but `APIFY_LOCAL_STORAGE_DIR` is not, the data is stored in the
* [Apify Request Queue](https://docs.apify.com/storage/request-queue)
* cloud storage. Note that you can force usage of the cloud storage also by passing the `forceCloud`
* option to {@apilink RequestQueue.open} function,
* even if the `APIFY_LOCAL_STORAGE_DIR` variable is set.
*
* **Example usage:**
*
* ```javascript
* // Open the default request queue associated with the crawler run
* const queue = await RequestQueue.open();
*
* // Open a named request queue
* const queueWithName = await RequestQueue.open('some-name');
*
* // Enqueue few requests
* await queue.addRequest({ url: 'http://example.com/aaa' });
* await queue.addRequest({ url: 'http://example.com/bbb' });
* await queue.addRequest({ url: 'http://example.com/foo/bar' }, { forefront: true });
* ```
* @category Sources
*/

class RequestQueue extends RequestProvider {
private _listHeadAndLockPromise: Promise<void> | null = null;

Expand Down Expand Up @@ -143,6 +190,12 @@ class RequestQueue extends RequestProvider {
return request;
}

/**
* Reclaims a failed request back to the queue, so that it can be returned for processing later again
* by another call to {@apilink RequestQueue.fetchNextRequest}.
* The request record in the queue is updated using the provided `request` parameter.
* For example, this lets you store the number of retries or error messages for the request.
*/
override async reclaimRequest(...args: Parameters<RequestProvider['reclaimRequest']>): ReturnType<RequestProvider['reclaimRequest']> {
checkStorageAccess();

Expand Down Expand Up @@ -350,6 +403,22 @@ class RequestQueue extends RequestProvider {
}
}

/**
* Opens a request queue and returns a promise resolving to an instance
* of the {@apilink RequestQueue} class.
*
* {@apilink RequestQueue} represents a queue of URLs to crawl, which is stored either on local filesystem or in the cloud.
* The queue is used for deep crawling of websites, where you start with several URLs and then
* recursively follow links to other pages. The data structure supports both breadth-first
* and depth-first crawling orders.
*
* For more details and code examples, see the {@apilink RequestQueue} class.
*
* @param [queueIdOrName]
* ID or name of the request queue to be opened. If `null` or `undefined`,
* the function returns the default request queue associated with the crawler run.
* @param [options] Open Request Queue options.
*/
static override async open(...args: Parameters<typeof RequestProvider.open>): Promise<RequestQueue> {
return super.open(...args) as Promise<RequestQueue>;
}
Expand Down

0 comments on commit af21cc3

Please sign in to comment.