From 2000141a45410b3d557fec3f4d294e89918c0c59 Mon Sep 17 00:00:00 2001 From: Borewit Date: Fri, 2 Aug 2024 16:36:46 +0200 Subject: [PATCH] Add support for web streams for `fileTypeStream` (#649) Co-authored-by: Sindre Sorhus --- core.d.ts | 16 ++++++++++++++++ core.js | 49 ++++++++++++++++++++++++++++++++++++++++++++++++ index.d.ts | 15 ++++++--------- index.js | 10 +++++++--- package.json | 1 + readme.md | 2 +- test.js | 53 ++++++++++++++++++++++++++-------------------------- 7 files changed, 106 insertions(+), 40 deletions(-) diff --git a/core.d.ts b/core.d.ts index a9378060..a8c15319 100644 --- a/core.d.ts +++ b/core.d.ts @@ -475,6 +475,17 @@ export declare class TokenizerPositionError extends Error { constructor(message?: string); } +export type AnyWebReadableByteStreamWithFileType = AnyWebReadableStream & { + readonly fileType?: FileTypeResult; +}; + +/** +Returns a `Promise` which resolves to the original readable stream argument, but with an added `fileType` property, which is an object like the one returned from `fileTypeFromFile()`. + +This method can be handy to put in a stream pipeline, but it comes with a price. Internally `stream()` builds up a buffer of `sampleSize` bytes, used as a sample, to determine the file type. The sample size impacts the file detection resolution. A smaller sample size will result in lower probability of the best file type detection. +*/ +export function fileTypeStream(webStream: AnyWebReadableStream, options?: StreamOptions): Promise; + export declare class FileTypeParser { detectors: Iterable; @@ -494,4 +505,9 @@ export declare class FileTypeParser { Works the same way as {@link fileTypeFromBlob}, additionally taking into account custom detectors (if any were provided to the constructor). */ fromBlob(blob: Blob): Promise; + + /** + Works the same way as {@link fileTypeStream}, additionally taking into account custom detectors (if any were provided to the constructor). + */ + toDetectionStream(webStream: AnyWebReadableStream, options?: StreamOptions): Promise; } diff --git a/core.js b/core.js index bb730370..50809027 100644 --- a/core.js +++ b/core.js @@ -51,6 +51,10 @@ export async function fileTypeFromTokenizer(tokenizer) { return new FileTypeParser().fromTokenizer(tokenizer); } +export async function fileTypeStream(webStream) { + return new FileTypeParser().toDetectionStream(webStream); +} + export class FileTypeParser { constructor(options) { this.detectors = options?.customDetectors; @@ -104,6 +108,51 @@ export class FileTypeParser { } } + async toDetectionStream(stream, options) { + const {sampleSize = reasonableDetectionSizeInBytes} = options; + let detectedFileType; + let firstChunk; + + const reader = stream.getReader({mode: 'byob'}); + try { + // Read the first chunk from the stream + const {value: chunk, done} = await reader.read(new Uint8Array(sampleSize)); + firstChunk = chunk; + if (!done && chunk) { + try { + // Attempt to detect the file type from the chunk + detectedFileType = await this.fromBuffer(chunk.slice(0, sampleSize)); + } catch (error) { + if (!(error instanceof strtok3.EndOfStreamError)) { + throw error; // Re-throw non-EndOfStreamError + } + + detectedFileType = undefined; + } + } + + firstChunk = chunk; + } finally { + reader.releaseLock(); // Ensure the reader is released + } + + // Create a new ReadableStream to manage locking issues + const transformStream = new TransformStream({ + async start(controller) { + controller.enqueue(firstChunk); // Enqueue the initial chunk + }, + transform(chunk, controller) { + // Pass through the chunks without modification + controller.enqueue(chunk); + }, + }); + + const newStream = stream.pipeThrough(transformStream); + newStream.fileType = detectedFileType; + + return newStream; + } + check(header, options) { return _check(this.buffer, header, options); } diff --git a/index.d.ts b/index.d.ts index b080e763..49b1700d 100644 --- a/index.d.ts +++ b/index.d.ts @@ -3,7 +3,7 @@ Typings for Node.js specific entry point. */ import type {Readable as NodeReadableStream} from 'node:stream'; -import type {FileTypeResult, StreamOptions, AnyWebReadableStream, Detector} from './core.js'; +import type {FileTypeResult, StreamOptions, AnyWebReadableStream, Detector, AnyWebReadableByteStreamWithFileType} from './core.js'; import {FileTypeParser} from './core.js'; export type ReadableStreamWithFileType = NodeReadableStream & { @@ -14,8 +14,6 @@ export type ReadableStreamWithFileType = NodeReadableStream & { Extending `FileTypeParser` with Node.js engine specific functions. */ export declare class NodeFileTypeParser extends FileTypeParser { - constructor(options?: {customDetectors?: Iterable}); - /** @param stream - Node.js `stream.Readable` or web `ReadableStream`. */ @@ -27,6 +25,7 @@ export declare class NodeFileTypeParser extends FileTypeParser { Works the same way as {@link fileTypeStream}, additionally taking into account custom detectors (if any were provided to the constructor). */ toDetectionStream(readableStream: NodeReadableStream, options?: StreamOptions): Promise; + toDetectionStream(webStream: AnyWebReadableStream, options?: StreamOptions): Promise; } /** @@ -66,11 +65,8 @@ Internally `stream()` builds up a buffer of `sampleSize` bytes, used as a sample The sample size impacts the file detection resolution. A smaller sample size will result in lower probability of the best file type detection. -**Note:** This method is only available when using Node.js. -**Note:** Requires Node.js 14 or later. - -@param readableStream - A [readable stream](https://nodejs.org/api/stream.html#stream_class_stream_readable) containing a file to examine. -@param options - Maybe used to override the default sample-size. +@param readableStream - A [web `ReadableStream`](https://developer.mozilla.org/en-US/docs/Web/API/ReadableStream) or [Node.js `stream.Readable`](https://nodejs.org/api/stream.html#stream_class_stream_readable), streaming a file to examine. +@param options - May be used to override the default sample size. @returns A `Promise` which resolves to the original readable stream argument, but with an added `fileType` property, which is an object like the one returned from `fileTypeFromFile()`. @example @@ -87,7 +83,8 @@ if (stream2.fileType?.mime === 'image/jpeg') { // stream2 can be used to stream the JPEG image (from the very beginning of the stream) } ``` - */ +*/ export function fileTypeStream(readableStream: NodeReadableStream, options?: StreamOptions): Promise; +export function fileTypeStream(webStream: AnyWebReadableStream, options?: StreamOptions): Promise; export * from './core.js'; diff --git a/index.js b/index.js index 56dd18af..1d414482 100644 --- a/index.js +++ b/index.js @@ -3,6 +3,7 @@ Node.js specific entry point. */ import {ReadableStream as WebReadableStream} from 'node:stream/web'; +import {pipeline, PassThrough} from 'node:stream'; import * as strtok3 from 'strtok3'; import {FileTypeParser, reasonableDetectionSizeInBytes} from './core.js'; @@ -26,7 +27,10 @@ export class NodeFileTypeParser extends FileTypeParser { } async toDetectionStream(readableStream, options = {}) { - const {default: stream} = await import('node:stream'); + if (readableStream instanceof WebReadableStream) { + return super.toDetectionStream(readableStream, options); + } + const {sampleSize = reasonableDetectionSizeInBytes} = options; return new Promise((resolve, reject) => { @@ -36,8 +40,8 @@ export class NodeFileTypeParser extends FileTypeParser { (async () => { try { // Set up output stream - const pass = new stream.PassThrough(); - const outputStream = stream.pipeline ? stream.pipeline(readableStream, pass, () => {}) : readableStream.pipe(pass); + const pass = new PassThrough(); + const outputStream = pipeline ? pipeline(readableStream, pass, () => {}) : readableStream.pipe(pass); // Read the input stream and detect the filetype const chunk = readableStream.read(sampleSize) ?? readableStream.read() ?? new Uint8Array(0); diff --git a/package.json b/package.json index 50ebc8ec..d77454a9 100644 --- a/package.json +++ b/package.json @@ -209,6 +209,7 @@ "vsdx" ], "dependencies": { + "get-stream": "^9.0.1", "strtok3": "^8.0.0", "token-types": "^6.0.0", "uint8array-extras": "^1.3.0" diff --git a/readme.md b/readme.md index 497844f4..21e540ae 100644 --- a/readme.md +++ b/readme.md @@ -170,7 +170,7 @@ Or `undefined` when there is no match. #### stream -Type: [`stream.Readable`](https://nodejs.org/api/stream.html#stream_class_stream_readable) +Type: [Web `ReadableStream`](https://developer.mozilla.org/en-US/docs/Web/API/ReadableStream) or [Node.js `stream.Readable`](https://nodejs.org/api/stream.html#stream_class_stream_readable) A readable stream representing file data. diff --git a/test.js b/test.js index 4e25f4f2..3d4761a9 100644 --- a/test.js +++ b/test.js @@ -2,12 +2,14 @@ import process from 'node:process'; import path from 'node:path'; import {fileURLToPath} from 'node:url'; import fs from 'node:fs'; +import {readFile} from 'node:fs/promises'; import stream from 'node:stream'; import test from 'ava'; import {readableNoopStream} from 'noop-stream'; import {Parser as ReadmeParser} from 'commonmark'; import * as strtok3 from 'strtok3/core'; import {areUint8ArraysEqual} from 'uint8array-extras'; +import {getStreamAsArrayBuffer} from 'get-stream'; import { fileTypeFromBuffer, fileTypeFromStream as fileTypeNodeFromStream, @@ -26,7 +28,7 @@ const missingTests = new Set([ ]); const [nodeMajorVersion] = process.versions.node.split('.').map(Number); -const nodeVersionSupportingByeBlobStream = 20; +const nodeVersionSupportingByteBlobStream = 20; const types = [...supportedExtensions].filter(ext => !missingTests.has(ext)); @@ -337,38 +339,34 @@ async function testFileNodeFromStream(t, ext, name) { t.is(typeof fileType.mime, 'string', 'fileType.mime'); } -async function loadEntireFile(readable) { - const chunks = []; - let totalLength = 0; - - for await (const chunk of readable) { - chunks.push(chunk); - totalLength += chunk.length; - } - - const entireFile = new Uint8Array(totalLength); - - let offset = 0; - for (const chunk of chunks) { - entireFile.set(new Uint8Array(chunk), offset); - offset += chunk.length; - } - - return entireFile; +async function getStreamAsUint8Array(stream) { + return new Uint8Array(await getStreamAsArrayBuffer(stream)); } -async function testStream(t, ext, name) { +async function testStreamWithNodeStream(t, ext, name) { const fixtureName = `${(name ?? 'fixture')}.${ext}`; const file = path.join(__dirname, 'fixture', fixtureName); const readableStream = await fileTypeStream(fs.createReadStream(file)); const fileStream = fs.createReadStream(file); - const [bufferA, bufferB] = await Promise.all([loadEntireFile(readableStream), loadEntireFile(fileStream)]); + const [bufferA, bufferB] = await Promise.all([getStreamAsUint8Array(readableStream), getStreamAsUint8Array(fileStream)]); t.true(areUint8ArraysEqual(bufferA, bufferB)); } +async function testStreamWithWebStream(t, ext, name) { + const fixtureName = `${(name ?? 'fixture')}.${ext}`; + const file = path.join(__dirname, 'fixture', fixtureName); + // Read the file into a buffer + const fileBuffer = await readFile(file); + // Create a Blob from the buffer + const blob = new Blob([fileBuffer]); + const webStream = await fileTypeStream(blob.stream()); + const webStreamResult = await getStreamAsUint8Array(webStream); + t.true(areUint8ArraysEqual(fileBuffer, webStreamResult)); +} + test('Test suite must be able to detect Node.js major version', t => { t.is(typeof nodeMajorVersion, 'number', 'Detected Node.js major version should be a number'); }); @@ -382,13 +380,14 @@ for (const type of types) { _test(`${name}.${type} ${i++} .fileTypeFromFile() method - same fileType`, testFromFile, type, name); _test(`${name}.${type} ${i++} .fileTypeFromBuffer() method - same fileType`, testFromBuffer, type, name); - if (nodeMajorVersion >= nodeVersionSupportingByeBlobStream) { + if (nodeMajorVersion >= nodeVersionSupportingByteBlobStream) { // Blob requires to stream to BYOB ReadableStream, requiring Node.js ≥ 20 _test(`${name}.${type} ${i++} .fileTypeFromBlob() method - same fileType`, testFromBlob, type, name); + test(`${name}.${type} ${i++} .fileTypeStream() - identical Web Streams`, testStreamWithWebStream, type, name); } _test(`${name}.${type} ${i++} .fileTypeFromStream() Node.js method - same fileType`, testFileNodeFromStream, type, name); - test(`${name}.${type} ${i++} .fileTypeStream() - identical streams`, testStream, type, name); + _test(`${name}.${type} ${i++} .fileTypeStream() - identical Node.js Readable streams`, testStreamWithNodeStream, type, name); } } else { const fixtureName = `fixture.${type}`; @@ -397,7 +396,7 @@ for (const type of types) { _test(`${type} ${i++} .fileTypeFromFile()`, testFromFile, type); _test(`${type} ${i++} .fileTypeFromBuffer()`, testFromBuffer, type); _test(`${type} ${i++} .fileTypeFromStream() Node.js`, testFileNodeFromStream, type); - test(`${type} ${i++} .fileTypeStream() - identical streams`, testStream, type); + test(`${type} ${i++} .fileTypeStream() - identical streams`, testStreamWithNodeStream, type); } if (Object.prototype.hasOwnProperty.call(falsePositives, type)) { @@ -427,7 +426,7 @@ test('.fileTypeStream() method - short stream', async t => { t.is(newStream.fileType, undefined); // Test usability of returned stream - const bufferB = await loadEntireFile(newStream); + const bufferB = await getStreamAsUint8Array(newStream); t.deepEqual(bufferA, bufferB); }); @@ -708,7 +707,7 @@ const tokenizerPositionChanger = tokenizer => { tokenizer.readBuffer(buffer, {length: 1, mayBeLess: true}); }; -if (nodeMajorVersion >= nodeVersionSupportingByeBlobStream) { +if (nodeMajorVersion >= nodeVersionSupportingByteBlobStream) { // Blob requires to stream to BYOB ReadableStream, requiring Node.js ≥ 20 test('fileTypeFromBlob should detect custom file type "unicorn" using custom detectors', async t => { @@ -849,7 +848,7 @@ test('fileTypeFromTokenizer should return undefined when a custom detector chang const header = 'UNICORN FILE\n'; const uint8ArrayContent = new TextEncoder().encode(header); - // Include the unicormDetector here to verify it's not used after the tokenizer.position changed + // Include the unicornDetector here to verify it's not used after the tokenizer.position changed const customDetectors = [tokenizerPositionChanger, unicornDetector]; const parser = new NodeFileTypeParser({customDetectors});