From d7fb48e75cbc7d4b71160bbb835b63cde79d16f3 Mon Sep 17 00:00:00 2001 From: Borewit Date: Mon, 29 Jul 2024 10:19:26 +0200 Subject: [PATCH] Add detecting Web Stream to default (core) entry point --- core.d.ts | 26 +++++++++++++++++++++++ core.js | 48 +++++++++++++++++++++++++++++++++++++++++++ index.d.ts | 8 +++++--- index.js | 10 ++++++--- test.js | 60 ++++++++++++++++++++++++++++++++++++++++++++++-------- 5 files changed, 137 insertions(+), 15 deletions(-) diff --git a/core.d.ts b/core.d.ts index a9378060..fca77504 100644 --- a/core.d.ts +++ b/core.d.ts @@ -475,6 +475,27 @@ export declare class TokenizerPositionError extends Error { constructor(message?: string); } +export type AnyWebReadableByteStreamWithFileType = AnyWebReadableStream & { + readonly fileType?: FileTypeResult; +}; + +/** + Returns a `Promise` which resolves to the original readable stream argument, but with an added `fileType` property, which is an object like the one returned from `fileTypeFromFile()`. + + This method can be handy to put in between a stream, but it comes with a price. + Internally `stream()` builds up a buffer of `sampleSize` bytes, used as a sample, to determine the file type. + The sample size impacts the file detection resolution. + A smaller sample size will result in lower probability of the best file type detection. + + **Note:** This method is only available when using Node.js. + **Note:** Requires Node.js 14 or later. + + @param webStream - A Web Stream + @param options - Maybe used to override the default sample-size. + @returns A `Promise` which resolves to the original web stream argument, but with an added `fileType` property, which is an object like the one returned from `fileTypeFromFile()`. + */ +export function fileTypeStream(webStream: AnyWebReadableStream, options?: StreamOptions): Promise; + export declare class FileTypeParser { detectors: Iterable; @@ -494,4 +515,9 @@ export declare class FileTypeParser { Works the same way as {@link fileTypeFromBlob}, additionally taking into account custom detectors (if any were provided to the constructor). */ fromBlob(blob: Blob): Promise; + + /** + Works the same way as {@link fileTypeStream}, additionally taking into account custom detectors (if any were provided to the constructor). + */ + toDetectionStream(webStream: AnyWebReadableStream, options?: StreamOptions): Promise; } diff --git a/core.js b/core.js index bb730370..8ef5e044 100644 --- a/core.js +++ b/core.js @@ -51,6 +51,10 @@ export async function fileTypeFromTokenizer(tokenizer) { return new FileTypeParser().fromTokenizer(tokenizer); } +export async function stream(webStream) { + return new FileTypeParser().toDetectionStream(webStream); +} + export class FileTypeParser { constructor(options) { this.detectors = options?.customDetectors; @@ -104,6 +108,50 @@ export class FileTypeParser { } } + async toDetectionStream(webStream, options = {}) { + const {sampleSize = reasonableDetectionSizeInBytes} = options; + + // Initialize a reader from the web stream + const reader = webStream.getReader({mode: 'byob'}); + const pass = new TransformStream(); + const writer = pass.writable.getWriter(); + let detectedFileType; + + // Read the first chunk for file type detection + const {value: chunk, done} = await reader.read(new Uint8Array(sampleSize)); + if (done || !chunk) { + detectedFileType = undefined; + } else { + try { + detectedFileType = await this.fromBuffer(chunk.slice(0, sampleSize)); + } catch (error) { + if (error instanceof strtok3.EndOfStreamError) { + detectedFileType = undefined; + } else { + throw error; + } + } + } + + // Write the initial chunk into the pass-through stream + writer.write(chunk); + + // Forward remaining data from the reader to the writer + (async function pump() { + const {value, done} = await reader.read(new Uint8Array(512 * 1024)); + if (done) { + return writer.close(); + } + + await writer.write(value); + return pump(); + })(); + + // Attach the detected file type to the output stream + pass.readable.fileType = detectedFileType; + return pass.readable; + } + check(header, options) { return _check(this.buffer, header, options); } diff --git a/index.d.ts b/index.d.ts index b080e763..edb1980f 100644 --- a/index.d.ts +++ b/index.d.ts @@ -3,7 +3,7 @@ Typings for Node.js specific entry point. */ import type {Readable as NodeReadableStream} from 'node:stream'; -import type {FileTypeResult, StreamOptions, AnyWebReadableStream, Detector} from './core.js'; +import type {FileTypeResult, StreamOptions, AnyWebReadableStream, Detector, AnyWebReadableByteStreamWithFileType} from './core.js'; import {FileTypeParser} from './core.js'; export type ReadableStreamWithFileType = NodeReadableStream & { @@ -27,6 +27,7 @@ export declare class NodeFileTypeParser extends FileTypeParser { Works the same way as {@link fileTypeStream}, additionally taking into account custom detectors (if any were provided to the constructor). */ toDetectionStream(readableStream: NodeReadableStream, options?: StreamOptions): Promise; + toDetectionStream(webStream: AnyWebReadableStream, options?: StreamOptions): Promise; } /** @@ -70,10 +71,10 @@ A smaller sample size will result in lower probability of the best file type det **Note:** Requires Node.js 14 or later. @param readableStream - A [readable stream](https://nodejs.org/api/stream.html#stream_class_stream_readable) containing a file to examine. -@param options - Maybe used to override the default sample-size. -@returns A `Promise` which resolves to the original readable stream argument, but with an added `fileType` property, which is an object like the one returned from `fileTypeFromFile()`. +@param options - Maybe used to override the default sample-size.with an added `fileType` property, which is an object like the one returned from `fileTypeFromFile()`. @example +@returns A `Promise` which resolves to the original readable stream argument, but ``` import got from 'got'; import {fileTypeStream} from 'file-type'; @@ -89,5 +90,6 @@ if (stream2.fileType?.mime === 'image/jpeg') { ``` */ export function fileTypeStream(readableStream: NodeReadableStream, options?: StreamOptions): Promise; +export function fileTypeStream(webStream: AnyWebReadableStream, options?: StreamOptions): Promise; export * from './core.js'; diff --git a/index.js b/index.js index 56dd18af..1d414482 100644 --- a/index.js +++ b/index.js @@ -3,6 +3,7 @@ Node.js specific entry point. */ import {ReadableStream as WebReadableStream} from 'node:stream/web'; +import {pipeline, PassThrough} from 'node:stream'; import * as strtok3 from 'strtok3'; import {FileTypeParser, reasonableDetectionSizeInBytes} from './core.js'; @@ -26,7 +27,10 @@ export class NodeFileTypeParser extends FileTypeParser { } async toDetectionStream(readableStream, options = {}) { - const {default: stream} = await import('node:stream'); + if (readableStream instanceof WebReadableStream) { + return super.toDetectionStream(readableStream, options); + } + const {sampleSize = reasonableDetectionSizeInBytes} = options; return new Promise((resolve, reject) => { @@ -36,8 +40,8 @@ export class NodeFileTypeParser extends FileTypeParser { (async () => { try { // Set up output stream - const pass = new stream.PassThrough(); - const outputStream = stream.pipeline ? stream.pipeline(readableStream, pass, () => {}) : readableStream.pipe(pass); + const pass = new PassThrough(); + const outputStream = pipeline ? pipeline(readableStream, pass, () => {}) : readableStream.pipe(pass); // Read the input stream and detect the filetype const chunk = readableStream.read(sampleSize) ?? readableStream.read() ?? new Uint8Array(0); diff --git a/test.js b/test.js index 4e25f4f2..acaa527e 100644 --- a/test.js +++ b/test.js @@ -2,6 +2,7 @@ import process from 'node:process'; import path from 'node:path'; import {fileURLToPath} from 'node:url'; import fs from 'node:fs'; +import {readFile} from 'node:fs/promises'; import stream from 'node:stream'; import test from 'ava'; import {readableNoopStream} from 'noop-stream'; @@ -26,7 +27,7 @@ const missingTests = new Set([ ]); const [nodeMajorVersion] = process.versions.node.split('.').map(Number); -const nodeVersionSupportingByeBlobStream = 20; +const nodeVersionSupportingByteBlobStream = 20; const types = [...supportedExtensions].filter(ext => !missingTests.has(ext)); @@ -337,7 +338,7 @@ async function testFileNodeFromStream(t, ext, name) { t.is(typeof fileType.mime, 'string', 'fileType.mime'); } -async function loadEntireFile(readable) { +async function loadEntireFileFromNodeReadable(readable) { const chunks = []; let totalLength = 0; @@ -357,18 +358,58 @@ async function loadEntireFile(readable) { return entireFile; } -async function testStream(t, ext, name) { +async function testStreamWithNodeStream(t, ext, name) { const fixtureName = `${(name ?? 'fixture')}.${ext}`; const file = path.join(__dirname, 'fixture', fixtureName); const readableStream = await fileTypeStream(fs.createReadStream(file)); const fileStream = fs.createReadStream(file); - const [bufferA, bufferB] = await Promise.all([loadEntireFile(readableStream), loadEntireFile(fileStream)]); + const [bufferA, bufferB] = await Promise.all([loadEntireFileFromNodeReadable(readableStream), loadEntireFileFromNodeReadable(fileStream)]); t.true(areUint8ArraysEqual(bufferA, bufferB)); } +async function loadEntireFileFromWebStream(webStream) { + const reader = webStream.getReader(); + const chunks = []; + let totalLength = 0; + let bytesRead = 0; + + do { + const {done, value} = await reader.read(); + if (done) { + break; + } + + chunks.push(value); + bytesRead = value.byteLength; + totalLength += bytesRead; + } while (bytesRead > 0); + + // Concatenate all chunks into a single Uint8Array + const entireFile = new Uint8Array(totalLength); + let offset = 0; + for (const chunk of chunks) { + entireFile.set(chunk, offset); + offset += chunk.byteLength; + } + + return entireFile; +} + +async function testStreamWithWebStream(t, ext, name) { + const fixtureName = `${(name ?? 'fixture')}.${ext}`; + const file = path.join(__dirname, 'fixture', fixtureName); + // Read the file into a buffer + const fileBuffer = await readFile(file); + // Create a Blob from the buffer + const blob = new Blob([fileBuffer]); + const webStream = await fileTypeStream(blob.stream()); + const webStreamResult = await loadEntireFileFromWebStream(webStream); + t.true(areUint8ArraysEqual(fileBuffer, webStreamResult)); +} + test('Test suite must be able to detect Node.js major version', t => { t.is(typeof nodeMajorVersion, 'number', 'Detected Node.js major version should be a number'); }); @@ -382,13 +423,14 @@ for (const type of types) { _test(`${name}.${type} ${i++} .fileTypeFromFile() method - same fileType`, testFromFile, type, name); _test(`${name}.${type} ${i++} .fileTypeFromBuffer() method - same fileType`, testFromBuffer, type, name); - if (nodeMajorVersion >= nodeVersionSupportingByeBlobStream) { + if (nodeMajorVersion >= nodeVersionSupportingByteBlobStream) { // Blob requires to stream to BYOB ReadableStream, requiring Node.js ≥ 20 _test(`${name}.${type} ${i++} .fileTypeFromBlob() method - same fileType`, testFromBlob, type, name); + test(`${name}.${type} ${i++} .fileTypeStream() - identical Web Streams`, testStreamWithWebStream, type, name); } _test(`${name}.${type} ${i++} .fileTypeFromStream() Node.js method - same fileType`, testFileNodeFromStream, type, name); - test(`${name}.${type} ${i++} .fileTypeStream() - identical streams`, testStream, type, name); + _test(`${name}.${type} ${i++} .fileTypeStream() - identical Node.js Readable streams`, testStreamWithNodeStream, type, name); } } else { const fixtureName = `fixture.${type}`; @@ -397,7 +439,7 @@ for (const type of types) { _test(`${type} ${i++} .fileTypeFromFile()`, testFromFile, type); _test(`${type} ${i++} .fileTypeFromBuffer()`, testFromBuffer, type); _test(`${type} ${i++} .fileTypeFromStream() Node.js`, testFileNodeFromStream, type); - test(`${type} ${i++} .fileTypeStream() - identical streams`, testStream, type); + test(`${type} ${i++} .fileTypeStream() - identical streams`, testStreamWithNodeStream, type); } if (Object.prototype.hasOwnProperty.call(falsePositives, type)) { @@ -427,7 +469,7 @@ test('.fileTypeStream() method - short stream', async t => { t.is(newStream.fileType, undefined); // Test usability of returned stream - const bufferB = await loadEntireFile(newStream); + const bufferB = await loadEntireFileFromNodeReadable(newStream); t.deepEqual(bufferA, bufferB); }); @@ -708,7 +750,7 @@ const tokenizerPositionChanger = tokenizer => { tokenizer.readBuffer(buffer, {length: 1, mayBeLess: true}); }; -if (nodeMajorVersion >= nodeVersionSupportingByeBlobStream) { +if (nodeMajorVersion >= nodeVersionSupportingByteBlobStream) { // Blob requires to stream to BYOB ReadableStream, requiring Node.js ≥ 20 test('fileTypeFromBlob should detect custom file type "unicorn" using custom detectors', async t => {