Skip to content

Commit

Permalink
Add support for web streams for fileTypeStream (#649)
Browse files Browse the repository at this point in the history
Co-authored-by: Sindre Sorhus <sindresorhus@gmail.com>
  • Loading branch information
Borewit and sindresorhus authored Aug 2, 2024
1 parent 7c3bea1 commit 2000141
Show file tree
Hide file tree
Showing 7 changed files with 106 additions and 40 deletions.
16 changes: 16 additions & 0 deletions core.d.ts
Original file line number Diff line number Diff line change
Expand Up @@ -475,6 +475,17 @@ export declare class TokenizerPositionError extends Error {
constructor(message?: string);
}

export type AnyWebReadableByteStreamWithFileType = AnyWebReadableStream<Uint8Array> & {
readonly fileType?: FileTypeResult;
};

/**
Returns a `Promise` which resolves to the original readable stream argument, but with an added `fileType` property, which is an object like the one returned from `fileTypeFromFile()`.
This method can be handy to put in a stream pipeline, but it comes with a price. Internally `stream()` builds up a buffer of `sampleSize` bytes, used as a sample, to determine the file type. The sample size impacts the file detection resolution. A smaller sample size will result in lower probability of the best file type detection.
*/
export function fileTypeStream(webStream: AnyWebReadableStream<Uint8Array>, options?: StreamOptions): Promise<AnyWebReadableByteStreamWithFileType>;

export declare class FileTypeParser {
detectors: Iterable<Detector>;

Expand All @@ -494,4 +505,9 @@ export declare class FileTypeParser {
Works the same way as {@link fileTypeFromBlob}, additionally taking into account custom detectors (if any were provided to the constructor).
*/
fromBlob(blob: Blob): Promise<FileTypeResult | undefined>;

/**
Works the same way as {@link fileTypeStream}, additionally taking into account custom detectors (if any were provided to the constructor).
*/
toDetectionStream(webStream: AnyWebReadableStream<Uint8Array>, options?: StreamOptions): Promise<AnyWebReadableByteStreamWithFileType>;
}
49 changes: 49 additions & 0 deletions core.js
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,10 @@ export async function fileTypeFromTokenizer(tokenizer) {
return new FileTypeParser().fromTokenizer(tokenizer);
}

export async function fileTypeStream(webStream) {
return new FileTypeParser().toDetectionStream(webStream);
}

export class FileTypeParser {
constructor(options) {
this.detectors = options?.customDetectors;
Expand Down Expand Up @@ -104,6 +108,51 @@ export class FileTypeParser {
}
}

async toDetectionStream(stream, options) {
const {sampleSize = reasonableDetectionSizeInBytes} = options;
let detectedFileType;
let firstChunk;

const reader = stream.getReader({mode: 'byob'});
try {
// Read the first chunk from the stream
const {value: chunk, done} = await reader.read(new Uint8Array(sampleSize));
firstChunk = chunk;
if (!done && chunk) {
try {
// Attempt to detect the file type from the chunk
detectedFileType = await this.fromBuffer(chunk.slice(0, sampleSize));
} catch (error) {
if (!(error instanceof strtok3.EndOfStreamError)) {
throw error; // Re-throw non-EndOfStreamError
}

detectedFileType = undefined;
}
}

firstChunk = chunk;
} finally {
reader.releaseLock(); // Ensure the reader is released
}

// Create a new ReadableStream to manage locking issues
const transformStream = new TransformStream({
async start(controller) {
controller.enqueue(firstChunk); // Enqueue the initial chunk
},
transform(chunk, controller) {
// Pass through the chunks without modification
controller.enqueue(chunk);
},
});

const newStream = stream.pipeThrough(transformStream);
newStream.fileType = detectedFileType;

return newStream;
}

check(header, options) {
return _check(this.buffer, header, options);
}
Expand Down
15 changes: 6 additions & 9 deletions index.d.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ Typings for Node.js specific entry point.
*/

import type {Readable as NodeReadableStream} from 'node:stream';
import type {FileTypeResult, StreamOptions, AnyWebReadableStream, Detector} from './core.js';
import type {FileTypeResult, StreamOptions, AnyWebReadableStream, Detector, AnyWebReadableByteStreamWithFileType} from './core.js';
import {FileTypeParser} from './core.js';

export type ReadableStreamWithFileType = NodeReadableStream & {
Expand All @@ -14,8 +14,6 @@ export type ReadableStreamWithFileType = NodeReadableStream & {
Extending `FileTypeParser` with Node.js engine specific functions.
*/
export declare class NodeFileTypeParser extends FileTypeParser {
constructor(options?: {customDetectors?: Iterable<Detector>});

/**
@param stream - Node.js `stream.Readable` or web `ReadableStream`.
*/
Expand All @@ -27,6 +25,7 @@ export declare class NodeFileTypeParser extends FileTypeParser {
Works the same way as {@link fileTypeStream}, additionally taking into account custom detectors (if any were provided to the constructor).
*/
toDetectionStream(readableStream: NodeReadableStream, options?: StreamOptions): Promise<ReadableStreamWithFileType>;
toDetectionStream(webStream: AnyWebReadableStream<Uint8Array>, options?: StreamOptions): Promise<AnyWebReadableByteStreamWithFileType>;
}

/**
Expand Down Expand Up @@ -66,11 +65,8 @@ Internally `stream()` builds up a buffer of `sampleSize` bytes, used as a sample
The sample size impacts the file detection resolution.
A smaller sample size will result in lower probability of the best file type detection.
**Note:** This method is only available when using Node.js.
**Note:** Requires Node.js 14 or later.
@param readableStream - A [readable stream](https://nodejs.org/api/stream.html#stream_class_stream_readable) containing a file to examine.
@param options - Maybe used to override the default sample-size.
@param readableStream - A [web `ReadableStream`](https://developer.mozilla.org/en-US/docs/Web/API/ReadableStream) or [Node.js `stream.Readable`](https://nodejs.org/api/stream.html#stream_class_stream_readable), streaming a file to examine.
@param options - May be used to override the default sample size.
@returns A `Promise` which resolves to the original readable stream argument, but with an added `fileType` property, which is an object like the one returned from `fileTypeFromFile()`.
@example
Expand All @@ -87,7 +83,8 @@ if (stream2.fileType?.mime === 'image/jpeg') {
// stream2 can be used to stream the JPEG image (from the very beginning of the stream)
}
```
*/
*/
export function fileTypeStream(readableStream: NodeReadableStream, options?: StreamOptions): Promise<ReadableStreamWithFileType>;
export function fileTypeStream(webStream: AnyWebReadableStream<Uint8Array>, options?: StreamOptions): Promise<AnyWebReadableByteStreamWithFileType>;

export * from './core.js';
10 changes: 7 additions & 3 deletions index.js
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ Node.js specific entry point.
*/

import {ReadableStream as WebReadableStream} from 'node:stream/web';
import {pipeline, PassThrough} from 'node:stream';
import * as strtok3 from 'strtok3';
import {FileTypeParser, reasonableDetectionSizeInBytes} from './core.js';

Expand All @@ -26,7 +27,10 @@ export class NodeFileTypeParser extends FileTypeParser {
}

async toDetectionStream(readableStream, options = {}) {
const {default: stream} = await import('node:stream');
if (readableStream instanceof WebReadableStream) {
return super.toDetectionStream(readableStream, options);
}

const {sampleSize = reasonableDetectionSizeInBytes} = options;

return new Promise((resolve, reject) => {
Expand All @@ -36,8 +40,8 @@ export class NodeFileTypeParser extends FileTypeParser {
(async () => {
try {
// Set up output stream
const pass = new stream.PassThrough();
const outputStream = stream.pipeline ? stream.pipeline(readableStream, pass, () => {}) : readableStream.pipe(pass);
const pass = new PassThrough();
const outputStream = pipeline ? pipeline(readableStream, pass, () => {}) : readableStream.pipe(pass);

// Read the input stream and detect the filetype
const chunk = readableStream.read(sampleSize) ?? readableStream.read() ?? new Uint8Array(0);
Expand Down
1 change: 1 addition & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -209,6 +209,7 @@
"vsdx"
],
"dependencies": {
"get-stream": "^9.0.1",
"strtok3": "^8.0.0",
"token-types": "^6.0.0",
"uint8array-extras": "^1.3.0"
Expand Down
2 changes: 1 addition & 1 deletion readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -170,7 +170,7 @@ Or `undefined` when there is no match.

#### stream

Type: [`stream.Readable`](https://nodejs.org/api/stream.html#stream_class_stream_readable)
Type: [Web `ReadableStream`](https://developer.mozilla.org/en-US/docs/Web/API/ReadableStream) or [Node.js `stream.Readable`](https://nodejs.org/api/stream.html#stream_class_stream_readable)

A readable stream representing file data.

Expand Down
53 changes: 26 additions & 27 deletions test.js
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,14 @@ import process from 'node:process';
import path from 'node:path';
import {fileURLToPath} from 'node:url';
import fs from 'node:fs';
import {readFile} from 'node:fs/promises';
import stream from 'node:stream';
import test from 'ava';
import {readableNoopStream} from 'noop-stream';
import {Parser as ReadmeParser} from 'commonmark';
import * as strtok3 from 'strtok3/core';
import {areUint8ArraysEqual} from 'uint8array-extras';
import {getStreamAsArrayBuffer} from 'get-stream';
import {
fileTypeFromBuffer,
fileTypeFromStream as fileTypeNodeFromStream,
Expand All @@ -26,7 +28,7 @@ const missingTests = new Set([
]);

const [nodeMajorVersion] = process.versions.node.split('.').map(Number);
const nodeVersionSupportingByeBlobStream = 20;
const nodeVersionSupportingByteBlobStream = 20;

const types = [...supportedExtensions].filter(ext => !missingTests.has(ext));

Expand Down Expand Up @@ -337,38 +339,34 @@ async function testFileNodeFromStream(t, ext, name) {
t.is(typeof fileType.mime, 'string', 'fileType.mime');
}

async function loadEntireFile(readable) {
const chunks = [];
let totalLength = 0;

for await (const chunk of readable) {
chunks.push(chunk);
totalLength += chunk.length;
}

const entireFile = new Uint8Array(totalLength);

let offset = 0;
for (const chunk of chunks) {
entireFile.set(new Uint8Array(chunk), offset);
offset += chunk.length;
}

return entireFile;
async function getStreamAsUint8Array(stream) {
return new Uint8Array(await getStreamAsArrayBuffer(stream));
}

async function testStream(t, ext, name) {
async function testStreamWithNodeStream(t, ext, name) {
const fixtureName = `${(name ?? 'fixture')}.${ext}`;
const file = path.join(__dirname, 'fixture', fixtureName);

const readableStream = await fileTypeStream(fs.createReadStream(file));
const fileStream = fs.createReadStream(file);

const [bufferA, bufferB] = await Promise.all([loadEntireFile(readableStream), loadEntireFile(fileStream)]);
const [bufferA, bufferB] = await Promise.all([getStreamAsUint8Array(readableStream), getStreamAsUint8Array(fileStream)]);

t.true(areUint8ArraysEqual(bufferA, bufferB));
}

async function testStreamWithWebStream(t, ext, name) {
const fixtureName = `${(name ?? 'fixture')}.${ext}`;
const file = path.join(__dirname, 'fixture', fixtureName);
// Read the file into a buffer
const fileBuffer = await readFile(file);
// Create a Blob from the buffer
const blob = new Blob([fileBuffer]);
const webStream = await fileTypeStream(blob.stream());
const webStreamResult = await getStreamAsUint8Array(webStream);
t.true(areUint8ArraysEqual(fileBuffer, webStreamResult));
}

test('Test suite must be able to detect Node.js major version', t => {
t.is(typeof nodeMajorVersion, 'number', 'Detected Node.js major version should be a number');
});
Expand All @@ -382,13 +380,14 @@ for (const type of types) {

_test(`${name}.${type} ${i++} .fileTypeFromFile() method - same fileType`, testFromFile, type, name);
_test(`${name}.${type} ${i++} .fileTypeFromBuffer() method - same fileType`, testFromBuffer, type, name);
if (nodeMajorVersion >= nodeVersionSupportingByeBlobStream) {
if (nodeMajorVersion >= nodeVersionSupportingByteBlobStream) {
// Blob requires to stream to BYOB ReadableStream, requiring Node.js ≥ 20
_test(`${name}.${type} ${i++} .fileTypeFromBlob() method - same fileType`, testFromBlob, type, name);
test(`${name}.${type} ${i++} .fileTypeStream() - identical Web Streams`, testStreamWithWebStream, type, name);
}

_test(`${name}.${type} ${i++} .fileTypeFromStream() Node.js method - same fileType`, testFileNodeFromStream, type, name);
test(`${name}.${type} ${i++} .fileTypeStream() - identical streams`, testStream, type, name);
_test(`${name}.${type} ${i++} .fileTypeStream() - identical Node.js Readable streams`, testStreamWithNodeStream, type, name);
}
} else {
const fixtureName = `fixture.${type}`;
Expand All @@ -397,7 +396,7 @@ for (const type of types) {
_test(`${type} ${i++} .fileTypeFromFile()`, testFromFile, type);
_test(`${type} ${i++} .fileTypeFromBuffer()`, testFromBuffer, type);
_test(`${type} ${i++} .fileTypeFromStream() Node.js`, testFileNodeFromStream, type);
test(`${type} ${i++} .fileTypeStream() - identical streams`, testStream, type);
test(`${type} ${i++} .fileTypeStream() - identical streams`, testStreamWithNodeStream, type);
}

if (Object.prototype.hasOwnProperty.call(falsePositives, type)) {
Expand Down Expand Up @@ -427,7 +426,7 @@ test('.fileTypeStream() method - short stream', async t => {
t.is(newStream.fileType, undefined);

// Test usability of returned stream
const bufferB = await loadEntireFile(newStream);
const bufferB = await getStreamAsUint8Array(newStream);
t.deepEqual(bufferA, bufferB);
});

Expand Down Expand Up @@ -708,7 +707,7 @@ const tokenizerPositionChanger = tokenizer => {
tokenizer.readBuffer(buffer, {length: 1, mayBeLess: true});
};

if (nodeMajorVersion >= nodeVersionSupportingByeBlobStream) {
if (nodeMajorVersion >= nodeVersionSupportingByteBlobStream) {
// Blob requires to stream to BYOB ReadableStream, requiring Node.js ≥ 20

test('fileTypeFromBlob should detect custom file type "unicorn" using custom detectors', async t => {
Expand Down Expand Up @@ -849,7 +848,7 @@ test('fileTypeFromTokenizer should return undefined when a custom detector chang
const header = 'UNICORN FILE\n';
const uint8ArrayContent = new TextEncoder().encode(header);

// Include the unicormDetector here to verify it's not used after the tokenizer.position changed
// Include the unicornDetector here to verify it's not used after the tokenizer.position changed
const customDetectors = [tokenizerPositionChanger, unicornDetector];
const parser = new NodeFileTypeParser({customDetectors});

Expand Down

0 comments on commit 2000141

Please sign in to comment.