diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md index 867c5522..f7656b56 100644 --- a/.github/pull_request_template.md +++ b/.github/pull_request_template.md @@ -4,7 +4,10 @@ If you're adding support for a new file type, please follow the below steps: - Add a fixture file named `fixture.` to the `fixture` directory. - Add the file extension to the `extensions` array in `supported.js`. - Add the file's MIME type to the `types` array in `supported.js`. -- Add the file type detection logic to the `core.js` file +- Add the file type detection logic to the `core.js` file. +- Determine the appropriate detection confidence category: + - `detectConfident()`: Detections with a high degree of certainty in identifying the correct file type. + - `detectImprecise()`: Detections with limited supporting data, resulting in a higher likelihood of false positives. - Respect the sequence: - Signature with shorter sample size (counted from offset 0 until the last required byte position) will be executed first. - Only the initial determination for the file type counts for the sequence. diff --git a/core.d.ts b/core.d.ts index 543c452b..2c84e1a9 100644 --- a/core.d.ts +++ b/core.d.ts @@ -162,7 +162,10 @@ console.log(fileType); // {ext: 'unicorn', mime: 'application/unicorn'} @param fileType - The file type detected by standard or previous custom detectors, or `undefined` if no match is found. @returns The detected file type, or `undefined` if no match is found. */ -export type Detector = (tokenizer: ITokenizer, fileType?: FileTypeResult) => Promise; +export type Detector = { + id: string; + detect: (tokenizer: ITokenizer, fileType?: FileTypeResult) => Promise; +}; export type FileTypeOptions = { customDetectors?: Iterable; diff --git a/core.js b/core.js index c3b73764..8794e7cd 100644 --- a/core.js +++ b/core.js @@ -154,7 +154,9 @@ export async function fileTypeStream(webStream, options) { export class FileTypeParser { constructor(options) { - this.detectors = [...(options?.customDetectors ?? []), this.parse]; + this.detectors = [...(options?.customDetectors ?? []), + {id: 'core', detect: this.detectConfident}, + {id: 'core.imprecise', detect: this.detectImprecise}]; this.tokenizerOptions = { abortSignal: options?.signal, }; @@ -165,7 +167,7 @@ export class FileTypeParser { // Iterate through all file-type detectors for (const detector of this.detectors) { - const fileType = await detector(tokenizer); + const fileType = await detector.detect(tokenizer); if (fileType) { return fileType; } @@ -256,7 +258,8 @@ export class FileTypeParser { return this.check(stringToBytes(header), options); } - parse = async tokenizer => { + // Detections with a high degree of certainty in identifying the correct file type + detectConfident = async tokenizer => { this.buffer = new Uint8Array(reasonableDetectionSizeInBytes); // Keep reading until EOF if the file size is unknown. @@ -346,7 +349,7 @@ export class FileTypeParser { if (this.check([0xEF, 0xBB, 0xBF])) { // UTF-8-BOM // Strip off UTF-8-BOM this.tokenizer.ignore(3); - return this.parse(tokenizer); + return this.detectConfident(tokenizer); } if (this.check([0x47, 0x49, 0x46])) { @@ -1406,39 +1409,6 @@ export class FileTypeParser { return undefined; // Some unknown text based format } - // -- Unsafe signatures -- - - if ( - this.check([0x0, 0x0, 0x1, 0xBA]) - || this.check([0x0, 0x0, 0x1, 0xB3]) - ) { - return { - ext: 'mpg', - mime: 'video/mpeg', - }; - } - - if (this.check([0x00, 0x01, 0x00, 0x00, 0x00])) { - return { - ext: 'ttf', - mime: 'font/ttf', - }; - } - - if (this.check([0x00, 0x00, 0x01, 0x00])) { - return { - ext: 'ico', - mime: 'image/x-icon', - }; - } - - if (this.check([0x00, 0x00, 0x02, 0x00])) { - return { - ext: 'cur', - mime: 'image/x-icon', - }; - } - if (this.check([0xD0, 0xCF, 0x11, 0xE0, 0xA1, 0xB1, 0x1A, 0xE1])) { // Detected Microsoft Compound File Binary File (MS-CFB) Format. return { @@ -1644,6 +1614,45 @@ export class FileTypeParser { mime: 'application/pgp-encrypted', }; } + }; + + // Detections with limited supporting data, resulting in a higher likelihood of false positives + detectImprecise = async tokenizer => { + this.buffer = new Uint8Array(reasonableDetectionSizeInBytes); + + // Read initial sample size of 8 bytes + await tokenizer.peekBuffer(this.buffer, {length: Math.min(8, tokenizer.fileInfo.size), mayBeLess: true}); + + if ( + this.check([0x0, 0x0, 0x1, 0xBA]) + || this.check([0x0, 0x0, 0x1, 0xB3]) + ) { + return { + ext: 'mpg', + mime: 'video/mpeg', + }; + } + + if (this.check([0x00, 0x01, 0x00, 0x00, 0x00])) { + return { + ext: 'ttf', + mime: 'font/ttf', + }; + } + + if (this.check([0x00, 0x00, 0x01, 0x00])) { + return { + ext: 'ico', + mime: 'image/x-icon', + }; + } + + if (this.check([0x00, 0x00, 0x02, 0x00])) { + return { + ext: 'cur', + mime: 'image/x-icon', + }; + } // Check MPEG 1 or 2 Layer 3 header, or 'layer 0' for ADTS (MPEG sync-word 0xFFE) if (this.buffer.length >= 2 && this.check([0xFF, 0xE0], {offset: 0, mask: [0xFF, 0xE0]})) { diff --git a/readme.md b/readme.md index 698cc65a..1a8526e1 100644 --- a/readme.md +++ b/readme.md @@ -364,8 +364,9 @@ Below is an example of a custom detector array. This can be passed to the `FileT ```js import {FileTypeParser} from 'file-type'; -const customDetectors = [ - async tokenizer => { +const unicornDetector = { + id: 'unicorn', // May be used to recognize the detector in the detector list + async detect(tokenizer) { const unicornHeader = [85, 78, 73, 67, 79, 82, 78]; // "UNICORN" in ASCII decimal const buffer = new Uint8Array(unicornHeader.length); @@ -375,11 +376,11 @@ const customDetectors = [ } return undefined; - }, -]; + } +} const buffer = new Uint8Array([85, 78, 73, 67, 79, 82, 78]); -const parser = new FileTypeParser({customDetectors}); +const parser = new FileTypeParser({customDetectors: [unicornDetector]}); const fileType = await parser.fromBuffer(buffer); console.log(fileType); // {ext: 'unicorn', mime: 'application/unicorn'} ``` diff --git a/test.js b/test.js index 54445f18..edd3c7aa 100644 --- a/test.js +++ b/test.js @@ -688,22 +688,31 @@ test('corrupt MKV throws', async t => { }); // Create a custom detector for the just made up "unicorn" file type -const unicornDetector = async tokenizer => { - const unicornHeader = [85, 78, 73, 67, 79, 82, 78]; // "UNICORN" as decimal string - const buffer = new Uint8Array(7); - await tokenizer.peekBuffer(buffer, {length: unicornHeader.length, mayBeLess: true}); - if (unicornHeader.every((value, index) => value === buffer[index])) { - return {ext: 'unicorn', mime: 'application/unicorn'}; - } +const unicornDetector = { + id: 'mock.unicorn', + async detect(tokenizer) { + const unicornHeader = [85, 78, 73, 67, 79, 82, 78]; // "UNICORN" as decimal string + const buffer = new Uint8Array(7); + await tokenizer.peekBuffer(buffer, {length: unicornHeader.length, mayBeLess: true}); + if (unicornHeader.every((value, index) => value === buffer[index])) { + return {ext: 'unicorn', mime: 'application/unicorn'}; + } - return undefined; + return undefined; + }, }; -const mockPngDetector = _tokenizer => ({ext: 'mockPng', mime: 'image/mockPng'}); +const mockPngDetector = { + id: 'mock.png', + detect: () => ({ext: 'mockPng', mime: 'image/mockPng'}), +}; -const tokenizerPositionChanger = tokenizer => { - const buffer = new Uint8Array(1); - tokenizer.readBuffer(buffer, {length: 1, mayBeLess: true}); +const tokenizerPositionChanger = { + id: 'mock.dirtyTokenizer', + detect(tokenizer) { + const buffer = new Uint8Array(1); + tokenizer.readBuffer(buffer, {length: 1, mayBeLess: true}); + }, }; if (nodeMajorVersion >= nodeVersionSupportingByteBlobStream) {