Separate confident and imprecise detectors, introduce detector identi…

…fication (#717) Co-authored-by: Sindre Sorhus <sindresorhus@gmail.com>
sindresorhus · Jan 7, 2025 · 356bce8 · 356bce8
1 parent 4db407d
commit 356bce8
Show file tree

Hide file tree

Showing 5 changed files with 81 additions and 56 deletions.
diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md
@@ -4,7 +4,10 @@ If you're adding support for a new file type, please follow the below steps:
 - Add a fixture file named `fixture.<extension>` to the `fixture` directory.
 - Add the file extension to the `extensions` array in `supported.js`.
 - Add the file's MIME type to the `types` array in `supported.js`.
-- Add the file type detection logic to the `core.js` file
+- Add the file type detection logic to the `core.js` file.
+- Determine the appropriate detection confidence category:
+	- `detectConfident()`: Detections with a high degree of certainty in identifying the correct file type.
+	- `detectImprecise()`: Detections with limited supporting data, resulting in a higher likelihood of false positives.
 - Respect the sequence:
 	- Signature with shorter sample size (counted from offset 0 until the last required byte position) will be executed first.
 	- Only the initial determination for the file type counts for the sequence.

diff --git a/core.d.ts b/core.d.ts
@@ -162,7 +162,10 @@ console.log(fileType); // {ext: 'unicorn', mime: 'application/unicorn'}
 @param fileType - The file type detected by standard or previous custom detectors, or `undefined` if no match is found.
 @returns The detected file type, or `undefined` if no match is found.
 */
-export type Detector = (tokenizer: ITokenizer, fileType?: FileTypeResult) => Promise<FileTypeResult | undefined>;
+export type Detector = {
+	id: string;
+	detect: (tokenizer: ITokenizer, fileType?: FileTypeResult) => Promise<FileTypeResult | undefined>;
+};
 
 export type FileTypeOptions = {
 	customDetectors?: Iterable<Detector>;

diff --git a/core.js b/core.js
@@ -154,7 +154,9 @@ export async function fileTypeStream(webStream, options) {
 
 export class FileTypeParser {
 	constructor(options) {
-		this.detectors = [...(options?.customDetectors ?? []), this.parse];
+		this.detectors = [...(options?.customDetectors ?? []),
+			{id: 'core', detect: this.detectConfident},
+			{id: 'core.imprecise', detect: this.detectImprecise}];
 		this.tokenizerOptions = {
 			abortSignal: options?.signal,
 		};
@@ -165,7 +167,7 @@ export class FileTypeParser {
 
 		// Iterate through all file-type detectors
 		for (const detector of this.detectors) {
-			const fileType = await detector(tokenizer);
+			const fileType = await detector.detect(tokenizer);
 			if (fileType) {
 				return fileType;
 			}
@@ -256,7 +258,8 @@ export class FileTypeParser {
 		return this.check(stringToBytes(header), options);
 	}
 
-	parse = async tokenizer => {
+	// Detections with a high degree of certainty in identifying the correct file type
+	detectConfident = async tokenizer => {
 		this.buffer = new Uint8Array(reasonableDetectionSizeInBytes);
 
 		// Keep reading until EOF if the file size is unknown.
@@ -346,7 +349,7 @@ export class FileTypeParser {
 		if (this.check([0xEF, 0xBB, 0xBF])) { // UTF-8-BOM
 			// Strip off UTF-8-BOM
 			this.tokenizer.ignore(3);
-			return this.parse(tokenizer);
+			return this.detectConfident(tokenizer);
 		}
 
 		if (this.check([0x47, 0x49, 0x46])) {
@@ -1406,39 +1409,6 @@ export class FileTypeParser {
 			return undefined; // Some unknown text based format
 		}
 
-		// -- Unsafe signatures --
-
-		if (
-			this.check([0x0, 0x0, 0x1, 0xBA])
-			|| this.check([0x0, 0x0, 0x1, 0xB3])
-		) {
-			return {
-				ext: 'mpg',
-				mime: 'video/mpeg',
-			};
-		}
-
-		if (this.check([0x00, 0x01, 0x00, 0x00, 0x00])) {
-			return {
-				ext: 'ttf',
-				mime: 'font/ttf',
-			};
-		}
-
-		if (this.check([0x00, 0x00, 0x01, 0x00])) {
-			return {
-				ext: 'ico',
-				mime: 'image/x-icon',
-			};
-		}
-
-		if (this.check([0x00, 0x00, 0x02, 0x00])) {
-			return {
-				ext: 'cur',
-				mime: 'image/x-icon',
-			};
-		}
-
 		if (this.check([0xD0, 0xCF, 0x11, 0xE0, 0xA1, 0xB1, 0x1A, 0xE1])) {
 			// Detected Microsoft Compound File Binary File (MS-CFB) Format.
 			return {
@@ -1644,6 +1614,45 @@ export class FileTypeParser {
 				mime: 'application/pgp-encrypted',
 			};
 		}
+	};
+
+	// Detections with limited supporting data, resulting in a higher likelihood of false positives
+	detectImprecise = async tokenizer => {
+		this.buffer = new Uint8Array(reasonableDetectionSizeInBytes);
+
+		// Read initial sample size of 8 bytes
+		await tokenizer.peekBuffer(this.buffer, {length: Math.min(8, tokenizer.fileInfo.size), mayBeLess: true});
+
+		if (
+			this.check([0x0, 0x0, 0x1, 0xBA])
+			|| this.check([0x0, 0x0, 0x1, 0xB3])
+		) {
+			return {
+				ext: 'mpg',
+				mime: 'video/mpeg',
+			};
+		}
+
+		if (this.check([0x00, 0x01, 0x00, 0x00, 0x00])) {
+			return {
+				ext: 'ttf',
+				mime: 'font/ttf',
+			};
+		}
+
+		if (this.check([0x00, 0x00, 0x01, 0x00])) {
+			return {
+				ext: 'ico',
+				mime: 'image/x-icon',
+			};
+		}
+
+		if (this.check([0x00, 0x00, 0x02, 0x00])) {
+			return {
+				ext: 'cur',
+				mime: 'image/x-icon',
+			};
+		}
 
 		// Check MPEG 1 or 2 Layer 3 header, or 'layer 0' for ADTS (MPEG sync-word 0xFFE)
 		if (this.buffer.length >= 2 && this.check([0xFF, 0xE0], {offset: 0, mask: [0xFF, 0xE0]})) {

diff --git a/readme.md b/readme.md
@@ -364,8 +364,9 @@ Below is an example of a custom detector array. This can be passed to the `FileT
 ```js
 import {FileTypeParser} from 'file-type';
 
-const customDetectors = [
-	async tokenizer => {
+const unicornDetector = {
+	id: 'unicorn', // May be used to recognize the detector in the detector list
+  	async detect(tokenizer) {
 		const unicornHeader = [85, 78, 73, 67, 79, 82, 78]; // "UNICORN" in ASCII decimal
 
 		const buffer = new Uint8Array(unicornHeader.length);
@@ -375,11 +376,11 @@ const customDetectors = [
 		}
 
 		return undefined;
-	},
-];
+	}
+}
 
 const buffer = new Uint8Array([85, 78, 73, 67, 79, 82, 78]);
-const parser = new FileTypeParser({customDetectors});
+const parser = new FileTypeParser({customDetectors: [unicornDetector]});
 const fileType = await parser.fromBuffer(buffer);
 console.log(fileType); // {ext: 'unicorn', mime: 'application/unicorn'}
 ```

diff --git a/test.js b/test.js
@@ -688,22 +688,31 @@ test('corrupt MKV throws', async t => {
 });
 
 // Create a custom detector for the just made up "unicorn" file type
-const unicornDetector = async tokenizer => {
-	const unicornHeader = [85, 78, 73, 67, 79, 82, 78]; // "UNICORN" as decimal string
-	const buffer = new Uint8Array(7);
-	await tokenizer.peekBuffer(buffer, {length: unicornHeader.length, mayBeLess: true});
-	if (unicornHeader.every((value, index) => value === buffer[index])) {
-		return {ext: 'unicorn', mime: 'application/unicorn'};
-	}
+const unicornDetector = {
+	id: 'mock.unicorn',
+	async detect(tokenizer) {
+		const unicornHeader = [85, 78, 73, 67, 79, 82, 78]; // "UNICORN" as decimal string
+		const buffer = new Uint8Array(7);
+		await tokenizer.peekBuffer(buffer, {length: unicornHeader.length, mayBeLess: true});
+		if (unicornHeader.every((value, index) => value === buffer[index])) {
+			return {ext: 'unicorn', mime: 'application/unicorn'};
+		}
 
-	return undefined;
+		return undefined;
+	},
 };
 
-const mockPngDetector = _tokenizer => ({ext: 'mockPng', mime: 'image/mockPng'});
+const mockPngDetector = {
+	id: 'mock.png',
+	detect: () => ({ext: 'mockPng', mime: 'image/mockPng'}),
+};
 
-const tokenizerPositionChanger = tokenizer => {
-	const buffer = new Uint8Array(1);
-	tokenizer.readBuffer(buffer, {length: 1, mayBeLess: true});
+const tokenizerPositionChanger = {
+	id: 'mock.dirtyTokenizer',
+	detect(tokenizer) {
+		const buffer = new Uint8Array(1);
+		tokenizer.readBuffer(buffer, {length: 1, mayBeLess: true});
+	},
 };
 
 if (nodeMajorVersion >= nodeVersionSupportingByteBlobStream) {