Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix some nonutf8 encodings #23

Merged
merged 7 commits into from
Jul 30, 2015
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 39 additions & 9 deletions index.js
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
var fs = require('fs');
var path = require("path");
var max_bytes = 512;
var MAX_BYTES = 512;

module.exports = function(bytes, size) {
var file = bytes;
Expand All @@ -13,7 +13,7 @@ module.exports = function(bytes, size) {
}
var descriptor = fs.openSync(file, 'r');
try {
bytes = new Buffer(max_bytes);
bytes = new Buffer(MAX_BYTES);
size = fs.readSync(descriptor, bytes, 0, bytes.length, 0);
} finally {
fs.closeSync(descriptor);
Expand All @@ -27,7 +27,7 @@ module.exports = function(bytes, size) {

fs.open(file, 'r', function(err, descriptor){
if (err) return callback(err);
var bytes = new Buffer(max_bytes);
var bytes = new Buffer(MAX_BYTES);
// Read the file with no encoding for raw buffer access.
fs.read(descriptor, bytes, 0, bytes.length, 0, function(err, size, bytes){
fs.close(descriptor, function(err2){
Expand All @@ -48,10 +48,40 @@ function isBinaryCheck(bytes, size) {
return false;

var suspicious_bytes = 0;
var total_bytes = Math.min(size, max_bytes);
var total_bytes = Math.min(size, MAX_BYTES);

// UTF-8 BOM
if (size >= 3 && bytes[0] == 0xEF && bytes[1] == 0xBB && bytes[2] == 0xBF) {
// UTF-8 BOM. This isn't binary.
return false;
}

// UTF-32 BOM
if (size >= 4 && bytes[0] === 0x00 && bytes[1] === 0x00 && bytes[2] == 0xFE && bytes[3] == 0xFF) {
return false;
}

// UTF-32 LE BOM
if (size >= 4 && bytes[0] == 0xFF && bytes[1] == 0xFE && bytes[2] === 0x00 && bytes[3] === 0x00) {
return false;
}

// GB BOM
if (size >= 4 && bytes[0] == 0x84 && bytes[1] == 0x31 && bytes[2] == 0x95 && bytes[3] == 0x33) {
return false;
}

// PDF
if (total_bytes >= 4 && bytes[0] == 0x25 && bytes[1] == 0x50 && bytes[2] == 0x44 && bytes[3] == 0x46) {
return true;
}

// UTF-16 BE BOM
if (size >= 2 && bytes[0] == 0xFE && bytes[1] == 0xFF) {
return false;
}

// UTF-16 LE BOM
if (size >= 2 && bytes[0] == 0xFF && bytes[1] == 0xFE) {
return false;
}

Expand All @@ -64,20 +94,20 @@ function isBinaryCheck(bytes, size) {
if (bytes[i] > 193 && bytes[i] < 224 && i + 1 < total_bytes) {
i++;
if (bytes[i] > 127 && bytes[i] < 192) {
continue;
continue;
}
}
else if (bytes[i] > 223 && bytes[i] < 240 && i + 2 < total_bytes) {
i++;
if (bytes[i] > 127 && bytes[i] < 192 && bytes[i + 1] > 127 && bytes[i + 1] < 192) {
i++;
continue;
i++;
continue;
}
}
suspicious_bytes++;
// Read at least 32 bytes before making a decision
if (i > 32 && (suspicious_bytes * 100) / total_bytes > 10) {
return true;
return true;
}
}
}
Expand Down
Binary file removed test/fixtures/04_HelloWorld.pdf
Binary file not shown.
File renamed without changes.
1 change: 1 addition & 0 deletions test/fixtures/encodings/big5.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
BIG5_TW �Тעբ��s�X�c�餤�����
22 changes: 22 additions & 0 deletions test/fixtures/encodings/big5_B.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
OpenVPN HOWTO ���媩
�@�̡Gliyi Ķ �峹�X�B�Gopenvpn �o���ɶ��G2005-09-16 �I���G3435 �r��G �i�p �� �j�j
OpenVPN HOWTO

����
�����ɴy�z�@�Ө嫬��Home��Office���q�H��OpenVPN���t�m�C�o��HOWTO�|�F�@�ӧ��㪺�t�m��ҡA�bman page��U�������@�ӧ�²�檺�Ҥl�C

��HOWTO�����٦��p�U�榡�G

PDF
PostScript

���[������
��L���@�ǫܦn�����ɤ�HOWTO �����P���ҤU�t�mOpenVPN�ӧ@�C

�򥻪��G�D(Tunnel)����
OpenVPN�i�H�Ыب�ذ򥻪��G�D�����G

Routed IP tunnels -- �A�Ω󤣻ݼs�����I���IIP(point-to-point)�q�H�C��_���������G�D�Ӳ���o�󦳮IJv�ǦӥB����t�m�C��HOWTO���ɲ[�\�FRouted IP tunnels�C
Bridged Ethernet Tunnels(���������G�D) -- ��Ω�IP��ij�ΫDIP��ij���G�D�C�o���������G�D��A�X��ϥμs��(broadcast)�����ΡA��p�Y��Windows���������C�t�m�_�ӵy�L�_���ǡC������������G�D��Mini-HOWTO�C
Routed IP tunnel HOWTO
�ڭ̷|���մy�z�@�ӧ��㪺�t�ΰt�m�A�����A�Ψ쨾����AVPN�ANAT�H�ΥL�̩��������ۤ����p�A�ڭ̤��|�t�ߪ��@�����@���������QVPN�]�m�C
Binary file added test/fixtures/encodings/bom_utf-16.txt
Binary file not shown.
Binary file added test/fixtures/encodings/bom_utf-16le.txt
Binary file not shown.
Binary file added test/fixtures/encodings/bom_utf-32.txt
Binary file not shown.
Binary file added test/fixtures/encodings/bom_utf-32le.txt
Binary file not shown.
1 change: 1 addition & 0 deletions test/fixtures/encodings/bom_utf-8.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
UTF-8 chinese UTF8格式的中文,包含中文标点符号“”。看看能不能看清楚
1 change: 1 addition & 0 deletions test/fixtures/encodings/test-gb.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
��ͨ
1 change: 1 addition & 0 deletions test/fixtures/encodings/test-gb2.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
��ͨͨ��ͨͨ����
1 change: 1 addition & 0 deletions test/fixtures/encodings/test-kr.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
�׷��� �̷��� ���� ���� �ø������ϴ� ����
18 changes: 18 additions & 0 deletions test/fixtures/encodings/test-latin.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@

-*- coding: latin-1 -*-

Mit freundlichen Gr��en
mit freundlichen Gr��en

Das ist ein �pfel.
Was k�nnen Sie jetzt machen?

Machen wir eine �bung!
Wor�ber?
Dar�ber.

Das ist euro: �
Euro: �!

�Clinux

2 changes: 2 additions & 0 deletions test/fixtures/encodings/test-shishi.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
ʩʦʵʫʿ,ʶʳʯʨʬʷ,ʾʷʵ,ʱʰʮʭʺʪʯʨʬ,ʼʹʯʨʬʴ,ʵʷʫ.
ʫʦʧʨʩʺʪʫʿʫʬʮʭʮʯʰʱʲʳʴʵʶʷʸʹʺʻʼʽʾʫʿ
Binary file added test/fixtures/encodings/test-utf16be.txt
Binary file not shown.
1 change: 1 addition & 0 deletions test/fixtures/encodings/utf8cn.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
UTF-8 chinese UTF8格式的中文,包含中文标点符号“”。看看能不能看清楚
1 change: 1 addition & 0 deletions test/fixtures/encodings/utf_8.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
中文
File renamed without changes.
File renamed without changes.
File renamed without changes
Binary file added test/fixtures/pdf.pdf
Binary file not shown.
File renamed without changes.
File renamed without changes.
File renamed without changes
71 changes: 42 additions & 29 deletions test/index.js
Original file line number Diff line number Diff line change
@@ -1,70 +1,65 @@
var assert = require("assert");
var fs = require("fs");
var path = require("path");
var isBinaryFile = require("../index");

var FIXTURE_PATH = "./test/fixtures";

describe('isBinaryFile', function() {
it('should fail on a binary program', function() {
assert(isBinaryFile("./test/fixtures/01_grep"));
assert(isBinaryFile(path.join(FIXTURE_PATH, "grep")));

var bytes = fs.readFileSync("./test/fixtures/01_grep");
var stat = fs.lstatSync("./test/fixtures/01_grep");
var bytes = fs.readFileSync(path.join(FIXTURE_PATH, "grep"));
var stat = fs.lstatSync(path.join(FIXTURE_PATH, "grep"));
assert(isBinaryFile(bytes, stat.size));
});

it('should not fail on an extensionless script', function() {
assert(!isBinaryFile("./test/fixtures/02_perl_script"));
assert(!isBinaryFile(path.join(FIXTURE_PATH, "perl_script")));

var bytes = fs.readFileSync("./test/fixtures/02_perl_script");
var stat = fs.lstatSync("./test/fixtures/02_perl_script");
var bytes = fs.readFileSync(path.join(FIXTURE_PATH, "perl_script"));
var stat = fs.lstatSync(path.join(FIXTURE_PATH, "perl_script"));
assert(!isBinaryFile(bytes, stat.size));
});

it('should not fail on a russian text', function() {
assert(!isBinaryFile("./test/fixtures/03_russian_file.rst"));
assert(!isBinaryFile(path.join(FIXTURE_PATH, "russian_file.rst")));

var bytes = fs.readFileSync("./test/fixtures/03_russian_file.rst");
var stat = fs.lstatSync("./test/fixtures/03_russian_file.rst");
var bytes = fs.readFileSync(path.join(FIXTURE_PATH, "russian_file.rst"));
var stat = fs.lstatSync(path.join(FIXTURE_PATH, "russian_file.rst"));
assert(!isBinaryFile(bytes, stat.size));
});

it('should not fail on a PDF', function() {
assert(isBinaryFile("./test/fixtures/04_HelloWorld.pdf"));

var bytes = fs.readFileSync("./test/fixtures/04_HelloWorld.pdf");
var stat = fs.lstatSync("./test/fixtures/04_HelloWorld.pdf");
assert(isBinaryFile(bytes, stat.size));
});

it('should not fail on a zero-byte file', function() {
assert(!isBinaryFile("./test/fixtures/05_null_file.gif"));
assert(!isBinaryFile(path.join(FIXTURE_PATH, "null_file.gif")));

var bytes = fs.readFileSync("./test/fixtures/05_null_file.gif");
var stat = fs.lstatSync("./test/fixtures/05_null_file.gif");
var bytes = fs.readFileSync(path.join(FIXTURE_PATH, "null_file.gif"));
var stat = fs.lstatSync(path.join(FIXTURE_PATH, "null_file.gif"));
assert(!isBinaryFile(bytes, stat.size));
});

it('should not fail on a gif', function() {
assert(isBinaryFile("./test/fixtures/06_trunks.gif"));
assert(isBinaryFile(path.join(FIXTURE_PATH, "trunks.gif")));

var bytes = fs.readFileSync("./test/fixtures/06_trunks.gif");
var stat = fs.lstatSync("./test/fixtures/06_trunks.gif");
var bytes = fs.readFileSync(path.join(FIXTURE_PATH, "trunks.gif"));
var stat = fs.lstatSync(path.join(FIXTURE_PATH, "trunks.gif"));
assert(isBinaryFile(bytes, stat.size));
});

it('should not fail on some UTF8 lua file', function() {
assert(!isBinaryFile("./test/fixtures/07_no.lua"));
assert(!isBinaryFile(path.join(FIXTURE_PATH, "no.lua")));

var bytes = fs.readFileSync("./test/fixtures/07_no.lua");
var stat = fs.lstatSync("./test/fixtures/07_no.lua");
var bytes = fs.readFileSync(path.join(FIXTURE_PATH, "no.lua"));
var stat = fs.lstatSync(path.join(FIXTURE_PATH, "no.lua"));
assert(!isBinaryFile(bytes, stat.size));
});

it('should fail a directory', function() {
assert(!isBinaryFile("./test/fixtures/08_dir"));
assert(!isBinaryFile(path.join(FIXTURE_PATH, "dir")));
});

it('should fail a directory with async', function(done) {
isBinaryFile("./test/fixtures/08_dir", function(err, result) {
isBinaryFile(path.join(FIXTURE_PATH, "dir"), function(err, result) {
assert(!err);
assert(!result);
done();
Expand All @@ -73,7 +68,7 @@ describe('isBinaryFile', function() {

it('should not fail with async', function(done) {
assert.doesNotThrow(function() {
isBinaryFile("./test/fixtures/06_trunks.gif", function(err, result) {
isBinaryFile(path.join(FIXTURE_PATH, "trunks.gif"), function(err, result) {
assert(!err);
assert(result);
done();
Expand All @@ -83,4 +78,22 @@ describe('isBinaryFile', function() {
});
});
});

it('should fail on a PDF', function() {
assert(isBinaryFile(path.join(FIXTURE_PATH, "pdf.pdf")));

var bytes = fs.readFileSync(path.join(FIXTURE_PATH, "pdf.pdf"));
var stat = fs.lstatSync(path.join(FIXTURE_PATH, "pdf.pdf"));
assert(isBinaryFile(bytes, stat.size));
});

it.only('should pass non-UTF8 files', function() {
encoding_dir = path.join(FIXTURE_PATH, "encodings")
files = fs.readdirSync(encoding_dir);
files.forEach(function(file) {
console.log(file)
if (!/big5/.test(file) && !/gb/.test(file) && !/kr/.test(file))
assert(!isBinaryFile(path.join(encoding_dir, file)));
});
});
});