Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
Co-authored-by: Jarred Sumner <709451+Jarred-Sumner@users.noreply.github.com>
  • Loading branch information
Jarred-Sumner and Jarred-Sumner authored Dec 7, 2023
1 parent 798f548 commit 653e293
Show file tree
Hide file tree
Showing 3 changed files with 66 additions and 24 deletions.
36 changes: 31 additions & 5 deletions src/bun.js/webcore/blob.zig
Original file line number Diff line number Diff line change
Expand Up @@ -3815,15 +3815,17 @@ pub const Blob = struct {
return this.store != null and this.store.?.data == .file;
}

pub fn toStringWithBytes(this: *Blob, global: *JSGlobalObject, buf: []const u8, comptime lifetime: Lifetime) JSValue {
pub fn toStringWithBytes(this: *Blob, global: *JSGlobalObject, buf_: []const u8, comptime lifetime: Lifetime) JSValue {
// null == unknown
// false == can't be
const could_be_all_ascii = this.is_all_ascii orelse this.store.?.is_all_ascii;

const buf = strings.withoutUTF8BOM(buf_);

if (could_be_all_ascii == null or !could_be_all_ascii.?) {
// if toUTF16Alloc returns null, it means there are no non-ASCII characters
// instead of erroring, invalid characters will become a U+FFFD replacement character
if (strings.toUTF16AllocAllowBOM(bun.default_allocator, buf, false, true) catch unreachable) |external| {
if (strings.toUTF16Alloc(bun.default_allocator, buf, false) catch unreachable) |external| {
if (lifetime != .temporary)
this.setIsASCIIFlag(false);

Expand All @@ -3850,21 +3852,36 @@ pub const Blob = struct {
// we don't need to clone
.clone => {
this.store.?.ref();
// we don't need to worry about UTF-8 BOM in this case because the store owns the memory.
return ZigString.init(buf).external(global, this.store.?, Store.external);
},
.transfer => {
var store = this.store.?;
std.debug.assert(store.data == .bytes);
this.transfer();
// we don't need to worry about UTF-8 BOM in this case because the store owns the memory.
return ZigString.init(buf).external(global, store, Store.external);
},
// strings are immutable
// sharing isn't really a thing
.share => {
this.store.?.ref();
// we don't need to worry about UTF-8 BOM in this case because the store owns the memory.s
return ZigString.init(buf).external(global, this.store.?, Store.external);
},
.temporary => {
// if there was a UTF-8 BOM, we need to clone the buffer because
// external doesn't support this case here yet.
if (buf.len != buf_.len) {
var out = bun.String.createLatin1(buf);
defer {
bun.default_allocator.free(buf_);
out.deref();
}

return out.toJS(global);
}

return ZigString.init(buf).toExternalValue(global);
},
}
Expand Down Expand Up @@ -3894,7 +3911,8 @@ pub const Blob = struct {
return toJSONWithBytes(this, global, view_, lifetime);
}

pub fn toJSONWithBytes(this: *Blob, global: *JSGlobalObject, buf: []const u8, comptime lifetime: Lifetime) JSValue {
pub fn toJSONWithBytes(this: *Blob, global: *JSGlobalObject, buf_: []const u8, comptime lifetime: Lifetime) JSValue {
const buf = strings.withoutUTF8BOM(buf_);
if (buf.len == 0) return global.createSyntaxErrorInstance("Unexpected end of JSON input", .{});
// null == unknown
// false == can't be
Expand All @@ -3905,7 +3923,7 @@ pub const Blob = struct {
var stack_fallback = std.heap.stackFallback(4096, bun.default_allocator);
const allocator = stack_fallback.get();
// if toUTF16Alloc returns null, it means there are no non-ASCII characters
if (strings.toUTF16AllocAllowBOM(allocator, buf, false, true) catch null) |external| {
if (strings.toUTF16Alloc(allocator, buf, false) catch null) |external| {
if (comptime lifetime != .temporary) this.setIsASCIIFlag(false);
const result = ZigString.init16(external).toJSONObject(global);
allocator.free(external);
Expand Down Expand Up @@ -4534,11 +4552,19 @@ pub const InternalBlob = struct {
was_string: bool = false,

pub fn toStringOwned(this: *@This(), globalThis: *JSC.JSGlobalObject) JSValue {
if (strings.toUTF16AllocAllowBOM(globalThis.allocator(), this.bytes.items, false, true) catch &[_]u16{}) |out| {
const bytes_without_bom = strings.withoutUTF8BOM(this.bytes.items);
if (strings.toUTF16Alloc(globalThis.allocator(), bytes_without_bom, false) catch &[_]u16{}) |out| {
const return_value = ZigString.toExternalU16(out.ptr, out.len, globalThis);
return_value.ensureStillAlive();
this.deinit();
return return_value;
} else if
// If there was a UTF8 BOM, we clone it
(bytes_without_bom.len != this.bytes.items.len) {
defer this.deinit();
var out = bun.String.createLatin1(this.bytes.items[3..]);
defer out.deref();
return out.toJS(globalThis);
} else {
var str = ZigString.init(this.toOwnedSlice());
str.mark();
Expand Down
22 changes: 3 additions & 19 deletions src/string_immutable.zig
Original file line number Diff line number Diff line change
Expand Up @@ -1313,13 +1313,9 @@ pub fn copyLatin1IntoASCII(dest: []u8, src: []const u8) void {

const utf8_bom = [_]u8{ 0xef, 0xbb, 0xbf };

pub fn toUTF16Alloc(allocator: std.mem.Allocator, bytes: []const u8, comptime fail_if_invalid: bool) !?[]u16 {
return toUTF16AllocAllowBOM(allocator, bytes, fail_if_invalid, false);
}

pub fn withoutUTF8BOM(bytes: []const u8) []const u8 {
if (bytes.len > 3 and strings.eqlComptime(bytes[0..3], utf8_bom)) {
return bytes[3..];
if (strings.hasPrefixComptime(bytes, utf8_bom)) {
return bytes[utf8_bom.len..];
} else {
return bytes;
}
Expand All @@ -1328,20 +1324,8 @@ pub fn withoutUTF8BOM(bytes: []const u8) []const u8 {
/// Convert a UTF-8 string to a UTF-16 string IF there are any non-ascii characters
/// If there are no non-ascii characters, this returns null
/// This is intended to be used for strings that go to JavaScript
pub fn toUTF16AllocAllowBOM(allocator: std.mem.Allocator, bytes_: []const u8, comptime fail_if_invalid: bool, comptime allow_bom: bool) !?[]u16 {
var bytes = bytes_;
pub fn toUTF16Alloc(allocator: std.mem.Allocator, bytes: []const u8, comptime fail_if_invalid: bool) !?[]u16 {
if (strings.firstNonASCII(bytes)) |i| {
if (comptime allow_bom) {
// we could avoid the allocation here when it's otherwise ASCII. But
// it gets really complicated because most memory allocators need
// the head pointer to be the allocated one so if we instead return
// a non-head pointer and try to free that the allocator might not
// be able to free it, and we would have a big problem.
if (i == 0 and bytes.len > 3 and strings.eqlComptime(bytes[0..3], utf8_bom)) {
bytes = bytes[3..];
}
}

const output_: ?std.ArrayList(u16) = if (comptime bun.FeatureFlags.use_simdutf) simd: {
const trimmed = bun.simdutf.trim.utf8(bytes);

Expand Down
32 changes: 32 additions & 0 deletions test/js/web/fetch/utf8-bom.test.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,39 @@
import { describe, expect, it, test } from "bun:test";

describe("UTF-8 BOM should be ignored", () => {
test("handles empty strings", async () => {
const blob = new Response(new Blob([Buffer.from([0xef, 0xbb, 0xbf])]));

expect(await blob.text()).toHaveLength(0);
expect(async () => await blob.json()).toThrow();
});

test("handles UTF8 BOM + emoji", async () => {
const blob = new Response(new Blob([Buffer.from([0xef, 0xbb, 0xbf]), Buffer.from("🌎")]));

expect(await blob.text()).toHaveLength(2);
expect(async () => await blob.json()).toThrow();
});

describe("Blob", () => {
describe("with emoji", () => {
it("in text()", async () => {
const blob = new Blob(["\uFEFFHello, World! 🌎"], { type: "text/plain" });
expect(await blob.text()).toBe("Hello, World! 🌎");
});

it("in json()", async () => {
const blob = new Blob(['\uFEFF{"hello":"World 🌎"}'], { type: "application/json" });
expect(await blob.json()).toStrictEqual({ "hello": "World 🌎" } as any);
});

it("in formData()", async () => {
const blob = new Blob(["\uFEFFhello=world 🌎"], { type: "application/x-www-form-urlencoded" });
const formData = await blob.formData();
expect(formData.get("hello")).toBe("world 🌎");
});
});

it("in text()", async () => {
const blob = new Blob(["\uFEFFHello, World!"], { type: "text/plain" });
expect(await blob.text()).toBe("Hello, World!");
Expand Down

0 comments on commit 653e293

Please sign in to comment.