From b42f8847437bc5e9a619db4dbb14137f588d661d Mon Sep 17 00:00:00 2001 From: Sirui Mu Date: Fri, 19 Jan 2024 08:23:11 +0800 Subject: [PATCH] [CIR][CIRGen] Support wide string literals (#399) This commit supports the codegen of wide string literals, including `wchar_t` string literals, `char16_t` string literals, and `char32_t` string literals. I'm not following the proposal in #374. The clang frontend doesn't record the literal string. It only records the encoded code units for wide string literals. So I believe that a dedicated string attribute with an encoding tag as described in #374 may not be that helpful as I thought. --- clang/lib/CIR/CodeGen/CIRGenModule.cpp | 33 ++++++++++++++++++++++++-- clang/test/CIR/CodeGen/wide-string.cpp | 26 ++++++++++++++++++++ 2 files changed, 57 insertions(+), 2 deletions(-) create mode 100644 clang/test/CIR/CodeGen/wide-string.cpp diff --git a/clang/lib/CIR/CodeGen/CIRGenModule.cpp b/clang/lib/CIR/CodeGen/CIRGenModule.cpp index 7eab7323d70f..9b7f11d37ce2 100644 --- a/clang/lib/CIR/CodeGen/CIRGenModule.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenModule.cpp @@ -1103,8 +1103,37 @@ CIRGenModule::getConstantArrayFromStringLiteral(const StringLiteral *E) { return builder.getString(Str, eltTy, finalSize); } - assert(0 && "not implemented"); - return {}; + auto arrayTy = + getTypes().ConvertType(E->getType()).dyn_cast(); + assert(arrayTy && "string literals must be emitted as an array type"); + + auto arrayEltTy = arrayTy.getEltType().dyn_cast(); + assert(arrayEltTy && + "string literal elements must be emitted as integral type"); + + auto arraySize = arrayTy.getSize(); + auto literalSize = E->getLength(); + + // Collect the code units. + SmallVector elementValues; + elementValues.reserve(arraySize); + for (unsigned i = 0; i < literalSize; ++i) + elementValues.push_back(E->getCodeUnit(i)); + elementValues.resize(arraySize); + + // If the string is full of null bytes, emit a #cir.zero instead. + if (std::all_of(elementValues.begin(), elementValues.end(), + [](uint32_t x) { return x == 0; })) + return builder.getZeroAttr(arrayTy); + + // Otherwise emit a constant array holding the characters. + SmallVector elements; + elements.reserve(arraySize); + for (uint64_t i = 0; i < arraySize; ++i) + elements.push_back(mlir::cir::IntAttr::get(arrayEltTy, elementValues[i])); + + auto elementsAttr = mlir::ArrayAttr::get(builder.getContext(), elements); + return builder.getConstArray(elementsAttr, arrayTy); } // TODO(cir): this could be a common AST helper for both CIR and LLVM codegen. diff --git a/clang/test/CIR/CodeGen/wide-string.cpp b/clang/test/CIR/CodeGen/wide-string.cpp new file mode 100644 index 000000000000..e7fc719647ab --- /dev/null +++ b/clang/test/CIR/CodeGen/wide-string.cpp @@ -0,0 +1,26 @@ +// RUN: %clang_cc1 -std=c++17 -triple x86_64-unknown-linux-gnu -fclangir-enable -emit-cir %s -o %t.cir +// RUN: FileCheck --input-file=%t.cir %s + +const char16_t *test_utf16() { + return u"你好世界"; +} + +// CHECK: cir.global "private" constant internal @{{.+}} = #cir.const_array<[#cir.int<20320> : !u16i, #cir.int<22909> : !u16i, #cir.int<19990> : !u16i, #cir.int<30028> : !u16i, #cir.int<0> : !u16i]> : !cir.array + +const char32_t *test_utf32() { + return U"你好世界"; +} + +// CHECK: cir.global "private" constant internal @{{.+}} = #cir.const_array<[#cir.int<20320> : !u32i, #cir.int<22909> : !u32i, #cir.int<19990> : !u32i, #cir.int<30028> : !u32i, #cir.int<0> : !u32i]> : !cir.array + +const char16_t *test_zero16() { + return u"\0\0\0\0"; +} + +// CHECK: cir.global "private" constant internal @{{.+}} = #cir.zero : !cir.array + +const char32_t *test_zero32() { + return U"\0\0\0\0"; +} + +// CHECK: cir.global "private" constant internal @{{.+}} = #cir.zero : !cir.array