From 53fcf071f05564221c40820d5af0406402ebd7fe Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Tue, 26 Sep 2023 20:06:08 +0200
Subject: [PATCH 01/12] Add native support for BFloat16.

---
 base/boot.jl             |  2 ++
 src/abi_x86_64.cpp       |  7 +++++--
 src/aotcompile.cpp       |  8 +++++--
 src/ccall.cpp            |  2 +-
 src/cgutils.cpp          |  2 ++
 src/codegen.cpp          |  3 +++
 src/intrinsics.cpp       |  2 +-
 src/jitlayers.cpp        |  8 ++++---
 src/jl_exported_data.inc |  1 +
 src/jltypes.c            |  2 ++
 src/julia.h              |  1 +
 src/julia_internal.h     |  2 ++
 src/runtime_intrinsics.c | 45 ++++++++++++++++++++++++++++++++++++++++
 src/staticdata.c         |  3 ++-
 14 files changed, 78 insertions(+), 10 deletions(-)

diff --git a/base/boot.jl b/base/boot.jl
index 637b16e04c13e..7f7f4cf02422d 100644
--- a/base/boot.jl
+++ b/base/boot.jl
@@ -217,6 +217,8 @@ primitive type Float16 <: AbstractFloat 16 end
 primitive type Float32 <: AbstractFloat 32 end
 primitive type Float64 <: AbstractFloat 64 end
 
+primitive type BFloat16 <: AbstractFloat 16 end
+
 #primitive type Bool <: Integer 8 end
 abstract type AbstractChar end
 primitive type Char <: AbstractChar 32 end
diff --git a/src/abi_x86_64.cpp b/src/abi_x86_64.cpp
index c3d12417e6de8..5938e1e5778a2 100644
--- a/src/abi_x86_64.cpp
+++ b/src/abi_x86_64.cpp
@@ -118,7 +118,8 @@ struct Classification {
 void classifyType(Classification& accum, jl_datatype_t *dt, uint64_t offset) const
 {
     // Floating point types
-    if (dt == jl_float64_type || dt == jl_float32_type) {
+    if (dt == jl_float64_type || dt == jl_float32_type || dt == jl_float16_type ||
+        dt == jl_bfloat16_type) {
         accum.addField(offset, Sse);
     }
     // Misc types
@@ -239,7 +240,9 @@ Type *preferred_llvm_type(jl_datatype_t *dt, bool isret, LLVMContext &ctx) const
                 types[0] = Type::getIntNTy(ctx, nbits);
             break;
         case Sse:
-            if (size <= 4)
+            if (size <= 2)
+                types[0] = Type::getHalfTy(ctx);
+            else if (size <= 4)
                 types[0] = Type::getFloatTy(ctx);
             else
                 types[0] = Type::getDoubleTy(ctx);
diff --git a/src/aotcompile.cpp b/src/aotcompile.cpp
index 3a54e2729ff5f..e3417a4c0dca1 100644
--- a/src/aotcompile.cpp
+++ b/src/aotcompile.cpp
@@ -497,7 +497,6 @@ static void reportWriterError(const ErrorInfoBase &E)
     jl_safe_printf("ERROR: failed to emit output file %s\n", err.c_str());
 }
 
-#if JULIA_FLOAT16_ABI == 1
 static void injectCRTAlias(Module &M, StringRef name, StringRef alias, FunctionType *FT)
 {
     Function *target = M.getFunction(alias);
@@ -514,7 +513,7 @@ static void injectCRTAlias(Module &M, StringRef name, StringRef alias, FunctionT
     auto val = builder.CreateCall(target, CallArgs);
     builder.CreateRet(val);
 }
-#endif
+
 void multiversioning_preannotate(Module &M);
 
 // See src/processor.h for documentation about this table. Corresponds to jl_image_shard_t.
@@ -1061,6 +1060,11 @@ static AOTOutputs add_output_impl(Module &M, TargetMachine &SourceTM, ShardTimer
 #else
             emitFloat16Wrappers(M, false);
 #endif
+
+            injectCRTAlias(M, "__truncsfbf2", "julia__truncsfbf2",
+                    FunctionType::get(Type::getBFloatTy(M.getContext()), { Type::getFloatTy(M.getContext()) }, false));
+            injectCRTAlias(M, "__truncsdbf2", "julia__truncdfbf2",
+                    FunctionType::get(Type::getBFloatTy(M.getContext()), { Type::getDoubleTy(M.getContext()) }, false));
         }
         timers.optimize.stopTimer();
     }
diff --git a/src/ccall.cpp b/src/ccall.cpp
index 118803cef1b10..cb069b58c3761 100644
--- a/src/ccall.cpp
+++ b/src/ccall.cpp
@@ -1127,7 +1127,7 @@ std::string generate_func_sig(const char *fname)
                 // see pull req #978. need to annotate signext/zeroext for
                 // small integer arguments.
                 jl_datatype_t *bt = (jl_datatype_t*)tti;
-                if (jl_datatype_size(bt) < 4 && bt != jl_float16_type) {
+                if (jl_datatype_size(bt) < 4) {
                     if (jl_signed_type && jl_subtype(tti, (jl_value_t*)jl_signed_type))
                         ab.addAttribute(Attribute::SExt);
                     else
diff --git a/src/cgutils.cpp b/src/cgutils.cpp
index 7dfa509357e5a..91be89ddbe395 100644
--- a/src/cgutils.cpp
+++ b/src/cgutils.cpp
@@ -665,6 +665,8 @@ static Type *bitstype_to_llvm(jl_value_t *bt, LLVMContext &ctxt, bool llvmcall =
         return getFloatTy(ctxt);
     if (bt == (jl_value_t*)jl_float64_type)
         return getDoubleTy(ctxt);
+    if (bt == (jl_value_t*)jl_bfloat16_type)
+        return getBFloatTy(ctxt);
     if (jl_is_llvmpointer_type(bt)) {
         jl_value_t *as_param = jl_tparam1(bt);
         int as;
diff --git a/src/codegen.cpp b/src/codegen.cpp
index b6d18b23c930e..20f2dfe28165f 100644
--- a/src/codegen.cpp
+++ b/src/codegen.cpp
@@ -125,6 +125,9 @@ auto getFloatTy(LLVMContext &ctxt) {
 auto getDoubleTy(LLVMContext &ctxt) {
     return Type::getDoubleTy(ctxt);
 }
+auto getBFloatTy(LLVMContext &ctxt) {
+    return Type::getBFloatTy(ctxt);
+}
 auto getFP128Ty(LLVMContext &ctxt) {
     return Type::getFP128Ty(ctxt);
 }
diff --git a/src/intrinsics.cpp b/src/intrinsics.cpp
index 3e7ace18a1749..1bb68674990b7 100644
--- a/src/intrinsics.cpp
+++ b/src/intrinsics.cpp
@@ -165,7 +165,7 @@ static Type *INTT(Type *t, const DataLayout &DL)
         return getInt64Ty(ctxt);
     if (t == getFloatTy(ctxt))
         return getInt32Ty(ctxt);
-    if (t == getHalfTy(ctxt))
+    if (t == getHalfTy(ctxt) || t == getBFloatTy(ctxt))
         return getInt16Ty(ctxt);
     unsigned nb = t->getPrimitiveSizeInBits();
     assert(t != getVoidTy(ctxt) && nb > 0);
diff --git a/src/jitlayers.cpp b/src/jitlayers.cpp
index f0360c6addc95..6c356759cc066 100644
--- a/src/jitlayers.cpp
+++ b/src/jitlayers.cpp
@@ -1727,16 +1727,18 @@ JuliaOJIT::JuliaOJIT()
     ExternalJD.addToLinkOrder(GlobalJD, orc::JITDylibLookupFlags::MatchExportedSymbolsOnly);
     ExternalJD.addToLinkOrder(JD, orc::JITDylibLookupFlags::MatchExportedSymbolsOnly);
 
-#if JULIA_FLOAT16_ABI == 1
     orc::SymbolAliasMap jl_crt = {
+#if JULIA_FLOAT16_ABI == 1
         { mangle("__gnu_h2f_ieee"), { mangle("julia__gnu_h2f_ieee"), JITSymbolFlags::Exported } },
         { mangle("__extendhfsf2"),  { mangle("julia__gnu_h2f_ieee"), JITSymbolFlags::Exported } },
         { mangle("__gnu_f2h_ieee"), { mangle("julia__gnu_f2h_ieee"), JITSymbolFlags::Exported } },
         { mangle("__truncsfhf2"),   { mangle("julia__gnu_f2h_ieee"), JITSymbolFlags::Exported } },
-        { mangle("__truncdfhf2"),   { mangle("julia__truncdfhf2"),   JITSymbolFlags::Exported } }
+        { mangle("__truncdfhf2"),   { mangle("julia__truncdfhf2"),   JITSymbolFlags::Exported } },
+#endif
+        { mangle("__truncsfbf2"),   { mangle("julia__truncsfbf2"),   JITSymbolFlags::Exported } },
+        { mangle("__truncdfbf2"),   { mangle("julia__truncdfbf2"),   JITSymbolFlags::Exported } },
     };
     cantFail(GlobalJD.define(orc::symbolAliases(jl_crt)));
-#endif
 
 #ifdef MSAN_EMUTLS_WORKAROUND
     orc::SymbolMap msan_crt;
diff --git a/src/jl_exported_data.inc b/src/jl_exported_data.inc
index 2acde218a104c..aa23b9d7b8205 100644
--- a/src/jl_exported_data.inc
+++ b/src/jl_exported_data.inc
@@ -42,6 +42,7 @@
     XX(jl_float16_type) \
     XX(jl_float32_type) \
     XX(jl_float64_type) \
+    XX(jl_bfloat16_type) \
     XX(jl_floatingpoint_type) \
     XX(jl_function_type) \
     XX(jl_binding_type) \
diff --git a/src/jltypes.c b/src/jltypes.c
index 998f3fe47f157..33b52158488a3 100644
--- a/src/jltypes.c
+++ b/src/jltypes.c
@@ -3403,6 +3403,8 @@ void post_boot_hooks(void)
     //XX(float32);
     jl_float64_type = (jl_datatype_t*)core("Float64");
     //XX(float64);
+    jl_bfloat16_type = (jl_datatype_t*)core("BFloat16");
+    //XX(bfloat16);
     jl_floatingpoint_type = (jl_datatype_t*)core("AbstractFloat");
     jl_number_type  = (jl_datatype_t*)core("Number");
     jl_signed_type  = (jl_datatype_t*)core("Signed");
diff --git a/src/julia.h b/src/julia.h
index 07f8459d37238..a357bdf558360 100644
--- a/src/julia.h
+++ b/src/julia.h
@@ -848,6 +848,7 @@ extern JL_DLLIMPORT jl_datatype_t *jl_uint64_type JL_GLOBALLY_ROOTED;
 extern JL_DLLIMPORT jl_datatype_t *jl_float16_type JL_GLOBALLY_ROOTED;
 extern JL_DLLIMPORT jl_datatype_t *jl_float32_type JL_GLOBALLY_ROOTED;
 extern JL_DLLIMPORT jl_datatype_t *jl_float64_type JL_GLOBALLY_ROOTED;
+extern JL_DLLIMPORT jl_datatype_t *jl_bfloat16_type JL_GLOBALLY_ROOTED;
 extern JL_DLLIMPORT jl_datatype_t *jl_floatingpoint_type JL_GLOBALLY_ROOTED;
 extern JL_DLLIMPORT jl_datatype_t *jl_number_type JL_GLOBALLY_ROOTED;
 extern JL_DLLIMPORT jl_datatype_t *jl_void_type JL_GLOBALLY_ROOTED;  // deprecated
diff --git a/src/julia_internal.h b/src/julia_internal.h
index 41f976b8585f3..9dff8e75cb2f5 100644
--- a/src/julia_internal.h
+++ b/src/julia_internal.h
@@ -1663,6 +1663,8 @@ jl_sym_t *_jl_symbol(const char *str, size_t len) JL_NOTSAFEPOINT;
 JL_DLLEXPORT float julia__gnu_h2f_ieee(uint16_t param) JL_NOTSAFEPOINT;
 JL_DLLEXPORT uint16_t julia__gnu_f2h_ieee(float param) JL_NOTSAFEPOINT;
 JL_DLLEXPORT uint16_t julia__truncdfhf2(double param) JL_NOTSAFEPOINT;
+JL_DLLEXPORT float julia__truncsfbf2(float param) JL_NOTSAFEPOINT;
+JL_DLLEXPORT float julia__truncdfbf2(double param) JL_NOTSAFEPOINT;
 //JL_DLLEXPORT double julia__extendhfdf2(uint16_t n) JL_NOTSAFEPOINT;
 //JL_DLLEXPORT int32_t julia__fixhfsi(uint16_t n) JL_NOTSAFEPOINT;
 //JL_DLLEXPORT int64_t julia__fixhfdi(uint16_t n) JL_NOTSAFEPOINT;
diff --git a/src/runtime_intrinsics.c b/src/runtime_intrinsics.c
index ed320aa9a6c35..287d22314c027 100644
--- a/src/runtime_intrinsics.c
+++ b/src/runtime_intrinsics.c
@@ -217,6 +217,51 @@ JL_DLLEXPORT uint16_t julia__truncdfhf2(double param)
     return float_to_half(res);
 }
 
+JL_DLLEXPORT float julia__truncsfbf2(float param) JL_NOTSAFEPOINT
+{
+    uint16_t result;
+
+    if (isnan(param))
+        result = 0x7fc0;
+    else {
+        uint32_t bits = *((uint32_t*) &param);
+
+        // round to nearest even
+        uint32_t bit_above_round = (bits >> 17) & 1;
+        uint32_t round_bit = (bits >> 16) & 1;
+        uint32_t sticky_bit = (bits & 0xFFFF) != 0;
+        if (round_bit && (sticky_bit || bit_above_round))
+            bits += 0x10000; // Add 1 to bit just above the target bits
+
+        result = (uint16_t)(bits >> 16);
+    }
+
+    // on x86, bfloat16 needs to be returned in XMM. only GCC 13 provides the necessary ABI
+    // support in the form of the __bf16 type; older versions only provide __bfloat16 which
+    // is simply a typedef for short (i16). so use float, which is passed in XMM too.
+    uint32_t result_32bit = (uint32_t)result;
+    return *(float*)&result_32bit;
+}
+
+JL_DLLEXPORT float julia__truncdfbf2(double param) JL_NOTSAFEPOINT
+{
+    float res = (float)param;
+    uint32_t resi;
+    memcpy(&resi, &res, sizeof(res));
+
+    // Handle subnormals: If this logic is activated, it indicates that when we
+    // cast our double to a float, the float is a subnormal number. However,
+    // bfloat16 uses the same exponent as float32, so we don't need special handling
+    // for subnormals when truncating to bfloat16.
+
+    if ((resi & 0x1ffu) == 0x100u) { // if we are halfway between 2 bfloat16 values
+        // adjust the value by 1 ULP in the direction that will make bfloat16(res) give the right answer
+        resi += (fabs(res) < fabs(param)) - (fabs(param) < fabs(res));
+        memcpy(&res, &resi, sizeof(res));
+    }
+    return julia__truncsfbf2(res);
+}
+
 //JL_DLLEXPORT double julia__extendhfdf2(uint16_t n) { return (double)julia__gnu_h2f_ieee(n); }
 //JL_DLLEXPORT int32_t julia__fixhfsi(uint16_t n) { return (int32_t)julia__gnu_h2f_ieee(n); }
 //JL_DLLEXPORT int64_t julia__fixhfdi(uint16_t n) { return (int64_t)julia__gnu_h2f_ieee(n); }
diff --git a/src/staticdata.c b/src/staticdata.c
index 536ca4cd6c3aa..df5652a5719c4 100644
--- a/src/staticdata.c
+++ b/src/staticdata.c
@@ -99,7 +99,7 @@ extern "C" {
 // TODO: put WeakRefs on the weak_refs list during deserialization
 // TODO: handle finalizers
 
-#define NUM_TAGS    159
+#define NUM_TAGS    160
 
 // An array of references that need to be restored from the sysimg
 // This is a manually constructed dual of the gvars array, which would be produced by codegen for Julia code, for C.
@@ -194,6 +194,7 @@ jl_value_t **const*const get_tags(void) {
         INSERT_TAG(jl_float16_type);
         INSERT_TAG(jl_float32_type);
         INSERT_TAG(jl_float64_type);
+        INSERT_TAG(jl_bfloat16_type);
         INSERT_TAG(jl_floatingpoint_type);
         INSERT_TAG(jl_number_type);
         INSERT_TAG(jl_signed_type);

From e033a427365e0eacf75ce27299ed40f46e6f617a Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Mon, 2 Oct 2023 12:58:07 +0200
Subject: [PATCH 02/12] Extend Float16 demote pass to BFloat16.

---
 src/llvm-demote-float16.cpp | 38 ++++++++++++++++++++++++++-----------
 1 file changed, 27 insertions(+), 11 deletions(-)

diff --git a/src/llvm-demote-float16.cpp b/src/llvm-demote-float16.cpp
index 740055730fb90..7eb591fd07d94 100644
--- a/src/llvm-demote-float16.cpp
+++ b/src/llvm-demote-float16.cpp
@@ -1,8 +1,9 @@
 // This file is a part of Julia. License is MIT: https://julialang.org/license
 
-// This pass finds floating-point operations on 16-bit (half precision) values, and replaces
-// them by equivalent operations on 32-bit (single precision) values surrounded by a fpext
-// and fptrunc. This ensures that the exact semantics of IEEE floating-point are preserved.
+// This pass finds floating-point operations on 16-bit values (half precision and bfloat),
+// and replaces them by equivalent operations on 32-bit (single precision) values surrounded
+// by a fpext and fptrunc. This ensures that the exact semantics of IEEE floating-point are
+// preserved.
 //
 // Without this pass, back-ends that do not natively support half-precision (e.g. x86_64)
 // similarly pattern-match half-precision operations with single-precision equivalents, but
@@ -71,10 +72,17 @@ static bool have_fp16(Function &caller, const Triple &TT) {
     return false;
 }
 
+static bool have_bf16(Function &caller, const Triple &TT) {
+    // TODO
+    return false;
+}
+
 static bool demoteFloat16(Function &F)
 {
     auto TT = Triple(F.getParent()->getTargetTriple());
-    if (have_fp16(F, TT))
+    auto has_fp16 = have_fp16(F, TT);
+    auto has_bf16 = have_bf16(F, TT);
+    if (has_fp16 && has_bf16)
         return false;
 
     auto &ctx = F.getContext();
@@ -82,14 +90,17 @@ static bool demoteFloat16(Function &F)
     SmallVector<Instruction *, 0> erase;
     for (auto &BB : F) {
         for (auto &I : BB) {
-            // extend Float16 operands to Float32
+            // check whether there's any 16-bit floating point operands to extend
             bool Float16 = I.getType()->getScalarType()->isHalfTy();
-            for (size_t i = 0; !Float16 && i < I.getNumOperands(); i++) {
+            bool BFloat16 = I.getType()->getScalarType()->isBFloatTy();
+            for (size_t i = 0; !BFloat16 && !Float16 && i < I.getNumOperands(); i++) {
                 Value *Op = I.getOperand(i);
-                if (Op->getType()->getScalarType()->isHalfTy())
+                if (!has_fp16 && Op->getType()->getScalarType()->isHalfTy())
                     Float16 = true;
+                else if (!has_bf16 && Op->getType()->getScalarType()->isBFloatTy())
+                    BFloat16 = true;
             }
-            if (!Float16)
+            if (!Float16 && !BFloat16)
                 continue;
 
             switch (I.getOpcode()) {
@@ -113,11 +124,16 @@ static bool demoteFloat16(Function &F)
 
             IRBuilder<> builder(&I);
 
-            // extend Float16 operands to Float32
+            // extend 16-bit floating point operands
             SmallVector<Value *, 2> Operands(I.getNumOperands());
             for (size_t i = 0; i < I.getNumOperands(); i++) {
                 Value *Op = I.getOperand(i);
-                if (Op->getType()->getScalarType()->isHalfTy()) {
+                if (!has_fp16 && Op->getType()->getScalarType()->isHalfTy()) {
+                    // extend Float16 to Float32
+                    ++TotalExt;
+                    Op = builder.CreateFPExt(Op, Op->getType()->getWithNewType(T_float32));
+                } else if (!has_bf16 && Op->getType()->getScalarType()->isBFloatTy()) {
+                    // extend BFloat16 to Float32
                     ++TotalExt;
                     Op = builder.CreateFPExt(Op, Op->getType()->getWithNewType(T_float32));
                 }
@@ -125,7 +141,7 @@ static bool demoteFloat16(Function &F)
             }
 
             // recreate the instruction if any operands changed,
-            // truncating the result back to Float16
+            // truncating the result back to the original type
             Value *NewI;
             ++TotalChanged;
             switch (I.getOpcode()) {

From 4e7706d749eb5b2871af53228d00272dcfe7eb7f Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Mon, 2 Oct 2023 13:00:38 +0200
Subject: [PATCH 03/12] Don't test BFloat16.

---
 test/numbers.jl | 1 +
 1 file changed, 1 insertion(+)

diff --git a/test/numbers.jl b/test/numbers.jl
index be661da6783fe..a9d126aa33d5a 100644
--- a/test/numbers.jl
+++ b/test/numbers.jl
@@ -2901,6 +2901,7 @@ end
     let float_types = Set()
         allsubtypes!(Base, AbstractFloat, float_types)
         allsubtypes!(Core, AbstractFloat, float_types)
+        filter!(!isequal(Core.BFloat16), float_types)   # defined externally
         @test !isempty(float_types)
 
         for T in float_types

From 3cbda01d3c5d0a08cc0cd90d0751b9a703b7154b Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Mon, 2 Oct 2023 15:42:22 +0200
Subject: [PATCH 04/12] Add test and AVX detection.

---
 src/llvm-demote-float16.cpp |  16 +++++-
 test/llvmpasses/float16.ll  | 104 +++++++++++++++++++++++++++++++++++-
 2 files changed, 117 insertions(+), 3 deletions(-)

diff --git a/src/llvm-demote-float16.cpp b/src/llvm-demote-float16.cpp
index 7eb591fd07d94..976b63d5f6b26 100644
--- a/src/llvm-demote-float16.cpp
+++ b/src/llvm-demote-float16.cpp
@@ -73,7 +73,21 @@ static bool have_fp16(Function &caller, const Triple &TT) {
 }
 
 static bool have_bf16(Function &caller, const Triple &TT) {
-    // TODO
+    Attribute FSAttr = caller.getFnAttribute("target-features");
+    StringRef FS = "";
+    if (FSAttr.isValid())
+        FS = FSAttr.getValueAsString();
+    else if (jl_ExecutionEngine)
+        FS = jl_ExecutionEngine->getTargetFeatureString();
+    // else probably called from opt, just do nothing
+    if (TT.getArch() == Triple::x86_64) {
+        if (FS.find("+avx512bf16") != llvm::StringRef::npos){
+            return true;
+        }
+    }
+    if (caller.hasFnAttribute("julia.hasbf16")) {
+        return true;
+    }
     return false;
 }
 
diff --git a/test/llvmpasses/float16.ll b/test/llvmpasses/float16.ll
index b442a39b0050c..1ea96e55aa6b8 100644
--- a/test/llvmpasses/float16.ll
+++ b/test/llvmpasses/float16.ll
@@ -3,9 +3,9 @@
 
 ; RUN: opt -enable-new-pm=1 --opaque-pointers=1  --load-pass-plugin=libjulia-codegen%shlibext -passes='DemoteFloat16' -S %s | FileCheck %s
 
-define half @demotehalf_test(half %a, half %b) #0 {
+define half @demote_half_test(half %a, half %b) #0 {
 top:
-; CHECK-LABEL: @demotehalf_test(
+; CHECK-LABEL: @demote_half_test(
 ; CHECK-NEXT:  top:
 ; CHECK-NEXT:    %0 = fpext half %a to float
 ; CHECK-NEXT:    %1 = fpext half %b to float
@@ -101,5 +101,105 @@ top:
   ret half %13
 }
 
+define bfloat @demote_bfloat_test(bfloat %a, bfloat %b) #0 {
+top:
+; CHECK-LABEL: @demote_bfloat_test(
+; CHECK-NEXT:  top:
+; CHECK-NEXT:    %0 = fpext bfloat %a to float
+; CHECK-NEXT:    %1 = fpext bfloat %b to float
+; CHECK-NEXT:    %2 = fadd float %0, %1
+; CHECK-NEXT:    %3 = fptrunc float %2 to bfloat
+; CHECK-NEXT:    %4 = fpext bfloat %3 to float
+; CHECK-NEXT:    %5 = fpext bfloat %b to float
+; CHECK-NEXT:    %6 = fadd float %4, %5
+; CHECK-NEXT:    %7 = fptrunc float %6 to bfloat
+; CHECK-NEXT:    %8 = fpext bfloat %7 to float
+; CHECK-NEXT:    %9 = fpext bfloat %b to float
+; CHECK-NEXT:    %10 = fadd float %8, %9
+; CHECK-NEXT:    %11 = fptrunc float %10 to bfloat
+; CHECK-NEXT:    %12 = fpext bfloat %11 to float
+; CHECK-NEXT:    %13 = fpext bfloat %b to float
+; CHECK-NEXT:    %14 = fmul float %12, %13
+; CHECK-NEXT:    %15 = fptrunc float %14 to bfloat
+; CHECK-NEXT:    %16 = fpext bfloat %15 to float
+; CHECK-NEXT:    %17 = fpext bfloat %b to float
+; CHECK-NEXT:    %18 = fdiv float %16, %17
+; CHECK-NEXT:    %19 = fptrunc float %18 to bfloat
+; CHECK-NEXT:    %20 = insertelement <2 x bfloat> undef, bfloat %a, i32 0
+; CHECK-NEXT:    %21 = insertelement <2 x bfloat> %20, bfloat %b, i32 1
+; CHECK-NEXT:    %22 = insertelement <2 x bfloat> undef, bfloat %b, i32 0
+; CHECK-NEXT:    %23 = insertelement <2 x bfloat> %22, bfloat %b, i32 1
+; CHECK-NEXT:    %24 = fpext <2 x bfloat> %21 to <2 x float>
+; CHECK-NEXT:    %25 = fpext <2 x bfloat> %23 to <2 x float>
+; CHECK-NEXT:    %26 = fadd <2 x float> %24, %25
+; CHECK-NEXT:    %27 = fptrunc <2 x float> %26 to <2 x bfloat>
+; CHECK-NEXT:    %28 = extractelement <2 x bfloat> %27, i32 0
+; CHECK-NEXT:    %29 = extractelement <2 x bfloat> %27, i32 1
+; CHECK-NEXT:    %30 = fpext bfloat %28 to float
+; CHECK-NEXT:    %31 = fpext bfloat %29 to float
+; CHECK-NEXT:    %32 = fadd float %30, %31
+; CHECK-NEXT:    %33 = fptrunc float %32 to bfloat
+; CHECK-NEXT:    %34 = fpext bfloat %33 to float
+; CHECK-NEXT:    %35 = fpext bfloat %19 to float
+; CHECK-NEXT:    %36 = fadd float %34, %35
+; CHECK-NEXT:    %37 = fptrunc float %36 to bfloat
+; CHECK-NEXT:    ret bfloat %37
+;
+  %0 = fadd bfloat %a, %b
+  %1 = fadd bfloat %0, %b
+  %2 = fadd bfloat %1, %b
+  %3 = fmul bfloat %2, %b
+  %4 = fdiv bfloat %3, %b
+  %5 = insertelement <2 x bfloat> undef, bfloat %a, i32 0
+  %6 = insertelement <2 x bfloat> %5, bfloat %b, i32 1
+  %7 = insertelement <2 x bfloat> undef, bfloat %b, i32 0
+  %8 = insertelement <2 x bfloat> %7, bfloat %b, i32 1
+  %9 = fadd <2 x bfloat> %6, %8
+  %10 = extractelement <2 x bfloat> %9, i32 0
+  %11 = extractelement <2 x bfloat> %9, i32 1
+  %12 = fadd bfloat %10, %11
+  %13 = fadd bfloat %12, %4
+  ret bfloat %13
+}
+
+define bfloat @native_bfloat_test(bfloat %a, bfloat %b) #1 {
+; CHECK-LABEL: @native_bfloat_test(
+; CHECK-NEXT   top:
+; CHECK-NEXT     %0 = fadd bfloat %a, %b
+; CHECK-NEXT     %1 = fadd bfloat %0, %b
+; CHECK-NEXT     %2 = fadd bfloat %1, %b
+; CHECK-NEXT     %3 = fmul bfloat %2, %b
+; CHECK-NEXT     %4 = fdiv bfloat %3, %b
+; CHECK-NEXT     %5 = insertelement <2 x bfloat> undef, bfloat %a, i32 0
+; CHECK-NEXT     %6 = insertelement <2 x bfloat> %5, bfloat %b, i32 1
+; CHECK-NEXT     %7 = insertelement <2 x bfloat> undef, bfloat %b, i32 0
+; CHECK-NEXT     %8 = insertelement <2 x bfloat> %7, bfloat %b, i32 1
+; CHECK-NEXT     %9 = fadd <2 x bfloat> %6, %8
+; CHECK-NEXT     %10 = extractelement <2 x bfloat> %9, i32 0
+; CHECK-NEXT     %11 = extractelement <2 x bfloat> %9, i32 1
+; CHECK-NEXT     %12 = fadd bfloat %10, %11
+; CHECK-NEXT     %13 = fadd bfloat %12, %4
+; CHECK-NEXT     ret bfloat %13
+;
+top:
+  %0 = fadd bfloat %a, %b
+  %1 = fadd bfloat %0, %b
+  %2 = fadd bfloat %1, %b
+  %3 = fmul bfloat %2, %b
+  %4 = fdiv bfloat %3, %b
+  %5 = insertelement <2 x bfloat> undef, bfloat %a, i32 0
+  %6 = insertelement <2 x bfloat> %5, bfloat %b, i32 1
+  %7 = insertelement <2 x bfloat> undef, bfloat %b, i32 0
+  %8 = insertelement <2 x bfloat> %7, bfloat %b, i32 1
+  %9 = fadd <2 x bfloat> %6, %8
+  %10 = extractelement <2 x bfloat> %9, i32 0
+  %11 = extractelement <2 x bfloat> %9, i32 1
+  %12 = fadd bfloat %10, %11
+  %13 = fadd bfloat %12, %4
+  ret bfloat %13
+}
+
 attributes #0 = { "target-features"="-avx512fp16" }
 attributes #1 = { "target-features"="+avx512fp16" }
+attributes #2 = { "target-features"="-avx512bf16" }
+attributes #3 = { "target-features"="+avx512bf16" }

From 4d6036b0c798d225ff64313af8639d9f0e4b6b10 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Mon, 2 Oct 2023 15:48:50 +0200
Subject: [PATCH 05/12] Multiversioning.

---
 src/llvm-multiversioning.cpp | 10 ++++++----
 src/processor.h              |  2 ++
 src/processor_x86.cpp        |  7 +++++++
 3 files changed, 15 insertions(+), 4 deletions(-)

diff --git a/src/llvm-multiversioning.cpp b/src/llvm-multiversioning.cpp
index da24882d85d6f..22f956294ddd3 100644
--- a/src/llvm-multiversioning.cpp
+++ b/src/llvm-multiversioning.cpp
@@ -50,7 +50,7 @@ extern Optional<bool> always_have_fma(Function&, const Triple &TT);
 
 namespace {
 constexpr uint32_t clone_mask =
-    JL_TARGET_CLONE_LOOP | JL_TARGET_CLONE_SIMD | JL_TARGET_CLONE_MATH | JL_TARGET_CLONE_CPU | JL_TARGET_CLONE_FLOAT16;
+    JL_TARGET_CLONE_LOOP | JL_TARGET_CLONE_SIMD | JL_TARGET_CLONE_MATH | JL_TARGET_CLONE_CPU | JL_TARGET_CLONE_FLOAT16 | JL_TARGET_CLONE_BFLOAT16;
 
 // Treat identical mapping as missing and return `def` in that case.
 // We mainly need this to identify cloned function using value map after LLVM cloning
@@ -126,12 +126,14 @@ static uint32_t collect_func_info(Function &F, const Triple &TT, bool &has_vecca
             }
 
             for (size_t i = 0; i < I.getNumOperands(); i++) {
-                if(I.getOperand(i)->getType()->isHalfTy()){
+                if(I.getOperand(i)->getType()->isHalfTy()) {
                     flag |= JL_TARGET_CLONE_FLOAT16;
                 }
-                // Check for BFloat16 when they are added to julia can be done here
+                if(I.getOperand(i)->getType()->isBFloatTy()) {
+                    flag |= JL_TARGET_CLONE_BFLOAT16;
+                }
             }
-            uint32_t veccall_flags = JL_TARGET_CLONE_SIMD | JL_TARGET_CLONE_MATH | JL_TARGET_CLONE_CPU | JL_TARGET_CLONE_FLOAT16;
+            uint32_t veccall_flags = JL_TARGET_CLONE_SIMD | JL_TARGET_CLONE_MATH | JL_TARGET_CLONE_CPU | JL_TARGET_CLONE_FLOAT16 | JL_TARGET_CLONE_BFLOAT16;
             if (has_veccall && (flag & veccall_flags) == veccall_flags) {
                 return flag;
             }
diff --git a/src/processor.h b/src/processor.h
index a3ebdf4f8c605..696d725ed826b 100644
--- a/src/processor.h
+++ b/src/processor.h
@@ -41,6 +41,8 @@ enum {
     JL_TARGET_CLONE_CPU = 1 << 8,
     // Clone when the function uses fp16
     JL_TARGET_CLONE_FLOAT16 = 1 << 9,
+    // Clone when the function uses bf16
+    JL_TARGET_CLONE_BFLOAT16 = 1 << 10,
 };
 
 #define JL_FEATURE_DEF_NAME(name, bit, llvmver, str) JL_FEATURE_DEF(name, bit, llvmver)
diff --git a/src/processor_x86.cpp b/src/processor_x86.cpp
index 73e0992bcf37c..13dabd4e42db7 100644
--- a/src/processor_x86.cpp
+++ b/src/processor_x86.cpp
@@ -961,6 +961,13 @@ static void ensure_jit_target(bool imaging)
                 break;
             }
         }
+        static constexpr uint32_t clone_bf16[] = {Feature::avx512bf16};
+        for (auto fe: clone_bf16) {
+            if (!test_nbit(features0, fe) && test_nbit(t.en.features, fe)) {
+                t.en.flags |= JL_TARGET_CLONE_BFLOAT16;
+                break;
+            }
+        }
     }
 }
 

From 8e4bc16a35c20cb49867c8362831c94551b491d1 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Tue, 3 Oct 2023 10:13:44 +0200
Subject: [PATCH 06/12] Always demote BFloat16.

---
 src/llvm-demote-float16.cpp | 15 +++-----------
 test/llvmpasses/float16.ll  | 41 +------------------------------------
 2 files changed, 4 insertions(+), 52 deletions(-)

diff --git a/src/llvm-demote-float16.cpp b/src/llvm-demote-float16.cpp
index 976b63d5f6b26..5d0d9f5d37c40 100644
--- a/src/llvm-demote-float16.cpp
+++ b/src/llvm-demote-float16.cpp
@@ -73,21 +73,12 @@ static bool have_fp16(Function &caller, const Triple &TT) {
 }
 
 static bool have_bf16(Function &caller, const Triple &TT) {
-    Attribute FSAttr = caller.getFnAttribute("target-features");
-    StringRef FS = "";
-    if (FSAttr.isValid())
-        FS = FSAttr.getValueAsString();
-    else if (jl_ExecutionEngine)
-        FS = jl_ExecutionEngine->getTargetFeatureString();
-    // else probably called from opt, just do nothing
-    if (TT.getArch() == Triple::x86_64) {
-        if (FS.find("+avx512bf16") != llvm::StringRef::npos){
-            return true;
-        }
-    }
     if (caller.hasFnAttribute("julia.hasbf16")) {
         return true;
     }
+
+    // there's no targets that fully support bfloat yet;,
+    // AVX512BF16 only provides conversion and dot product instructions.
     return false;
 }
 
diff --git a/test/llvmpasses/float16.ll b/test/llvmpasses/float16.ll
index 1ea96e55aa6b8..0c37be449d959 100644
--- a/test/llvmpasses/float16.ll
+++ b/test/llvmpasses/float16.ll
@@ -101,7 +101,7 @@ top:
   ret half %13
 }
 
-define bfloat @demote_bfloat_test(bfloat %a, bfloat %b) #0 {
+define bfloat @demote_bfloat_test(bfloat %a, bfloat %b) {
 top:
 ; CHECK-LABEL: @demote_bfloat_test(
 ; CHECK-NEXT:  top:
@@ -162,44 +162,5 @@ top:
   ret bfloat %13
 }
 
-define bfloat @native_bfloat_test(bfloat %a, bfloat %b) #1 {
-; CHECK-LABEL: @native_bfloat_test(
-; CHECK-NEXT   top:
-; CHECK-NEXT     %0 = fadd bfloat %a, %b
-; CHECK-NEXT     %1 = fadd bfloat %0, %b
-; CHECK-NEXT     %2 = fadd bfloat %1, %b
-; CHECK-NEXT     %3 = fmul bfloat %2, %b
-; CHECK-NEXT     %4 = fdiv bfloat %3, %b
-; CHECK-NEXT     %5 = insertelement <2 x bfloat> undef, bfloat %a, i32 0
-; CHECK-NEXT     %6 = insertelement <2 x bfloat> %5, bfloat %b, i32 1
-; CHECK-NEXT     %7 = insertelement <2 x bfloat> undef, bfloat %b, i32 0
-; CHECK-NEXT     %8 = insertelement <2 x bfloat> %7, bfloat %b, i32 1
-; CHECK-NEXT     %9 = fadd <2 x bfloat> %6, %8
-; CHECK-NEXT     %10 = extractelement <2 x bfloat> %9, i32 0
-; CHECK-NEXT     %11 = extractelement <2 x bfloat> %9, i32 1
-; CHECK-NEXT     %12 = fadd bfloat %10, %11
-; CHECK-NEXT     %13 = fadd bfloat %12, %4
-; CHECK-NEXT     ret bfloat %13
-;
-top:
-  %0 = fadd bfloat %a, %b
-  %1 = fadd bfloat %0, %b
-  %2 = fadd bfloat %1, %b
-  %3 = fmul bfloat %2, %b
-  %4 = fdiv bfloat %3, %b
-  %5 = insertelement <2 x bfloat> undef, bfloat %a, i32 0
-  %6 = insertelement <2 x bfloat> %5, bfloat %b, i32 1
-  %7 = insertelement <2 x bfloat> undef, bfloat %b, i32 0
-  %8 = insertelement <2 x bfloat> %7, bfloat %b, i32 1
-  %9 = fadd <2 x bfloat> %6, %8
-  %10 = extractelement <2 x bfloat> %9, i32 0
-  %11 = extractelement <2 x bfloat> %9, i32 1
-  %12 = fadd bfloat %10, %11
-  %13 = fadd bfloat %12, %4
-  ret bfloat %13
-}
-
 attributes #0 = { "target-features"="-avx512fp16" }
 attributes #1 = { "target-features"="+avx512fp16" }
-attributes #2 = { "target-features"="-avx512bf16" }
-attributes #3 = { "target-features"="+avx512bf16" }

From ae5918e2fe3d89fb4e5c992a8d2bea7255c10b21 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Tue, 3 Oct 2023 10:14:46 +0200
Subject: [PATCH 07/12] Fix doctests.

---
 doc/src/base/reflection.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/doc/src/base/reflection.md b/doc/src/base/reflection.md
index b6246c06472a4..38c987fe8a497 100644
--- a/doc/src/base/reflection.md
+++ b/doc/src/base/reflection.md
@@ -54,6 +54,7 @@ the abstract `DataType` [`AbstractFloat`](@ref) has four (concrete) subtypes:
 julia> subtypes(AbstractFloat)
 4-element Vector{Any}:
  BigFloat
+ Core.BFloat16
  Float16
  Float32
  Float64

From e6b8db93d03af84944f48e97853b084d7ef17f8a Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Tue, 3 Oct 2023 10:48:59 +0200
Subject: [PATCH 08/12] Only ever attach zext/sext to integer types.

---
 src/ccall.cpp            | 3 ++-
 src/codegen.cpp          | 3 ++-
 src/jl_exported_data.inc | 1 +
 src/jltypes.c            | 2 +-
 src/julia.h              | 1 +
 src/staticdata.c         | 3 ++-
 6 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/src/ccall.cpp b/src/ccall.cpp
index cb069b58c3761..24022ac19c3b5 100644
--- a/src/ccall.cpp
+++ b/src/ccall.cpp
@@ -1123,7 +1123,8 @@ std::string generate_func_sig(const char *fname)
             isboxed = false;
         }
         else {
-            if (jl_is_primitivetype(tti)) {
+            if (jl_is_primitivetype(tti) &&
+                jl_integer_type && jl_subtype(tti, (jl_value_t*)jl_integer_type)) {
                 // see pull req #978. need to annotate signext/zeroext for
                 // small integer arguments.
                 jl_datatype_t *bt = (jl_datatype_t*)tti;
diff --git a/src/codegen.cpp b/src/codegen.cpp
index 20f2dfe28165f..6187bf85c9da0 100644
--- a/src/codegen.cpp
+++ b/src/codegen.cpp
@@ -7149,7 +7149,8 @@ static jl_returninfo_t get_specsig_function(jl_codectx_t &ctx, Module *M, Value
         else if (isboxed && jl_is_immutable_datatype(jt)) {
             param.addAttribute(Attribute::ReadOnly);
         }
-        else if (jl_is_primitivetype(jt) && ty->isIntegerTy()) {
+        else if (jl_is_primitivetype(jt) && jl_integer_type &&
+                 jl_subtype(jt, (jl_value_t*)jl_integer_type)) {
             bool issigned = jl_signed_type && jl_subtype(jt, (jl_value_t*)jl_signed_type);
             Attribute::AttrKind attr = issigned ? Attribute::SExt : Attribute::ZExt;
             param.addAttribute(attr);
diff --git a/src/jl_exported_data.inc b/src/jl_exported_data.inc
index aa23b9d7b8205..3bb991ae82ecc 100644
--- a/src/jl_exported_data.inc
+++ b/src/jl_exported_data.inc
@@ -50,6 +50,7 @@
     XX(jl_gotoifnot_type) \
     XX(jl_gotonode_type) \
     XX(jl_initerror_type) \
+    XX(jl_integer_type) \
     XX(jl_int16_type) \
     XX(jl_int32_type) \
     XX(jl_int64_type) \
diff --git a/src/jltypes.c b/src/jltypes.c
index 33b52158488a3..8c005f5966d1d 100644
--- a/src/jltypes.c
+++ b/src/jltypes.c
@@ -3409,7 +3409,7 @@ void post_boot_hooks(void)
     jl_number_type  = (jl_datatype_t*)core("Number");
     jl_signed_type  = (jl_datatype_t*)core("Signed");
     jl_datatype_t *jl_unsigned_type = (jl_datatype_t*)core("Unsigned");
-    jl_datatype_t *jl_integer_type = (jl_datatype_t*)core("Integer");
+    jl_integer_type = (jl_datatype_t*)core("Integer");
 
     jl_bool_type->super = jl_integer_type;
     jl_uint8_type->super = jl_unsigned_type;
diff --git a/src/julia.h b/src/julia.h
index a357bdf558360..842bc16068b76 100644
--- a/src/julia.h
+++ b/src/julia.h
@@ -853,6 +853,7 @@ extern JL_DLLIMPORT jl_datatype_t *jl_floatingpoint_type JL_GLOBALLY_ROOTED;
 extern JL_DLLIMPORT jl_datatype_t *jl_number_type JL_GLOBALLY_ROOTED;
 extern JL_DLLIMPORT jl_datatype_t *jl_void_type JL_GLOBALLY_ROOTED;  // deprecated
 extern JL_DLLIMPORT jl_datatype_t *jl_nothing_type JL_GLOBALLY_ROOTED;
+extern JL_DLLIMPORT jl_datatype_t *jl_integer_type JL_GLOBALLY_ROOTED;
 extern JL_DLLIMPORT jl_datatype_t *jl_signed_type JL_GLOBALLY_ROOTED;
 extern JL_DLLIMPORT jl_datatype_t *jl_voidpointer_type JL_GLOBALLY_ROOTED;
 extern JL_DLLIMPORT jl_datatype_t *jl_uint8pointer_type JL_GLOBALLY_ROOTED;
diff --git a/src/staticdata.c b/src/staticdata.c
index df5652a5719c4..4d1be4bd295e3 100644
--- a/src/staticdata.c
+++ b/src/staticdata.c
@@ -99,7 +99,7 @@ extern "C" {
 // TODO: put WeakRefs on the weak_refs list during deserialization
 // TODO: handle finalizers
 
-#define NUM_TAGS    160
+#define NUM_TAGS    161
 
 // An array of references that need to be restored from the sysimg
 // This is a manually constructed dual of the gvars array, which would be produced by codegen for Julia code, for C.
@@ -197,6 +197,7 @@ jl_value_t **const*const get_tags(void) {
         INSERT_TAG(jl_bfloat16_type);
         INSERT_TAG(jl_floatingpoint_type);
         INSERT_TAG(jl_number_type);
+        INSERT_TAG(jl_integer_type);
         INSERT_TAG(jl_signed_type);
         INSERT_TAG(jl_pair_type);
 

From 577f4d133c3486be827406be27b07f9b772f3b2d Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Wed, 4 Oct 2023 10:16:21 +0200
Subject: [PATCH 09/12] Simplify conversion routines.

---
 src/runtime_intrinsics.c | 11 ++---------
 1 file changed, 2 insertions(+), 9 deletions(-)

diff --git a/src/runtime_intrinsics.c b/src/runtime_intrinsics.c
index 287d22314c027..b42b7d9832383 100644
--- a/src/runtime_intrinsics.c
+++ b/src/runtime_intrinsics.c
@@ -227,12 +227,7 @@ JL_DLLEXPORT float julia__truncsfbf2(float param) JL_NOTSAFEPOINT
         uint32_t bits = *((uint32_t*) &param);
 
         // round to nearest even
-        uint32_t bit_above_round = (bits >> 17) & 1;
-        uint32_t round_bit = (bits >> 16) & 1;
-        uint32_t sticky_bit = (bits & 0xFFFF) != 0;
-        if (round_bit && (sticky_bit || bit_above_round))
-            bits += 0x10000; // Add 1 to bit just above the target bits
-
+        bits += 0x7fff + ((bits >> 16) & 1);
         result = (uint16_t)(bits >> 16);
     }
 
@@ -249,10 +244,8 @@ JL_DLLEXPORT float julia__truncdfbf2(double param) JL_NOTSAFEPOINT
     uint32_t resi;
     memcpy(&resi, &res, sizeof(res));
 
-    // Handle subnormals: If this logic is activated, it indicates that when we
-    // cast our double to a float, the float is a subnormal number. However,
     // bfloat16 uses the same exponent as float32, so we don't need special handling
-    // for subnormals when truncating to bfloat16.
+    // for subnormals when truncating float64 to bfloat16.
 
     if ((resi & 0x1ffu) == 0x100u) { // if we are halfway between 2 bfloat16 values
         // adjust the value by 1 ULP in the direction that will make bfloat16(res) give the right answer

From 13a453d47c0dba4024d3430d1d8a88a503f7925a Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Wed, 4 Oct 2023 10:35:58 +0200
Subject: [PATCH 10/12] Fix doctest.

---
 doc/src/base/reflection.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/src/base/reflection.md b/doc/src/base/reflection.md
index 38c987fe8a497..2798cfe2e7530 100644
--- a/doc/src/base/reflection.md
+++ b/doc/src/base/reflection.md
@@ -52,7 +52,7 @@ the abstract `DataType` [`AbstractFloat`](@ref) has four (concrete) subtypes:
 
 ```jldoctest; setup = :(using InteractiveUtils)
 julia> subtypes(AbstractFloat)
-4-element Vector{Any}:
+5-element Vector{Any}:
  BigFloat
  Core.BFloat16
  Float16

From 01d9a86cf59d9b3f4af71315463c3a965cbe6d86 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Wed, 4 Oct 2023 16:36:08 +0200
Subject: [PATCH 11/12] Address review comments.

---
 src/ccall.cpp            | 12 +++++-------
 src/codegen.cpp          |  3 +--
 src/jl_exported_data.inc |  1 -
 src/jltypes.c            |  2 +-
 src/julia.h              |  1 -
 src/staticdata.c         |  3 +--
 6 files changed, 8 insertions(+), 14 deletions(-)

diff --git a/src/ccall.cpp b/src/ccall.cpp
index 24022ac19c3b5..3c42e46d273cf 100644
--- a/src/ccall.cpp
+++ b/src/ccall.cpp
@@ -1123,8 +1123,11 @@ std::string generate_func_sig(const char *fname)
             isboxed = false;
         }
         else {
-            if (jl_is_primitivetype(tti) &&
-                jl_integer_type && jl_subtype(tti, (jl_value_t*)jl_integer_type)) {
+            t = _julia_struct_to_llvm(ctx, LLVMCtx, tti, &isboxed, llvmcall);
+            if (t == getVoidTy(LLVMCtx)) {
+                return make_errmsg(fname, i + 1, " type doesn't correspond to a C type");
+            }
+            if (jl_is_primitivetype(tti) && t->isIntegerTy()) {
                 // see pull req #978. need to annotate signext/zeroext for
                 // small integer arguments.
                 jl_datatype_t *bt = (jl_datatype_t*)tti;
@@ -1135,11 +1138,6 @@ std::string generate_func_sig(const char *fname)
                         ab.addAttribute(Attribute::ZExt);
                 }
             }
-
-            t = _julia_struct_to_llvm(ctx, LLVMCtx, tti, &isboxed, llvmcall);
-            if (t == getVoidTy(LLVMCtx)) {
-                return make_errmsg(fname, i + 1, " type doesn't correspond to a C type");
-            }
         }
 
         Type *pat;
diff --git a/src/codegen.cpp b/src/codegen.cpp
index 6187bf85c9da0..20f2dfe28165f 100644
--- a/src/codegen.cpp
+++ b/src/codegen.cpp
@@ -7149,8 +7149,7 @@ static jl_returninfo_t get_specsig_function(jl_codectx_t &ctx, Module *M, Value
         else if (isboxed && jl_is_immutable_datatype(jt)) {
             param.addAttribute(Attribute::ReadOnly);
         }
-        else if (jl_is_primitivetype(jt) && jl_integer_type &&
-                 jl_subtype(jt, (jl_value_t*)jl_integer_type)) {
+        else if (jl_is_primitivetype(jt) && ty->isIntegerTy()) {
             bool issigned = jl_signed_type && jl_subtype(jt, (jl_value_t*)jl_signed_type);
             Attribute::AttrKind attr = issigned ? Attribute::SExt : Attribute::ZExt;
             param.addAttribute(attr);
diff --git a/src/jl_exported_data.inc b/src/jl_exported_data.inc
index 3bb991ae82ecc..aa23b9d7b8205 100644
--- a/src/jl_exported_data.inc
+++ b/src/jl_exported_data.inc
@@ -50,7 +50,6 @@
     XX(jl_gotoifnot_type) \
     XX(jl_gotonode_type) \
     XX(jl_initerror_type) \
-    XX(jl_integer_type) \
     XX(jl_int16_type) \
     XX(jl_int32_type) \
     XX(jl_int64_type) \
diff --git a/src/jltypes.c b/src/jltypes.c
index 8c005f5966d1d..33b52158488a3 100644
--- a/src/jltypes.c
+++ b/src/jltypes.c
@@ -3409,7 +3409,7 @@ void post_boot_hooks(void)
     jl_number_type  = (jl_datatype_t*)core("Number");
     jl_signed_type  = (jl_datatype_t*)core("Signed");
     jl_datatype_t *jl_unsigned_type = (jl_datatype_t*)core("Unsigned");
-    jl_integer_type = (jl_datatype_t*)core("Integer");
+    jl_datatype_t *jl_integer_type = (jl_datatype_t*)core("Integer");
 
     jl_bool_type->super = jl_integer_type;
     jl_uint8_type->super = jl_unsigned_type;
diff --git a/src/julia.h b/src/julia.h
index 842bc16068b76..a357bdf558360 100644
--- a/src/julia.h
+++ b/src/julia.h
@@ -853,7 +853,6 @@ extern JL_DLLIMPORT jl_datatype_t *jl_floatingpoint_type JL_GLOBALLY_ROOTED;
 extern JL_DLLIMPORT jl_datatype_t *jl_number_type JL_GLOBALLY_ROOTED;
 extern JL_DLLIMPORT jl_datatype_t *jl_void_type JL_GLOBALLY_ROOTED;  // deprecated
 extern JL_DLLIMPORT jl_datatype_t *jl_nothing_type JL_GLOBALLY_ROOTED;
-extern JL_DLLIMPORT jl_datatype_t *jl_integer_type JL_GLOBALLY_ROOTED;
 extern JL_DLLIMPORT jl_datatype_t *jl_signed_type JL_GLOBALLY_ROOTED;
 extern JL_DLLIMPORT jl_datatype_t *jl_voidpointer_type JL_GLOBALLY_ROOTED;
 extern JL_DLLIMPORT jl_datatype_t *jl_uint8pointer_type JL_GLOBALLY_ROOTED;
diff --git a/src/staticdata.c b/src/staticdata.c
index 4d1be4bd295e3..df5652a5719c4 100644
--- a/src/staticdata.c
+++ b/src/staticdata.c
@@ -99,7 +99,7 @@ extern "C" {
 // TODO: put WeakRefs on the weak_refs list during deserialization
 // TODO: handle finalizers
 
-#define NUM_TAGS    161
+#define NUM_TAGS    160
 
 // An array of references that need to be restored from the sysimg
 // This is a manually constructed dual of the gvars array, which would be produced by codegen for Julia code, for C.
@@ -197,7 +197,6 @@ jl_value_t **const*const get_tags(void) {
         INSERT_TAG(jl_bfloat16_type);
         INSERT_TAG(jl_floatingpoint_type);
         INSERT_TAG(jl_number_type);
-        INSERT_TAG(jl_integer_type);
         INSERT_TAG(jl_signed_type);
         INSERT_TAG(jl_pair_type);
 

From 5ab8b4665789fd59bee76955b2acdd657dcd9d65 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Thu, 5 Oct 2023 16:33:50 +0200
Subject: [PATCH 12/12] Revert Float16 ABI change, for now.

---
 src/abi_x86_64.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/abi_x86_64.cpp b/src/abi_x86_64.cpp
index 5938e1e5778a2..7800c44b4d3ae 100644
--- a/src/abi_x86_64.cpp
+++ b/src/abi_x86_64.cpp
@@ -118,8 +118,7 @@ struct Classification {
 void classifyType(Classification& accum, jl_datatype_t *dt, uint64_t offset) const
 {
     // Floating point types
-    if (dt == jl_float64_type || dt == jl_float32_type || dt == jl_float16_type ||
-        dt == jl_bfloat16_type) {
+    if (dt == jl_float64_type || dt == jl_float32_type || dt == jl_bfloat16_type) {
         accum.addField(offset, Sse);
     }
     // Misc types