From f5fd262cd3be554b97ee6019051e8b58f08947e8 Mon Sep 17 00:00:00 2001
From: zerbina <100542850+zerbina@users.noreply.github.com>
Date: Sun, 13 Aug 2023 15:40:59 +0200
Subject: [PATCH] sem: decide storage type for enums on definition (#839)

## Summary

Centralize the decision about the underlying storage types for enum
values in semantic analysis. Since the decision now happens in a single
place instead of being scattered across the compiler, this makes it
significantly easier to change the enum storage behaviour later on.

In addition, this fixes two bugs with 1- and 2-byte enum values (which
use unsigned integer storage) where the most significant bit was set
that affected code running in the VM:
- loading the values from memory resulted in non-valid values
- (only affected the VM and JS backend) storing them inside aggregate
  `const`s resulted in them having invalid values at run-time

## Details

The key changes are:
- `tyEnum` now stores the storage type in the second type slot, meaning
  that `skipTypes(tyEnum)` yields the storage type
- size and alignment computation redirects to the storage type
- `isUnsigned` considers a `tyEnum`'s storage type
- the code generators use the `tyEnum`'s provided storage type and don't
  decide it themselves

Using the second type slot of `tyEnum` is done for both forward- and
backward-compatibility: it's possible that some of the current code
still depends on an enum type's first slot always being `nil`, and in
the future the slot could possibly be used to store the enum's base
type.

The storage type is directly set on enum definition (`semEnum`). Before
analyzing the enum fields, a preliminary storage type (`tyInt`) is
already added for the enum: this allows for the enum's values to be used
*before* the enum type is fully produced. Once the full value range is
known, the correct storage type is selected.

Choosing the storage type happens in the same way as it previously did
in the C code generator (`ccgtypes`). Apart from the information now
being available during semantic analysis already, this only affects the
VM target, where non-negative enum values requiring 32 or 64 bit of
storage were previously stored as unsigned integer (they're now stored
as *signed* integers, in line with the C target).

For the VM code generator, `tyEnum` is added to the `IrrelevantTypes`
set, meaning that the code generator now transparently treats all enum
types as their storage type, with an exception being made for the to-
string operation.
---
 compiler/ast/ast.nim                         |  7 ++-
 compiler/ast/ast_types.nim                   |  3 +
 compiler/ast/types.nim                       |  2 +-
 compiler/backend/ccgtypes.nim                | 26 +-------
 compiler/backend/cgirgen.nim                 |  8 +--
 compiler/backend/jsgen.nim                   |  4 +-
 compiler/sem/semtypes.nim                    | 31 ++++++++++
 compiler/sem/sizealignoffsetimpl.nim         | 22 ++-----
 compiler/vm/vmcompilerserdes.nim             | 11 +++-
 compiler/vm/vmgen.nim                        | 22 ++++---
 compiler/vm/vmtypegen.nim                    | 15 +----
 tests/lang_defs/enum/tstored_as_unsigned.nim | 65 ++++++++++++++++++++
 12 files changed, 142 insertions(+), 74 deletions(-)
 create mode 100644 tests/lang_defs/enum/tstored_as_unsigned.nim

diff --git a/compiler/ast/ast.nim b/compiler/ast/ast.nim
index 54bbe3187b2..d2f9c26d74a 100644
--- a/compiler/ast/ast.nim
+++ b/compiler/ast/ast.nim
@@ -255,7 +255,12 @@ proc newIntTypeNode*(intVal: BiggestInt, typ: PType): PNode =
   of tyUInt16:  result = newNode(nkUInt16Lit)
   of tyUInt32:  result = newNode(nkUInt32Lit)
   of tyUInt64:  result = newNode(nkUInt64Lit)
-  of tyBool, tyEnum:
+  of tyEnum:
+    # XXX: the kind for the underlying type should be used here, but too much
+    #      code still relies on literal enum values being represented as
+    #      exactly ``nkIntLit``
+    result = newNode(nkIntLit)
+  of tyBool:
     # XXX: does this really need to be the kind nkIntLit?
     result = newNode(nkIntLit)
   of tyStatic: # that's a pre-existing bug, will fix in another PR
diff --git a/compiler/ast/ast_types.nim b/compiler/ast/ast_types.nim
index eec5f035de1..7f91375e1df 100644
--- a/compiler/ast/ast_types.nim
+++ b/compiler/ast/ast_types.nim
@@ -475,6 +475,9 @@ type
     tyGenericParam,      ## ``a`` in the above patterns
     tyDistinct,
     tyEnum,
+      ## enum type. `base` stores the base type (currently always nil) and
+      ## `lastSon` the storage type (also referred to as "underlying type"),
+      ## which is the type used for the in-memory representation
     tyOrdinal,           ## integer types (including enums and boolean)
     tyArray,
     tyObject,
diff --git a/compiler/ast/types.nim b/compiler/ast/types.nim
index 6b61c18681e..e79cd653662 100644
--- a/compiler/ast/types.nim
+++ b/compiler/ast/types.nim
@@ -100,7 +100,7 @@ proc isPureObject*(typ: PType): bool =
   result = t.sym != nil and sfPure in t.sym.flags
 
 func isUnsigned*(t: PType): bool {.inline.} =
-  t.skipTypes(abstractInst).kind in {tyChar, tyUInt..tyUInt64}
+  t.skipTypes(abstractInst + {tyEnum}).kind in {tyChar, tyUInt..tyUInt64}
 
 proc getOrdValue*(n: PNode; onError = high(Int128)): Int128 =
   let k =
diff --git a/compiler/backend/ccgtypes.nim b/compiler/backend/ccgtypes.nim
index 49d123e4c82..fe31f776f46 100644
--- a/compiler/backend/ccgtypes.nim
+++ b/compiler/backend/ccgtypes.nim
@@ -129,18 +129,8 @@ proc mapType(conf: ConfigRef; typ: PType; kind: TSymKind): TCTypeKind =
     doAssert typ.isResolvedUserTypeClass
     return mapType(conf, typ.lastSon, kind)
   of tyGenericBody, tyGenericInst, tyGenericParam, tyDistinct, tyOrdinal,
-     tyTypeDesc, tyAlias, tySink, tyInferred:
+     tyTypeDesc, tyAlias, tySink, tyInferred, tyEnum:
     result = mapType(conf, lastSon(typ), kind)
-  of tyEnum:
-    if firstOrd(conf, typ) < 0:
-      result = ctInt32
-    else:
-      case int(getSize(conf, typ))
-      of 1: result = ctUInt8
-      of 2: result = ctUInt16
-      of 4: result = ctInt32
-      of 8: result = ctInt64
-      else: result = ctInt32
   of tyRange: result = mapType(conf, typ[0], kind)
   of tyPtr, tyVar, tyLent, tyRef:
     var base = skipTypes(typ.lastSon, typedescInst)
@@ -661,18 +651,8 @@ proc getTypeDescAux(m: BModule, origTyp: PType, check: var IntSet; kind: TSymKin
       result = getTypeName(m, origTyp, sig)
       if not (sfImportc in t.sym.flags and t.sym.magic == mNone):
         m.typeCache[sig] = result
-        var size: int
-        if firstOrd(m.config, t) < 0:
-          m.s[cfsTypes].addf("typedef NI32 $1;$n", [result])
-          size = 4
-        else:
-          size = int(getSize(m.config, t))
-          case size
-          of 1: m.s[cfsTypes].addf("typedef NU8 $1;$n", [result])
-          of 2: m.s[cfsTypes].addf("typedef NU16 $1;$n", [result])
-          of 4: m.s[cfsTypes].addf("typedef NI32 $1;$n", [result])
-          of 8: m.s[cfsTypes].addf("typedef NI64 $1;$n", [result])
-          else: internalError(m.config, t.sym.info, "getTypeDescAux: enum")
+        m.s[cfsTypes].addf("typedef $1 $2;$n",
+          [getTypeDescAux(m, t.lastSon, check, skVar), result])
         when false:
           let owner = hashOwner(t.sym)
           if not gDebugInfo.hasEnum(t.sym.name.s, t.sym.info.line, owner):
diff --git a/compiler/backend/cgirgen.nim b/compiler/backend/cgirgen.nim
index 139c54b3607..902e8c0981a 100644
--- a/compiler/backend/cgirgen.nim
+++ b/compiler/backend/cgirgen.nim
@@ -262,8 +262,8 @@ proc translateLit*(val: PNode): CgNode =
   case val.kind
   of nkIntLiterals:
     # use the type for deciding what whether it's a signed or unsigned value
-    case val.typ.skipTypes(abstractRange).kind
-    of tyInt..tyInt64, tyEnum, tyBool:
+    case val.typ.skipTypes(abstractRange + {tyEnum}).kind
+    of tyInt..tyInt64, tyBool:
       node(cnkIntLit, intVal, val.intVal)
     of tyUInt..tyUInt64, tyChar:
       node(cnkUIntLit, intVal, val.intVal)
@@ -713,11 +713,11 @@ proc tbRegion(tree: TreeWithSource, cl: var TranslateCl, prev: sink Values,
               cr: var TreeCursor): CgNode
 
 proc newIntLit(val: Int128, t: PType): CgNode =
-  case t.skipTypes(abstractVarRange).kind
+  case t.skipTypes(abstractVarRange + {tyEnum}).kind
   of tyUInt..tyUInt64, tyChar:
     CgNode(kind: cnkUIntLit, info: unknownLineInfo, typ: t,
            intVal: cast[BiggestInt](toUInt(val)))
-  of tyInt..tyInt64, tyBool, tyEnum:
+  of tyInt..tyInt64, tyBool:
     CgNode(kind: cnkIntLit, info: unknownLineInfo, typ: t,
            intVal: toInt(val))
   else:
diff --git a/compiler/backend/jsgen.nim b/compiler/backend/jsgen.nim
index 43b871a119d..bcd609dcd70 100644
--- a/compiler/backend/jsgen.nim
+++ b/compiler/backend/jsgen.nim
@@ -210,7 +210,7 @@ proc mapType(typ: PType; indirect = false): TJSTypeKind =
   of tyRange, tyDistinct, tyOrdinal, tyProxy, tyLent:
     # tyLent is no-op as JS has pass-by-reference semantics
     result = mapType(t[0], indirect)
-  of tyInt..tyInt64, tyUInt..tyUInt64, tyEnum, tyChar: result = etyInt
+  of tyInt..tyInt64, tyUInt..tyUInt64, tyChar: result = etyInt
   of tyBool: result = etyBool
   of tyFloat..tyFloat128: result = etyFloat
   of tySet: result = etyObject # map a set to a table
@@ -224,7 +224,7 @@ proc mapType(typ: PType; indirect = false): TJSTypeKind =
      tyAnd, tyOr, tyNot, tyAnything, tyVoid:
     result = etyNone
   of tyGenericInst, tyInferred, tyAlias, tyUserTypeClass, tyUserTypeClassInst,
-     tySink:
+     tySink, tyEnum:
     result = mapType(typ.lastSon, indirect)
   of tyStatic:
     if t.n != nil: result = mapType(t.lastSon, indirect)
diff --git a/compiler/sem/semtypes.nim b/compiler/sem/semtypes.nim
index f33674fc229..35fe2761099 100644
--- a/compiler/sem/semtypes.nim
+++ b/compiler/sem/semtypes.nim
@@ -41,6 +41,10 @@ proc semEnum(c: PContext, n: PNode, prev: PType): PType =
     localReport(c.config, n[0], reportAst(rsemIllformedAst, n[0]))
 
   rawAddSon(result, nil) # base type; always nil
+  # add a preliminary storage type such that the enum can already be
+  # used in its own definition
+  rawAddSon(result, getSysType(c.graph, n.info, tyInt))
+
   let isPure = result.sym != nil and sfPure in result.sym.flags
   var symbols: TStrTable
   if isPure: initStrTable(symbols)
@@ -143,6 +147,33 @@ proc semEnum(c: PContext, n: PNode, prev: PType): PType =
       wrongRedefinition(c, e.info, e, conflict)
 
     inc(counter)
+
+  # now that we know the full value range, select the correct storage type
+  # of the enum:
+  let tk =
+    if firstOrd(c.config, result) < Zero:
+      tyInt32 # use signed int32
+    elif result.size != szUncomputedSize:
+      # use the manually specified size for selecting the storage type
+      let s = result.size
+      if   s <= 1: tyUInt8
+      elif s <= 2: tyUInt16
+      elif s <= 4: tyInt32
+      elif s <= 8: tyInt64
+      else:        unreachable()
+    else:
+      let lastOrd = lastOrd(c.config, result)
+      if lastOrd < (1 shl 8):
+        tyUInt8
+      elif lastOrd < (1 shl 16):
+        tyUInt16
+      elif lastOrd < (BiggestInt(1) shl 32):
+        tyInt32
+      else:
+        tyInt64
+
+  result[1] = getSysType(c.graph, n.info, tk)
+
   if isPure and sfExported in result.sym.flags:
     addPureEnum(c, LazySym(sym: result.sym))
 
diff --git a/compiler/sem/sizealignoffsetimpl.nim b/compiler/sem/sizealignoffsetimpl.nim
index 7ca42bbe531..2795be25539 100644
--- a/compiler/sem/sizealignoffsetimpl.nim
+++ b/compiler/sem/sizealignoffsetimpl.nim
@@ -277,23 +277,11 @@ proc computeSizeAlign(conf: ConfigRef; typ: PType) =
     typ.align = base.align
 
   of tyEnum:
-    if firstOrd(conf, typ) < Zero:
-      typ.size = 4              # use signed int32
-      typ.align = 4
-    else:
-      let lastOrd = toInt64(lastOrd(conf, typ))   # BUGFIX: use lastOrd!
-      if lastOrd < `shl`(1, 8):
-        typ.size = 1
-        typ.align = 1
-      elif lastOrd < `shl`(1, 16):
-        typ.size = 2
-        typ.align = 2
-      elif lastOrd < `shl`(BiggestInt(1), 32):
-        typ.size = 4
-        typ.align = 4
-      else:
-        typ.size = 8
-        typ.align = int16(conf.floatInt64Align)
+    let base = typ.lastSon
+    computeSizeAlign(conf, base)
+    # use the size and alignment of the underlying type:
+    typ.size = base.size
+    typ.align = base.align
   of tySet:
     if typ[0].kind == tyGenericParam:
       typ.size = szUncomputedSize
diff --git a/compiler/vm/vmcompilerserdes.nim b/compiler/vm/vmcompilerserdes.nim
index 3c6f0015911..531c1f5fa92 100644
--- a/compiler/vm/vmcompilerserdes.nim
+++ b/compiler/vm/vmcompilerserdes.nim
@@ -272,14 +272,19 @@ proc deserialize(c: TCtx, m: VmMemoryRegion, vt: PVmType, formal, t: PType, info
       of akInt: readUInt(m)
       of akDiscriminator: readDiscriminant(m, vt.numBits)
       else: unreachable()
-    setResult(nkUIntLit, intVal, i)
-  of tyBool, tyEnum, tyInt..tyInt64:
+    result = newIntTypeNode(i, formal)
+    result.info = info
+  of tyBool, tyInt..tyInt64:
     let i =
       case vt.kind
       of akInt: signExtended(readIntBits(m), BiggestInt(s))
       of akDiscriminator: readDiscriminant(m, vt.numBits)
       else: unreachable()
-    setResult(nkIntLit, intVal, i)
+    result = newIntTypeNode(i, formal)
+    result.info = info
+  of tyEnum:
+    # the value is stored as the enum's underlying type
+    result = deserialize(c, m, vt, formal, t.lastSon, info)
   of tyFloat32:
     assert vt.kind == akFloat
     setResult(nkFloat32Lit, floatVal, readFloat32(m))
diff --git a/compiler/vm/vmgen.nim b/compiler/vm/vmgen.nim
index d7bda647821..79d54c715e5 100644
--- a/compiler/vm/vmgen.nim
+++ b/compiler/vm/vmgen.nim
@@ -98,7 +98,7 @@ type
     val: TRegister   ## the register holding the loaded value
 
 const
-  IrrelevantTypes = abstractInst + {tyStatic} - {tyTypeDesc}
+  IrrelevantTypes = abstractInst + {tyStatic, tyEnum} - {tyTypeDesc}
     ## the set of types that are not relevant to the VM. ``tyTypeDesc``, while
     ## not relevant right now, is likely going to be in the future.
 
@@ -333,8 +333,8 @@ proc patch(c: var TCtx, p: TPosition) =
                TInstrType(diff+wordExcess) shl regBxShift).TInstr
 
 proc getSlotKind(t: PType): TSlotKind =
-  case t.skipTypes(abstractRange-{tyTypeDesc}).kind
-  of tyBool, tyChar, tyEnum, tyOrdinal, tyInt..tyInt64, tyUInt..tyUInt64:
+  case t.skipTypes(IrrelevantTypes+{tyRange}).kind
+  of tyBool, tyChar, tyInt..tyInt64, tyUInt..tyUInt64:
     slotTempInt
   of tyString, tyCstring:
     slotTempStr
@@ -1207,7 +1207,7 @@ proc genNumberConv(c: var TCtx, info: CgNode, dest, src: TRegister,
   ## numeric types.
   const
     Floats = {tyFloat..tyFloat64}
-    Signed = {tyInt..tyInt64, tyEnum}
+    Signed = {tyInt..tyInt64}
     Unsigned = {tyUInt..tyUInt64, tyChar, tyBool}
 
   template payload(op: NumericConvKind): uint16 =
@@ -1291,10 +1291,12 @@ proc genConv(c: var TCtx; n, arg: CgNode; dest: var TDest) =
 
 proc genToStr(c: var TCtx, n, arg: CgNode, dest: var TDest) =
   # TODO: don't use ``opcConv`` for to-string conversions
+  # don't skip enum types
+  const Skip = IrrelevantTypes - {tyEnum}
   prepare(c, dest, n, n.typ)
   let tmp = c.genx(arg)
-  c.gABx(n, opcConv, dest, c.genTypeInfo(n.typ.skipTypes(IrrelevantTypes)))
-  c.gABx(n, opcConv, tmp, c.genTypeInfo(arg.typ.skipTypes(IrrelevantTypes)))
+  c.gABx(n, opcConv, dest, c.genTypeInfo(n.typ.skipTypes(Skip)))
+  c.gABx(n, opcConv, tmp, c.genTypeInfo(arg.typ.skipTypes(Skip)))
   c.freeTemp(tmp)
 
 proc genObjConv(c: var TCtx, n: CgNode, dest: var TDest) =
@@ -1513,8 +1515,8 @@ proc genSetElem(c: var TCtx, n: CgNode, typ: PType): TRegister {.inline.} =
   genSetElem(c, n, first)
 
 func fitsRegister(t: PType): bool =
-  let st = t.skipTypes(abstractInst + {tyStatic} - {tyTypeDesc})
-  st.kind in { tyRange, tyEnum, tyBool, tyInt..tyUInt64, tyChar, tyPtr, tyPointer} or
+  let st = t.skipTypes(IrrelevantTypes + {tyRange})
+  st.kind in { tyBool, tyInt..tyUInt64, tyChar, tyPtr, tyPointer} or
     (st.sym != nil and st.sym.magic == mPNimrodNode) # NimNode goes into register too
 
 proc ldNullOpcode(t: PType): TOpcode =
@@ -1522,8 +1524,8 @@ proc ldNullOpcode(t: PType): TOpcode =
   if fitsRegister(t): opcLdNullReg else: opcLdNull
 
 proc whichAsgnOpc(n: CgNode; requiresCopy = true): TOpcode =
-  case n.typ.skipTypes(abstractRange-{tyTypeDesc}).kind
-  of tyBool, tyChar, tyEnum, tyOrdinal, tyInt..tyInt64, tyUInt..tyUInt64:
+  case n.typ.skipTypes(IrrelevantTypes + {tyRange}).kind
+  of tyBool, tyChar, tyInt..tyInt64, tyUInt..tyUInt64:
     opcAsgnInt
   of tyFloat..tyFloat128:
     opcAsgnFloat
diff --git a/compiler/vm/vmtypegen.nim b/compiler/vm/vmtypegen.nim
index db5d71bac85..9c9543519a1 100644
--- a/compiler/vm/vmtypegen.nim
+++ b/compiler/vm/vmtypegen.nim
@@ -130,19 +130,8 @@ func getAtomicType(cache: TypeInfoCache, conf: ConfigRef, t: PType): Option[PVmT
       if t.sym != nil and t.sym.magic == mPNimrodNode: cache.nodeType
       else: nil
     of tyEnum:
-      # This mirrors the logic for `tyEnum` in `types.getSize`
-      {.cast(noSideEffect).}: # cast away the reporting related side effects
-                              # of `firstOrd`/`lastOrd`
-        if firstOrd(conf, t) < Zero:
-          cache.intTypes[tyInt32]
-        else:
-          let lastOrd = toInt64(lastOrd(conf, t))   # BUGFIX: use lastOrd!
-          let tk =
-            if lastOrd < (1 shl 8): tyUInt8
-            elif lastOrd < (1 shl 16): tyUInt16
-            elif lastOrd < (BiggestInt(1) shl 32): tyUInt32
-            else: tyUInt64
-          cache.uintTypes[tk]
+      # use the underlying type:
+      getAtomicType(cache, conf, t.lastSon).get()
     else:
       nil
 
diff --git a/tests/lang_defs/enum/tstored_as_unsigned.nim b/tests/lang_defs/enum/tstored_as_unsigned.nim
new file mode 100644
index 00000000000..ed10463e36d
--- /dev/null
+++ b/tests/lang_defs/enum/tstored_as_unsigned.nim
@@ -0,0 +1,65 @@
+discard """
+  targets: "c js vm"
+  description: "
+    Ensure that enum values stored as usigned integers are properly read as
+    such
+  "
+"""
+
+type
+  EnumA = enum
+    enuA_a = 0
+    enuA_b = high(uint8)
+
+  EnumB = enum
+    enuB_a = 0
+    enuB_b = high(uint16)
+
+doAssert sizeof(EnumA) == sizeof(uint8)
+doAssert sizeof(EnumB) == sizeof(uint16)
+
+proc test() =
+  # globals could interfere with the test, so use a procedure in order to
+  # ensure that locals are used. In addition, assign the enum values to
+  # variables so that the ``ord`` calls are not folded away
+  var val1 = enuA_b
+  doAssert ord(val1) == int high(uint8)
+
+  var val2 = enuB_b
+  doAssert ord(val2) == int high(uint16)
+
+  # the VM has to narrow the value after loading it from a memory
+  # location, so also test that case by accessing an array
+
+  var arr1 = [enuA_b]
+  doAssert ord(arr1[0]) == int high(uint8)
+
+  var arr2 = [enuB_b]
+  doAssert ord(arr2[0]) == int high(uint16)
+
+test()
+
+block compile_time_run_time_boundary:
+  # make sure that enum values stored as unsigned integers properly cross the
+  # compile-time/run-time boundary
+  proc get[T](): T = high(T)
+
+  block uint8_value:
+    const
+      folded   = (enuA_b,)       # the VM is not used
+      computed = (get[EnumA](),) # ``vmcompilerserdes`` is used
+
+    var val = folded
+    doAssert val[0] == enuA_b
+    val = computed
+    doAssert val[0] == enuA_b
+
+  block uint16_value:
+    const
+      folded   = (enuB_b,)       # the VM is not used
+      computed = (get[EnumB](),) # ``vmcompilerserdes`` is used
+
+    var val = folded
+    doAssert val[0] == enuB_b
+    val = computed
+    doAssert val[0] == enuB_b
\ No newline at end of file