From 0bfd8539e254db538a5d3d132acdf4d56968766a Mon Sep 17 00:00:00 2001
From: Ivan Gotovchits <ivg@ieee.org>
Date: Thu, 9 Jul 2020 09:06:21 -0400
Subject: [PATCH] publishes and documents the new disassembler engine (#1173)

* publishes and documents the new disassembler engine

The engine was introduced in BAP 2.0 but wasn't fully published or
documented.

Some parts of the interface were published but were lacking the
documentation, some were just missing.

We also publish the low-level interface to the disassembler engine,
which is totally new.

Last but not least we provide a precise description of the algorithm
and the interaction of the engine with the knowledge base.

* some grammar tweaks and cleanup
---
 lib/bap/bap.mli                      | 369 ++++++++++++++++++++++++++-
 lib/bap_disasm/bap_disasm.ml         |   1 +
 lib/bap_disasm/bap_disasm.mli        |   1 +
 lib/bap_disasm/bap_disasm_driver.ml  |  27 +-
 lib/bap_disasm/bap_disasm_driver.mli |  12 +
 5 files changed, 399 insertions(+), 11 deletions(-)

diff --git a/lib/bap/bap.mli b/lib/bap/bap.mli
index 209495e88..069c7ac0e 100644
--- a/lib/bap/bap.mli
+++ b/lib/bap/bap.mli
@@ -6596,17 +6596,253 @@ module Std : sig
 
   end
 
-  (** Disassembled program.
-      An interface for diassembling things. *)
+  (** The interface to the disassembler level.
+
+      The following definitions are used in documentation of modules
+      and functions in this interface.
+
+      An {i instruction} is a sequence of consecutive bytes that has
+      known decoding in the given instruction set architecture
+      (ISA). The following semantic properties of an instruction, as
+      provided by ISA specification. In the definitions below the
+      following properties play an important role
+      (see {!Insn.property} for more details about the properties):
+      - [call];
+      - [jump].
+
+      An {i instruction address} is the address of the first byte of
+      the instruction.
+
+      A [jump] instruction {i destination} is an address defined by
+      ISA specification to which the control flow should transfer if
+      the jump is taken. Potentially, it is possible that the
+      destination of a jump instruction follows the instruction, but
+      otherwise, the instruction that follows the instruction is not
+      the destination, only destinations of the {i taken} jump are
+      considered to be in the set of destinations of an instruction.
+
+      An instruction is a {i conditional} [jump] if it is a [jump]
+      instruction that is not always taken, as defined by the ISA
+      specification.
+
+      An instruction is a {i barrier} if it a [jump] that is not a
+      [call] and is not conditional.
+
+      An {i execution order}, is an order in which CPU executes
+      instructions.
+
+      The {i linear order} of a sequence of instructions is the
+      ascending order of their addresses.
+
+      An instruction is {i delayed} by [m > 0] instructions if it takes
+      effect not immediately but after [m] other instructions are
+      executed.
+
+      An instruction [i(k)] {i follows} the instruction [i(j)] if
+      [i(j)] is not a barrier and either the address of [i(k)] is
+      the successor of the address of the last byte of [i(j)] or if
+      either [i(k+m)] or [i(k)] is an instruction that is delayed
+      by [m > 0] instructions.
+
+      A {i chain of instructions} is a sequence of instruction
+      [{i(0); ...; i(k),i(k+1),i(n)}] so that i(k+1) is either
+      a resolved destination of [i(k)] or follows it. An instruction
+      can belong to more than one chain.
+
+      A {i valid chain of instructions} is a chain where the last
+      instruction is a [jump] instruction that is either indirect or
+      its destinations belong to some previous jump in the same
+      chain.
+
+      An instruction is {i valid} if it belongs to a valid chain of
+      instructions.
+
+      A byte is {i data} if one the following is true:
+      1) its address is an address of an instruction that is not valid;
+      2) it was classified in the knowledge base as data;
+      3) it is not an instruction.
+
+      A {i basic block} is an non-empty instruction chain
+      [{i(1); ... i(n)}] such that for each [1 < i <= n],
+      - [i(i)] follows [i(i-1)];
+      - there is no valid instruction in the knowledge base that has
+        [i(i)] as a known destination;
+      - [i(i)] is not a jump when [i < n].
+
+      A {i subroutine} is a non-empty finite set of basic blocks
+      [{b(1); ..; b(n)}] such that [b(1)] dominates each block in
+      [{b(2); ..; b(n)}] (which also implies that they are
+      reachable) and [b(1)] is called the {i entry} block (or point).
+  *)
   module Disasm : sig
 
+    (** Disassembler Driver.
+
+        This is a low-level interface to the CFG reconstruction and
+        disassembling engine. It is used by BAP's high-level
+        components, such as the recursive-descent disassembler, so
+        there is in general no need to use it directly, unless you're
+        devising a custom disassembly pipe-line.
+
+
+        The disassembler is driven and controlled by the knowledge
+        base, so it is possible to modify the behavior of the BAP
+        disassembler layer through the knowledge base and turn the
+        default recursive-descent mode into something more
+        conservative, e.g., speculative, superset, shingled, or even
+        probabilistic disassembler.
+
+        {3 Algorithm}
+
+        {4 Memory Classification}
+
+        When the driver is fed with a new memory region (using the
+        {!Driver.scan} function), it uses the knowledge base to
+        initially classify addresses that belong to this region.
+
+        For each byte in the region, it creates a temporary
+        [core-theory:program] object and sets is [label-address]
+        property to the address of that byte. It then queries if it is
+        a function start ([core-theory:is-function]) and if it is
+        known to be code or data (if the [core-theory:is-valid]
+        property is [(true)] then it is classified as code and if it
+        [(false)] then it is data, otherwise, it is undetermined). All
+        objects created during classification are deleted immediately
+        after the query and never committed to the knowledge base
+        (they are scoped objects). Therefore it is fine to speculate
+        and assume that all bytes are code by providing [(true)] to
+        the [is-valid] property of each byte.
+
+
+        {4 Disassembling}
+
+        The disassembling starts from each function start (as
+        identified by the previous step) and continues until there is
+        no more unprocessed function starts and all addresses, which
+        were classified as code, are either successfully disassembled
+        or proved to be data.
+
+        During disassembling the driver will discover more jump
+        destinations and add them to the worklist. The default mode is
+        speculative, i.e., when we meet a barrier, we continue
+        disassembling beyond it. If the worklist is empty, but the set
+        of addresses that were classified as code (in the first step)
+        is still not empty, which means that these addresses are not
+        reachable from the initially provided starting points
+        (function starts) then the minimal address is extracted from
+        the set and is assumed to be a start of a basic block and
+        added to the worklist.
+
+        The process continues until both the worklist and the set of
+        code addresses are empty. When the process converges the
+        knowledge base will contain all disassembled instructions
+        (though some of them might be invalid). The result of the
+        disassembly is the value that contains information sufficient
+        to reconstruct the control-flow graph of the program. It could
+        be queried directly, using various accessors or folded over
+        with the [explore] function, which is a generalized
+        control-flow graph building function.
+
+        {4 Backtracking}
+
+        The disassembler has a backtracking mechanism that enables it
+        to track each disassembled byte back to the memory byte that
+        was initially marked as code or a function start. When we
+        identify an instruction chain that is invalid, i.e., when data
+        follow a machine instruction or when its destination is some
+        data, we retract the whole chain of instructions. This ensures
+        that all valid instructions belong to at least one valid
+        instruction chain. The justification for not including invalid
+        instructions chains into the disassembly is that such chains
+        will unconditionally switch the CPU into the invalid
+        instruction state which will terminate the program. Since such
+        a chain can't include system calls or CPU exceptions (both are
+        not barriers) it can't have any side-effects visible outside
+        of the process so it could be safely ignored.
+
+        {4 Delay slots}
+
+        Any instruction could have a delay ([core-theory:delay]) that
+        is greater than zero. In that case the execution order of the
+        instructions will not be equal to the linear order of the
+        instructions addresses and [m] instructions that follow the
+        delayed instruction will be executed before that instruction
+        (put in the basic block before it), where [m] is the size of
+        the delay slot, expressed in instructions.
+
+        @since 2.0.0
+    *)
     module Driver : sig
-      type state
+
+      (** information necessary to build the control-flow graph.
+
+          @since 2.2.0 implements [bin_io]
+      *)
+      type state [@@deriving bin_io]
+
+      (** abstract type for a sequence of instructions.  *)
       type insns
 
+
+      (** abstract representation of a jump instruction.  *)
+      type jump
+
+      (** [init] the initial disassembler state.  *)
       val init : state
+
+
+      (** [merge x y] is a sum of information in states [x] and [y].
+
+          @since 2.2.0
+      *)
+      val merge : state -> state -> state
+
+
+      (** [scan mem state] updates the state.
+
+          If bytes in [mem] were already scanned then returns [state]
+          without changes. Nothing is stored in the knowledge base.
+
+          If it is a new memory region then classifies and
+          disassembles the whole region. For each disassembled address
+          [p] an object [Theory.Label.for_addr p] is stored in the
+          knowledge base. It will contain a fully disassembled and
+          lifted instruction. All instructions that are subroutine
+          entry points will have [is-subroutine] property set to [(true)].
+
+          See {! sec:Algorithm} for the detailed description of the
+          algorithm.
+      *)
       val scan : mem -> state -> state knowledge
 
+
+      (** [explore ~block ~node ~edge ~init state] builds a
+          control-flow graph from the [state].
+
+          This function is a generalized fold function that calls the
+          user provided functions to construct the graph, which has
+          abstract type ['g].
+
+          - [block mem insns] creates a basic block of type ['n] which
+            covers memory [mem]; [insns] is the sequence of
+            instructions that constitute that memory, use [list_insns insns]
+            to get instructions in the linear order of their
+            addresses, or [execution_order] to get the execution
+            order (which might be different from linear in the
+            presence of delayed and speculative instructions).
+
+          - [node x g] inserts the node [x] into graph [g];
+          - [edge x y g] inserts an edge between nodes [x] and [y];
+          - [init] is the initial graph;
+          - [follow x] if it returns [true] then the function will
+            follow this destination. Defaults to a function that
+            always evaluates to [true].
+
+          - [entry] is the entry point from which to build the graph,
+            if absent, then all basic blocks will be consecuitively,
+            in the order of ascending addresses, used as the entry
+            points.
+      *)
       val explore :
         ?entry:addr ->
         ?follow:(addr -> bool knowledge) ->
@@ -6616,11 +6852,138 @@ module Std : sig
         init:'c ->
         state -> 'c knowledge
 
+
+      (** [list_insns xs] returns the list of instructions in the
+          ascending order of their addresses (or descending if [rev]
+          is [true] (defaults to [false]. *)
       val list_insns : ?rev:bool -> insns -> Theory.Label.t list
+
+
+      (** [execution_order xs] reruns a list of instructions in the
+          order in which they will be executed by the target CPU,
+          which could be different when instructions are delayed or
+          speculatively executed. *)
       val execution_order : insns -> Theory.Label.t list knowledge
+
+      (** {3 Low-level interface}
+
+          All functions in this interface were made availabe in BAP
+          2.2.0 unless stated otherwise. *)
+
+
+      (** [subroutines state] is a set of subroutine entry points that
+          either were provided through the knowledge base or later
+          discovered as destinations of call instructions.
+
+          @since 2.2.0
+      *)
+      val subroutines : state -> Set.M(Addr).t
+
+
+      (** [blocks state] is the set of addresses of instructions that
+          start basic blocks.
+          @since 2.2.0
+
+      *)
+      val blocks : state -> Set.M(Addr).t
+
+
+      (** [is_data state x] is [true] if [x] was classified as data.
+          @since 2.2.0 *)
+      val is_data : state -> addr -> bool
+
+      (** [is_subroutine s x] is [Set.mem (subroutines s) x].
+          @since 2.2.0 *)
+      val is_subroutine : state -> addr -> bool
+
+      (** [is_block s x] is [Set.mem (blocks s) x].
+          @since 2.2.0 *)
+      val is_block : state -> addr -> bool
+
+
+      (** [is_jump s x] is [true] if [x] is the maximal address of an
+          instruction in the basic block that terminates with a jump.
+
+          Note, that when jumps are delayed, the linear order of
+          instructions differs from the execution order, so the
+          address of the last instruction is not the maximal address
+          in the basic block.
+
+          @since 2.2.0 *)
+      val is_jump : state -> addr -> bool
+
+      (** [jump state src] if [src] is the last in the linear order
+          instruction of a basic block that terminates with a jump
+          instruction then returns the descrption of that jump
+          instruction.
+
+          @since 2.2.0
+      *)
+      val jump : state -> addr -> jump option
+
+
+      (** [destinations jump] returns the set of resolved
+          destinations.
+          @since 2.2.0
+      *)
+      val destinations : jump -> Set.M(Addr).t
+
+
+      (** [is_call jump] is [true] if jump is [call].
+          @since 2.2.0 *)
+      val is_call : jump -> bool
+
+      (** [is_barrier jump] is [true] if jump is [barrier].
+          @since 2.2.0 *)
+      val is_barrier : jump -> bool
     end
 
 
+    (** A set of subroutines.
+
+        A partition of a whole program control-flow graph into a
+        quotient set of subroutines.
+
+        The algorithm builds the minimal partion, i.e., it guarantees
+        that no other partition exists that has fewer
+        elements.
+
+        The number of elements heavily depends on the number of
+        function starts that were provided as the input (e.g., if the
+        function start identification procedure has a lot of false
+        positives, expect a lot of subroutines).
+
+        @since made public since 2.2.0
+    *)
+    module Subroutines : sig
+      type t [@@deriving bin_io]
+
+      (** [empty] is the empty partition.  *)
+      val empty : t
+
+      (** [equal p1 p2] is [true] if [p1] is structurally equal [p2].  *)
+      val equal : t -> t -> bool
+
+      (** [update p s] updates the partitioning with the newly
+          disassembled blocks.  *)
+      val update : t -> Driver.state -> t knowledge
+
+      (** [belongs part ~entry addr] is true if [addr] belongs to a
+          basic block of subroutine with the given [entry].  *)
+      val belongs : t -> entry:addr -> addr -> bool
+
+      (** [entries part] is the set of entry points of all
+          subroutines in the partition.
+
+          [Set.length (entries part)] is the cardinality of the
+          partition. *)
+      val entries : t -> Set.M(Addr).t
+
+      (** [siblings part x y] is [true] if [x] and [y] belong to the
+          same subroutine.  *)
+      val siblings : t -> addr -> addr -> bool
+    end
+
     type t = disasm
 
     (** [create cfg]   *)
diff --git a/lib/bap_disasm/bap_disasm.ml b/lib/bap_disasm/bap_disasm.ml
index eb52f26d8..e6c260228 100644
--- a/lib/bap_disasm/bap_disasm.ml
+++ b/lib/bap_disasm/bap_disasm.ml
@@ -63,6 +63,7 @@ let of_rec d = {
 
 module Disasm = struct
   module Driver = Bap_disasm_driver
+  module Subroutines = Bap_disasm_calls
 
   type t = disasm
   type 'a disassembler = ?backend:string -> ?brancher:brancher -> ?rooter:rooter -> 'a
diff --git a/lib/bap_disasm/bap_disasm.mli b/lib/bap_disasm/bap_disasm.mli
index 096d1bb9b..9a7dd1942 100644
--- a/lib/bap_disasm/bap_disasm.mli
+++ b/lib/bap_disasm/bap_disasm.mli
@@ -14,6 +14,7 @@ type cfg = Bap_disasm_rec.Cfg.t [@@deriving compare]
 
 module Disasm : sig
   module Driver = Bap_disasm_driver
+  module Subroutines = Bap_disasm_calls
 
   type t = disasm
   type 'a disassembler = ?backend:string -> ?brancher:brancher -> ?rooter:rooter -> 'a
diff --git a/lib/bap_disasm/bap_disasm_driver.ml b/lib/bap_disasm/bap_disasm_driver.ml
index 0369ed8a9..e9589ab90 100644
--- a/lib/bap_disasm/bap_disasm_driver.ml
+++ b/lib/bap_disasm/bap_disasm_driver.ml
@@ -13,7 +13,7 @@ type insn = Insn.t [@@deriving sexp_of]
 type edge = [`Jump | `Cond | `Fall] [@@deriving compare]
 
 
-type dsts = {
+type jump = {
   call : bool;
   barrier : bool;
   indirect : bool;
@@ -24,7 +24,7 @@ module Machine : sig
   type task = private
     | Dest of {dst : addr; parent : task option}
     | Fall of {dst : addr; parent : task; delay : slot}
-    | Jump of {src : addr; age: int; dsts : dsts; parent : task}
+    | Jump of {src : addr; age: int; dsts : jump; parent : task}
   and slot = private
     | Ready of task option
     | Delay
@@ -38,7 +38,7 @@ module Machine : sig
     addr : addr;                (* current address *)
     dels : Set.M(Addr).t;
     begs : task list Map.M(Addr).t;       (* begins of basic blocks *)
-    jmps : dsts Map.M(Addr).t;  (* jumps  *)
+    jmps : jump Map.M(Addr).t;  (* jumps  *)
     code : Set.M(Addr).t;       (* all valid instructions *)
     data : Set.M(Addr).t;       (* all non-instructions *)
     usat : Set.M(Addr).t;       (* unsatisfied constraints *)
@@ -58,7 +58,7 @@ module Machine : sig
     ready:(state -> mem -> 'a) -> 'a
 
   val failed : state -> addr -> state
-  val jumped : state -> mem -> dsts -> int -> state
+  val jumped : state -> mem -> jump -> int -> state
   val stopped : state -> state
   val moved  : state -> mem -> state
   val is_ready : state -> bool
@@ -67,7 +67,7 @@ end = struct
   type task =
     | Dest of {dst : addr; parent : task option}
     | Fall of {dst : addr; parent : task; delay : slot}
-    | Jump of {src : addr; age: int; dsts : dsts; parent : task}
+    | Jump of {src : addr; age: int; dsts : jump; parent : task}
   and slot =
     | Ready of task option
     | Delay
@@ -87,7 +87,7 @@ end = struct
     addr : addr;                (* current address *)
     dels : Set.M(Addr).t;
     begs : task list Map.M(Addr).t;       (* begins of basic blocks *)
-    jmps : dsts Map.M(Addr).t;  (* jumps  *)
+    jmps : jump Map.M(Addr).t;  (* jumps  *)
     code : Set.M(Addr).t;       (* all valid instructions *)
     data : Set.M(Addr).t;       (* all non-instructions *)
     usat : Set.M(Addr).t;       (* unsatisfied constraints *)
@@ -364,7 +364,7 @@ type insns = Theory.Label.t list
 type state = {
   funs : Addr.Set.t;
   begs : Addr.Set.t;
-  jmps : dsts Addr.Map.t;
+  jmps : jump Addr.Map.t;
   data : Addr.Set.t;
   mems : mem list;
   debt : Machine.task list;
@@ -379,7 +379,18 @@ let init = {
   debt = [];
 }
 
-let subroutines {funs} = funs
+let subroutines x = x.funs
+let blocks x = x.begs
+let jump {jmps} = Map.find jmps
+
+let is_data {data} = Set.mem data
+let is_subroutine {funs} = Set.mem funs
+let is_jump {jmps} = Map.mem jmps
+let is_block {begs} = Set.mem begs
+
+let destinations dsts = dsts.resolved
+let is_call dsts = dsts.call
+let is_barrier dsts = dsts.barrier
 
 let query_arch addr =
   KB.Object.scoped Theory.Program.cls @@ fun obj ->
diff --git a/lib/bap_disasm/bap_disasm_driver.mli b/lib/bap_disasm/bap_disasm_driver.mli
index 353ea9b7f..1c3f82ea2 100644
--- a/lib/bap_disasm/bap_disasm_driver.mli
+++ b/lib/bap_disasm/bap_disasm_driver.mli
@@ -7,12 +7,24 @@ module Dis = Bap_disasm_basic
 
 type state [@@deriving bin_io]
 type insns
+type jump
 
 val init : state
 val scan : mem -> state -> state knowledge
 val merge : state -> state -> state
 
 val subroutines : state -> Set.M(Addr).t
+val blocks : state -> Set.M(Addr).t
+val jump : state -> addr -> jump option
+
+val is_data : state -> addr -> bool
+val is_subroutine : state -> addr -> bool
+val is_jump : state -> addr -> bool
+val is_block : state -> addr -> bool
+
+val destinations : jump -> Set.M(Addr).t
+val is_call : jump -> bool
+val is_barrier : jump -> bool
 
 val explore :
   ?entry:addr ->