diff --git a/src/lat/sausages.cc b/src/lat/sausages.cc
index 7cb7a273b98..16a61b3f5eb 100644
--- a/src/lat/sausages.cc
+++ b/src/lat/sausages.cc
@@ -114,11 +114,11 @@ double MinimumBayesRisk::EditDistance(int32 N, int32 Q,
       for (int32 q = 0; q <= Q; q++) {
         if (q == 0) {
           alpha_dash_arc(q) = // line 15.
-              alpha_dash(s_a, q) + l(w_a, 0) + delta();
+              alpha_dash(s_a, q) + l(w_a, 0, true);
         } else {  // a1,a2,a3 are the 3 parts of min expression of line 17.
           int32 r_q = r(q);
           double a1 = alpha_dash(s_a, q-1) + l(w_a, r_q),
-              a2 = alpha_dash(s_a, q) + l(w_a, 0) + delta(),
+              a2 = alpha_dash(s_a, q) + l(w_a, 0, true),
               a3 = alpha_dash_arc(q-1) + l(0, r_q);
           alpha_dash_arc(q) = std::min(a1, std::min(a2, a3));
         }
@@ -166,11 +166,11 @@ void MinimumBayesRisk::AccStats() {
       const Arc &arc = arcs_[pre_[n][i]];
       int32 s_a = arc.start_node, w_a = arc.word;
       BaseFloat p_a = arc.loglike;
-      alpha_dash_arc(0) = alpha_dash(s_a, 0) + l(w_a, 0) + delta(); // line 14.
+      alpha_dash_arc(0) = alpha_dash(s_a, 0) + l(w_a, 0, true); // line 14.
       for (int32 q = 1; q <= Q; q++) { // this loop == lines 15-18.
         int32 r_q = r(q);
         double a1 = alpha_dash(s_a, q-1) + l(w_a, r_q),
-            a2 = alpha_dash(s_a, q) + l(w_a, 0) + delta(),
+            a2 = alpha_dash(s_a, q) + l(w_a, 0, true),
             a3 = alpha_dash_arc(q-1) + l(0, r_q);
         if (a1 <= a2) {
           if (a1 <= a3) { b_arc[q] = 1; alpha_dash_arc(q) = a1; }
diff --git a/src/lat/sausages.h b/src/lat/sausages.h
index a6af91cc12f..9dab0b68713 100644
--- a/src/lat/sausages.h
+++ b/src/lat/sausages.h
@@ -128,8 +128,18 @@ class MinimumBayesRisk {
   /// Minimum-Bayes-Risk Decode. Top-level algorithm.  Figure 6 of the paper.
   void MbrDecode();
 
-  /// The basic edit-distance function l(a,b), as in the paper.
-  inline double l(int32 a, int32 b) { return (a == b ? 0.0 : 1.0); }
+  /// Without the 'penalize' argument this gives us the basic edit-distance
+  /// function l(a,b), as in the paper.
+  /// With the 'penalize' argument it can be interpreted as the edit distance
+  /// plus the 'delta' from the paper, except that we make a kind of conceptual
+  /// bug-fix and only apply the delta if the edit-distance was not already
+  /// zero.  This bug-fix was necessary in order to force all the stats to show
+  /// up, that should show up, and applying the bug-fix makes the sausage stats
+  /// significantly less sparse.
+  inline double l(int32 a, int32 b, bool penalize = false) {
+    if (a == b) return 0.0;
+    else return (penalize ? 1.0 + delta() : 1.0);
+  }
 
   /// returns r_q, in one-based indexing, as in the paper.
   inline int32 r(int32 q) { return R_[q-1]; }
@@ -151,8 +161,14 @@ class MinimumBayesRisk {
   // epsilon (0).  (But if no words in vec, just one epsilon)
   static void NormalizeEps(std::vector<int32> *vec);
 
-  static inline BaseFloat delta() { return 1.0e-05; } // A constant
-  // used in the algorithm.
+  // delta() is a constant used in the algorithm, which penalizes
+  // the use of certain epsilon transitions in the edit-distance which would cause
+  // words not to show up in the accumulated edit-distance statistics.
+  // There has been a conceptual bug-fix versus the way it was presented in
+  // the paper: we now add delta only if the edit-distance was not already
+  // zero.
+  static inline BaseFloat delta() { return 1.0e-05; }
+
 
   /// Function used to increment map.
   static inline void AddToMap(int32 i, double d, std::map<int32, double> *gamma) {