diff --git a/mllib-local/pom.xml b/mllib-local/pom.xml
index 2a2c373242201..16cd55c8a45f0 100644
--- a/mllib-local/pom.xml
+++ b/mllib-local/pom.xml
@@ -68,6 +68,13 @@
       <scope>test</scope>
     </dependency>
 
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-core_${scala.binary.version}</artifactId>
+      <version>${project.version}</version>
+      <type>test-jar</type>
+      <scope>test</scope>
+    </dependency>
   </dependencies>
   <profiles>
     <profile>
@@ -81,6 +88,34 @@
         </dependency>
       </dependencies>
     </profile>
+    <profile>
+      <id>jvm-vectorized</id>
+      <properties>
+        <extra.source.dir>src/jvm-vectorized/java</extra.source.dir>
+      </properties>
+      <build>
+        <plugins>
+          <plugin>
+            <groupId>org.codehaus.mojo</groupId>
+            <artifactId>build-helper-maven-plugin</artifactId>
+            <executions>
+              <execution>
+                <id>add-vectorized-sources</id>
+                <phase>generate-sources</phase>
+                <goals>
+                  <goal>add-source</goal>
+                </goals>
+                <configuration>
+                  <sources>
+                    <source>${extra.source.dir}</source>
+                  </sources>
+                </configuration>
+              </execution>
+            </executions>
+          </plugin>
+        </plugins>
+      </build>
+    </profile>
   </profiles>
   <build>
     <outputDirectory>target/scala-${scala.binary.version}/classes</outputDirectory>
diff --git a/mllib-local/src/jvm-vectorized/java/org/apache/spark/ml/linalg/VectorizedBLAS.java b/mllib-local/src/jvm-vectorized/java/org/apache/spark/ml/linalg/VectorizedBLAS.java
new file mode 100644
index 0000000000000..7db1bb9111a00
--- /dev/null
+++ b/mllib-local/src/jvm-vectorized/java/org/apache/spark/ml/linalg/VectorizedBLAS.java
@@ -0,0 +1,483 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.linalg;
+
+import com.github.fommil.netlib.F2jBLAS;
+import jdk.incubator.vector.DoubleVector;
+import jdk.incubator.vector.FloatVector;
+import jdk.incubator.vector.VectorOperators;
+import jdk.incubator.vector.VectorSpecies;
+
+public class VectorizedBLAS extends F2jBLAS {
+
+  private static final VectorSpecies<Float>  FMAX = FloatVector.SPECIES_MAX;
+  private static final VectorSpecies<Double> DMAX = DoubleVector.SPECIES_MAX;
+
+  // y += alpha * x
+  @Override
+  public void daxpy(int n, double alpha, double[] x, int incx, double[] y, int incy) {
+    if (n >= 0
+        && x != null && x.length >= n && incx == 1
+        && y != null && y.length >= n && incy == 1) {
+      if (alpha != 0.) {
+        DoubleVector valpha = DoubleVector.broadcast(DMAX, alpha);
+        int i = 0;
+        for (; i < DMAX.loopBound(n); i += DMAX.length()) {
+          DoubleVector vx = DoubleVector.fromArray(DMAX, x, i);
+          DoubleVector vy = DoubleVector.fromArray(DMAX, y, i);
+          vx.fma(valpha, vy).intoArray(y, i);
+        }
+        for (; i < n; i += 1) {
+          y[i] += alpha * x[i];
+        }
+      }
+    } else {
+      super.daxpy(n, alpha, x, incx, y, incy);
+    }
+  }
+
+  // sum(x * y)
+  @Override
+  public float sdot(int n, float[] x, int incx, float[] y, int incy) {
+    if (n >= 0
+        && x != null && x.length >= n && incx == 1
+        && y != null && y.length >= n && incy == 1) {
+      float sum = 0.0f;
+      int i = 0;
+      FloatVector vsum = FloatVector.zero(FMAX);
+      for (; i < FMAX.loopBound(n); i += FMAX.length()) {
+        FloatVector vx = FloatVector.fromArray(FMAX, x, i);
+        FloatVector vy = FloatVector.fromArray(FMAX, y, i);
+        vsum = vx.fma(vy, vsum);
+      }
+      sum += vsum.reduceLanes(VectorOperators.ADD);
+      for (; i < n; i += 1) {
+        sum += x[i] * y[i];
+      }
+      return sum;
+    } else {
+      return super.sdot(n, x, incx, y, incy);
+    }
+  }
+
+  // sum(x * y)
+  @Override
+  public double ddot(int n, double[] x, int incx, double[] y, int incy) {
+    if (n >= 0
+        && x != null && x.length >= n && incx == 1
+        && y != null && y.length >= n && incy == 1) {
+      double sum = 0.;
+      int i = 0;
+      DoubleVector vsum = DoubleVector.zero(DMAX);
+      for (; i < DMAX.loopBound(n); i += DMAX.length()) {
+        DoubleVector vx = DoubleVector.fromArray(DMAX, x, i);
+        DoubleVector vy = DoubleVector.fromArray(DMAX, y, i);
+        vsum = vx.fma(vy, vsum);
+      }
+      sum += vsum.reduceLanes(VectorOperators.ADD);
+      for (; i < n; i += 1) {
+        sum += x[i] * y[i];
+      }
+      return sum;
+    } else {
+      return super.ddot(n, x, incx, y, incy);
+    }
+  }
+
+  @Override
+  public void dscal(int n, double alpha, double[] x, int incx) {
+    dscal(n, alpha, x, 0, incx);
+  }
+
+  // x = alpha * x
+  @Override
+  public void dscal(int n, double alpha, double[] x, int offsetx, int incx) {
+    if (n >= 0 && x != null && x.length >= offsetx + n && incx == 1) {
+      if (alpha != 1.) {
+        DoubleVector valpha = DoubleVector.broadcast(DMAX, alpha);
+        int i = 0;
+        for (; i < DMAX.loopBound(n); i += DMAX.length()) {
+          DoubleVector vx = DoubleVector.fromArray(DMAX, x, offsetx + i);
+          vx.mul(valpha).intoArray(x, offsetx + i);
+        }
+        for (; i < n; i += 1) {
+          x[offsetx + i] *= alpha;
+        }
+      }
+    } else {
+      super.dscal(n, alpha, x, offsetx, incx);
+    }
+  }
+
+  @Override
+  public void sscal(int n, float alpha, float[] x, int incx) {
+    sscal(n, alpha, x, 0, incx);
+  }
+
+  // x = alpha * x
+  @Override
+  public void sscal(int n, float alpha, float[] x, int offsetx, int incx) {
+    if (n >= 0 && x != null && x.length >= offsetx + n && incx == 1) {
+      if (alpha != 1.) {
+        FloatVector valpha = FloatVector.broadcast(FMAX, alpha);
+        int i = 0;
+        for (; i < FMAX.loopBound(n); i += FMAX.length()) {
+          FloatVector vx = FloatVector.fromArray(FMAX, x, offsetx + i);
+          vx.mul(valpha).intoArray(x, offsetx + i);
+        }
+        for (; i < n; i += 1) {
+          x[offsetx + i] *= alpha;
+        }
+      }
+    } else {
+      super.sscal(n, alpha, x, offsetx, incx);
+    }
+  }
+
+  // y = alpha * a * x + beta * y
+  @Override
+  public void dspmv(String uplo, int n, double alpha, double[] a,
+      double[] x, int incx, double beta, double[] y, int incy) {
+    if ("U".equals(uplo)
+        && n >= 0
+        && a != null && a.length >= n * (n + 1) / 2
+        && x != null && x.length >= n && incx == 1
+        && y != null && y.length >= n && incy == 1) {
+      // y = beta * y
+      dscal(n, beta, y, 1);
+      // y += alpha * A * x
+      if (alpha != 0.) {
+        DoubleVector valpha = DoubleVector.broadcast(DMAX, alpha);
+        for (int row = 0; row < n; row += 1) {
+          int col = 0;
+          DoubleVector vyrowsum = DoubleVector.zero(DMAX);
+          DoubleVector valphaxrow = DoubleVector.broadcast(DMAX, alpha * x[row]);
+          for (; col < DMAX.loopBound(row); col += DMAX.length()) {
+            DoubleVector vx = DoubleVector.fromArray(DMAX, x, col);
+            DoubleVector vy = DoubleVector.fromArray(DMAX, y, col);
+            DoubleVector va = DoubleVector.fromArray(DMAX, a, col + row * (row + 1) / 2);
+            vyrowsum = valpha.mul(vx).fma(va, vyrowsum);
+            valphaxrow.fma(va, vy).intoArray(y, col);
+          }
+          y[row] += vyrowsum.reduceLanes(VectorOperators.ADD);
+          for (; col < row; col += 1) {
+            y[row] += alpha * x[col] * a[col + row * (row + 1) / 2];
+            y[col] += alpha * x[row] * a[col + row * (row + 1) / 2];
+          }
+          y[row] += alpha * x[col] * a[col + row * (row + 1) / 2];
+        }
+      }
+    } else {
+      super.dspmv(uplo, n, alpha, a, x, incx, beta, y, incy);
+    }
+  }
+
+  // a += alpha * x * x.t
+  @Override
+  public void dspr(String uplo, int n, double alpha, double[] x, int incx, double[] a) {
+    if ("U".equals(uplo)
+        && n >= 0
+        && x != null && x.length >= n && incx == 1
+        && a != null && a.length >= n * (n + 1) / 2) {
+      if (alpha != 0.) {
+        for (int row = 0; row < n; row += 1) {
+          int col = 0;
+          DoubleVector valphaxrow = DoubleVector.broadcast(DMAX, alpha * x[row]);
+          for (; col < DMAX.loopBound(row + 1); col += DMAX.length()) {
+            DoubleVector vx = DoubleVector.fromArray(DMAX, x, col);
+            DoubleVector va = DoubleVector.fromArray(DMAX, a, col + row * (row + 1) / 2);
+            vx.fma(valphaxrow, va).intoArray(a, col + row * (row + 1) / 2);
+          }
+          for (; col < row + 1; col += 1) {
+            a[col + row * (row + 1) / 2] += alpha * x[row] * x[col];
+          }
+        }
+      }
+    } else {
+      super.dspr(uplo, n, alpha, x, incx, a);
+    }
+  }
+
+  // a += alpha * x * x.t
+  @Override
+  public void dsyr(String uplo, int n, double alpha, double[] x, int incx, double[] a, int lda) {
+    if ("U".equals(uplo)
+        && n >= 0
+        && x != null && x.length >= n && incx == 1
+        && a != null && a.length >= n * n && lda == n) {
+      if (alpha != 0.) {
+        for (int row = 0; row < n; row += 1) {
+          int col = 0;
+          DoubleVector valphaxrow = DoubleVector.broadcast(DMAX, alpha * x[row]);
+          for (; col < DMAX.loopBound(row + 1); col += DMAX.length()) {
+            DoubleVector vx = DoubleVector.fromArray(DMAX, x, col);
+            DoubleVector va = DoubleVector.fromArray(DMAX, a, col + row * n);
+            vx.fma(valphaxrow, va).intoArray(a, col + row * n);
+          }
+          for (; col < row + 1; col += 1) {
+            a[col + row * n] += alpha * x[row] * x[col];
+          }
+        }
+      }
+    } else {
+      super.dsyr(uplo, n, alpha, x, incx, a, lda);
+    }
+  }
+
+  @Override
+  public void dgemv(String trans, int m, int n,
+      double alpha, double[] a, int lda, double[] x, int incx,
+      double beta, double[] y, int incy) {
+    dgemv(trans, m, n, alpha, a, 0, lda, x, 0, incx, beta, y, 0, incy);
+  }
+
+  // y = alpha * A * x + beta * y
+  @Override
+  public void dgemv(String trans, int m, int n,
+      double alpha, double[] a, int offseta, int lda, double[] x, int offsetx, int incx,
+      double beta, double[] y, int offsety, int incy) {
+    if ("N".equals(trans)
+        && m >= 0 && n >= 0
+        && a != null && a.length >= offseta + m * n && lda == m
+        && x != null && x.length >= offsetx + n && incx == 1
+        && y != null && y.length >= offsety + m && incy == 1) {
+      // y = beta * y
+      dscal(m, beta, y, offsety, 1);
+      // y += alpha * A * x
+      if (alpha != 0.) {
+        DoubleVector valpha = DoubleVector.broadcast(DMAX, alpha);
+        for (int col = 0; col < n; col += 1) {
+          int row = 0;
+          for (; row < DMAX.loopBound(m); row += DMAX.length()) {
+            DoubleVector va = DoubleVector.fromArray(DMAX, a, offseta + row + col * m);
+            DoubleVector vy = DoubleVector.fromArray(DMAX, y, offsety + row);
+            valpha.mul(x[offsetx + col]).fma(va, vy)
+                  .intoArray(y, offsety + row);
+          }
+          for (; row < m; row += 1) {
+            y[offsety + row] += alpha * x[offsetx + col] * a[offseta + row + col * m];
+          }
+        }
+      }
+    } else if ("T".equals(trans)
+        && m >= 0 && n >= 0
+        && a != null && a.length >= offseta + m * n && lda == m
+        && x != null && x.length >= offsetx + m && incx == 1
+        && y != null && y.length >= offsety + n && incy == 1) {
+      if (alpha != 0. || beta != 1.) {
+        for (int col = 0; col < n; col += 1) {
+          double sum = 0.;
+          int row = 0;
+          DoubleVector vsum = DoubleVector.zero(DMAX);
+          for (; row < DMAX.loopBound(m); row += DMAX.length()) {
+            DoubleVector va = DoubleVector.fromArray(DMAX, a, offseta + row + col * m);
+            DoubleVector vx = DoubleVector.fromArray(DMAX, x, offsetx + row);
+            vsum = va.fma(vx, vsum);
+          }
+          sum += vsum.reduceLanes(VectorOperators.ADD);
+          for (; row < m; row += 1) {
+            sum += x[offsetx + row] * a[offseta + row + col * m];
+          }
+          y[offsety + col] = alpha * sum + beta * y[offsety + col];
+        }
+      }
+    } else {
+      super.dgemv(trans, m, n, alpha, a, offseta, lda, x, offsetx, incx, beta, y, offsety, incy);
+    }
+  }
+
+  @Override
+  public void sgemv(String trans, int m, int n,
+      float alpha, float[] a, int lda, float[] x, int incx,
+      float beta, float[] y, int incy) {
+    sgemv(trans, m, n, alpha, a, 0, lda, x, 0, incx, beta, y, 0, incy);
+  }
+
+  // y = alpha * A * x + beta * y
+  @Override
+  public void sgemv(String trans, int m, int n,
+      float alpha, float[] a, int offseta, int lda, float[] x, int offsetx, int incx,
+      float beta, float[] y, int offsety, int incy) {
+    if ("N".equals(trans)
+        && m >= 0 && n >= 0
+        && a != null && a.length >= offseta + m * n && lda == m
+        && x != null && x.length >= offsetx + n && incx == 1
+        && y != null && y.length >= offsety + m && incy == 1) {
+      // y = beta * y
+      sscal(m, beta, y, offsety, 1);
+      // y += alpha * A * x
+      if (alpha != 0.f) {
+        FloatVector valpha = FloatVector.broadcast(FMAX, alpha);
+        for (int col = 0; col < n; col += 1) {
+          int row = 0;
+          for (; row < FMAX.loopBound(m); row += FMAX.length()) {
+            FloatVector va = FloatVector.fromArray(FMAX, a, offseta + row + col * m);
+            FloatVector vy = FloatVector.fromArray(FMAX, y, offsety + row);
+            valpha.mul(x[offsetx + col]).fma(va, vy)
+                  .intoArray(y, offsety + row);
+          }
+          for (; row < m; row += 1) {
+            y[offsety + row] += alpha * x[offsetx + col] * a[offseta + row + col * m];
+          }
+        }
+      }
+    } else if ("T".equals(trans)
+        && m >= 0 && n >= 0
+        && a != null && a.length >= offseta + m * n && lda == m
+        && x != null && x.length >= offsetx + m && incx == 1
+        && y != null && y.length >= offsety + n && incy == 1) {
+      if (alpha != 0. || beta != 1.) {
+        for (int col = 0; col < n; col += 1) {
+          float sum = 0.f;
+          int row = 0;
+          FloatVector vsum = FloatVector.zero(FMAX);
+          for (; row < FMAX.loopBound(m); row += FMAX.length()) {
+            FloatVector va = FloatVector.fromArray(FMAX, a, offseta + row + col * m);
+            FloatVector vx = FloatVector.fromArray(FMAX, x, offsetx + row);
+            vsum = va.fma(vx, vsum);
+          }
+          sum += vsum.reduceLanes(VectorOperators.ADD);
+          for (; row < m; row += 1) {
+            sum += x[offsetx + row] * a[offseta + row + col * m];
+          }
+          y[offsety + col] = alpha * sum + beta * y[offsety + col];
+        }
+      }
+    } else {
+      super.sgemv(trans, m, n, alpha, a, offseta, lda, x, offsetx, incx, beta, y, offsety, incy);
+    }
+  }
+
+  @Override
+  public void dgemm(String transa, String transb, int m, int n, int k,
+      double alpha, double[] a, int lda, double[] b, int ldb,
+      double beta, double[] c, int ldc) {
+    dgemm(transa, transb, m, n, k, alpha, a, 0, lda, b, 0, ldb, beta, c, 0, ldc);
+  }
+
+  // c = alpha * a * b + beta * c
+  @Override
+  public void dgemm(String transa, String transb, int m, int n, int k,
+      double alpha, double[] a, int offseta, int lda, double[] b, int offsetb, int ldb,
+      double beta, double[] c, int offsetc, int ldc) {
+    if ("N".equals(transa) && "N".equals(transb)
+        && m >= 0 && n >= 0 && k >= 0
+        && a != null && a.length >= offseta + m * k && lda == m
+        && b != null && b.length >= offsetb + k * n && ldb == k
+        && c != null && c.length >= offsetc + m * n && ldc == m) {
+      // C = beta * C
+      dscal(m * n, beta, c, offsetc, 1);
+      // C += alpha * A * B
+      if (alpha != 0.) {
+        DoubleVector valpha = DoubleVector.broadcast(DMAX, alpha);
+        for (int col = 0; col < n; col += 1) {
+          for (int i = 0; i < k; i += 1) {
+            int row = 0;
+            for (; row < DMAX.loopBound(m); row += DMAX.length()) {
+              DoubleVector va = DoubleVector.fromArray(DMAX, a, offseta + i * m + row);
+              DoubleVector vc = DoubleVector.fromArray(DMAX, c, offsetc + col * m + row);
+              valpha.mul(b[offsetb + col * k + i]).fma(va, vc)
+                    .intoArray(c, offsetc + col * m + row);
+            }
+            for (; row < m; row += 1) {
+              c[offsetc + col * m + row] += alpha * a[offseta + i * m + row] * b[offsetb + col * k + i];
+            }
+          }
+        }
+      }
+    } else if ("N".equals(transa) && "T".equals(transb)
+        && m >= 0 && n >= 0 && k >= 0
+        && a != null && a.length >= offseta + m * k && lda == m
+        && b != null && b.length >= offsetb + k * n && ldb == n
+        && c != null && c.length >= offsetc + m * n && ldc == m) {
+      // C = beta * C
+      dscal(m * n, beta, c, offsetc, 1);
+      // C += alpha * A * B
+      if (alpha != 0.) {
+        DoubleVector valpha = DoubleVector.broadcast(DMAX, alpha);
+        for (int i = 0; i < k; i += 1) {
+          for (int col = 0; col < n; col += 1) {
+            int row = 0;
+            for (; row < DMAX.loopBound(m); row += DMAX.length()) {
+              DoubleVector va = DoubleVector.fromArray(DMAX, a, offseta + i * m + row);
+              DoubleVector vc = DoubleVector.fromArray(DMAX, c, offsetc + col * m + row);
+              valpha.mul(b[offsetb + col + i * n]).fma(va, vc)
+                    .intoArray(c, offsetc + col * m + row);
+            }
+            for (; row < m; row += 1) {
+              c[offsetc + col * m + row] += alpha * a[offseta + i * m + row] * b[offsetb + col + i * n];
+            }
+          }
+        }
+      }
+    } else if ("T".equals(transa) && "N".equals(transb)
+        && m >= 0 && n >= 0 && k >= 0
+        && a != null && a.length >= offseta + m * k && lda == k
+        && b != null && b.length >= offsetb + k * n && ldb == k
+        && c != null && c.length >= offsetc + m * n && ldc == m) {
+      if (alpha != 0. || beta != 1.) {
+        for (int col = 0; col < n; col += 1) {
+          for (int row = 0; row < m; row += 1) {
+            double sum = 0.;
+            int i = 0;
+            DoubleVector vsum = DoubleVector.zero(DMAX);
+            for (; i < DMAX.loopBound(k); i += DMAX.length()) {
+              DoubleVector va = DoubleVector.fromArray(DMAX, a, offseta + i + row * k);
+              DoubleVector vb = DoubleVector.fromArray(DMAX, b, offsetb + col * k + i);
+              vsum = va.fma(vb, vsum);
+            }
+            sum += vsum.reduceLanes(VectorOperators.ADD);
+            for (; i < k; i += 1) {
+              sum += a[offseta + i + row * k] * b[offsetb + col * k + i];
+            }
+            if (beta != 0.) {
+              c[offsetc + col * m + row] = alpha * sum + beta * c[offsetc + col * m + row];
+            } else {
+              c[offsetc + col * m + row] = alpha * sum;
+            }
+          }
+        }
+      }
+    } else if ("T".equals(transa) && "T".equals(transb)
+        && m >= 0 && n >= 0 && k >= 0
+        && a != null && a.length >= offseta + m * k && lda == k
+        && b != null && b.length >= offsetb + k * n && ldb == n
+        && c != null && c.length >= offsetc + m * n && ldc == m) {
+      if (alpha != 0. || beta != 1.) {
+        // FIXME: do block by block
+        for (int col = 0; col < n; col += 1) {
+          for (int row = 0; row < m; row += 1) {
+            double sum = 0.;
+            for (int i = 0; i < k; i += 1) {
+              sum += a[offseta + i + row * k] * b[offsetb + col + i * n];
+            }
+            if (beta != 0.) {
+              c[offsetc + col * m + row] = alpha * sum + beta * c[offsetc + col * m + row];
+            } else {
+              c[offsetc + col * m + row] = alpha * sum;
+            }
+          }
+        }
+      }
+    } else {
+      super.dgemm(transa, transb, m, n, k,
+                  alpha, a, offseta, lda, b, offsetb, ldb,
+                  beta, c, offsetc, ldc);
+    }
+  }
+}
diff --git a/mllib-local/src/main/scala/org/apache/spark/ml/linalg/BLAS.scala b/mllib-local/src/main/scala/org/apache/spark/ml/linalg/BLAS.scala
index b6c1b011f004c..518c71129a970 100644
--- a/mllib-local/src/main/scala/org/apache/spark/ml/linalg/BLAS.scala
+++ b/mllib-local/src/main/scala/org/apache/spark/ml/linalg/BLAS.scala
@@ -18,28 +18,51 @@
 package org.apache.spark.ml.linalg
 
 import com.github.fommil.netlib.{BLAS => NetlibBLAS, F2jBLAS}
-import com.github.fommil.netlib.BLAS.{getInstance => NativeBLAS}
 
 /**
  * BLAS routines for MLlib's vectors and matrices.
  */
 private[spark] object BLAS extends Serializable {
 
-  @transient private var _f2jBLAS: NetlibBLAS = _
+  @transient private var _javaBLAS: NetlibBLAS = _
   @transient private var _nativeBLAS: NetlibBLAS = _
   private val nativeL1Threshold: Int = 256
 
-  // For level-1 function dspmv, use f2jBLAS for better performance.
-  private[ml] def f2jBLAS: NetlibBLAS = {
-    if (_f2jBLAS == null) {
-      _f2jBLAS = new F2jBLAS
+  // For level-1 function dspmv, use javaBLAS for better performance.
+  private[ml] def javaBLAS: NetlibBLAS = {
+    if (_javaBLAS == null) {
+      _javaBLAS =
+        try {
+          // scalastyle:off classforname
+          Class.forName("org.apache.spark.ml.linalg.VectorizedBLAS", true,
+                          Option(Thread.currentThread().getContextClassLoader)
+                            .getOrElse(getClass.getClassLoader))
+               .newInstance()
+               .asInstanceOf[NetlibBLAS]
+          // scalastyle:on classforname
+        } catch {
+          case _: Throwable => new F2jBLAS
+        }
+    }
+    _javaBLAS
+  }
+
+  // For level-3 routines, we use the native BLAS.
+  private[ml] def nativeBLAS: NetlibBLAS = {
+    if (_nativeBLAS == null) {
+      _nativeBLAS =
+        if (NetlibBLAS.getInstance.isInstanceOf[F2jBLAS]) {
+          javaBLAS
+        } else {
+          NetlibBLAS.getInstance
+        }
     }
-    _f2jBLAS
+    _nativeBLAS
   }
 
   private[ml] def getBLAS(vectorSize: Int): NetlibBLAS = {
     if (vectorSize < nativeL1Threshold) {
-      f2jBLAS
+      javaBLAS
     } else {
       nativeBLAS
     }
@@ -235,14 +258,6 @@ private[spark] object BLAS extends Serializable {
     }
   }
 
-  // For level-3 routines, we use the native BLAS.
-  private[ml] def nativeBLAS: NetlibBLAS = {
-    if (_nativeBLAS == null) {
-      _nativeBLAS = NativeBLAS
-    }
-    _nativeBLAS
-  }
-
   /**
    * Adds alpha * x * x.t to a matrix in-place. This is the same as BLAS's ?SPR.
    *
@@ -267,7 +282,7 @@ private[spark] object BLAS extends Serializable {
       x: DenseVector,
       beta: Double,
       y: DenseVector): Unit = {
-    f2jBLAS.dspmv("U", n, alpha, A.values, x.values, 1, beta, y.values, 1)
+    javaBLAS.dspmv("U", n, alpha, A.values, x.values, 1, beta, y.values, 1)
   }
 
   /**
@@ -279,7 +294,7 @@ private[spark] object BLAS extends Serializable {
     val n = v.size
     v match {
       case DenseVector(values) =>
-        NativeBLAS.dspr("U", n, alpha, values, 1, U)
+        nativeBLAS.dspr("U", n, alpha, values, 1, U)
       case SparseVector(size, indices, values) =>
         val nnz = indices.length
         var colStartIdx = 0
diff --git a/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Matrices.scala b/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Matrices.scala
index 1254ed747aeb4..3272a50508a3d 100644
--- a/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Matrices.scala
+++ b/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Matrices.scala
@@ -22,7 +22,6 @@ import java.util.{Arrays, Random}
 import scala.collection.mutable.{ArrayBuffer, ArrayBuilder => MArrayBuilder, HashSet => MHashSet}
 
 import breeze.linalg.{CSCMatrix => BSM, DenseMatrix => BDM, Matrix => BM}
-import com.github.fommil.netlib.BLAS.{getInstance => blas}
 
 import org.apache.spark.annotation.Since
 
@@ -457,7 +456,7 @@ class DenseMatrix @Since("2.0.0") (
     if (isTransposed) {
       Iterator.tabulate(numCols) { j =>
         val col = new Array[Double](numRows)
-        blas.dcopy(numRows, values, j, numCols, col, 0, 1)
+        BLAS.nativeBLAS.dcopy(numRows, values, j, numCols, col, 0, 1)
         new DenseVector(col)
       }
     } else {
diff --git a/mllib-local/src/test/scala/org/apache/spark/ml/linalg/BLASBenchmark.scala b/mllib-local/src/test/scala/org/apache/spark/ml/linalg/BLASBenchmark.scala
new file mode 100644
index 0000000000000..1dcfcf9ebb034
--- /dev/null
+++ b/mllib-local/src/test/scala/org/apache/spark/ml/linalg/BLASBenchmark.scala
@@ -0,0 +1,502 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.linalg
+
+import com.github.fommil.netlib.{BLAS => NetlibBLAS, F2jBLAS}
+
+import org.apache.spark.benchmark.{Benchmark, BenchmarkBase}
+
+/**
+ * Serialization benchmark for BLAS.
+ * To run this benchmark:
+ * {{{
+ * 1. without sbt: bin/spark-submit --class <this class> <spark mllib test jar>
+ * 2. build/sbt "mllib-local/test:runMain <this class>"
+ * 3. generate result: SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt "mllib/test:runMain <this class>"
+ *    Results will be written to "benchmarks/BLASBenchmark-results.txt".
+ * }}}
+ */
+object BLASBenchmark extends BenchmarkBase {
+
+  override def runBenchmarkSuite(mainArgs: Array[String]): Unit = {
+
+    val iters = 1e2.toInt
+    val rnd = new scala.util.Random(0)
+
+    val f2jBLAS = new F2jBLAS
+    val nativeBLAS = NetlibBLAS.getInstance
+    val vectorBLAS =
+      try {
+        // scalastyle:off classforname
+        Class.forName("org.apache.spark.ml.linalg.VectorizedBLAS", true,
+                        Option(Thread.currentThread().getContextClassLoader)
+                          .getOrElse(getClass.getClassLoader))
+             .newInstance()
+             .asInstanceOf[NetlibBLAS]
+        // scalastyle:on classforname
+      } catch {
+        case _: Throwable => new F2jBLAS
+      }
+
+    // scalastyle:off println
+    println("nativeBLAS = " + nativeBLAS.getClass.getName)
+    println("f2jBLAS    = " + f2jBLAS.getClass.getName)
+    println("vectorBLAS = " + vectorBLAS.getClass.getName)
+    // scalastyle:on println
+
+    runBenchmark("daxpy") {
+      val n = 1e7.toInt
+      val alpha = rnd.nextDouble
+      val x = Array.fill(n) { rnd.nextDouble }
+      val y = Array.fill(n) { rnd.nextDouble }
+
+      val benchmark = new Benchmark("daxpy", n, iters, output = output)
+
+      benchmark.addCase("f2j") { _ =>
+        f2jBLAS.daxpy(n, alpha, x, 1, y, 1)
+      }
+
+      if (!nativeBLAS.getClass.equals(classOf[F2jBLAS])) {
+        benchmark.addCase("native") { _ =>
+          nativeBLAS.daxpy(n, alpha, x, 1, y, 1)
+        }
+      }
+
+      if (!vectorBLAS.getClass.equals(classOf[F2jBLAS])) {
+        benchmark.addCase("vector") { _ =>
+          vectorBLAS.daxpy(n, alpha, x, 1, y, 1)
+        }
+      }
+
+      benchmark.run()
+    }
+
+    runBenchmark("ddot") {
+      val n = 1e7.toInt
+      val x = Array.fill(n) { rnd.nextDouble }
+      val y = Array.fill(n) { rnd.nextDouble }
+
+      val benchmark = new Benchmark("ddot", n, iters, output = output)
+
+      benchmark.addCase("f2j") { _ =>
+        f2jBLAS.ddot(n, x, 1, y, 1)
+      }
+
+      if (!nativeBLAS.getClass.equals(classOf[F2jBLAS])) {
+        benchmark.addCase("native") { _ =>
+          nativeBLAS.ddot(n, x, 1, y, 1)
+        }
+      }
+
+      if (!vectorBLAS.getClass.equals(classOf[F2jBLAS])) {
+        benchmark.addCase("vector") { _ =>
+          vectorBLAS.ddot(n, x, 1, y, 1)
+        }
+      }
+
+      benchmark.run()
+    }
+
+    runBenchmark("sdot") {
+      val n = 1e7.toInt
+      val x = Array.fill(n) { rnd.nextFloat }
+      val y = Array.fill(n) { rnd.nextFloat }
+
+      val benchmark = new Benchmark("sdot", n, iters, output = output)
+
+      benchmark.addCase("f2j") { _ =>
+        f2jBLAS.sdot(n, x, 1, y, 1)
+      }
+
+      if (!nativeBLAS.getClass.equals(classOf[F2jBLAS])) {
+        benchmark.addCase("native") { _ =>
+          nativeBLAS.sdot(n, x, 1, y, 1)
+        }
+      }
+
+      if (!vectorBLAS.getClass.equals(classOf[F2jBLAS])) {
+        benchmark.addCase("vector") { _ =>
+          vectorBLAS.sdot(n, x, 1, y, 1)
+        }
+      }
+
+      benchmark.run()
+    }
+
+    runBenchmark("dscal") {
+      val n = 1e7.toInt
+      val alpha = rnd.nextDouble
+      val x = Array.fill(n) { rnd.nextDouble }
+
+      val benchmark = new Benchmark("dscal", n, iters, output = output)
+
+      benchmark.addCase("f2j") { _ =>
+        f2jBLAS.dscal(n, alpha, x, 1)
+      }
+
+      if (!nativeBLAS.getClass.equals(classOf[F2jBLAS])) {
+        benchmark.addCase("native") { _ =>
+          nativeBLAS.dscal(n, alpha, x, 1)
+        }
+      }
+
+      if (!vectorBLAS.getClass.equals(classOf[F2jBLAS])) {
+        benchmark.addCase("vector") { _ =>
+          vectorBLAS.dscal(n, alpha, x, 1)
+        }
+      }
+
+      benchmark.run()
+    }
+
+    runBenchmark("sscal") {
+      val n = 1e7.toInt
+      val alpha = rnd.nextFloat
+      val x = Array.fill(n) { rnd.nextFloat }
+
+      val benchmark = new Benchmark("sscal", n, iters, output = output)
+
+      benchmark.addCase("f2j") { _ =>
+        f2jBLAS.sscal(n, alpha, x, 1)
+      }
+
+      if (!nativeBLAS.getClass.equals(classOf[F2jBLAS])) {
+        benchmark.addCase("native") { _ =>
+          nativeBLAS.sscal(n, alpha, x, 1)
+        }
+      }
+
+      if (!vectorBLAS.getClass.equals(classOf[F2jBLAS])) {
+        benchmark.addCase("vector") { _ =>
+          vectorBLAS.sscal(n, alpha, x, 1)
+        }
+      }
+
+      benchmark.run()
+    }
+
+    runBenchmark("dspmv[U]") {
+      val n = 1e4.toInt
+      val alpha = rnd.nextDouble
+      val a = Array.fill(n * (n + 1) / 2) { rnd.nextDouble }
+      val x = Array.fill(n) { rnd.nextDouble }
+      val beta = rnd.nextDouble
+      val y = Array.fill(n) { rnd.nextDouble }
+
+      val benchmark = new Benchmark("dspmv[U]", n, iters, output = output)
+
+      benchmark.addCase("f2j") { _ =>
+        f2jBLAS.dspmv("U", n, alpha, a, x, 1, beta, y, 1)
+      }
+
+      if (!nativeBLAS.getClass.equals(classOf[F2jBLAS])) {
+        benchmark.addCase("native") { _ =>
+          nativeBLAS.dspmv("U", n, alpha, a, x, 1, beta, y, 1)
+        }
+      }
+
+      if (!vectorBLAS.getClass.equals(classOf[F2jBLAS])) {
+        benchmark.addCase("vector") { _ =>
+          vectorBLAS.dspmv("U", n, alpha, a, x, 1, beta, y, 1)
+        }
+      }
+
+      benchmark.run()
+    }
+
+    runBenchmark("dspr[U]") {
+      val n = 1e4.toInt
+      val alpha = rnd.nextDouble
+      val x = Array.fill(n) { rnd.nextDouble }
+      val a = Array.fill(n * (n + 1) / 2) { rnd.nextDouble }
+
+      val benchmark = new Benchmark("dspr[U]", n, iters, output = output)
+
+      benchmark.addCase("f2j") { _ =>
+        f2jBLAS.dspr("U", n, alpha, x, 1, a)
+      }
+
+      if (!nativeBLAS.getClass.equals(classOf[F2jBLAS])) {
+        benchmark.addCase("native") { _ =>
+          nativeBLAS.dspr("U", n, alpha, x, 1, a)
+        }
+      }
+
+      if (!vectorBLAS.getClass.equals(classOf[F2jBLAS])) {
+        benchmark.addCase("vector") { _ =>
+          vectorBLAS.dspr("U", n, alpha, x, 1, a)
+        }
+      }
+
+      benchmark.run()
+    }
+
+    runBenchmark("dsyr[U]") {
+      val n = 1e4.toInt
+      val alpha = rnd.nextDouble
+      val x = Array.fill(n) { rnd.nextDouble }
+      val a = Array.fill(n * n) { rnd.nextDouble }
+
+      val benchmark = new Benchmark("dsyr[U]", n, iters, output = output)
+
+      benchmark.addCase("f2j") { _ =>
+        f2jBLAS.dsyr("U", n, alpha, x, 1, a, n)
+      }
+
+      if (!nativeBLAS.getClass.equals(classOf[F2jBLAS])) {
+        benchmark.addCase("native") { _ =>
+          nativeBLAS.dsyr("U", n, alpha, x, 1, a, n)
+        }
+      }
+
+      if (!vectorBLAS.getClass.equals(classOf[F2jBLAS])) {
+        benchmark.addCase("vector") { _ =>
+          vectorBLAS.dsyr("U", n, alpha, x, 1, a, n)
+        }
+      }
+
+      benchmark.run()
+    }
+
+    runBenchmark("dgemv[N]") {
+      val m = 1e4.toInt
+      val n = 1e3.toInt
+      val alpha = rnd.nextDouble
+      val a = Array.fill(m * n) { rnd.nextDouble }
+      val lda = m
+      val x = Array.fill(n) { rnd.nextDouble }
+      val beta = rnd.nextDouble
+      val y = Array.fill(m) { rnd.nextDouble }
+
+      val benchmark = new Benchmark("dgemv[N]", n, iters, output = output)
+
+      benchmark.addCase("f2j") { _ =>
+        f2jBLAS.dgemv("N", m, n, alpha, a, lda, x, 1, beta, y, 1)
+      }
+
+      if (!nativeBLAS.getClass.equals(classOf[F2jBLAS])) {
+        benchmark.addCase("native") { _ =>
+          nativeBLAS.dgemv("N", m, n, alpha, a, lda, x, 1, beta, y, 1)
+        }
+      }
+
+      if (!vectorBLAS.getClass.equals(classOf[F2jBLAS])) {
+        benchmark.addCase("vector") { _ =>
+          vectorBLAS.dgemv("N", m, n, alpha, a, lda, x, 1, beta, y, 1)
+        }
+      }
+
+      benchmark.run()
+    }
+
+    runBenchmark("dgemv[T]") {
+      val m = 1e4.toInt
+      val n = 1e3.toInt
+      val alpha = rnd.nextDouble
+      val a = Array.fill(m * n) { rnd.nextDouble }
+      val lda = m
+      val x = Array.fill(m) { rnd.nextDouble }
+      val beta = rnd.nextDouble
+      val y = Array.fill(n) { rnd.nextDouble }
+
+      val benchmark = new Benchmark("dgemv[T]", n, iters, output = output)
+
+      benchmark.addCase("f2j") { _ =>
+        f2jBLAS.dgemv("T", m, n, alpha, a, lda, x, 1, beta, y, 1)
+      }
+
+      if (!nativeBLAS.getClass.equals(classOf[F2jBLAS])) {
+        benchmark.addCase("native") { _ =>
+          nativeBLAS.dgemv("T", m, n, alpha, a, lda, x, 1, beta, y, 1)
+        }
+      }
+
+      if (!vectorBLAS.getClass.equals(classOf[F2jBLAS])) {
+        benchmark.addCase("vector") { _ =>
+          vectorBLAS.dgemv("T", m, n, alpha, a, lda, x, 1, beta, y, 1)
+        }
+      }
+
+      benchmark.run()
+    }
+
+    runBenchmark("sgemv[N]") {
+      val m = 1e4.toInt
+      val n = 1e3.toInt
+      val alpha = rnd.nextFloat
+      val a = Array.fill(m * n) { rnd.nextFloat }
+      val lda = m
+      val x = Array.fill(n) { rnd.nextFloat }
+      val beta = rnd.nextFloat
+      val y = Array.fill(m) { rnd.nextFloat }
+
+      val benchmark = new Benchmark("sgemv[N]", n, iters, output = output)
+
+      benchmark.addCase("f2j") { _ =>
+        f2jBLAS.sgemv("N", m, n, alpha, a, lda, x, 1, beta, y, 1)
+      }
+
+      if (!nativeBLAS.getClass.equals(classOf[F2jBLAS])) {
+        benchmark.addCase("native") { _ =>
+          nativeBLAS.sgemv("N", m, n, alpha, a, lda, x, 1, beta, y, 1)
+        }
+      }
+
+      if (!vectorBLAS.getClass.equals(classOf[F2jBLAS])) {
+        benchmark.addCase("vector") { _ =>
+          vectorBLAS.sgemv("N", m, n, alpha, a, lda, x, 1, beta, y, 1)
+        }
+      }
+
+      benchmark.run()
+    }
+
+    runBenchmark("sgemv[T]") {
+      val m = 1e4.toInt
+      val n = 1e3.toInt
+      val alpha = rnd.nextFloat
+      val a = Array.fill(m * n) { rnd.nextFloat }
+      val lda = m
+      val x = Array.fill(m) { rnd.nextFloat }
+      val beta = rnd.nextFloat
+      val y = Array.fill(n) { rnd.nextFloat }
+
+      val benchmark = new Benchmark("sgemv[T]", n, iters, output = output)
+
+      benchmark.addCase("f2j") { _ =>
+        f2jBLAS.sgemv("T", m, n, alpha, a, lda, x, 1, beta, y, 1)
+      }
+
+      if (!nativeBLAS.getClass.equals(classOf[F2jBLAS])) {
+        benchmark.addCase("native") { _ =>
+          nativeBLAS.sgemv("T", m, n, alpha, a, lda, x, 1, beta, y, 1)
+        }
+      }
+
+      if (!vectorBLAS.getClass.equals(classOf[F2jBLAS])) {
+        benchmark.addCase("vector") { _ =>
+          vectorBLAS.sgemv("T", m, n, alpha, a, lda, x, 1, beta, y, 1)
+        }
+      }
+
+      benchmark.run()
+    }
+
+    runBenchmark("dgemm[N,N]") {
+      val m = 1e3.toInt
+      val n = 1e2.toInt
+      val k = 1e3.toInt
+      val alpha = rnd.nextDouble
+      val a = Array.fill(m * k) { rnd.nextDouble }
+      val lda = m
+      val b = Array.fill(k * n) { rnd.nextDouble }
+      val ldb = k
+      val beta = rnd.nextDouble
+      val c = Array.fill(m * n) { rnd.nextDouble }
+      var ldc = m
+
+      val benchmark = new Benchmark("dgemm[N,N]", m*n, iters, output = output)
+
+      benchmark.addCase("f2j") { _ =>
+        f2jBLAS.dgemm("N", "N", m, n, k, alpha, a, lda, b, ldb, beta, c, ldc)
+      }
+
+      if (!nativeBLAS.getClass.equals(classOf[F2jBLAS])) {
+        benchmark.addCase("native") { _ =>
+          nativeBLAS.dgemm("N", "N", m, n, k, alpha, a, lda, b, ldb, beta, c, ldc)
+        }
+      }
+
+      if (!vectorBLAS.getClass.equals(classOf[F2jBLAS])) {
+        benchmark.addCase("vector") { _ =>
+          vectorBLAS.dgemm("N", "N", m, n, k, alpha, a, lda, b, ldb, beta, c, ldc)
+        }
+      }
+
+      benchmark.run()
+    }
+
+    runBenchmark("dgemm[N,T]") {
+      val m = 1e3.toInt
+      val n = 1e2.toInt
+      val k = 1e3.toInt
+      val alpha = rnd.nextDouble
+      val a = Array.fill(m * k) { rnd.nextDouble }
+      val lda = m
+      val b = Array.fill(k * n) { rnd.nextDouble }
+      val ldb = n
+      val beta = rnd.nextDouble
+      val c = Array.fill(m * n) { rnd.nextDouble }
+      var ldc = m
+
+      val benchmark = new Benchmark("dgemm[N,T]", m*n, iters, output = output)
+
+      benchmark.addCase("f2j") { _ =>
+        f2jBLAS.dgemm("N", "T", m, n, k, alpha, a, lda, b, ldb, beta, c, ldc)
+      }
+
+      if (!nativeBLAS.getClass.equals(classOf[F2jBLAS])) {
+        benchmark.addCase("native") { _ =>
+          nativeBLAS.dgemm("N", "T", m, n, k, alpha, a, lda, b, ldb, beta, c, ldc)
+        }
+      }
+
+      if (!vectorBLAS.getClass.equals(classOf[F2jBLAS])) {
+        benchmark.addCase("vector") { _ =>
+          vectorBLAS.dgemm("N", "T", m, n, k, alpha, a, lda, b, ldb, beta, c, ldc)
+        }
+      }
+
+      benchmark.run()
+    }
+
+    runBenchmark("dgemm[T,N]") {
+      val m = 1e3.toInt
+      val n = 1e2.toInt
+      val k = 1e3.toInt
+      val alpha = rnd.nextDouble
+      val a = Array.fill(m * k) { rnd.nextDouble }
+      val lda = k
+      val b = Array.fill(k * n) { rnd.nextDouble }
+      val ldb = k
+      val beta = rnd.nextDouble
+      val c = Array.fill(m * n) { rnd.nextDouble }
+      var ldc = m
+
+      val benchmark = new Benchmark("dgemm[T,N]", m*n, iters, output = output)
+
+      benchmark.addCase("f2j") { _ =>
+        f2jBLAS.dgemm("T", "N", m, n, k, alpha, a, lda, b, ldb, beta, c, ldc)
+      }
+
+      if (!nativeBLAS.getClass.equals(classOf[F2jBLAS])) {
+        benchmark.addCase("native") { _ =>
+          nativeBLAS.dgemm("T", "N", m, n, k, alpha, a, lda, b, ldb, beta, c, ldc)
+        }
+      }
+
+      if (!vectorBLAS.getClass.equals(classOf[F2jBLAS])) {
+        benchmark.addCase("vector") { _ =>
+          vectorBLAS.dgemm("T", "N", m, n, k, alpha, a, lda, b, ldb, beta, c, ldc)
+        }
+      }
+
+      benchmark.run()
+    }
+  }
+}
diff --git a/mllib-local/src/test/scala/org/apache/spark/ml/linalg/BLASSuite.scala b/mllib-local/src/test/scala/org/apache/spark/ml/linalg/BLASSuite.scala
index 781f3da313d82..ce177666f34e6 100644
--- a/mllib-local/src/test/scala/org/apache/spark/ml/linalg/BLASSuite.scala
+++ b/mllib-local/src/test/scala/org/apache/spark/ml/linalg/BLASSuite.scala
@@ -24,7 +24,7 @@ import org.apache.spark.ml.util.TestingUtils._
 class BLASSuite extends SparkMLFunSuite {
 
   test("nativeL1Threshold") {
-    assert(getBLAS(128) == BLAS.f2jBLAS)
+    assert(getBLAS(128) == BLAS.javaBLAS)
     assert(getBLAS(256) == BLAS.nativeBLAS)
     assert(getBLAS(512) == BLAS.nativeBLAS)
   }
diff --git a/mllib/src/main/scala/org/apache/spark/ml/ann/BreezeUtil.scala b/mllib/src/main/scala/org/apache/spark/ml/ann/BreezeUtil.scala
index 6bbe7e1cb2134..ce9093a6238ee 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/ann/BreezeUtil.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/ann/BreezeUtil.scala
@@ -18,7 +18,8 @@
 package org.apache.spark.ml.ann
 
 import breeze.linalg.{DenseMatrix => BDM, DenseVector => BDV}
-import com.github.fommil.netlib.BLAS.{getInstance => NativeBLAS}
+
+import org.apache.spark.ml.linalg.BLAS
 
 /**
  * In-place DGEMM and DGEMV for Breeze
@@ -41,7 +42,7 @@ private[ann] object BreezeUtil {
     require(A.cols == B.rows, "A & B Dimension mismatch!")
     require(A.rows == C.rows, "A & C Dimension mismatch!")
     require(B.cols == C.cols, "A & C Dimension mismatch!")
-    NativeBLAS.dgemm(transposeString(A), transposeString(B), C.rows, C.cols, A.cols,
+    BLAS.nativeBLAS.dgemm(transposeString(A), transposeString(B), C.rows, C.cols, A.cols,
       alpha, A.data, A.offset, A.majorStride, B.data, B.offset, B.majorStride,
       beta, C.data, C.offset, C.rows)
   }
@@ -57,7 +58,7 @@ private[ann] object BreezeUtil {
   def dgemv(alpha: Double, A: BDM[Double], x: BDV[Double], beta: Double, y: BDV[Double]): Unit = {
     require(A.cols == x.length, "A & x Dimension mismatch!")
     require(A.rows == y.length, "A & y Dimension mismatch!")
-    NativeBLAS.dgemv(transposeString(A), A.rows, A.cols,
+    BLAS.nativeBLAS.dgemv(transposeString(A), A.rows, A.cols,
       alpha, A.data, A.offset, A.majorStride, x.data, x.offset, x.stride,
       beta, y.data, y.offset, y.stride)
   }
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala
index ca43ef863003a..453e28609a0c2 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala
@@ -17,14 +17,13 @@
 
 package org.apache.spark.ml.classification
 
-import com.github.fommil.netlib.BLAS.{getInstance => blas}
 import org.json4s.{DefaultFormats, JObject}
 import org.json4s.JsonDSL._
 
 import org.apache.spark.annotation.Since
 import org.apache.spark.internal.Logging
 import org.apache.spark.ml.feature.Instance
-import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors}
+import org.apache.spark.ml.linalg.{BLAS, DenseVector, SparseVector, Vector, Vectors}
 import org.apache.spark.ml.param.ParamMap
 import org.apache.spark.ml.regression.DecisionTreeRegressionModel
 import org.apache.spark.ml.tree._
@@ -371,7 +370,7 @@ class GBTClassificationModel private[ml](
   /** Raw prediction for the positive class. */
   private def margin(features: Vector): Double = {
     val treePredictions = _trees.map(_.rootNode.predictImpl(features).prediction)
-    blas.ddot(getNumTrees, treePredictions, 1, _treeWeights, 1)
+    BLAS.nativeBLAS.ddot(getNumTrees, treePredictions, 1, _treeWeights, 1)
   }
 
   /** (private[ml]) Convert to a model in the old API */
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
index 0bc5b9a9ca09f..57fb46b451689 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
@@ -980,7 +980,7 @@ class LogisticRegression @Since("1.2.0") (
     if (fitWithMean) {
       if (multinomial) {
         val adapt = Array.ofDim[Double](numClasses)
-        BLAS.f2jBLAS.dgemv("N", numClasses, numFeatures, 1.0,
+        BLAS.javaBLAS.dgemv("N", numClasses, numFeatures, 1.0,
           initialSolution, numClasses, scaledMean, 1, 0.0, adapt, 1)
         BLAS.getBLAS(numFeatures).daxpy(numClasses, 1.0, adapt, 0, 1,
           initialSolution, numClasses * numFeatures, 1)
@@ -1016,7 +1016,7 @@ class LogisticRegression @Since("1.2.0") (
     if (fitWithMean && solution != null) {
       if (multinomial) {
         val adapt = Array.ofDim[Double](numClasses)
-        BLAS.f2jBLAS.dgemv("N", numClasses, numFeatures, 1.0,
+        BLAS.javaBLAS.dgemv("N", numClasses, numFeatures, 1.0,
           solution, numClasses, scaledMean, 1, 0.0, adapt, 1)
         BLAS.getBLAS(numFeatures).daxpy(numClasses, -1.0, adapt, 0, 1,
           solution, numClasses * numFeatures, 1)
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/BucketedRandomProjectionLSH.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/BucketedRandomProjectionLSH.scala
index 475fc5b7f8ccf..16d711197fdde 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/BucketedRandomProjectionLSH.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/BucketedRandomProjectionLSH.scala
@@ -183,8 +183,8 @@ class BucketedRandomProjectionLSH(override val uid: String)
     var i = 0
     while (i < localNumHashTables) {
       val offset = i * inputDim
-      val norm = BLAS.f2jBLAS.dnrm2(inputDim, values, offset, 1)
-      if (norm != 0) BLAS.f2jBLAS.dscal(inputDim, 1.0 / norm, values, offset, 1)
+      val norm = BLAS.javaBLAS.dnrm2(inputDim, values, offset, 1)
+      if (norm != 0) BLAS.javaBLAS.dscal(inputDim, 1.0 / norm, values, offset, 1)
       i += 1
     }
     val randMatrix = new DenseMatrix(localNumHashTables, inputDim, values, true)
diff --git a/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala b/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala
index 9a24d3b0d09db..a0ddf7129c9b5 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala
@@ -26,7 +26,6 @@ import scala.reflect.ClassTag
 import scala.util.{Sorting, Try}
 import scala.util.hashing.byteswap64
 
-import com.github.fommil.netlib.BLAS.{getInstance => blas}
 import com.google.common.collect.{Ordering => GuavaOrdering}
 import org.apache.hadoop.fs.Path
 import org.json4s.DefaultFormats
@@ -483,7 +482,7 @@ class ALSModel private[ml] (
 
           Iterator.range(0, m).flatMap { i =>
             // scores = i-th vec in srcMat * dstMat
-            BLAS.f2jBLAS.sgemv("T", rank, n, 1.0F, dstMat, 0, rank,
+            BLAS.javaBLAS.sgemv("T", rank, n, 1.0F, dstMat, 0, rank,
               srcMat, i * rank, 1, 0.0F, scores, 0, 1)
 
             val srcId = srcIds(i)
@@ -895,9 +894,9 @@ object ALS extends DefaultParamsReadable[ALS] with Logging {
       require(c >= 0.0)
       require(a.length == k)
       copyToDouble(a)
-      blas.dspr(upper, k, c, da, 1, ata)
+      BLAS.nativeBLAS.dspr(upper, k, c, da, 1, ata)
       if (b != 0.0) {
-        blas.daxpy(k, b, da, 1, atb, 1)
+        BLAS.nativeBLAS.daxpy(k, b, da, 1, atb, 1)
       }
       this
     }
@@ -905,8 +904,8 @@ object ALS extends DefaultParamsReadable[ALS] with Logging {
     /** Merges another normal equation object. */
     def merge(other: NormalEquation): NormalEquation = {
       require(other.k == k)
-      blas.daxpy(ata.length, 1.0, other.ata, 1, ata, 1)
-      blas.daxpy(atb.length, 1.0, other.atb, 1, atb, 1)
+      BLAS.nativeBLAS.daxpy(ata.length, 1.0, other.ata, 1, ata, 1)
+      BLAS.nativeBLAS.daxpy(atb.length, 1.0, other.atb, 1, atb, 1)
       this
     }
 
@@ -1280,8 +1279,8 @@ object ALS extends DefaultParamsReadable[ALS] with Logging {
           val random = new XORShiftRandom(byteswap64(seed ^ srcBlockId))
           val factors = Array.fill(inBlock.srcIds.length) {
             val factor = Array.fill(rank)(random.nextGaussian().toFloat)
-            val nrm = blas.snrm2(rank, factor, 1)
-            blas.sscal(rank, 1.0f / nrm, factor, 1)
+            val nrm = BLAS.nativeBLAS.snrm2(rank, factor, 1)
+            BLAS.nativeBLAS.sscal(rank, 1.0f / nrm, factor, 1)
             factor
           }
           (srcBlockId, factors)
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala
index 78d5ddaa2758b..fd8af71d43568 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala
@@ -17,13 +17,12 @@
 
 package org.apache.spark.ml.regression
 
-import com.github.fommil.netlib.BLAS.{getInstance => blas}
 import org.json4s.{DefaultFormats, JObject}
 import org.json4s.JsonDSL._
 
 import org.apache.spark.annotation.Since
 import org.apache.spark.internal.Logging
-import org.apache.spark.ml.linalg.Vector
+import org.apache.spark.ml.linalg.{BLAS, Vector}
 import org.apache.spark.ml.param.ParamMap
 import org.apache.spark.ml.tree._
 import org.apache.spark.ml.tree.impl.GradientBoostedTrees
@@ -299,7 +298,7 @@ class GBTRegressionModel private[ml](
     // TODO: When we add a generic Boosting class, handle transform there?  SPARK-7129
     // Classifies by thresholding sum of weighted tree predictions
     val treePredictions = _trees.map(_.rootNode.predictImpl(features).prediction)
-    blas.ddot(getNumTrees, treePredictions, 1, _treeWeights, 1)
+    BLAS.nativeBLAS.ddot(getNumTrees, treePredictions, 1, _treeWeights, 1)
   }
 
   @Since("1.4.0")
diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/GBTClassifierSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/GBTClassifierSuite.scala
index 37e695ef88b14..c86bc0d9a36f3 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/classification/GBTClassifierSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/classification/GBTClassifierSuite.scala
@@ -17,12 +17,10 @@
 
 package org.apache.spark.ml.classification
 
-import com.github.fommil.netlib.BLAS
-
 import org.apache.spark.{SparkException, SparkFunSuite}
 import org.apache.spark.ml.classification.LinearSVCSuite.generateSVMInput
 import org.apache.spark.ml.feature.{Instance, LabeledPoint}
-import org.apache.spark.ml.linalg.{Vector, Vectors}
+import org.apache.spark.ml.linalg.{BLAS, Vector, Vectors}
 import org.apache.spark.ml.param.ParamsSuite
 import org.apache.spark.ml.regression.DecisionTreeRegressionModel
 import org.apache.spark.ml.tree._
@@ -170,8 +168,6 @@ class GBTClassifierSuite extends MLTest with DefaultReadWriteTest {
     val numFeatures = trainingDataset.select(featuresCol).first().getAs[Vector](0).size
     assert(gbtModel.numFeatures === numFeatures)
 
-    val blas = BLAS.getInstance()
-
     val validationDataset = validationData.toDF(labelCol, featuresCol)
     testTransformer[(Double, Vector)](validationDataset, gbtModel,
       "rawPrediction", "features", "probability", "prediction") {
@@ -179,7 +175,7 @@ class GBTClassifierSuite extends MLTest with DefaultReadWriteTest {
         assert(raw.size === 2)
         // check that raw prediction is tree predictions dot tree weights
         val treePredictions = gbtModel.trees.map(_.rootNode.predictImpl(features).prediction)
-        val prediction = blas.ddot(gbtModel.getNumTrees, treePredictions, 1,
+        val prediction = BLAS.nativeBLAS.ddot(gbtModel.getNumTrees, treePredictions, 1,
           gbtModel.treeWeights, 1)
         assert(raw ~== Vectors.dense(-prediction, prediction) relTol eps)
 
diff --git a/mllib/src/test/scala/org/apache/spark/ml/recommendation/ALSSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/recommendation/ALSSuite.scala
index 28275eb06cf0d..cebd8cac057f1 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/recommendation/ALSSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/recommendation/ALSSuite.scala
@@ -24,14 +24,13 @@ import scala.collection.JavaConverters._
 import scala.collection.mutable
 import scala.collection.mutable.{ArrayBuffer, WrappedArray}
 
-import com.github.fommil.netlib.BLAS.{getInstance => blas}
 import org.apache.commons.io.FileUtils
 import org.apache.commons.io.filefilter.TrueFileFilter
 import org.scalatest.BeforeAndAfterEach
 
 import org.apache.spark._
 import org.apache.spark.internal.Logging
-import org.apache.spark.ml.linalg.Vectors
+import org.apache.spark.ml.linalg.{BLAS, Vectors}
 import org.apache.spark.ml.recommendation.ALS._
 import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTest, MLTestingUtils}
 import org.apache.spark.ml.util.TestingUtils._
@@ -296,7 +295,7 @@ class ALSSuite extends MLTest with DefaultReadWriteTest with Logging {
     for ((userId, userFactor) <- userFactors; (itemId, itemFactor) <- itemFactors) {
       val x = random.nextDouble()
       if (x < totalFraction) {
-        val rating = blas.sdot(rank, userFactor, 1, itemFactor, 1)
+        val rating = BLAS.nativeBLAS.sdot(rank, userFactor, 1, itemFactor, 1)
         if (x < trainingFraction) {
           val noise = noiseStd * random.nextGaussian()
           training += Rating(userId, itemId, rating + noise.toFloat)
@@ -1194,7 +1193,7 @@ object ALSSuite extends Logging {
     val training = ArrayBuffer.empty[Rating[Int]]
     val test = ArrayBuffer.empty[Rating[Int]]
     for ((userId, userFactor) <- userFactors; (itemId, itemFactor) <- itemFactors) {
-      val rating = blas.sdot(rank, userFactor, 1, itemFactor, 1)
+      val rating = BLAS.nativeBLAS.sdot(rank, userFactor, 1, itemFactor, 1)
       val threshold = if (rating > 0) positiveFraction else negativeFraction
       val observed = random.nextDouble() < threshold
       if (observed) {
diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala
index 7bfb0a4aa1f3b..97f5e2f4ff683 100644
--- a/project/SparkBuild.scala
+++ b/project/SparkBuild.scala
@@ -291,6 +291,12 @@ object SparkBuild extends PomBuild {
     (SbtCompile / publishLocal) := publishTask((SbtCompile / publishLocalConfiguration)).value,
     publishLocal := Seq((MavenCompile / publishLocal), (SbtCompile / publishLocal)).dependOn.value,
 
+    javaOptions ++= {
+      val versionParts = System.getProperty("java.version").split("[+.\\-]+", 3)
+      var major = versionParts(0).toInt
+      if (major >= 16) Seq("--add-modules=jdk.incubator.vector") else Seq.empty
+    },
+
     (Compile / doc / javacOptions) ++= {
       val versionParts = System.getProperty("java.version").split("[+.\\-]+", 3)
       var major = versionParts(0).toInt