sanity-check:fix issues caused by sync with upstream llama.cpp

zhouwg · Mar 28, 2024 · 877f2de · 877f2de · zhouwg · Mar 28, 2024
1 parent a7a975c
commit 877f2de
Show file tree

Hide file tree

Showing 12 changed files with 102 additions and 33 deletions.
diff --git a/README.md b/README.md
@@ -223,7 +223,7 @@ https://github.com/zhouwg/kantv/assets/6889919/2fabcb24-c00b-4289-a06e-05b98ecd2
 
 ----
 
-![635425083](https://github.com/zhouwg/kantv/assets/6889919/bcf5103e-af44-4a82-bc9f-3a06e4265781)
+![1697162123](https://github.com/zhouwg/kantv/assets/6889919/d6b9ab54-ff27-43f7-b169-25c614ca3280)
 
 <details>
   <summary>some other screenshots</summary>
@@ -264,8 +264,6 @@ https://github.com/zhouwg/kantv/assets/6889919/2fabcb24-c00b-4289-a06e-05b98ecd2
 
 - improve <b>quality</b> of real-time English subtitle which powered by great and excellent and amazing ![whisper.cpp](https://github.com/ggerganov/whisper.cpp)
 
-- <a href="https://github.com/ggerganov/ggml/issues/771">adding native backend for Qualcomm mobile SoC(Qualcomm Snapdragon 8 Gen 3)</a>
-
 - real-time Chinese subtitle for online English TV by great and excellent and amazing ![whisper.cpp](https://github.com/ggerganov/whisper.cpp)
 
 - bugfix in UI layer(Java)

diff --git a/cdeosplayer/kantv/src/main/java/com/cdeos/kantv/ui/fragment/ASRResearchFragment.java b/cdeosplayer/kantv/src/main/java/com/cdeos/kantv/ui/fragment/ASRResearchFragment.java
@@ -184,7 +184,7 @@ public void initView() {
                  + "Arch:" + Build.CPU_ABI + "(" + systemInfo + ")";
          _txtGGMLInfo.setText("");
          _txtGGMLInfo.append(phoneInfo + "\n");
-         _txtGGMLInfo.append("Powered by whisper.cpp(fff24a0148fe194df4997a738eeceddd724959c3,Thu Mar 21 22:23:30 2024)(https://github.com/ggerganov/whisper.cpp)\n");
+         _txtGGMLInfo.append("Powered by whisper.cpp(https://github.com/ggerganov/whisper.cpp)\n");
 
 
          Spinner spinnerBenchType = mActivity.findViewById(R.id.spinnerBenchType);
@@ -289,9 +289,13 @@ public void onNothingSelected(AdapterView<?> parent) {
 
              //TODO: better method
              //sanity check begin
-             if (strModeName.startsWith("llama")) {
+             if (strModeName.contains("llama")) {
                  isLLMModel = true;
-             } else if (strModeName.startsWith("qwen")) {
+             } else if (strModeName.contains("qwen")) {
+                 isLLMModel = true;
+             } else if (strModeName.contains("baichuan")) {
+                 isLLMModel = true;
+             } else if (strModeName.contains("gemma")) {
                  isLLMModel = true;
              }
              if (isLLMModel)
@@ -320,7 +324,7 @@ public void onNothingSelected(AdapterView<?> parent) {
              File sampleFile = new File(CDEUtils.getDataPath() + ggmlSampleFileName);
 
              if (!selectModeFile.exists() || (!sampleFile.exists())) {
-                 CDEUtils.showMsgBox(mActivity, "pls check whether GGML's model file and sample file(jfk.wav) exist in /sdcard/kantv/");
+                 CDEUtils.showMsgBox(mActivity, "pls check whether GGML's model file:" + selectModeFileName + " and sample file(jfk.wav) exist in /sdcard/kantv/");
                  return;
              }
              //sanity check end

diff --git a/cdeosplayer/kantv/src/main/java/com/cdeos/kantv/ui/fragment/LLMResearchFragment.java b/cdeosplayer/kantv/src/main/java/com/cdeos/kantv/ui/fragment/LLMResearchFragment.java
@@ -98,7 +98,8 @@ public class LLMResearchFragment extends BaseMvpFragment<LLMResearchPresenter> i
      private long endTime = 0;
      private long duration = 0;
      private String strBenchmarkInfo;
-     private String strUserInput = "how many days in March 2024?";
+     //private String strUserInput = "how many days in March 2024?";
+     private String strUserInput = "introduce the movie Once Upon a Time in America briefly, less then 100 words.";
 
      private AtomicBoolean isBenchmarking = new AtomicBoolean(false);
      private ProgressDialog mProgressDialog;
@@ -112,21 +113,25 @@ public class LLMResearchFragment extends BaseMvpFragment<LLMResearchPresenter> i
      // https://huggingface.co/TheBloke/Llama-2-70B-Chat-GGUF
 
 
-
-
      // https://huggingface.co/Qwen/Qwen1.5-1.8B-Chat-GGUF/resolve/main/qwen1_5-1_8b-chat-q4_0.gguf   //1.1 GB
 
 
-
-
      // https://huggingface.co/TheBloke/blossom-v3-baichuan2-7B-GGUF
      // https://huggingface.co/shaowenchen/baichuan2-7b-chat-gguf
      // https://huggingface.co/TheBloke/blossom-v3-baichuan2-7B-GGUF/blob/main/blossom-v3-baichuan2-7b.Q4_K_M.gguf // 4.61 GB
 
+
+     // https://huggingface.co/mlabonne/gemma-2b-GGUF/tree/main
+     // https://huggingface.co/mlabonne/gemma-2b-GGUF/resolve/main/gemma-2b.Q4_K_M.gguf  // 1.5 GB
+     // https://huggingface.co/mlabonne/gemma-2b-GGUF/resolve/main/gemma-2b.Q8_0.gguf    // 2.67 GB
+
+
      //private String ggmlModelFileName = "llama-2-7b.Q4_K_M.gguf";    //4.08 GB
      //private String ggmlModelFileName = "llama-2-7b-chat.Q4_K_M.gguf"; //4.08 GB
      //private String ggmlModelFileName = "qwen1_5-1_8b-chat-q4_0.gguf"; // 1.1 GB
-     private String ggmlModelFileName = "blossom-v3-baichuan2-7b.Q4_K_M.gguf"; // 4.61 GB
+     //private String ggmlModelFileName = "baichuan2-7b.Q4_K_M.gguf"; // 4.61 GB
+     //private String ggmlModelFileName = "gemma-2b.Q4_K_M.gguf";  // 1.5 GB
+     private String ggmlModelFileName = "gemma-2b.Q8_0.gguf";    // 2.67 GB
 
      private Context mContext;
      private Activity mActivity;
@@ -205,6 +210,8 @@ public void initView() {
 
          _btnInference.setOnClickListener(v -> {
              String strPrompt = _txtUserInput.getText().toString();
+
+             //sanity check begin
              if (strPrompt.isEmpty()) {
                  //CDEUtils.showMsgBox(mActivity, "pls check your input");
                  //return;
@@ -230,6 +237,8 @@ public void initView() {
                  CDEUtils.showMsgBox(mActivity, "pls check whether GGML's model file exist in /sdcard/kantv/");
                  return;
              }
+             //sanity check end
+
              ggmlModelFileName = selectModeFileName;
              CDELog.j(TAG, "model file:" + CDEUtils.getDataPath() + selectModeFileName);
 

diff --git a/cdeosplayer/kantv/src/main/res/layout/fragment_asr.xml b/cdeosplayer/kantv/src/main/res/layout/fragment_asr.xml
@@ -95,7 +95,7 @@
                 android:id="@+id/btnBenchmark"
                 android:layout_width="wrap_content"
                 android:layout_height="30dp"
-                android:layout_marginLeft="30dp"
+                android:layout_marginLeft="10dp"
                 android:background="@drawable/button_drawable"
                 android:text="Benchmark"
                 android:textAllCaps="false"

diff --git a/cdeosplayer/kantv/src/main/res/layout/fragment_llm.xml b/cdeosplayer/kantv/src/main/res/layout/fragment_llm.xml
@@ -49,10 +49,10 @@
             <EditText
                 android:id="@+id/txtUserInput"
                 android:layout_width="match_parent"
-                android:layout_height="50dp"
+                android:layout_height="60dp"
                 android:gravity="top"
                 android:background="@drawable/textview_border"
-                android:hint="how many days in March 2024?"
+                android:hint="introduce the movie Once Upon a Time in America briefly."
                 android:layout_marginTop="5dp"
                 />
 

diff --git a/cdeosplayer/kantv/src/main/res/values/arrays.xml b/cdeosplayer/kantv/src/main/res/values/arrays.xml
@@ -111,5 +111,8 @@
         <item>large</item>
         <item>llama-2-7b-chat.Q4_K_M</item>
         <item>qwen1_5-1_8b-chat-q4_0</item>
+        <item>baichuan2-7b.Q4_K_M</item>
+        <item>gemma-2b.Q4_K_M</item>
+        <item>gemma-2b.Q8_0</item>
     </string-array>
 </resources>
diff --git a/external/ggml/jni/ggml-jni-impl.cpp b/external/ggml/jni/ggml-jni-impl.cpp
@@ -142,12 +142,14 @@ typedef struct {
     size_t n_decoding_mode;                          // 0:WHISPER_SAMPLING_GREEDY 1:WHISPER_SAMPLING_BEAM_SEARCH
 
     size_t n_asr_mode;                               // 0: normal transcription  1: asr pressure test 2:benchmark 3: transcription + audio record
-    size_t n_benchmark_type;                         // what to benchmark: 0: asr, 1: memcpy 2: mulmat  3: whisper_encode/whisper full benchmark
-    bool   b_use_gpu;
+    size_t n_benchmark_type;                         // what to benchmark:
+                                                     // 0: asr(transcription) 1: memcpy 2: mulmat  3: full/whisper_encode 4: matrix  5: LLAMA inference
 
-    bool   b_abort_benchmark;                        //TODO: for abort time-consuming benchmark from UI layer. not works perfectly as expected
+    bool   b_use_gpu;                                // TODO: not used on Android device currently, ref: https://github.com/ggerganov/ggml/issues/771
 
-    fifo_buffer_t   * asr_fifo;                      //fifo for ASR data producer-consumer
+    bool   b_abort_benchmark;                        // TODO: for abort time-consuming task from UI layer. not works as expected
+
+    fifo_buffer_t   * asr_fifo;                      // fifo for ASR data producer-consumer
 
     size_t   n_sample_size;
 
@@ -158,7 +160,7 @@ typedef struct {
 
     class whisper_asr * p_asr;                       // attention memory leak, smart pointer should not be used here for performance consideration
 
-    pthread_mutex_t  mutex;
+    pthread_mutex_t  mutex;                          // not used since 03-19-2024
 
     //only for troubleshooting issue
     bool     b_pre_convert;

diff --git a/external/ggml/llamacpp/Makefile b/external/ggml/llamacpp/Makefile
@@ -670,7 +670,7 @@ ggml-quants.o: ggml-quants.c ggml.h ggml-quants.h ggml-common.h
 unicode.o: unicode.cpp unicode.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
 
-OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o ../../ffmpeg/libavutil/cde_log.o
+OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o ../../ffmpeg/libavutil/cde_log.o
 
 llama.o: llama.cpp unicode.h ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h llama.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@

diff --git a/external/ggml/llamacpp/diff-with-upstream-llamacpp.sh b/external/ggml/llamacpp/diff-with-upstream-llamacpp.sh
@@ -32,7 +32,7 @@ echo -e  "upstream llamacpp path: ${UPSTREAM_LLAMACPP_PATH}\n"
 echo -e  "local    llamacpp path: ${LOCAL_LLAMACPP_PATH}\n"
 
 #the following method borrow from bench-all.sh in GGML's project whisper.cpp
-LLAMACPP_SRCS=(ggml-alloc.c ggml-alloc.h ggml-backend.c ggml-backend.h ggml.c ggml.h ggml-quants.c ggml-quants.h llama.cpp llama.h unicode.h unicode.cpp unicode-data.h unicode-data.cpp)
+LLAMACPP_SRCS=(ggml-alloc.c ggml-alloc.h ggml-backend.c ggml-backend.h ggml.c ggml.h ggml-quants.c ggml-quants.h llama.cpp llama.h unicode.h unicode.cpp unicode-data.h unicode-data.cpp ggml-common.h)
 for file in "${LLAMACPP_SRCS[@]}"; do
     echo "diff $file     ${UPSTREAM_LLAMACPP_PATH}/$file"
     diff ${LOCAL_LLAMACPP_PATH}/$file     ${UPSTREAM_LLAMACPP_PATH}/$file

diff --git a/external/ggml/llamacpp/ggml-common.h b/external/ggml/llamacpp/ggml-common.h
@@ -377,6 +377,27 @@ typedef struct {
 } block_iq1_s;
 static_assert(sizeof(block_iq1_s) == sizeof(ggml_half) + QK_K/8 + QK_K/16, "wrong iq1_s block size/padding");
 
+// 1.75 bpw
+typedef struct {
+    uint8_t  qs[QK_K/8];      // grid index, low 8 bits
+    uint8_t  qh[QK_K/16];     // grid index, high 3 bits + grid shift bit (for two groups of 8)
+#if QK_K == 64
+    ggml_half d;
+#endif
+    uint8_t  scales[QK_K/32]; // 3-bit block scales (4-bit if QK_K == 64)
+} block_iq1_m;
+#if QK_K == 64
+static_assert(sizeof(block_iq1_m) == QK_K/8 + QK_K/16 + QK_K/32 + sizeof(ggml_half), "wrong iq1_m block size/padding");
+#else
+static_assert(sizeof(block_iq1_m) == QK_K/8 + QK_K/16 + QK_K/32, "wrong iq1_m block size/padding");
+#endif
+
+// Used by IQ1_M quants
+typedef union {
+    ggml_half f16;
+    uint16_t  u16;
+} iq1m_scale_t;
+
 // Non-linear quants
 #define QK4_NL 32
 typedef struct {
@@ -1050,6 +1071,7 @@ GGML_TABLE_END()
 
 #define NGRID_IQ1S 2048
 #define IQ1S_DELTA 0.125f
+#define IQ1M_DELTA 0.125f
 #if defined(GGML_COMMON_IMPL_C)
 GGML_TABLE_BEGIN(uint64_t, iq1s_grid, NGRID_IQ1S)
     0xffffffffffffffff, 0xffffffffffffff01, 0xffffffffffff0000, 0xffffffffffff01ff,

diff --git a/external/ggml/llamacpp/llama.cpp b/external/ggml/llamacpp/llama.cpp
@@ -7,6 +7,11 @@
 #include "ggml-alloc.h"
 #include "ggml-backend.h"
 
+#ifdef TARGET_ANDROID
+#include "kantv-asr.h"
+#include "ggml-jni.h"
+#endif
+
 #ifdef GGML_USE_CUDA
 #  include "ggml-cuda.h"
 #elif defined(GGML_USE_CLBLAST)
@@ -15617,15 +15622,41 @@ struct llama_timings llama_get_timings(struct llama_context * ctx) {
 void llama_print_timings(struct llama_context * ctx) {
     const llama_timings timings = llama_get_timings(ctx);
 
-    LLAMA_LOG_INFO("\n");
-    LLAMA_LOG_INFO("%s:        load time = %10.2f ms\n", __func__, timings.t_load_ms);
-    LLAMA_LOG_INFO("%s:      sample time = %10.2f ms / %5d runs   (%8.2f ms per token, %8.2f tokens per second)\n",
-            __func__, timings.t_sample_ms, timings.n_sample, timings.t_sample_ms / timings.n_sample, 1e3 / timings.t_sample_ms * timings.n_sample);
-    LLAMA_LOG_INFO("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
-            __func__, timings.t_p_eval_ms, timings.n_p_eval, timings.t_p_eval_ms / timings.n_p_eval, 1e3 / timings.t_p_eval_ms * timings.n_p_eval);
-    LLAMA_LOG_INFO("%s:        eval time = %10.2f ms / %5d runs   (%8.2f ms per token, %8.2f tokens per second)\n",
-            __func__, timings.t_eval_ms, timings.n_eval, timings.t_eval_ms / timings.n_eval, 1e3 / timings.t_eval_ms * timings.n_eval);
-    LLAMA_LOG_INFO("%s:       total time = %10.2f ms / %5d tokens\n", __func__, (timings.t_end_ms - timings.t_start_ms), (timings.n_p_eval + timings.n_eval));
+#ifdef TARGET_ANDROID
+    std::ostringstream timing;
+    timing << "llama-timings:\t";
+#endif
+
+    LOGGV("\n");
+    LOGGV("%s:        load time = %10.2f ms\n", __func__, timings.t_load_ms);
+    LOGGV("%s:      sample time = %10.2f ms / %5d runs   (%8.2f ms per token, %8.2f tokens per second)\n",
+          __func__, timings.t_sample_ms, timings.n_sample, timings.t_sample_ms / timings.n_sample, 1e3 / timings.t_sample_ms * timings.n_sample);
+    LOGGV("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
+          __func__, timings.t_p_eval_ms, timings.n_p_eval, timings.t_p_eval_ms / timings.n_p_eval, 1e3 / timings.t_p_eval_ms * timings.n_p_eval);
+    LOGGV("%s:        eval time = %10.2f ms / %5d runs   (%8.2f ms per token, %8.2f tokens per second)\n",
+          __func__, timings.t_eval_ms, timings.n_eval, timings.t_eval_ms / timings.n_eval, 1e3 / timings.t_eval_ms * timings.n_eval);
+    LOGGV("%s:       total time = %10.2f ms / %5d tokens\n", __func__, (timings.t_end_ms - timings.t_start_ms), (timings.n_p_eval + timings.n_eval));
+
+#ifdef TARGET_ANDROID
+    timing << "   load time  = " << std::setw(10) << std::fixed <<  std::setprecision(2) <<  (timings.t_load_ms) << " ms";
+
+    timing << "\n";
+    timing << " sample time  = " << std::setw(10) << std::fixed <<  std::setprecision(2) << (timings.t_sample_ms) << " ms / "
+           << timings.n_sample << " runs (" << (timings.t_sample_ms / timings.n_sample) << " ms per token, "
+           << (1e3 / timings.t_sample_ms * timings.n_sample) << " tokens per second)";
+    timing << "\n";
+
+    timing << "prompt eval time = " << std::setw(10) << std::fixed <<  std::setprecision(2) << timings.t_p_eval_ms << " ms / "
+           << timings.n_p_eval << " tokens (" << (timings.t_p_eval_ms / timings.n_p_eval)  << " ms per token, " << (1e3 / timings.t_p_eval_ms * timings.n_p_eval)
+           << " tokens per second";
+    timing << "\n";
+
+    timing << "   total time = " << std::setw(10) << std::fixed <<  std::setprecision(2) <<  ((timings.t_end_ms - timings.t_start_ms)) << " ms / "
+           << (timings.n_p_eval + timings.n_eval) <<  "  tokens\n";
+
+    std::string result = timing.str();
+    kantv_asr_notify_benchmark(result);
+#endif
 }
 
 void llama_reset_timings(struct llama_context * ctx) {

diff --git a/external/ggml/llamacpp/sync-with-upstream-llamacpp.sh b/external/ggml/llamacpp/sync-with-upstream-llamacpp.sh
@@ -34,7 +34,7 @@ echo -e  "local    llamacpp path: ${LOCAL_LLAMACPP_PATH}\n"
 echo -e  "sync source code on ${SYNC_TIME}\n\n"
 
 #the following method borrow from bench-all.sh in GGML's project whisper.cpp
-LLAMACPP_SRCS=(ggml-alloc.c ggml-alloc.h ggml-backend.c ggml-backend.h ggml.c ggml.h ggml-quants.c ggml-quants.h llama.cpp llama.h unicode.h unicode.cpp unicode-data.h unicode-data.cpp)
+LLAMACPP_SRCS=(ggml-alloc.c ggml-alloc.h ggml-backend.c ggml-backend.h ggml.c ggml.h ggml-quants.c ggml-quants.h llama.cpp llama.h unicode.h unicode.cpp unicode-data.h unicode-data.cpp ggml-common.h)
 for file in "${LLAMACPP_SRCS[@]}"; do
     /bin/cp -fv ${UPSTREAM_LLAMACPP_PATH}/$file ${LOCAL_LLAMACPP_PATH}/$file
 done