Skip to content

Commit

Permalink
sanity-check:fix issues caused by sync with upstream llama.cpp
Browse files Browse the repository at this point in the history
  • Loading branch information
zhouwg committed Mar 28, 2024
1 parent a7a975c commit 877f2de
Show file tree
Hide file tree
Showing 12 changed files with 102 additions and 33 deletions.
4 changes: 1 addition & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -223,7 +223,7 @@ https://github.com/zhouwg/kantv/assets/6889919/2fabcb24-c00b-4289-a06e-05b98ecd2

----

![635425083](https://github.com/zhouwg/kantv/assets/6889919/bcf5103e-af44-4a82-bc9f-3a06e4265781)
![1697162123](https://github.com/zhouwg/kantv/assets/6889919/d6b9ab54-ff27-43f7-b169-25c614ca3280)

<details>
<summary>some other screenshots</summary>
Expand Down Expand Up @@ -264,8 +264,6 @@ https://github.com/zhouwg/kantv/assets/6889919/2fabcb24-c00b-4289-a06e-05b98ecd2

- improve <b>quality</b> of real-time English subtitle which powered by great and excellent and amazing ![whisper.cpp](https://github.com/ggerganov/whisper.cpp)

- <a href="https://github.com/ggerganov/ggml/issues/771">adding native backend for Qualcomm mobile SoC(Qualcomm Snapdragon 8 Gen 3)</a>

- real-time Chinese subtitle for online English TV by great and excellent and amazing ![whisper.cpp](https://github.com/ggerganov/whisper.cpp)

- bugfix in UI layer(Java)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -184,7 +184,7 @@ public void initView() {
+ "Arch:" + Build.CPU_ABI + "(" + systemInfo + ")";
_txtGGMLInfo.setText("");
_txtGGMLInfo.append(phoneInfo + "\n");
_txtGGMLInfo.append("Powered by whisper.cpp(fff24a0148fe194df4997a738eeceddd724959c3,Thu Mar 21 22:23:30 2024)(https://github.com/ggerganov/whisper.cpp)\n");
_txtGGMLInfo.append("Powered by whisper.cpp(https://github.com/ggerganov/whisper.cpp)\n");


Spinner spinnerBenchType = mActivity.findViewById(R.id.spinnerBenchType);
Expand Down Expand Up @@ -289,9 +289,13 @@ public void onNothingSelected(AdapterView<?> parent) {

//TODO: better method
//sanity check begin
if (strModeName.startsWith("llama")) {
if (strModeName.contains("llama")) {
isLLMModel = true;
} else if (strModeName.startsWith("qwen")) {
} else if (strModeName.contains("qwen")) {
isLLMModel = true;
} else if (strModeName.contains("baichuan")) {
isLLMModel = true;
} else if (strModeName.contains("gemma")) {
isLLMModel = true;
}
if (isLLMModel)
Expand Down Expand Up @@ -320,7 +324,7 @@ public void onNothingSelected(AdapterView<?> parent) {
File sampleFile = new File(CDEUtils.getDataPath() + ggmlSampleFileName);

if (!selectModeFile.exists() || (!sampleFile.exists())) {
CDEUtils.showMsgBox(mActivity, "pls check whether GGML's model file and sample file(jfk.wav) exist in /sdcard/kantv/");
CDEUtils.showMsgBox(mActivity, "pls check whether GGML's model file:" + selectModeFileName + " and sample file(jfk.wav) exist in /sdcard/kantv/");
return;
}
//sanity check end
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,8 @@ public class LLMResearchFragment extends BaseMvpFragment<LLMResearchPresenter> i
private long endTime = 0;
private long duration = 0;
private String strBenchmarkInfo;
private String strUserInput = "how many days in March 2024?";
//private String strUserInput = "how many days in March 2024?";
private String strUserInput = "introduce the movie Once Upon a Time in America briefly, less then 100 words.";

private AtomicBoolean isBenchmarking = new AtomicBoolean(false);
private ProgressDialog mProgressDialog;
Expand All @@ -112,21 +113,25 @@ public class LLMResearchFragment extends BaseMvpFragment<LLMResearchPresenter> i
// https://huggingface.co/TheBloke/Llama-2-70B-Chat-GGUF




// https://huggingface.co/Qwen/Qwen1.5-1.8B-Chat-GGUF/resolve/main/qwen1_5-1_8b-chat-q4_0.gguf //1.1 GB




// https://huggingface.co/TheBloke/blossom-v3-baichuan2-7B-GGUF
// https://huggingface.co/shaowenchen/baichuan2-7b-chat-gguf
// https://huggingface.co/TheBloke/blossom-v3-baichuan2-7B-GGUF/blob/main/blossom-v3-baichuan2-7b.Q4_K_M.gguf // 4.61 GB


// https://huggingface.co/mlabonne/gemma-2b-GGUF/tree/main
// https://huggingface.co/mlabonne/gemma-2b-GGUF/resolve/main/gemma-2b.Q4_K_M.gguf // 1.5 GB
// https://huggingface.co/mlabonne/gemma-2b-GGUF/resolve/main/gemma-2b.Q8_0.gguf // 2.67 GB


//private String ggmlModelFileName = "llama-2-7b.Q4_K_M.gguf"; //4.08 GB
//private String ggmlModelFileName = "llama-2-7b-chat.Q4_K_M.gguf"; //4.08 GB
//private String ggmlModelFileName = "qwen1_5-1_8b-chat-q4_0.gguf"; // 1.1 GB
private String ggmlModelFileName = "blossom-v3-baichuan2-7b.Q4_K_M.gguf"; // 4.61 GB
//private String ggmlModelFileName = "baichuan2-7b.Q4_K_M.gguf"; // 4.61 GB
//private String ggmlModelFileName = "gemma-2b.Q4_K_M.gguf"; // 1.5 GB
private String ggmlModelFileName = "gemma-2b.Q8_0.gguf"; // 2.67 GB

private Context mContext;
private Activity mActivity;
Expand Down Expand Up @@ -205,6 +210,8 @@ public void initView() {

_btnInference.setOnClickListener(v -> {
String strPrompt = _txtUserInput.getText().toString();

//sanity check begin
if (strPrompt.isEmpty()) {
//CDEUtils.showMsgBox(mActivity, "pls check your input");
//return;
Expand All @@ -230,6 +237,8 @@ public void initView() {
CDEUtils.showMsgBox(mActivity, "pls check whether GGML's model file exist in /sdcard/kantv/");
return;
}
//sanity check end

ggmlModelFileName = selectModeFileName;
CDELog.j(TAG, "model file:" + CDEUtils.getDataPath() + selectModeFileName);

Expand Down
2 changes: 1 addition & 1 deletion cdeosplayer/kantv/src/main/res/layout/fragment_asr.xml
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@
android:id="@+id/btnBenchmark"
android:layout_width="wrap_content"
android:layout_height="30dp"
android:layout_marginLeft="30dp"
android:layout_marginLeft="10dp"
android:background="@drawable/button_drawable"
android:text="Benchmark"
android:textAllCaps="false"
Expand Down
4 changes: 2 additions & 2 deletions cdeosplayer/kantv/src/main/res/layout/fragment_llm.xml
Original file line number Diff line number Diff line change
Expand Up @@ -49,10 +49,10 @@
<EditText
android:id="@+id/txtUserInput"
android:layout_width="match_parent"
android:layout_height="50dp"
android:layout_height="60dp"
android:gravity="top"
android:background="@drawable/textview_border"
android:hint="how many days in March 2024?"
android:hint="introduce the movie Once Upon a Time in America briefly."
android:layout_marginTop="5dp"
/>

Expand Down
3 changes: 3 additions & 0 deletions cdeosplayer/kantv/src/main/res/values/arrays.xml
Original file line number Diff line number Diff line change
Expand Up @@ -111,5 +111,8 @@
<item>large</item>
<item>llama-2-7b-chat.Q4_K_M</item>
<item>qwen1_5-1_8b-chat-q4_0</item>
<item>baichuan2-7b.Q4_K_M</item>
<item>gemma-2b.Q4_K_M</item>
<item>gemma-2b.Q8_0</item>
</string-array>
</resources>
12 changes: 7 additions & 5 deletions external/ggml/jni/ggml-jni-impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -142,12 +142,14 @@ typedef struct {
size_t n_decoding_mode; // 0:WHISPER_SAMPLING_GREEDY 1:WHISPER_SAMPLING_BEAM_SEARCH

size_t n_asr_mode; // 0: normal transcription 1: asr pressure test 2:benchmark 3: transcription + audio record
size_t n_benchmark_type; // what to benchmark: 0: asr, 1: memcpy 2: mulmat 3: whisper_encode/whisper full benchmark
bool b_use_gpu;
size_t n_benchmark_type; // what to benchmark:
// 0: asr(transcription) 1: memcpy 2: mulmat 3: full/whisper_encode 4: matrix 5: LLAMA inference

bool b_abort_benchmark; //TODO: for abort time-consuming benchmark from UI layer. not works perfectly as expected
bool b_use_gpu; // TODO: not used on Android device currently, ref: https://github.com/ggerganov/ggml/issues/771

fifo_buffer_t * asr_fifo; //fifo for ASR data producer-consumer
bool b_abort_benchmark; // TODO: for abort time-consuming task from UI layer. not works as expected

fifo_buffer_t * asr_fifo; // fifo for ASR data producer-consumer

size_t n_sample_size;

Expand All @@ -158,7 +160,7 @@ typedef struct {

class whisper_asr * p_asr; // attention memory leak, smart pointer should not be used here for performance consideration

pthread_mutex_t mutex;
pthread_mutex_t mutex; // not used since 03-19-2024

//only for troubleshooting issue
bool b_pre_convert;
Expand Down
2 changes: 1 addition & 1 deletion external/ggml/llamacpp/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -670,7 +670,7 @@ ggml-quants.o: ggml-quants.c ggml.h ggml-quants.h ggml-common.h
unicode.o: unicode.cpp unicode.h
$(CXX) $(CXXFLAGS) -c $< -o $@

OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o ../../ffmpeg/libavutil/cde_log.o
OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o ../../ffmpeg/libavutil/cde_log.o

llama.o: llama.cpp unicode.h ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h llama.h
$(CXX) $(CXXFLAGS) -c $< -o $@
Expand Down
2 changes: 1 addition & 1 deletion external/ggml/llamacpp/diff-with-upstream-llamacpp.sh
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ echo -e "upstream llamacpp path: ${UPSTREAM_LLAMACPP_PATH}\n"
echo -e "local llamacpp path: ${LOCAL_LLAMACPP_PATH}\n"

#the following method borrow from bench-all.sh in GGML's project whisper.cpp
LLAMACPP_SRCS=(ggml-alloc.c ggml-alloc.h ggml-backend.c ggml-backend.h ggml.c ggml.h ggml-quants.c ggml-quants.h llama.cpp llama.h unicode.h unicode.cpp unicode-data.h unicode-data.cpp)
LLAMACPP_SRCS=(ggml-alloc.c ggml-alloc.h ggml-backend.c ggml-backend.h ggml.c ggml.h ggml-quants.c ggml-quants.h llama.cpp llama.h unicode.h unicode.cpp unicode-data.h unicode-data.cpp ggml-common.h)
for file in "${LLAMACPP_SRCS[@]}"; do
echo "diff $file ${UPSTREAM_LLAMACPP_PATH}/$file"
diff ${LOCAL_LLAMACPP_PATH}/$file ${UPSTREAM_LLAMACPP_PATH}/$file
Expand Down
22 changes: 22 additions & 0 deletions external/ggml/llamacpp/ggml-common.h
Original file line number Diff line number Diff line change
Expand Up @@ -377,6 +377,27 @@ typedef struct {
} block_iq1_s;
static_assert(sizeof(block_iq1_s) == sizeof(ggml_half) + QK_K/8 + QK_K/16, "wrong iq1_s block size/padding");

// 1.75 bpw
typedef struct {
uint8_t qs[QK_K/8]; // grid index, low 8 bits
uint8_t qh[QK_K/16]; // grid index, high 3 bits + grid shift bit (for two groups of 8)
#if QK_K == 64
ggml_half d;
#endif
uint8_t scales[QK_K/32]; // 3-bit block scales (4-bit if QK_K == 64)
} block_iq1_m;
#if QK_K == 64
static_assert(sizeof(block_iq1_m) == QK_K/8 + QK_K/16 + QK_K/32 + sizeof(ggml_half), "wrong iq1_m block size/padding");
#else
static_assert(sizeof(block_iq1_m) == QK_K/8 + QK_K/16 + QK_K/32, "wrong iq1_m block size/padding");
#endif

// Used by IQ1_M quants
typedef union {
ggml_half f16;
uint16_t u16;
} iq1m_scale_t;

// Non-linear quants
#define QK4_NL 32
typedef struct {
Expand Down Expand Up @@ -1050,6 +1071,7 @@ GGML_TABLE_END()

#define NGRID_IQ1S 2048
#define IQ1S_DELTA 0.125f
#define IQ1M_DELTA 0.125f
#if defined(GGML_COMMON_IMPL_C)
GGML_TABLE_BEGIN(uint64_t, iq1s_grid, NGRID_IQ1S)
0xffffffffffffffff, 0xffffffffffffff01, 0xffffffffffff0000, 0xffffffffffff01ff,
Expand Down
49 changes: 40 additions & 9 deletions external/ggml/llamacpp/llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,11 @@
#include "ggml-alloc.h"
#include "ggml-backend.h"

#ifdef TARGET_ANDROID
#include "kantv-asr.h"
#include "ggml-jni.h"
#endif

#ifdef GGML_USE_CUDA
# include "ggml-cuda.h"
#elif defined(GGML_USE_CLBLAST)
Expand Down Expand Up @@ -15617,15 +15622,41 @@ struct llama_timings llama_get_timings(struct llama_context * ctx) {
void llama_print_timings(struct llama_context * ctx) {
const llama_timings timings = llama_get_timings(ctx);

LLAMA_LOG_INFO("\n");
LLAMA_LOG_INFO("%s: load time = %10.2f ms\n", __func__, timings.t_load_ms);
LLAMA_LOG_INFO("%s: sample time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
__func__, timings.t_sample_ms, timings.n_sample, timings.t_sample_ms / timings.n_sample, 1e3 / timings.t_sample_ms * timings.n_sample);
LLAMA_LOG_INFO("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
__func__, timings.t_p_eval_ms, timings.n_p_eval, timings.t_p_eval_ms / timings.n_p_eval, 1e3 / timings.t_p_eval_ms * timings.n_p_eval);
LLAMA_LOG_INFO("%s: eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
__func__, timings.t_eval_ms, timings.n_eval, timings.t_eval_ms / timings.n_eval, 1e3 / timings.t_eval_ms * timings.n_eval);
LLAMA_LOG_INFO("%s: total time = %10.2f ms / %5d tokens\n", __func__, (timings.t_end_ms - timings.t_start_ms), (timings.n_p_eval + timings.n_eval));
#ifdef TARGET_ANDROID
std::ostringstream timing;
timing << "llama-timings:\t";
#endif

LOGGV("\n");
LOGGV("%s: load time = %10.2f ms\n", __func__, timings.t_load_ms);
LOGGV("%s: sample time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
__func__, timings.t_sample_ms, timings.n_sample, timings.t_sample_ms / timings.n_sample, 1e3 / timings.t_sample_ms * timings.n_sample);
LOGGV("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
__func__, timings.t_p_eval_ms, timings.n_p_eval, timings.t_p_eval_ms / timings.n_p_eval, 1e3 / timings.t_p_eval_ms * timings.n_p_eval);
LOGGV("%s: eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
__func__, timings.t_eval_ms, timings.n_eval, timings.t_eval_ms / timings.n_eval, 1e3 / timings.t_eval_ms * timings.n_eval);
LOGGV("%s: total time = %10.2f ms / %5d tokens\n", __func__, (timings.t_end_ms - timings.t_start_ms), (timings.n_p_eval + timings.n_eval));

#ifdef TARGET_ANDROID
timing << " load time = " << std::setw(10) << std::fixed << std::setprecision(2) << (timings.t_load_ms) << " ms";

timing << "\n";
timing << " sample time = " << std::setw(10) << std::fixed << std::setprecision(2) << (timings.t_sample_ms) << " ms / "
<< timings.n_sample << " runs (" << (timings.t_sample_ms / timings.n_sample) << " ms per token, "
<< (1e3 / timings.t_sample_ms * timings.n_sample) << " tokens per second)";
timing << "\n";

timing << "prompt eval time = " << std::setw(10) << std::fixed << std::setprecision(2) << timings.t_p_eval_ms << " ms / "
<< timings.n_p_eval << " tokens (" << (timings.t_p_eval_ms / timings.n_p_eval) << " ms per token, " << (1e3 / timings.t_p_eval_ms * timings.n_p_eval)
<< " tokens per second";
timing << "\n";

timing << " total time = " << std::setw(10) << std::fixed << std::setprecision(2) << ((timings.t_end_ms - timings.t_start_ms)) << " ms / "
<< (timings.n_p_eval + timings.n_eval) << " tokens\n";

std::string result = timing.str();
kantv_asr_notify_benchmark(result);
#endif
}

void llama_reset_timings(struct llama_context * ctx) {
Expand Down
2 changes: 1 addition & 1 deletion external/ggml/llamacpp/sync-with-upstream-llamacpp.sh
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ echo -e "local llamacpp path: ${LOCAL_LLAMACPP_PATH}\n"
echo -e "sync source code on ${SYNC_TIME}\n\n"

#the following method borrow from bench-all.sh in GGML's project whisper.cpp
LLAMACPP_SRCS=(ggml-alloc.c ggml-alloc.h ggml-backend.c ggml-backend.h ggml.c ggml.h ggml-quants.c ggml-quants.h llama.cpp llama.h unicode.h unicode.cpp unicode-data.h unicode-data.cpp)
LLAMACPP_SRCS=(ggml-alloc.c ggml-alloc.h ggml-backend.c ggml-backend.h ggml.c ggml.h ggml-quants.c ggml-quants.h llama.cpp llama.h unicode.h unicode.cpp unicode-data.h unicode-data.cpp ggml-common.h)
for file in "${LLAMACPP_SRCS[@]}"; do
/bin/cp -fv ${UPSTREAM_LLAMACPP_PATH}/$file ${LOCAL_LLAMACPP_PATH}/$file
done
Expand Down

1 comment on commit 877f2de

@zhouwg
Copy link
Owner Author

@zhouwg zhouwg commented on 877f2de Mar 28, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I found sometimes some famous LLM model's answers are wildly incorrect after spent 3 days on llama.cpp. I really don't know/have no ide how llama.cpp is really used in project kant? like real-time subtitle with online-TV by the great whisper.cpp?

I have to say that:

  • whisper.cpp is a real magic AI tech and have a lot practical application scenarios

  • LLM is just a very immature chat toy / powerful database: lack of practical application scenario and too many unknown errors, it's also a "magic"

  • someone said AI could replace human programmer, now I think it's totally impossible because some programmer like original author of GGML/whisper.cpp could NOT be replaced by "magic" LLM. at the same time I think AI-assisted human programmer can NOT become a real/leading programmer(such as original author of GGML/whisper.cpp). If one day a leading programmer (such as the original author of GGML/whisper.cpp) could be replaced by AI/AGI, it could be a terrible story for humanity.

so I should not spent more time on llama.cpp because it's really far away from/too hard to non-AI programmer.

Please sign in to comment.