-
Notifications
You must be signed in to change notification settings - Fork 0
/
model_generate.py
142 lines (133 loc) · 4 KB
/
model_generate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
from time import time
from ctransformers import LLM, Config
from log import log
def model_generate(
prompt: str,
model_name: str,
llm: LLM,
config: Config,
):
"""_summary_
returns the response body for /chat/completions
"""
created = time()
top_k = config.top_k
log.debug("top_k: %s", top_k)
top_p = config.top_p
log.debug("top_p: %s", top_p)
temperature = config.temperature
log.debug("temperature: %s", temperature)
repetition_penalty = config.repetition_penalty
log.debug("repetition_penalty: %s", repetition_penalty)
last_n_tokens = config.last_n_tokens
log.debug("last_n_tokens: %s", last_n_tokens)
seed = config.seed
log.debug("seed: %s", seed)
batch_size = config.batch_size
log.debug("batch_size: %s", batch_size)
threads = config.threads
log.debug("threads: %s", threads)
max_new_tokens = config.max_new_tokens
log.debug("max_new_tokens: %s", max_new_tokens)
stop = config.stop
log.debug("stop: %s", stop)
log.debug("prompt: %s", prompt)
log.debug("Getting from ctransformer instance")
result: str = llm( # pyright: ignore [reportGeneralTypeIssues]
prompt=prompt,
stream=False,
reset=True,
top_k=top_k,
top_p=top_p,
temperature=temperature,
repetition_penalty=repetition_penalty,
last_n_tokens=last_n_tokens,
seed=seed,
batch_size=batch_size,
threads=threads,
max_new_tokens=max_new_tokens,
stop=stop,
)
http_response = {
"id": "id",
"object": "text_completion",
"created": created,
"model": model_name,
"choices": [
{
"index": 0,
"text": result,
"logprobs": None,
"finish_reason": "end_of_token",
}
],
"usage": {"prompt_tokens": 0, "completion_tokens": 0, "total_tokens": 0},
}
log.debug("http_response:%s ", http_response)
return http_response
def chat_model_generate(
prompt: str,
model_name: str,
llm: LLM,
config: Config,
):
"""_summary_
returns the response body for /chat/completions
"""
created = time()
top_k = config.top_k
log.debug("top_k: %s", top_k)
top_p = config.top_p
log.debug("top_p: %s", top_p)
temperature = config.temperature
log.debug("temperature: %s", temperature)
repetition_penalty = config.repetition_penalty
log.debug("repetition_penalty: %s", repetition_penalty)
last_n_tokens = config.last_n_tokens
log.debug("last_n_tokens: %s", last_n_tokens)
seed = config.seed
log.debug("seed: %s", seed)
batch_size = config.batch_size
log.debug("batch_size: %s", batch_size)
threads = config.threads
log.debug("threads: %s", threads)
max_new_tokens = config.max_new_tokens
log.debug("max_new_tokens: %s", max_new_tokens)
stop = config.stop
log.debug("stop: %s", stop)
log.debug("prompt: %s", prompt)
log.debug("Getting from ctransformer instance")
result: str = llm( # pyright: ignore [reportGeneralTypeIssues]
prompt=prompt,
stream=False,
reset=True,
top_k=top_k,
top_p=top_p,
temperature=temperature,
repetition_penalty=repetition_penalty,
last_n_tokens=last_n_tokens,
seed=seed,
batch_size=batch_size,
threads=threads,
max_new_tokens=max_new_tokens,
stop=stop,
)
http_response = {
"id": "id",
"object": "chat.completion",
"created": created,
"model": model_name,
"choices": [
{
"index": 0,
"message": {
"role": "assistant",
"content": result,
},
"finish_reason": "end_of_token",
}
],
"usage": {"prompt_tokens": 0, "completion_tokens": 0, "total_tokens": 0},
}
log.debug("http_response:%s ", http_response)
return http_response