From 0c621357ac7df30d4f77b427e44bfb5cf9df1485 Mon Sep 17 00:00:00 2001 From: Teddy Xinyuan Chen <45612704+tddschn@users.noreply.github.com> Date: Wed, 24 May 2023 10:50:51 +0800 Subject: [PATCH 01/13] add embedding support in main.py --- main.py | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/main.py b/main.py index 903b1cb..b13022a 100644 --- a/main.py +++ b/main.py @@ -22,8 +22,11 @@ import transformers DEFAULT_MODEL = "THUDM/chatglm-6b-int4" +# todo +DEFAULT_EMBEDDING_MODEL = "" TOKENIZER = os.environ.get("MODELZ_TOKENIZER", DEFAULT_MODEL) MODEL = os.environ.get("MODELZ_MODEL", DEFAULT_MODEL) +EMBEDDING_MODEL = os.environ.get("MODELZ_EMBEDDING_MODEL", DEFAULT_EMBEDDING_MODEL) logger = logging.getLogger(__name__) @@ -153,14 +156,44 @@ async def on_post(self, req: Request, resp: Response): resp.data = completion.to_json() +class Embeddings: + def __init__(self, model_name: str) -> None: + self.model_name = model_name + + async def on_post(self, req: Request, resp: Response): + buf = await req.stream.readall() + try: + prompt_req = PromptCompletionRequest.from_bytes(buf=buf) + except msgspec.ValidationError as err: + logger.info(f"Failed to parse request: {err}") + # return 400 otherwise the client will retry + resp.status = falcon.HTTP_400 + resp.data = ErrorResponse.from_validation_err(err, str(buf)).to_json() + return + + tokens = llm.encode(prompt_req.prompt) + with torch.no_grad(): + outputs = llm.model(**tokens) + last_hidden_states = ( + outputs.last_hidden_state + ) # shape: (batch_size, sequence_length, hidden_size) + embeddings = torch.mean( + last_hidden_states, dim=1 + ).tolist() # get mean over sequence_length dimension + + resp.data = {"embeddings": embeddings} + + app = App() app.add_route("/", Ping()) app.add_route("/completions", Completions(model_name=MODEL)) app.add_route("/chat/completions", ChatCompletions(model_name=MODEL)) +app.add_route("/embeddings", Embeddings(model_name=EMBEDDING_MODEL)) # refer to https://platform.openai.com/docs/api-reference/chat # make it fully compatible with the current OpenAI API endpoints app.add_route("/v1/completions", Completions(model_name=MODEL)) app.add_route("/v1/chat/completions", ChatCompletions(model_name=MODEL)) +app.add_route("/v1/embeddings", Embeddings(model_name=EMBEDDING_MODEL)) if __name__ == "__main__": From 49d193606238a2b5d649a1575febd05d487957e9 Mon Sep 17 00:00:00 2001 From: Teddy Xinyuan Chen <45612704+tddschn@users.noreply.github.com> Date: Wed, 24 May 2023 11:51:34 +0800 Subject: [PATCH 02/13] use llmspec structs, add engine route Signed-off-by: Teddy Xinyuan Chen <45612704+tddschn@users.noreply.github.com> --- main.py | 35 ++++++++++++++++++++++++++++------- 1 file changed, 28 insertions(+), 7 deletions(-) diff --git a/main.py b/main.py index b13022a..0f2c988 100644 --- a/main.py +++ b/main.py @@ -18,6 +18,9 @@ TokenUsage, LanguageModels, ErrorResponse, + EmbeddingRequest, + EmbeddingResponse, + EmbeddingData, ) import transformers @@ -163,25 +166,35 @@ def __init__(self, model_name: str) -> None: async def on_post(self, req: Request, resp: Response): buf = await req.stream.readall() try: - prompt_req = PromptCompletionRequest.from_bytes(buf=buf) + # todo: llmspec hasn't implemented from_bytes for EmbeddingRequest + embedding_req = msgspec.json.decode(buf, type=EmbeddingRequest) except msgspec.ValidationError as err: logger.info(f"Failed to parse request: {err}") - # return 400 otherwise the client will retry + resp.status = falcon.HTTP_400 resp.status = falcon.HTTP_400 resp.data = ErrorResponse.from_validation_err(err, str(buf)).to_json() return - tokens = llm.encode(prompt_req.prompt) + tokens = llm.encode(embedding_req.input) with torch.no_grad(): outputs = llm.model(**tokens) - last_hidden_states = ( - outputs.last_hidden_state - ) # shape: (batch_size, sequence_length, hidden_size) + last_hidden_states = outputs.last_hidden_state embeddings = torch.mean( last_hidden_states, dim=1 ).tolist() # get mean over sequence_length dimension - resp.data = {"embeddings": embeddings} + embedding_data = EmbeddingData(embedding=embeddings, index=0) + embedding_resp = EmbeddingResponse( + data=embedding_data, + model=self.model_name, + usage=TokenUsage( + prompt_tokens=len(tokens[0]), + completion_tokens=0, # No completions performed, only embeddings generated. + total_tokens=len(tokens[0]), + ), + ) + # todo: llmspec hasn't implemented to_json for EmbeddingResponse + resp.data = msgspec.json.encode(embedding_resp) app = App() @@ -189,11 +202,19 @@ async def on_post(self, req: Request, resp: Response): app.add_route("/completions", Completions(model_name=MODEL)) app.add_route("/chat/completions", ChatCompletions(model_name=MODEL)) app.add_route("/embeddings", Embeddings(model_name=EMBEDDING_MODEL)) +app.add_route( + "/engines/{}/embeddings".format(EMBEDDING_MODEL), + Embeddings(model_name=EMBEDDING_MODEL), +) # refer to https://platform.openai.com/docs/api-reference/chat # make it fully compatible with the current OpenAI API endpoints app.add_route("/v1/completions", Completions(model_name=MODEL)) app.add_route("/v1/chat/completions", ChatCompletions(model_name=MODEL)) app.add_route("/v1/embeddings", Embeddings(model_name=EMBEDDING_MODEL)) +app.add_route( + "/v1/engines/{}/embeddings".format(EMBEDDING_MODEL), + Embeddings(model_name=EMBEDDING_MODEL), +) if __name__ == "__main__": From c49699bfc9da9bdb2461639639e28302ed8f572e Mon Sep 17 00:00:00 2001 From: Teddy Xinyuan Chen <45612704+tddschn@users.noreply.github.com> Date: Wed, 24 May 2023 12:01:13 +0800 Subject: [PATCH 03/13] use sentence_transformers Signed-off-by: Teddy Xinyuan Chen <45612704+tddschn@users.noreply.github.com> --- main.py | 14 +++++--------- requirements.txt | 1 + 2 files changed, 6 insertions(+), 9 deletions(-) diff --git a/main.py b/main.py index 0f2c988..1ba6e3a 100644 --- a/main.py +++ b/main.py @@ -23,10 +23,10 @@ EmbeddingData, ) import transformers +from sentence_transformers import SentenceTransformer DEFAULT_MODEL = "THUDM/chatglm-6b-int4" -# todo -DEFAULT_EMBEDDING_MODEL = "" +DEFAULT_EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2" TOKENIZER = os.environ.get("MODELZ_TOKENIZER", DEFAULT_MODEL) MODEL = os.environ.get("MODELZ_MODEL", DEFAULT_MODEL) EMBEDDING_MODEL = os.environ.get("MODELZ_EMBEDDING_MODEL", DEFAULT_EMBEDDING_MODEL) @@ -175,13 +175,9 @@ async def on_post(self, req: Request, resp: Response): resp.data = ErrorResponse.from_validation_err(err, str(buf)).to_json() return - tokens = llm.encode(embedding_req.input) - with torch.no_grad(): - outputs = llm.model(**tokens) - last_hidden_states = outputs.last_hidden_state - embeddings = torch.mean( - last_hidden_states, dim=1 - ).tolist() # get mean over sequence_length dimension + model = SentenceTransformer(self.model_name) + embeddings = model.encode(embedding_req.input) + print(embeddings) embedding_data = EmbeddingData(embedding=embeddings, index=0) embedding_resp = EmbeddingResponse( diff --git a/requirements.txt b/requirements.txt index e37681b..8158169 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,3 +9,4 @@ accelerate llmspec falcon uvicorn +sentence_transformers \ No newline at end of file From 169e09528e760fa75d8c66adae44242765b84cd3 Mon Sep 17 00:00:00 2001 From: Teddy Xinyuan Chen <45612704+tddschn@users.noreply.github.com> Date: Wed, 24 May 2023 12:02:39 +0800 Subject: [PATCH 04/13] fix EmbeddingData import Signed-off-by: Teddy Xinyuan Chen <45612704+tddschn@users.noreply.github.com> --- main.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/main.py b/main.py index 1ba6e3a..7a2bedb 100644 --- a/main.py +++ b/main.py @@ -20,8 +20,10 @@ ErrorResponse, EmbeddingRequest, EmbeddingResponse, - EmbeddingData, ) + +# todo: make this importable from top level +from llmspec.llmspec import EmbeddingData import transformers from sentence_transformers import SentenceTransformer From df7f37209ec15b164cd009db5177354422ba7b4b Mon Sep 17 00:00:00 2001 From: Teddy Xinyuan Chen <45612704+tddschn@users.noreply.github.com> Date: Wed, 24 May 2023 13:30:08 +0800 Subject: [PATCH 05/13] fix EmbeddingData Signed-off-by: Teddy Xinyuan Chen <45612704+tddschn@users.noreply.github.com> --- main.py | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/main.py b/main.py index 7a2bedb..90a4d3f 100644 --- a/main.py +++ b/main.py @@ -22,8 +22,16 @@ EmbeddingResponse, ) + # todo: make this importable from top level -from llmspec.llmspec import EmbeddingData +# from llmspec.llmspec import EmbeddingData +# temporary fix: embedding attr type +class EmbeddingData(msgspec.Struct): + embedding: List[float] | List[List[float]] + index: int + object: str = "embedding" + + import transformers from sentence_transformers import SentenceTransformer @@ -179,16 +187,22 @@ async def on_post(self, req: Request, resp: Response): model = SentenceTransformer(self.model_name) embeddings = model.encode(embedding_req.input) - print(embeddings) + # convert embeddings of type list[Tensor] | ndarray to list[float] + if isinstance(embeddings, list): + embeddings = [e.tolist() for e in embeddings] + elif isinstance(embeddings, torch.Tensor): + embeddings = embeddings.tolist() + else: + embeddings = embeddings.tolist() embedding_data = EmbeddingData(embedding=embeddings, index=0) embedding_resp = EmbeddingResponse( data=embedding_data, model=self.model_name, usage=TokenUsage( - prompt_tokens=len(tokens[0]), + prompt_tokens=0, # No prompt tokens, only embeddings generated. completion_tokens=0, # No completions performed, only embeddings generated. - total_tokens=len(tokens[0]), + total_tokens=len(embeddings), ), ) # todo: llmspec hasn't implemented to_json for EmbeddingResponse From 6fd7da203eb36d56cbc127cc624f53c67dbc0bb4 Mon Sep 17 00:00:00 2001 From: Teddy Xinyuan Chen <45612704+tddschn@users.noreply.github.com> Date: Wed, 24 May 2023 13:32:48 +0800 Subject: [PATCH 06/13] fix engine routes Signed-off-by: Teddy Xinyuan Chen <45612704+tddschn@users.noreply.github.com> --- main.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/main.py b/main.py index 90a4d3f..7399b65 100644 --- a/main.py +++ b/main.py @@ -209,14 +209,22 @@ async def on_post(self, req: Request, resp: Response): resp.data = msgspec.json.encode(embedding_resp) +class EmbeddingsEngineRouteWrapper(Embeddings): + def __init__(self, model_name: str) -> None: + super().__init__(model_name) + + async def on_post(self, req: Request, resp: Response, engine: str): + await super().on_post(req, resp) + + app = App() app.add_route("/", Ping()) app.add_route("/completions", Completions(model_name=MODEL)) app.add_route("/chat/completions", ChatCompletions(model_name=MODEL)) app.add_route("/embeddings", Embeddings(model_name=EMBEDDING_MODEL)) app.add_route( - "/engines/{}/embeddings".format(EMBEDDING_MODEL), - Embeddings(model_name=EMBEDDING_MODEL), + "/engines/{engine}/embeddings".format(EMBEDDING_MODEL), + EmbeddingsEngineRouteWrapper(model_name=EMBEDDING_MODEL), ) # refer to https://platform.openai.com/docs/api-reference/chat # make it fully compatible with the current OpenAI API endpoints @@ -224,8 +232,8 @@ async def on_post(self, req: Request, resp: Response): app.add_route("/v1/chat/completions", ChatCompletions(model_name=MODEL)) app.add_route("/v1/embeddings", Embeddings(model_name=EMBEDDING_MODEL)) app.add_route( - "/v1/engines/{}/embeddings".format(EMBEDDING_MODEL), - Embeddings(model_name=EMBEDDING_MODEL), + "/v1/engines/{engine}/embeddings".format(EMBEDDING_MODEL), + EmbeddingsEngineRouteWrapper(model_name=EMBEDDING_MODEL), ) From 8611ac2fa1443a2f4bfc5689b0385c837f5e1046 Mon Sep 17 00:00:00 2001 From: Teddy Xinyuan Chen <45612704+tddschn@users.noreply.github.com> Date: Wed, 24 May 2023 16:10:32 +0800 Subject: [PATCH 07/13] remove duplicated line Co-authored-by: Keming --- main.py | 1 - 1 file changed, 1 deletion(-) diff --git a/main.py b/main.py index 7399b65..952228b 100644 --- a/main.py +++ b/main.py @@ -181,7 +181,6 @@ async def on_post(self, req: Request, resp: Response): except msgspec.ValidationError as err: logger.info(f"Failed to parse request: {err}") resp.status = falcon.HTTP_400 - resp.status = falcon.HTTP_400 resp.data = ErrorResponse.from_validation_err(err, str(buf)).to_json() return From ee1ed2e3b51a5b38902f82b992f05f07955b68bc Mon Sep 17 00:00:00 2001 From: Teddy Xinyuan Chen <45612704+tddschn@users.noreply.github.com> Date: Wed, 24 May 2023 16:47:27 +0800 Subject: [PATCH 08/13] fixes Signed-off-by: Teddy Xinyuan Chen <45612704+tddschn@users.noreply.github.com> --- main.py | 37 ++++++++++++++++--------------------- 1 file changed, 16 insertions(+), 21 deletions(-) diff --git a/main.py b/main.py index 952228b..2e42df1 100644 --- a/main.py +++ b/main.py @@ -20,18 +20,10 @@ ErrorResponse, EmbeddingRequest, EmbeddingResponse, + EmbeddingData, ) -# todo: make this importable from top level -# from llmspec.llmspec import EmbeddingData -# temporary fix: embedding attr type -class EmbeddingData(msgspec.Struct): - embedding: List[float] | List[List[float]] - index: int - object: str = "embedding" - - import transformers from sentence_transformers import SentenceTransformer @@ -172,20 +164,19 @@ async def on_post(self, req: Request, resp: Response): class Embeddings: def __init__(self, model_name: str) -> None: self.model_name = model_name + self.model = SentenceTransformer(self.model_name) async def on_post(self, req: Request, resp: Response): buf = await req.stream.readall() try: - # todo: llmspec hasn't implemented from_bytes for EmbeddingRequest - embedding_req = msgspec.json.decode(buf, type=EmbeddingRequest) + embedding_req = EmbeddingRequest.from_bytes(buf=buf) except msgspec.ValidationError as err: logger.info(f"Failed to parse request: {err}") resp.status = falcon.HTTP_400 resp.data = ErrorResponse.from_validation_err(err, str(buf)).to_json() return - model = SentenceTransformer(self.model_name) - embeddings = model.encode(embedding_req.input) + embeddings = self.model.encode(embedding_req.input) # convert embeddings of type list[Tensor] | ndarray to list[float] if isinstance(embeddings, list): embeddings = [e.tolist() for e in embeddings] @@ -195,17 +186,18 @@ async def on_post(self, req: Request, resp: Response): embeddings = embeddings.tolist() embedding_data = EmbeddingData(embedding=embeddings, index=0) + # todo + token_count = 0 embedding_resp = EmbeddingResponse( data=embedding_data, model=self.model_name, usage=TokenUsage( - prompt_tokens=0, # No prompt tokens, only embeddings generated. + prompt_tokens=token_count, completion_tokens=0, # No completions performed, only embeddings generated. - total_tokens=len(embeddings), + total_tokens=token_count, ), ) - # todo: llmspec hasn't implemented to_json for EmbeddingResponse - resp.data = msgspec.json.encode(embedding_resp) + resp.data = embedding_resp.to_json() class EmbeddingsEngineRouteWrapper(Embeddings): @@ -216,23 +208,26 @@ async def on_post(self, req: Request, resp: Response, engine: str): await super().on_post(req, resp) +embeddings = Embeddings(EMBEDDING_MODEL) +embeddings_engine_route_wrapper = EmbeddingsEngineRouteWrapper(EMBEDDING_MODEL) + app = App() app.add_route("/", Ping()) app.add_route("/completions", Completions(model_name=MODEL)) app.add_route("/chat/completions", ChatCompletions(model_name=MODEL)) -app.add_route("/embeddings", Embeddings(model_name=EMBEDDING_MODEL)) +app.add_route("/embeddings", embeddings) app.add_route( "/engines/{engine}/embeddings".format(EMBEDDING_MODEL), - EmbeddingsEngineRouteWrapper(model_name=EMBEDDING_MODEL), + embeddings_engine_route_wrapper, ) # refer to https://platform.openai.com/docs/api-reference/chat # make it fully compatible with the current OpenAI API endpoints app.add_route("/v1/completions", Completions(model_name=MODEL)) app.add_route("/v1/chat/completions", ChatCompletions(model_name=MODEL)) -app.add_route("/v1/embeddings", Embeddings(model_name=EMBEDDING_MODEL)) +app.add_route("/v1/embeddings", embeddings) app.add_route( "/v1/engines/{engine}/embeddings".format(EMBEDDING_MODEL), - EmbeddingsEngineRouteWrapper(model_name=EMBEDDING_MODEL), + embeddings_engine_route_wrapper, ) From 710a72925e08620faed0d58d7775881bff321ca0 Mon Sep 17 00:00:00 2001 From: Teddy Xinyuan Chen <45612704+tddschn@users.noreply.github.com> Date: Wed, 24 May 2023 16:56:06 +0800 Subject: [PATCH 09/13] add token count support Signed-off-by: Teddy Xinyuan Chen <45612704+tddschn@users.noreply.github.com> --- main.py | 44 ++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 40 insertions(+), 4 deletions(-) diff --git a/main.py b/main.py index 2e42df1..569b12f 100644 --- a/main.py +++ b/main.py @@ -6,6 +6,7 @@ import falcon import msgspec import torch # type: ignore +import torch.nn.functional as F from falcon.asgi import App, Request, Response from llmspec import ( ChatChoice, @@ -25,7 +26,6 @@ import transformers -from sentence_transformers import SentenceTransformer DEFAULT_MODEL = "THUDM/chatglm-6b-int4" DEFAULT_EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2" @@ -165,6 +165,44 @@ class Embeddings: def __init__(self, model_name: str) -> None: self.model_name = model_name self.model = SentenceTransformer(self.model_name) + from transformers import AutoTokenizer, AutoModel + + # Load model from HuggingFace Hub + self.tokenizer = AutoTokenizer.from_pretrained(self.model) + self.model = AutoModel.from_pretrained(self.model) + + def embed_and_get_token_count(self, sentences): + # Mean Pooling - Take attention mask into account for correct averaging + def mean_pooling(model_output, attention_mask): + token_embeddings = model_output[ + 0 + ] # First element of model_output contains all token embeddings + input_mask_expanded = ( + attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() + ) + return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp( + input_mask_expanded.sum(1), min=1e-9 + ) + + # Tokenize sentences + encoded_input = self.tokenizer( + sentences, padding=True, truncation=True, return_tensors='pt' + ) + token_count = encoded_input['attention_mask'].sum(dim=1) + + # Compute token embeddings + with torch.no_grad(): + model_output = self.model(**encoded_input) + + # Perform pooling + sentence_embeddings = mean_pooling( + model_output, encoded_input['attention_mask'] + ) + + # Normalize embeddings + sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1) + + return token_count, sentence_embeddings async def on_post(self, req: Request, resp: Response): buf = await req.stream.readall() @@ -176,7 +214,7 @@ async def on_post(self, req: Request, resp: Response): resp.data = ErrorResponse.from_validation_err(err, str(buf)).to_json() return - embeddings = self.model.encode(embedding_req.input) + token_count, embeddings = self.embed_and_get_token_count(embedding_req.input) # convert embeddings of type list[Tensor] | ndarray to list[float] if isinstance(embeddings, list): embeddings = [e.tolist() for e in embeddings] @@ -186,8 +224,6 @@ async def on_post(self, req: Request, resp: Response): embeddings = embeddings.tolist() embedding_data = EmbeddingData(embedding=embeddings, index=0) - # todo - token_count = 0 embedding_resp = EmbeddingResponse( data=embedding_data, model=self.model_name, From 202e40d5dd81185fd759fd4f5744bc237ef7d29f Mon Sep 17 00:00:00 2001 From: Teddy Xinyuan Chen <45612704+tddschn@users.noreply.github.com> Date: Thu, 25 May 2023 09:54:28 +0800 Subject: [PATCH 10/13] merge embedding classes Signed-off-by: Teddy Xinyuan Chen <45612704+tddschn@users.noreply.github.com> --- main.py | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) diff --git a/main.py b/main.py index 569b12f..d38bcf3 100644 --- a/main.py +++ b/main.py @@ -204,7 +204,7 @@ def mean_pooling(model_output, attention_mask): return token_count, sentence_embeddings - async def on_post(self, req: Request, resp: Response): + async def on_post(self, req: Request, resp: Response, engine: str = ''): buf = await req.stream.readall() try: embedding_req = EmbeddingRequest.from_bytes(buf=buf) @@ -236,16 +236,7 @@ async def on_post(self, req: Request, resp: Response): resp.data = embedding_resp.to_json() -class EmbeddingsEngineRouteWrapper(Embeddings): - def __init__(self, model_name: str) -> None: - super().__init__(model_name) - - async def on_post(self, req: Request, resp: Response, engine: str): - await super().on_post(req, resp) - - embeddings = Embeddings(EMBEDDING_MODEL) -embeddings_engine_route_wrapper = EmbeddingsEngineRouteWrapper(EMBEDDING_MODEL) app = App() app.add_route("/", Ping()) @@ -254,7 +245,7 @@ async def on_post(self, req: Request, resp: Response, engine: str): app.add_route("/embeddings", embeddings) app.add_route( "/engines/{engine}/embeddings".format(EMBEDDING_MODEL), - embeddings_engine_route_wrapper, + embeddings, ) # refer to https://platform.openai.com/docs/api-reference/chat # make it fully compatible with the current OpenAI API endpoints @@ -263,7 +254,7 @@ async def on_post(self, req: Request, resp: Response, engine: str): app.add_route("/v1/embeddings", embeddings) app.add_route( "/v1/engines/{engine}/embeddings".format(EMBEDDING_MODEL), - embeddings_engine_route_wrapper, + embeddings, ) From 61bfaa27eb1a6534f1473b40e1bc939ff056da43 Mon Sep 17 00:00:00 2001 From: Teddy Xinyuan Chen <45612704+tddschn@users.noreply.github.com> Date: Thu, 25 May 2023 09:59:01 +0800 Subject: [PATCH 11/13] fix imports Signed-off-by: Teddy Xinyuan Chen <45612704+tddschn@users.noreply.github.com> --- main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.py b/main.py index d38bcf3..e9b63ac 100644 --- a/main.py +++ b/main.py @@ -26,6 +26,7 @@ import transformers +from transformers import AutoTokenizer, AutoModel DEFAULT_MODEL = "THUDM/chatglm-6b-int4" DEFAULT_EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2" @@ -165,7 +166,6 @@ class Embeddings: def __init__(self, model_name: str) -> None: self.model_name = model_name self.model = SentenceTransformer(self.model_name) - from transformers import AutoTokenizer, AutoModel # Load model from HuggingFace Hub self.tokenizer = AutoTokenizer.from_pretrained(self.model) From 2c640a135df2ffe5440ab93689b6430b235da166 Mon Sep 17 00:00:00 2001 From: Teddy Xinyuan Chen <45612704+tddschn@users.noreply.github.com> Date: Thu, 25 May 2023 09:59:57 +0800 Subject: [PATCH 12/13] remove SentenceTransformer code Signed-off-by: Teddy Xinyuan Chen <45612704+tddschn@users.noreply.github.com> --- main.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/main.py b/main.py index e9b63ac..806ef69 100644 --- a/main.py +++ b/main.py @@ -165,9 +165,6 @@ async def on_post(self, req: Request, resp: Response): class Embeddings: def __init__(self, model_name: str) -> None: self.model_name = model_name - self.model = SentenceTransformer(self.model_name) - - # Load model from HuggingFace Hub self.tokenizer = AutoTokenizer.from_pretrained(self.model) self.model = AutoModel.from_pretrained(self.model) From ab44c24cb175c4725f620384c52cb1fa7c1d1987 Mon Sep 17 00:00:00 2001 From: Teddy Xinyuan Chen <45612704+tddschn@users.noreply.github.com> Date: Thu, 25 May 2023 11:24:49 +0800 Subject: [PATCH 13/13] add requirements-cpu.txt Signed-off-by: Teddy Xinyuan Chen <45612704+tddschn@users.noreply.github.com> --- requirements-cpu.txt | 8 ++++++++ 1 file changed, 8 insertions(+) create mode 100644 requirements-cpu.txt diff --git a/requirements-cpu.txt b/requirements-cpu.txt new file mode 100644 index 0000000..fd38477 --- /dev/null +++ b/requirements-cpu.txt @@ -0,0 +1,8 @@ +msgpack +mosec +torch --extra-index-url https://download.pytorch.org/whl/cpu +diffusers[torch] +transformers +llmspec +falcon +uvicorn \ No newline at end of file