-
Notifications
You must be signed in to change notification settings - Fork 1
/
models.py
26 lines (18 loc) · 847 Bytes
/
models.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
import logging
from pathlib import Path
from transformers import AutoModelForCausalLM
from auto_gptq import AutoGPTQForCausalLM
# logging
LOG = logging.getLogger(__name__)
def load_quantized(model_name, args):
path_to_model = Path(f'{args.model_dir}/{model_name}')
LOG.debug(f"model path: {path_to_model}")
# i will force it to use triton which loads whole model to GPU, otherwise it litterally cant be used due to slowness
model = AutoGPTQForCausalLM.from_quantized(path_to_model, device="cuda:0", use_triton=True, use_safetensors=args.use_safetensors)
return model
def load_normal(model_name, args):\
# VRAM warning
path_to_model = Path(f'{args.model_dir}/{model_name}')
LOG.debug(f"model path: {path_to_model}")
model = AutoModelForCausalLM.from_pretrained(path_to_model, device=0)
return model