Hello Guys, i have recently changed my server. And it worked on the last server where I had 2 gpus, now i have 4 but i dont think that matters.
When i try to run my script i get this error:
TypeError: GenerationMixin._get_logits_warper() missing 1 required positional argument: ‘device’
But as far as I know the .generate function doesnt even take a parameter called device.
I also use one venv for both files the script and the wrapper
Here is my Code:
from TorchLangChainWrapper import TransformersLLM
llm = TransformersLLM(model_path="/sicbert/models/mistral-7B")
for chunk in llm.stream("The quick brown fox jumps over the lazy dog."):
print(chunk, end="", flush=True)
And here is my Wrapper:
from typing import Any, Dict, Iterator, List, Mapping, Optional
from langchain_core.callbacks.manager import CallbackManagerForLLMRun
from langchain_core.language_models.llms import LLM
from langchain_core.outputs import GenerationChunk
from langchain_core.pydantic_v1 import root_validator
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers_stream_generator import init_stream_support
class TransformersLLM(LLM):
model: Any
tokenizer: Any
model_path: str
@root_validator()
def validate_environment(cls, values: Dict) -> Dict:
model_path = values["model_path"]
init_stream_support()
try:
tokenizer = AutoTokenizer.from_pretrained(model_path)
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=torch.float16, device_map='balanced')
model.eval()
values["model"] = model
values["tokenizer"] = tokenizer
except Exception as e:
raise ValueError(
f"Could not load model from path: {model_path}. "
f"Received error {e}"
)
return values
def _call(self,prompt: str,stop: Optional[List[str]] = None,run_manager: Optional[CallbackManagerForLLMRun] = None,**kwargs: Any,) -> str:
inputs = self.tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=512)
with torch.no_grad():
outputs = self.model.generate(
input_ids=inputs['input_ids'].to(self.model.device),
attention_mask=inputs['attention_mask'].to(self.model.device),
max_length=200
#temperature=0.7, # Control randomness in generation
#top_p=0.9, # Use nucleus sampling to limit the number of tokens considered at each step
#repetition_penalty=1.2, # Penalize repeated phrases
#num_return_sequences=1 # Generate only one sequence
)
answer = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
return answer
def _stream(
self,
prompt: str,
stop: Optional[List[str]] = None,
run_manager: Optional[CallbackManagerForLLMRun] = None,
**kwargs: Any,
) -> Iterator[GenerationChunk]:
inputs = self.tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=512)
with torch.no_grad():
generator = self.model.generate(
input_ids=inputs['input_ids'].to(self.model.device),
attention_mask=inputs['attention_mask'].to(self.model.device),
max_length=200,
do_sample=True,
do_stream=True
#temperature=0.7, # Control randomness in generation
#top_p=0.9, # Use nucleus sampling to limit the number of tokens considered at each step
#repetition_penalty=1.2, # Penalize repeated phrases
#num_return_sequences=1 # Generate only one sequence
)
last_tokens = []
last_decoded_tokens = []
for index, x in enumerate(generator):
tokens = x.cpu().numpy().tolist()
tokens = last_tokens + tokens
word = self.tokenizer.decode(tokens, skip_special_tokens=True)
if "�" in word:
last_tokens = tokens
else:
if " " in self.tokenizer.decode(
last_decoded_tokens + tokens, skip_special_tokens=True
):
word = " " + word
last_tokens = []
last_decoded_tokens = tokens
chunk = GenerationChunk(text=word)
if run_manager:
run_manager.on_llm_new_token(chunk.text, chunk=chunk)
yield chunk
@property
def _identifying_params(self) -> Dict[str, Any]:
"""Return a dictionary of identifying parameters."""
return {
"model_name": "CustomChatModel",
}
@property
def _llm_type(self) -> str:
"""Get the type of language model used by this chat model. Used for logging purposes only."""
return "custom"