TypeError: GenerationMixin._get_logits_warper() missing 1 required positional argument: 'device'

Hello Guys, i have recently changed my server. And it worked on the last server where I had 2 gpus, now i have 4 but i dont think that matters.

When i try to run my script i get this error:
TypeError: GenerationMixin._get_logits_warper() missing 1 required positional argument: ‘device’

But as far as I know the .generate function doesnt even take a parameter called device.

I also use one venv for both files the script and the wrapper

Here is my Code:

from TorchLangChainWrapper import TransformersLLM
llm = TransformersLLM(model_path="/sicbert/models/mistral-7B")
for chunk in llm.stream("The quick brown fox jumps over the lazy dog."):
    print(chunk, end="", flush=True)

And here is my Wrapper:

from typing import Any, Dict, Iterator, List, Mapping, Optional
from langchain_core.callbacks.manager import CallbackManagerForLLMRun
from langchain_core.language_models.llms import LLM
from langchain_core.outputs import GenerationChunk
from langchain_core.pydantic_v1 import root_validator
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers_stream_generator import init_stream_support

class TransformersLLM(LLM):
    model: Any
    tokenizer: Any
    model_path: str

    @root_validator()
    def validate_environment(cls, values: Dict) -> Dict:
        model_path = values["model_path"]
        init_stream_support()
        try:
            tokenizer = AutoTokenizer.from_pretrained(model_path)
            tokenizer.pad_token = tokenizer.eos_token
            model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=torch.float16, device_map='balanced')
            model.eval()
            values["model"] = model
            values["tokenizer"] = tokenizer
        except Exception as e:
            raise ValueError(
                f"Could not load model from path: {model_path}. "
                f"Received error {e}"
            )
        return values

    def _call(self,prompt: str,stop: Optional[List[str]] = None,run_manager: Optional[CallbackManagerForLLMRun] = None,**kwargs: Any,) -> str:
        inputs = self.tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=512)
        with torch.no_grad():
            outputs = self.model.generate(
                input_ids=inputs['input_ids'].to(self.model.device),
                attention_mask=inputs['attention_mask'].to(self.model.device),
                max_length=200
                #temperature=0.7,  # Control randomness in generation
                #top_p=0.9,  # Use nucleus sampling to limit the number of tokens considered at each step
                #repetition_penalty=1.2,  # Penalize repeated phrases
                #num_return_sequences=1  # Generate only one sequence
            )
        answer = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
        return answer

    def _stream(
        self,
        prompt: str,
        stop: Optional[List[str]] = None,
        run_manager: Optional[CallbackManagerForLLMRun] = None,
        **kwargs: Any,
    ) -> Iterator[GenerationChunk]:
        inputs = self.tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=512)
        with torch.no_grad():
            generator = self.model.generate(
                    input_ids=inputs['input_ids'].to(self.model.device),
                    attention_mask=inputs['attention_mask'].to(self.model.device),
                    max_length=200,
                    do_sample=True,
                    do_stream=True
                    #temperature=0.7,  # Control randomness in generation
                    #top_p=0.9,  # Use nucleus sampling to limit the number of tokens considered at each step
                    #repetition_penalty=1.2,  # Penalize repeated phrases
                    #num_return_sequences=1  # Generate only one sequence
            )
            last_tokens = []
            last_decoded_tokens = []

            for index, x in enumerate(generator):
                tokens = x.cpu().numpy().tolist()
                tokens = last_tokens + tokens
                word = self.tokenizer.decode(tokens, skip_special_tokens=True)
                if "�" in word:
                    last_tokens = tokens
                else:
                    if " " in self.tokenizer.decode(
                        last_decoded_tokens + tokens, skip_special_tokens=True
                    ):
                        word = " " + word
                    last_tokens = []
                    last_decoded_tokens = tokens
                
                chunk = GenerationChunk(text=word)
                if run_manager:
                    run_manager.on_llm_new_token(chunk.text, chunk=chunk)

                yield chunk
                
    @property
    def _identifying_params(self) -> Dict[str, Any]:
        """Return a dictionary of identifying parameters."""
        return {
            "model_name": "CustomChatModel",
        }

    @property
    def _llm_type(self) -> str:
        """Get the type of language model used by this chat model. Used for logging purposes only."""
        return "custom"

I got the same bug when I run the demo:InterLM(LLM) +MSagent(datasets), fine-tuned by xtuner and downloaded datasets with modelscope.

(python35-paddle120-env) aistudio@jupyter-410941-8169713:~/ft-msagent$ ~/.local/bin/xtuner chat /home/aistudio/data/internlm-chat-7b --adapter internlm-7b-qlora-msagent-react --lagent
[2024-07-22 15:37:32,547] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
 [WARNING]  async_io requires the dev libaio .so object and headers but these were not found.
 [WARNING]  async_io: please install the libaio-dev package with apt
 [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
 [WARNING]  Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
 [WARNING]  NVIDIA Inference is only supported on Ampere and newer architectures
 [WARNING]  sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.0
 [WARNING]  using untested triton version (2.0.0), only 1.0.0 is known to be compatible
[2024-07-22 15:37:42,991] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
 [WARNING]  async_io requires the dev libaio .so object and headers but these were not found.
 [WARNING]  async_io: please install the libaio-dev package with apt
 [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
 [WARNING]  Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
 [WARNING]  NVIDIA Inference is only supported on Ampere and newer architectures
 [WARNING]  sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.0
 [WARNING]  using untested triton version (2.0.0), only 1.0.0 is known to be compatible
Loading checkpoint shards:   0%|                                                                                                    | 0/8 [00:00<?, ?it/s]/home/aistudio/.local/lib/python3.10/site-packages/torch/_utils.py:776: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()
  return self.fget.__get__(instance, owner)()
Loading checkpoint shards: 100%|████████████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:18<00:00,  2.32s/it]
Loading adapter from internlm-7b-qlora-msagent-react...

double enter to end input (EXIT: exit chat, RESET: reset history) >>> How's the weather at Ji'nan China?

/home/aistudio/.local/lib/python3.10/site-packages/transformers/generation/configuration_utils.py:540: UserWarning: `do_sample` is set to `False`. However, `temperature` is set to `0.8` -- this flag is only used in sample-based generation modes. You should set `do_sample=True` or unset `temperature`.
  warnings.warn(
/home/aistudio/.local/lib/python3.10/site-packages/transformers/generation/configuration_utils.py:545: UserWarning: `do_sample` is set to `False`. However, `top_p` is set to `0.8` -- this flag is only used in sample-based generation modes. You should set `do_sample=True` or unset `top_p`.
  warnings.warn(
Traceback (most recent call last):
  File "/home/aistudio/.local/lib/python3.10/site-packages/xtuner/tools/chat.py", line 491, in <module>
    main()
  File "/home/aistudio/.local/lib/python3.10/site-packages/xtuner/tools/chat.py", line 212, in main
    response = chatbot.chat(text)
  File "/home/aistudio/.local/lib/python3.10/site-packages/lagent/agents/react.py", line 233, in chat
    response = self._llm.chat(prompt, **kwargs)
  File "/home/aistudio/.local/lib/python3.10/site-packages/lagent/llms/base_llm.py", line 191, in chat
    return self.generate(_inputs, **gen_params)
  File "/home/aistudio/.local/lib/python3.10/site-packages/lagent/llms/huggingface.py", line 133, in generate
    for status, chunk, _ in self.stream_generate(inputs, do_sample,
  File "/home/aistudio/.local/lib/python3.10/site-packages/lagent/llms/huggingface.py", line 208, in stream_generate
    logits_warper = self.model._get_logits_warper(generation_config)
TypeError: GenerationMixin._get_logits_warper() missing 1 required positional argument: 'device'

By reading this outputs, I surposed this bug caused by wrong version of lagnet.
Then I clone the lagent repo and git checkout 511b03889010c4811b1701abb153e02b8e94fb5e( a old version). After pip install -e ., I rerun the demo. It works!