Hi, I’m trying to run multi GPU inference code with llama 3B model
.
I did get the good result by running in single GPU
env, while running perfectly the same code with multi GPU
env, facing errors.
What am I missing?
import os
import torch
import pandas as pd
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForCausalLM
torch.cuda.empty_cache()
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"
MODEL_ID = "local_path"
CSV_FILE = "mycsv.csv"
OUTPUT_FILE = "output.csv"
BATCH_SIZE = 10
DEFAULT_SCORE = 100000
# Load data
print("Loading CSV...")
data = pd.read_csv(CSV_FILE)
findings = data['col1'][:20]
impressions = data['col2'][:20]
summaries = data['col3'][:20]
# Load tokenizer and model
print("Loading tokenizer and model...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, padding_side="left")
tokenizer.pad_token_id = tokenizer.eos_token_id
model = AutoModelForCausalLM.from_pretrained(
MODEL_ID,
pad_token_id=tokenizer.eos_token_id,
device_map="auto",
torch_dtype=torch.float16,
)
# Evaluation criteria
EVALUATION_CRITERIA = """
Evaluation Criteria:
- Relevance (1-5): The summary should include only important information from the findings and impressions.
- Coherence (1-5): The summary should be well-structured and logically organized.
- Consistency (1-5): The summary should align factually with the findings and impressions.
- Fluency (1-3): The summary should have proper grammar, spelling, punctuation, and sentence structure.
The lower the score, the worse the quality of the summary.
"""
# Define prompt template with special tokens
def create_prompt(finding, impression, summary):
return (my_manual_prompt)
evaluation_results = []
print("Starting evaluation...")
progress_bar = tqdm(total=len(findings))
# Batch processing for evaluation
for i in range(0, len(findings), BATCH_SIZE):
findings_batch = findings[i:i + BATCH_SIZE]
impressions_batch = impressions[i:i + BATCH_SIZE]
summaries_batch = summaries[i:i + BATCH_SIZE]
# Create prompts for the batch
prompts = [
create_prompt(finding, impression, summary)
for finding, impression, summary in zip(findings_batch, impressions_batch, summaries_batch)
]
# Tokenize batch prompts
inputs = tokenizer(prompts, return_tensors="pt", padding=True, truncation=True).to(model.device)
# Generate responses
with torch.no_grad():
outputs = model.generate(
input_ids=inputs["input_ids"],
attention_mask=inputs["attention_mask"],
pad_token_id=tokenizer.eos_token_id,
max_new_tokens=100,
)
# Decode only the new tokens
responses = tokenizer.batch_decode(
outputs[:, inputs["input_ids"].shape[1]:], # Ignore the input prompt tokens
skip_special_tokens=True,
)
# Parse responses
for finding, impression, summary, prompt, response in zip(
findings_batch, impressions_batch, summaries_batch, prompts, responses
):
# Initialize scores
relevance = coherence = consistency = fluency = DEFAULT_SCORE
# Extract scores
for line in response.split('\n'):
line = line.strip()
if line.startswith("Relevance:"):
relevance = line.replace("Relevance:", "").strip()
elif line.startswith("Coherence:"):
coherence = line.replace("Coherence:", "").strip()
elif line.startswith("Consistency:"):
consistency = line.replace("Consistency:", "").strip()
elif line.startswith("Fluency:"):
fluency = line.replace("Fluency:", "").strip()
# Validate scores
try:
# Ensure the variables are strings before using isdigit()
relevance = int(str(relevance)) if str(relevance).isdigit() else DEFAULT_SCORE
coherence = int(str(coherence)) if str(coherence).isdigit() else DEFAULT_SCORE
consistency = int(str(consistency)) if str(consistency).isdigit() else DEFAULT_SCORE
fluency = int(str(fluency)) if str(fluency).isdigit() else DEFAULT_SCORE
except ValueError:
relevance = coherence = consistency = fluency = DEFAULT_SCORE
# Save results
evaluation_results.append({
"Findings": finding,
"Impression": impression,
"Summary": summary,
"Prompt": prompt,
"Raw Output": response,
"Relevance": relevance,
"Coherence": coherence,
"Consistency": consistency,
"Fluency": fluency
})
progress_bar.update(len(findings_batch))
progress_bar.close()
# Save results to CSV
evaluation_df = pd.DataFrame(evaluation_results)
evaluation_df.to_csv(OUTPUT_FILE, index=False)
print(f"Evaluation results saved to '{OUTPUT_FILE}'.")
ERROR MESSAGE
... (more hidden) ...../aten/src/ATen/native/cuda/TensorCompare.cu:110: _assert_async_cuda_kernel: block: [0,0,0], thread: [0,0,0] Assertion `probability tensor contains either `inf`, `nan` or element < 0` failed.
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
Cell In[1], line 88
86 # Generate responses
87 with torch.no_grad():
---> 88 outputs = model.generate(
89 input_ids=inputs["input_ids"],
90 attention_mask=inputs["attention_mask"],
91 pad_token_id=tokenizer.eos_token_id,
92 max_new_tokens=100,
93 )
95 # Decode only the new tokens
96 responses = tokenizer.batch_decode(
97 outputs[:, inputs["input_ids"].shape[1]:], # Ignore the input prompt tokens
98 skip_special_tokens=True,
99 )
File ~/miniconda3/envs/radiology/lib/python3.10/site-packages/torch/utils/_contextlib.py:116, in context_decorator.<locals>.decorate_context(*args, **kwargs)
113 @functools.wraps(func)
114 def decorate_context(*args, **kwargs):
115 with ctx_factory():
--> 116 return func(*args, **kwargs)
File ~/miniconda3/envs/radiology/lib/python3.10/site-packages/transformers/generation/utils.py:2215, in GenerationMixin.generate(self, inputs, generation_config, logits_processor, stopping_criteria, prefix_allowed_tokens_fn, synced_gpus, assistant_model, streamer, negative_prompt_ids, negative_prompt_attention_mask, **kwargs)
2207 input_ids, model_kwargs = self._expand_inputs_for_generation(
2208 input_ids=input_ids,
2209 expand_size=generation_config.num_return_sequences,
2210 is_encoder_decoder=self.config.is_encoder_decoder,
2211 **model_kwargs,
2212 )
2214 # 12. run sample (it degenerates to greedy search when `generation_config.do_sample=False`)
-> 2215 result = self._sample(
2216 input_ids,
2217 logits_processor=prepared_logits_processor,
2218 stopping_criteria=prepared_stopping_criteria,
2219 generation_config=generation_config,
2220 synced_gpus=synced_gpus,
2221 streamer=streamer,
2222 **model_kwargs,
2223 )
2225 elif generation_mode in (GenerationMode.BEAM_SAMPLE, GenerationMode.BEAM_SEARCH):
2226 # 11. prepare beam search scorer
2227 beam_scorer = BeamSearchScorer(
2228 batch_size=batch_size,
2229 num_beams=generation_config.num_beams,
(...)
2234 max_length=generation_config.max_length,
2235 )
File ~/miniconda3/envs/radiology/lib/python3.10/site-packages/transformers/generation/utils.py:3249, in GenerationMixin._sample(self, input_ids, logits_processor, stopping_criteria, generation_config, synced_gpus, streamer, **model_kwargs)
3247 probs = nn.functional.softmax(next_token_scores, dim=-1)
3248 # TODO (joao): this OP throws "skipping cudagraphs due to ['incompatible ops']", find solution
-> 3249 next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1)
3250 else:
3251 next_tokens = torch.argmax(next_token_scores, dim=-1)
RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.