Perfectly the same code, single GPU OK, multi GPU ERROR

Hi, I’m trying to run multi GPU inference code with llama 3B model.

I did get the good result by running in single GPU env, while running perfectly the same code with multi GPU env, facing errors.

What am I missing?

import os
import torch
import pandas as pd
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForCausalLM

torch.cuda.empty_cache()

os.environ["CUDA_VISIBLE_DEVICES"] = "0,1" 

MODEL_ID = "local_path"
CSV_FILE = "mycsv.csv"
OUTPUT_FILE = "output.csv"
BATCH_SIZE = 10
DEFAULT_SCORE = 100000 

# Load data
print("Loading CSV...")
data = pd.read_csv(CSV_FILE)
findings = data['col1'][:20]
impressions = data['col2'][:20]
summaries = data['col3'][:20]

# Load tokenizer and model
print("Loading tokenizer and model...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, padding_side="left")
tokenizer.pad_token_id = tokenizer.eos_token_id

model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    pad_token_id=tokenizer.eos_token_id,
    device_map="auto",
    torch_dtype=torch.float16,
)

# Evaluation criteria
EVALUATION_CRITERIA = """
Evaluation Criteria:
- Relevance (1-5): The summary should include only important information from the findings and impressions.
- Coherence (1-5): The summary should be well-structured and logically organized.
- Consistency (1-5): The summary should align factually with the findings and impressions.
- Fluency (1-3): The summary should have proper grammar, spelling, punctuation, and sentence structure.

The lower the score, the worse the quality of the summary.
"""

# Define prompt template with special tokens
def create_prompt(finding, impression, summary):
    return (my_manual_prompt)

evaluation_results = []

print("Starting evaluation...")
progress_bar = tqdm(total=len(findings))

# Batch processing for evaluation
for i in range(0, len(findings), BATCH_SIZE):
    findings_batch = findings[i:i + BATCH_SIZE]
    impressions_batch = impressions[i:i + BATCH_SIZE]
    summaries_batch = summaries[i:i + BATCH_SIZE]
    
    # Create prompts for the batch
    prompts = [
        create_prompt(finding, impression, summary)
        for finding, impression, summary in zip(findings_batch, impressions_batch, summaries_batch)
    ]

    # Tokenize batch prompts
    inputs = tokenizer(prompts, return_tensors="pt", padding=True, truncation=True).to(model.device)

    # Generate responses
    with torch.no_grad():
        outputs = model.generate(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            pad_token_id=tokenizer.eos_token_id,
            max_new_tokens=100,
        )

    # Decode only the new tokens
    responses = tokenizer.batch_decode(
        outputs[:, inputs["input_ids"].shape[1]:],  # Ignore the input prompt tokens
        skip_special_tokens=True,
    )

    # Parse responses
    for finding, impression, summary, prompt, response in zip(
        findings_batch, impressions_batch, summaries_batch, prompts, responses
    ):
        # Initialize scores
        relevance = coherence = consistency = fluency = DEFAULT_SCORE

        # Extract scores
        for line in response.split('\n'):
            line = line.strip()
            if line.startswith("Relevance:"):
                relevance = line.replace("Relevance:", "").strip()
            elif line.startswith("Coherence:"):
                coherence = line.replace("Coherence:", "").strip()
            elif line.startswith("Consistency:"):
                consistency = line.replace("Consistency:", "").strip()
            elif line.startswith("Fluency:"):
                fluency = line.replace("Fluency:", "").strip()

        # Validate scores
        try:
            # Ensure the variables are strings before using isdigit()
            relevance = int(str(relevance)) if str(relevance).isdigit() else DEFAULT_SCORE
            coherence = int(str(coherence)) if str(coherence).isdigit() else DEFAULT_SCORE
            consistency = int(str(consistency)) if str(consistency).isdigit() else DEFAULT_SCORE
            fluency = int(str(fluency)) if str(fluency).isdigit() else DEFAULT_SCORE
        except ValueError:
            relevance = coherence = consistency = fluency = DEFAULT_SCORE


        # Save results
        evaluation_results.append({
            "Findings": finding,
            "Impression": impression,
            "Summary": summary,
            "Prompt": prompt,
            "Raw Output": response,
            "Relevance": relevance,
            "Coherence": coherence,
            "Consistency": consistency,
            "Fluency": fluency
        })

    progress_bar.update(len(findings_batch))

progress_bar.close()

# Save results to CSV
evaluation_df = pd.DataFrame(evaluation_results)
evaluation_df.to_csv(OUTPUT_FILE, index=False)

print(f"Evaluation results saved to '{OUTPUT_FILE}'.")

ERROR MESSAGE

 ... (more hidden) ...../aten/src/ATen/native/cuda/TensorCompare.cu:110: _assert_async_cuda_kernel: block: [0,0,0], thread: [0,0,0] Assertion `probability tensor contains either `inf`, `nan` or element < 0` failed.
---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
Cell In[1], line 88
     86 # Generate responses
     87 with torch.no_grad():
---> 88     outputs = model.generate(
     89         input_ids=inputs["input_ids"],
     90         attention_mask=inputs["attention_mask"],
     91         pad_token_id=tokenizer.eos_token_id,
     92         max_new_tokens=100,
     93     )
     95 # Decode only the new tokens
     96 responses = tokenizer.batch_decode(
     97     outputs[:, inputs["input_ids"].shape[1]:],  # Ignore the input prompt tokens
     98     skip_special_tokens=True,
     99 )

File ~/miniconda3/envs/radiology/lib/python3.10/site-packages/torch/utils/_contextlib.py:116, in context_decorator.<locals>.decorate_context(*args, **kwargs)
    113 @functools.wraps(func)
    114 def decorate_context(*args, **kwargs):
    115     with ctx_factory():
--> 116         return func(*args, **kwargs)

File ~/miniconda3/envs/radiology/lib/python3.10/site-packages/transformers/generation/utils.py:2215, in GenerationMixin.generate(self, inputs, generation_config, logits_processor, stopping_criteria, prefix_allowed_tokens_fn, synced_gpus, assistant_model, streamer, negative_prompt_ids, negative_prompt_attention_mask, **kwargs)
   2207     input_ids, model_kwargs = self._expand_inputs_for_generation(
   2208         input_ids=input_ids,
   2209         expand_size=generation_config.num_return_sequences,
   2210         is_encoder_decoder=self.config.is_encoder_decoder,
   2211         **model_kwargs,
   2212     )
   2214     # 12. run sample (it degenerates to greedy search when `generation_config.do_sample=False`)
-> 2215     result = self._sample(
   2216         input_ids,
   2217         logits_processor=prepared_logits_processor,
   2218         stopping_criteria=prepared_stopping_criteria,
   2219         generation_config=generation_config,
   2220         synced_gpus=synced_gpus,
   2221         streamer=streamer,
   2222         **model_kwargs,
   2223     )
   2225 elif generation_mode in (GenerationMode.BEAM_SAMPLE, GenerationMode.BEAM_SEARCH):
   2226     # 11. prepare beam search scorer
   2227     beam_scorer = BeamSearchScorer(
   2228         batch_size=batch_size,
   2229         num_beams=generation_config.num_beams,
   (...)
   2234         max_length=generation_config.max_length,
   2235     )

File ~/miniconda3/envs/radiology/lib/python3.10/site-packages/transformers/generation/utils.py:3249, in GenerationMixin._sample(self, input_ids, logits_processor, stopping_criteria, generation_config, synced_gpus, streamer, **model_kwargs)
   3247     probs = nn.functional.softmax(next_token_scores, dim=-1)
   3248     # TODO (joao): this OP throws "skipping cudagraphs due to ['incompatible ops']", find solution
-> 3249     next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1)
   3250 else:
   3251     next_tokens = torch.argmax(next_token_scores, dim=-1)

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
1 Like